00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 15. July 2011 00005 * $Revision: V1.0.10 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_conv_partial_q7.c 00009 * 00010 * Description: Partial convolution of Q7 sequences. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0 00013 * 00014 * Version 1.0.10 2011/7/15 00015 * Big Endian support added and Merged M0 and M3/M4 Source code. 00016 * 00017 * Version 1.0.3 2010/11/29 00018 * Re-organized the CMSIS folders and updated documentation. 00019 * 00020 * Version 1.0.2 2010/11/11 00021 * Documentation updated. 00022 * 00023 * Version 1.0.1 2010/10/05 00024 * Production release and review comments incorporated. 00025 * 00026 * Version 1.0.0 2010/09/20 00027 * Production release and review comments incorporated 00028 * 00029 * Version 0.0.7 2010/06/10 00030 * Misra-C changes done 00031 * 00032 * -------------------------------------------------------------------- */ 00033 00034 #include "arm_math.h" 00035 00058 arm_status arm_conv_partial_q7( 00059 q7_t * pSrcA, 00060 uint32_t srcALen, 00061 q7_t * pSrcB, 00062 uint32_t srcBLen, 00063 q7_t * pDst, 00064 uint32_t firstIndex, 00065 uint32_t numPoints) 00066 { 00067 00068 00069 #ifndef ARM_MATH_CM0 00070 00071 /* Run the below code for Cortex-M4 and Cortex-M3 */ 00072 00073 q7_t *pIn1; /* inputA pointer */ 00074 q7_t *pIn2; /* inputB pointer */ 00075 q7_t *pOut = pDst; /* output pointer */ 00076 q7_t *px; /* Intermediate inputA pointer */ 00077 q7_t *py; /* Intermediate inputB pointer */ 00078 q7_t *pSrc1, *pSrc2; /* Intermediate pointers */ 00079 q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */ 00080 q31_t input1, input2; 00081 q15_t in1, in2; 00082 q7_t x0, x1, x2, x3, c0, c1; 00083 uint32_t j, k, count, check, blkCnt; 00084 int32_t blockSize1, blockSize2, blockSize3; /* loop counter */ 00085 arm_status status; 00086 00087 00088 /* Check for range of output samples to be calculated */ 00089 if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) 00090 { 00091 /* Set status as ARM_MATH_ARGUMENT_ERROR */ 00092 status = ARM_MATH_ARGUMENT_ERROR; 00093 } 00094 else 00095 { 00096 00097 /* The algorithm implementation is based on the lengths of the inputs. */ 00098 /* srcB is always made to slide across srcA. */ 00099 /* So srcBLen is always considered as shorter or equal to srcALen */ 00100 if(srcALen >= srcBLen) 00101 { 00102 /* Initialization of inputA pointer */ 00103 pIn1 = pSrcA; 00104 00105 /* Initialization of inputB pointer */ 00106 pIn2 = pSrcB; 00107 } 00108 else 00109 { 00110 /* Initialization of inputA pointer */ 00111 pIn1 = pSrcB; 00112 00113 /* Initialization of inputB pointer */ 00114 pIn2 = pSrcA; 00115 00116 /* srcBLen is always considered as shorter or equal to srcALen */ 00117 j = srcBLen; 00118 srcBLen = srcALen; 00119 srcALen = j; 00120 } 00121 00122 /* Conditions to check which loopCounter holds 00123 * the first and last indices of the output samples to be calculated. */ 00124 check = firstIndex + numPoints; 00125 blockSize3 = ((int32_t) check - (int32_t) srcALen); 00126 blockSize3 = (blockSize3 > 0) ? blockSize3 : 0; 00127 blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex); 00128 blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 : 00129 (int32_t) numPoints) : 0; 00130 blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) + 00131 (int32_t) firstIndex); 00132 blockSize2 = (blockSize2 > 0) ? blockSize2 : 0; 00133 00134 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 00135 /* The function is internally 00136 * divided into three stages according to the number of multiplications that has to be 00137 * taken place between inputA samples and inputB samples. In the first stage of the 00138 * algorithm, the multiplications increase by one for every iteration. 00139 * In the second stage of the algorithm, srcBLen number of multiplications are done. 00140 * In the third stage of the algorithm, the multiplications decrease by one 00141 * for every iteration. */ 00142 00143 /* Set the output pointer to point to the firstIndex 00144 * of the output sample to be calculated. */ 00145 pOut = pDst + firstIndex; 00146 00147 /* -------------------------- 00148 * Initializations of stage1 00149 * -------------------------*/ 00150 00151 /* sum = x[0] * y[0] 00152 * sum = x[0] * y[1] + x[1] * y[0] 00153 * .... 00154 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] 00155 */ 00156 00157 /* In this stage the MAC operations are increased by 1 for every iteration. 00158 The count variable holds the number of MAC operations performed. 00159 Since the partial convolution starts from from firstIndex 00160 Number of Macs to be performed is firstIndex + 1 */ 00161 count = 1u + firstIndex; 00162 00163 /* Working pointer of inputA */ 00164 px = pIn1; 00165 00166 /* Working pointer of inputB */ 00167 pSrc2 = pIn2 + firstIndex; 00168 py = pSrc2; 00169 00170 /* ------------------------ 00171 * Stage1 process 00172 * ----------------------*/ 00173 00174 /* The first stage starts here */ 00175 while(blockSize1 > 0) 00176 { 00177 /* Accumulator is made zero for every iteration */ 00178 sum = 0; 00179 00180 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00181 k = count >> 2u; 00182 00183 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00184 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00185 while(k > 0u) 00186 { 00187 /* x[0] , x[1] */ 00188 in1 = (q15_t) * px++; 00189 in2 = (q15_t) * px++; 00190 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00191 00192 /* y[srcBLen - 1] , y[srcBLen - 2] */ 00193 in1 = (q15_t) * py--; 00194 in2 = (q15_t) * py--; 00195 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00196 00197 /* x[0] * y[srcBLen - 1] */ 00198 /* x[1] * y[srcBLen - 2] */ 00199 sum = __SMLAD(input1, input2, sum); 00200 00201 /* x[2] , x[3] */ 00202 in1 = (q15_t) * px++; 00203 in2 = (q15_t) * px++; 00204 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00205 00206 /* y[srcBLen - 3] , y[srcBLen - 4] */ 00207 in1 = (q15_t) * py--; 00208 in2 = (q15_t) * py--; 00209 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00210 00211 /* x[2] * y[srcBLen - 3] */ 00212 /* x[3] * y[srcBLen - 4] */ 00213 sum = __SMLAD(input1, input2, sum); 00214 00215 /* Decrement the loop counter */ 00216 k--; 00217 } 00218 00219 /* If the count is not a multiple of 4, compute any remaining MACs here. 00220 ** No loop unrolling is used. */ 00221 k = count % 0x4u; 00222 00223 while(k > 0u) 00224 { 00225 /* Perform the multiply-accumulates */ 00226 sum += ((q31_t) * px++ * *py--); 00227 00228 /* Decrement the loop counter */ 00229 k--; 00230 } 00231 00232 /* Store the result in the accumulator in the destination buffer. */ 00233 *pOut++ = (q7_t) (__SSAT(sum >> 7, 8)); 00234 00235 /* Update the inputA and inputB pointers for next MAC calculation */ 00236 py = ++pSrc2; 00237 px = pIn1; 00238 00239 /* Increment the MAC count */ 00240 count++; 00241 00242 /* Decrement the loop counter */ 00243 blockSize1--; 00244 } 00245 00246 /* -------------------------- 00247 * Initializations of stage2 00248 * ------------------------*/ 00249 00250 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] 00251 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] 00252 * .... 00253 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] 00254 */ 00255 00256 /* Working pointer of inputA */ 00257 px = pIn1; 00258 00259 /* Working pointer of inputB */ 00260 pSrc2 = pIn2 + (srcBLen - 1u); 00261 py = pSrc2; 00262 00263 /* count is index by which the pointer pIn1 to be incremented */ 00264 count = 1u; 00265 00266 /* ------------------- 00267 * Stage2 process 00268 * ------------------*/ 00269 00270 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00271 * So, to loop unroll over blockSize2, 00272 * srcBLen should be greater than or equal to 4 */ 00273 if(srcBLen >= 4u) 00274 { 00275 /* Loop unroll over blockSize2, by 4 */ 00276 blkCnt = ((uint32_t) blockSize2 >> 2u); 00277 00278 while(blkCnt > 0u) 00279 { 00280 /* Set all accumulators to zero */ 00281 acc0 = 0; 00282 acc1 = 0; 00283 acc2 = 0; 00284 acc3 = 0; 00285 00286 /* read x[0], x[1], x[2] samples */ 00287 x0 = *(px++); 00288 x1 = *(px++); 00289 x2 = *(px++); 00290 00291 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00292 k = srcBLen >> 2u; 00293 00294 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00295 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00296 do 00297 { 00298 /* Read y[srcBLen - 1] sample */ 00299 c0 = *(py--); 00300 /* Read y[srcBLen - 2] sample */ 00301 c1 = *(py--); 00302 00303 /* Read x[3] sample */ 00304 x3 = *(px++); 00305 00306 /* x[0] and x[1] are packed */ 00307 in1 = (q15_t) x0; 00308 in2 = (q15_t) x1; 00309 00310 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00311 00312 /* y[srcBLen - 1] and y[srcBLen - 2] are packed */ 00313 in1 = (q15_t) c0; 00314 in2 = (q15_t) c1; 00315 00316 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00317 00318 /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */ 00319 acc0 = __SMLAD(input1, input2, acc0); 00320 00321 /* x[1] and x[2] are packed */ 00322 in1 = (q15_t) x1; 00323 in2 = (q15_t) x2; 00324 00325 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00326 00327 /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */ 00328 acc1 = __SMLAD(input1, input2, acc1); 00329 00330 /* x[2] and x[3] are packed */ 00331 in1 = (q15_t) x2; 00332 in2 = (q15_t) x3; 00333 00334 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00335 00336 /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */ 00337 acc2 = __SMLAD(input1, input2, acc2); 00338 00339 /* Read x[4] sample */ 00340 x0 = *(px++); 00341 00342 /* x[3] and x[4] are packed */ 00343 in1 = (q15_t) x3; 00344 in2 = (q15_t) x0; 00345 00346 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00347 00348 /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */ 00349 acc3 = __SMLAD(input1, input2, acc3); 00350 00351 /* Read y[srcBLen - 3] sample */ 00352 c0 = *(py--); 00353 /* Read y[srcBLen - 4] sample */ 00354 c1 = *(py--); 00355 00356 /* Read x[5] sample */ 00357 x1 = *(px++); 00358 00359 /* x[2] and x[3] are packed */ 00360 in1 = (q15_t) x2; 00361 in2 = (q15_t) x3; 00362 00363 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00364 00365 /* y[srcBLen - 3] and y[srcBLen - 4] are packed */ 00366 in1 = (q15_t) c0; 00367 in2 = (q15_t) c1; 00368 00369 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00370 00371 /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */ 00372 acc0 = __SMLAD(input1, input2, acc0); 00373 00374 /* x[3] and x[4] are packed */ 00375 in1 = (q15_t) x3; 00376 in2 = (q15_t) x0; 00377 00378 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00379 00380 /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */ 00381 acc1 = __SMLAD(input1, input2, acc1); 00382 00383 /* x[4] and x[5] are packed */ 00384 in1 = (q15_t) x0; 00385 in2 = (q15_t) x1; 00386 00387 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00388 00389 /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */ 00390 acc2 = __SMLAD(input1, input2, acc2); 00391 00392 /* Read x[6] sample */ 00393 x2 = *(px++); 00394 00395 /* x[5] and x[6] are packed */ 00396 in1 = (q15_t) x1; 00397 in2 = (q15_t) x2; 00398 00399 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00400 00401 /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */ 00402 acc3 = __SMLAD(input1, input2, acc3); 00403 00404 } while(--k); 00405 00406 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00407 ** No loop unrolling is used. */ 00408 k = srcBLen % 0x4u; 00409 00410 while(k > 0u) 00411 { 00412 /* Read y[srcBLen - 5] sample */ 00413 c0 = *(py--); 00414 00415 /* Read x[7] sample */ 00416 x3 = *(px++); 00417 00418 /* Perform the multiply-accumulates */ 00419 /* acc0 += x[4] * y[srcBLen - 5] */ 00420 acc0 += ((q31_t) x0 * c0); 00421 /* acc1 += x[5] * y[srcBLen - 5] */ 00422 acc1 += ((q31_t) x1 * c0); 00423 /* acc2 += x[6] * y[srcBLen - 5] */ 00424 acc2 += ((q31_t) x2 * c0); 00425 /* acc3 += x[7] * y[srcBLen - 5] */ 00426 acc3 += ((q31_t) x3 * c0); 00427 00428 /* Reuse the present samples for the next MAC */ 00429 x0 = x1; 00430 x1 = x2; 00431 x2 = x3; 00432 00433 /* Decrement the loop counter */ 00434 k--; 00435 } 00436 00437 /* Store the result in the accumulator in the destination buffer. */ 00438 *pOut++ = (q7_t) (__SSAT(acc0 >> 7, 8)); 00439 *pOut++ = (q7_t) (__SSAT(acc1 >> 7, 8)); 00440 *pOut++ = (q7_t) (__SSAT(acc2 >> 7, 8)); 00441 *pOut++ = (q7_t) (__SSAT(acc3 >> 7, 8)); 00442 00443 /* Update the inputA and inputB pointers for next MAC calculation */ 00444 px = pIn1 + count * 4u; 00445 py = pSrc2; 00446 00447 /* Increment the pointer pIn1 index, count by 1 */ 00448 count++; 00449 00450 /* Decrement the loop counter */ 00451 blkCnt--; 00452 } 00453 00454 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. 00455 ** No loop unrolling is used. */ 00456 blkCnt = (uint32_t) blockSize2 % 0x4u; 00457 00458 while(blkCnt > 0u) 00459 { 00460 /* Accumulator is made zero for every iteration */ 00461 sum = 0; 00462 00463 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00464 k = srcBLen >> 2u; 00465 00466 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00467 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00468 while(k > 0u) 00469 { 00470 00471 /* Reading two inputs of SrcA buffer and packing */ 00472 in1 = (q15_t) * px++; 00473 in2 = (q15_t) * px++; 00474 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00475 00476 /* Reading two inputs of SrcB buffer and packing */ 00477 in1 = (q15_t) * py--; 00478 in2 = (q15_t) * py--; 00479 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00480 00481 /* Perform the multiply-accumulates */ 00482 sum = __SMLAD(input1, input2, sum); 00483 00484 /* Reading two inputs of SrcA buffer and packing */ 00485 in1 = (q15_t) * px++; 00486 in2 = (q15_t) * px++; 00487 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00488 00489 /* Reading two inputs of SrcB buffer and packing */ 00490 in1 = (q15_t) * py--; 00491 in2 = (q15_t) * py--; 00492 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00493 00494 /* Perform the multiply-accumulates */ 00495 sum = __SMLAD(input1, input2, sum); 00496 00497 /* Decrement the loop counter */ 00498 k--; 00499 } 00500 00501 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00502 ** No loop unrolling is used. */ 00503 k = srcBLen % 0x4u; 00504 00505 while(k > 0u) 00506 { 00507 /* Perform the multiply-accumulates */ 00508 sum += ((q31_t) * px++ * *py--); 00509 00510 /* Decrement the loop counter */ 00511 k--; 00512 } 00513 00514 /* Store the result in the accumulator in the destination buffer. */ 00515 *pOut++ = (q7_t) (__SSAT(sum >> 7, 8)); 00516 00517 /* Update the inputA and inputB pointers for next MAC calculation */ 00518 px = pIn1 + count; 00519 py = pSrc2; 00520 00521 /* Increment the pointer pIn1 index, count by 1 */ 00522 count++; 00523 00524 /* Decrement the loop counter */ 00525 blkCnt--; 00526 } 00527 } 00528 else 00529 { 00530 /* If the srcBLen is not a multiple of 4, 00531 * the blockSize2 loop cannot be unrolled by 4 */ 00532 blkCnt = (uint32_t) blockSize2; 00533 00534 while(blkCnt > 0u) 00535 { 00536 /* Accumulator is made zero for every iteration */ 00537 sum = 0; 00538 00539 /* srcBLen number of MACS should be performed */ 00540 k = srcBLen; 00541 00542 while(k > 0u) 00543 { 00544 /* Perform the multiply-accumulate */ 00545 sum += ((q31_t) * px++ * *py--); 00546 00547 /* Decrement the loop counter */ 00548 k--; 00549 } 00550 00551 /* Store the result in the accumulator in the destination buffer. */ 00552 *pOut++ = (q7_t) (__SSAT(sum >> 7, 8)); 00553 00554 /* Update the inputA and inputB pointers for next MAC calculation */ 00555 px = pIn1 + count; 00556 py = pSrc2; 00557 00558 /* Increment the MAC count */ 00559 count++; 00560 00561 /* Decrement the loop counter */ 00562 blkCnt--; 00563 } 00564 } 00565 00566 00567 /* -------------------------- 00568 * Initializations of stage3 00569 * -------------------------*/ 00570 00571 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] 00572 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] 00573 * .... 00574 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] 00575 * sum += x[srcALen-1] * y[srcBLen-1] 00576 */ 00577 00578 /* In this stage the MAC operations are decreased by 1 for every iteration. 00579 The count variable holds the number of MAC operations performed */ 00580 count = srcBLen - 1u; 00581 00582 /* Working pointer of inputA */ 00583 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); 00584 px = pSrc1; 00585 00586 /* Working pointer of inputB */ 00587 pSrc2 = pIn2 + (srcBLen - 1u); 00588 py = pSrc2; 00589 00590 /* ------------------- 00591 * Stage3 process 00592 * ------------------*/ 00593 00594 while(blockSize3 > 0) 00595 { 00596 /* Accumulator is made zero for every iteration */ 00597 sum = 0; 00598 00599 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00600 k = count >> 2u; 00601 00602 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00603 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00604 while(k > 0u) 00605 { 00606 /* Reading two inputs, x[srcALen - srcBLen + 1] and x[srcALen - srcBLen + 2] of SrcA buffer and packing */ 00607 in1 = (q15_t) * px++; 00608 in2 = (q15_t) * px++; 00609 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00610 00611 /* Reading two inputs, y[srcBLen - 1] and y[srcBLen - 2] of SrcB buffer and packing */ 00612 in1 = (q15_t) * py--; 00613 in2 = (q15_t) * py--; 00614 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00615 00616 /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */ 00617 /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */ 00618 sum = __SMLAD(input1, input2, sum); 00619 00620 /* Reading two inputs, x[srcALen - srcBLen + 3] and x[srcALen - srcBLen + 4] of SrcA buffer and packing */ 00621 in1 = (q15_t) * px++; 00622 in2 = (q15_t) * px++; 00623 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00624 00625 /* Reading two inputs, y[srcBLen - 3] and y[srcBLen - 4] of SrcB buffer and packing */ 00626 in1 = (q15_t) * py--; 00627 in2 = (q15_t) * py--; 00628 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16); 00629 00630 /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */ 00631 /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */ 00632 sum = __SMLAD(input1, input2, sum); 00633 00634 /* Decrement the loop counter */ 00635 k--; 00636 } 00637 00638 /* If the count is not a multiple of 4, compute any remaining MACs here. 00639 ** No loop unrolling is used. */ 00640 k = count % 0x4u; 00641 00642 while(k > 0u) 00643 { 00644 /* Perform the multiply-accumulates */ 00645 /* sum += x[srcALen-1] * y[srcBLen-1] */ 00646 sum += ((q31_t) * px++ * *py--); 00647 00648 /* Decrement the loop counter */ 00649 k--; 00650 } 00651 00652 /* Store the result in the accumulator in the destination buffer. */ 00653 *pOut++ = (q7_t) (__SSAT(sum >> 7, 8)); 00654 00655 /* Update the inputA and inputB pointers for next MAC calculation */ 00656 px = ++pSrc1; 00657 py = pSrc2; 00658 00659 /* Decrement the MAC count */ 00660 count--; 00661 00662 /* Decrement the loop counter */ 00663 blockSize3--; 00664 00665 } 00666 00667 /* set status as ARM_MATH_SUCCESS */ 00668 status = ARM_MATH_SUCCESS; 00669 } 00670 00671 /* Return to application */ 00672 return (status); 00673 00674 #else 00675 00676 /* Run the below code for Cortex-M0 */ 00677 00678 q7_t *pIn1 = pSrcA; /* inputA pointer */ 00679 q7_t *pIn2 = pSrcB; /* inputB pointer */ 00680 q31_t sum; /* Accumulator */ 00681 uint32_t i, j; /* loop counters */ 00682 arm_status status; /* status of Partial convolution */ 00683 00684 /* Check for range of output samples to be calculated */ 00685 if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) 00686 { 00687 /* Set status as ARM_ARGUMENT_ERROR */ 00688 status = ARM_MATH_ARGUMENT_ERROR; 00689 } 00690 else 00691 { 00692 /* Loop to calculate convolution for output length number of values */ 00693 for (i = firstIndex; i <= (firstIndex + numPoints - 1); i++) 00694 { 00695 /* Initialize sum with zero to carry on MAC operations */ 00696 sum = 0; 00697 00698 /* Loop to perform MAC operations according to convolution equation */ 00699 for (j = 0; j <= i; j++) 00700 { 00701 /* Check the array limitations */ 00702 if(((i - j) < srcBLen) && (j < srcALen)) 00703 { 00704 /* z[i] += x[i-j] * y[j] */ 00705 sum += ((q15_t) pIn1[j] * (pIn2[i - j])); 00706 } 00707 } 00708 00709 /* Store the output in the destination buffer */ 00710 pDst[i] = (q7_t) __SSAT((sum >> 7u), 8u); 00711 } 00712 /* set status as ARM_SUCCESS as there are no argument errors */ 00713 status = ARM_MATH_SUCCESS; 00714 } 00715 return (status); 00716 00717 #endif /* #ifndef ARM_MATH_CM0 */ 00718 00719 } 00720