CMSIS DSP Library from CMSIS 2.0. See http://www.onarm.com/cmsis/ for full details
Dependents: K22F_DSP_Matrix_least_square BNO055-ELEC3810 1BNO055 ECE4180Project--Slave2 ... more
arm_correlate_q15.c
00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 29. November 2010 00005 * $Revision: V1.0.3 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_correlate_q15.c 00009 * 00010 * Description: Q15 Correlation. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3 00013 * 00014 * Version 1.0.3 2010/11/29 00015 * Re-organized the CMSIS folders and updated documentation. 00016 * 00017 * Version 1.0.2 2010/11/11 00018 * Documentation updated. 00019 * 00020 * Version 1.0.1 2010/10/05 00021 * Production release and review comments incorporated. 00022 * 00023 * Version 1.0.0 2010/09/20 00024 * Production release and review comments incorporated 00025 * 00026 * Version 0.0.7 2010/06/10 00027 * Misra-C changes done 00028 * 00029 * -------------------------------------------------------------------- */ 00030 00031 #include "arm_math.h" 00032 00033 /** 00034 * @ingroup groupFilters 00035 */ 00036 00037 /** 00038 * @addtogroup Corr 00039 * @{ 00040 */ 00041 00042 /** 00043 * @brief Correlation of Q15 sequences 00044 * @param[in] *pSrcA points to the first input sequence. 00045 * @param[in] srcALen length of the first input sequence. 00046 * @param[in] *pSrcB points to the second input sequence. 00047 * @param[in] srcBLen length of the second input sequence. 00048 * @param[out] *pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1. 00049 * @return none. 00050 * 00051 * @details 00052 * <b>Scaling and Overflow Behavior:</b> 00053 * 00054 * \par 00055 * The function is implemented using a 64-bit internal accumulator. 00056 * Both inputs are in 1.15 format and multiplications yield a 2.30 result. 00057 * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format. 00058 * This approach provides 33 guard bits and there is no risk of overflow. 00059 * The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format. 00060 * 00061 * \par 00062 * Refer to <code>arm_correlate_fast_q15()</code> for a faster but less precise version of this function. 00063 */ 00064 00065 void arm_correlate_q15( 00066 q15_t * pSrcA, 00067 uint32_t srcALen, 00068 q15_t * pSrcB, 00069 uint32_t srcBLen, 00070 q15_t * pDst) 00071 { 00072 q15_t *pIn1; /* inputA pointer */ 00073 q15_t *pIn2; /* inputB pointer */ 00074 q15_t *pOut = pDst; /* output pointer */ 00075 q63_t sum, acc0, acc1, acc2, acc3; /* Accumulators */ 00076 q15_t *px; /* Intermediate inputA pointer */ 00077 q15_t *py; /* Intermediate inputB pointer */ 00078 q15_t *pSrc1; /* Intermediate pointers */ 00079 q31_t x0, x1, x2, x3, c0; /* temporary variables for holding input and coefficient values */ 00080 uint32_t j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3; /* loop counter */ 00081 int32_t inc = 1; /* Destination address modifier */ 00082 q31_t *pb; /* 32 bit pointer for inputB buffer */ 00083 00084 00085 /* The algorithm implementation is based on the lengths of the inputs. */ 00086 /* srcB is always made to slide across srcA. */ 00087 /* So srcBLen is always considered as shorter or equal to srcALen */ 00088 /* But CORR(x, y) is reverse of CORR(y, x) */ 00089 /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */ 00090 /* and the destination pointer modifier, inc is set to -1 */ 00091 /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */ 00092 /* But to improve the performance, 00093 * we include zeroes in the output instead of zero padding either of the the inputs*/ 00094 /* If srcALen > srcBLen, 00095 * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */ 00096 /* If srcALen < srcBLen, 00097 * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */ 00098 if(srcALen >= srcBLen) 00099 { 00100 /* Initialization of inputA pointer */ 00101 pIn1 = (pSrcA); 00102 00103 /* Initialization of inputB pointer */ 00104 pIn2 = (pSrcB); 00105 00106 /* Number of output samples is calculated */ 00107 outBlockSize = (2u * srcALen) - 1u; 00108 00109 /* When srcALen > srcBLen, zero padding is done to srcB 00110 * to make their lengths equal. 00111 * Instead, (outBlockSize - (srcALen + srcBLen - 1)) 00112 * number of output samples are made zero */ 00113 j = outBlockSize - (srcALen + (srcBLen - 1u)); 00114 00115 while(j > 0u) 00116 { 00117 /* Zero is stored in the destination buffer */ 00118 *pOut++ = 0; 00119 00120 /* Decrement the loop counter */ 00121 j--; 00122 } 00123 00124 } 00125 else 00126 { 00127 /* Initialization of inputA pointer */ 00128 pIn1 = (pSrcB); 00129 00130 /* Initialization of inputB pointer */ 00131 pIn2 = (pSrcA); 00132 00133 /* srcBLen is always considered as shorter or equal to srcALen */ 00134 j = srcBLen; 00135 srcBLen = srcALen; 00136 srcALen = j; 00137 00138 /* CORR(x, y) = Reverse order(CORR(y, x)) */ 00139 /* Hence set the destination pointer to point to the last output sample */ 00140 pOut = pDst + ((srcALen + srcBLen) - 2u); 00141 00142 /* Destination address modifier is set to -1 */ 00143 inc = -1; 00144 00145 } 00146 00147 /* The function is internally 00148 * divided into three parts according to the number of multiplications that has to be 00149 * taken place between inputA samples and inputB samples. In the first part of the 00150 * algorithm, the multiplications increase by one for every iteration. 00151 * In the second part of the algorithm, srcBLen number of multiplications are done. 00152 * In the third part of the algorithm, the multiplications decrease by one 00153 * for every iteration.*/ 00154 /* The algorithm is implemented in three stages. 00155 * The loop counters of each stage is initiated here. */ 00156 blockSize1 = srcBLen - 1u; 00157 blockSize2 = srcALen - (srcBLen - 1u); 00158 blockSize3 = blockSize1; 00159 00160 /* -------------------------- 00161 * Initializations of stage1 00162 * -------------------------*/ 00163 00164 /* sum = x[0] * y[srcBlen - 1] 00165 * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1] 00166 * .... 00167 * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1] 00168 */ 00169 00170 /* In this stage the MAC operations are increased by 1 for every iteration. 00171 The count variable holds the number of MAC operations performed */ 00172 count = 1u; 00173 00174 /* Working pointer of inputA */ 00175 px = pIn1; 00176 00177 /* Working pointer of inputB */ 00178 pSrc1 = pIn2 + (srcBLen - 1u); 00179 py = pSrc1; 00180 00181 /* ------------------------ 00182 * Stage1 process 00183 * ----------------------*/ 00184 00185 /* The first loop starts here */ 00186 while(blockSize1 > 0u) 00187 { 00188 /* Accumulator is made zero for every iteration */ 00189 sum = 0; 00190 00191 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00192 k = count >> 2; 00193 00194 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00195 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00196 while(k > 0u) 00197 { 00198 /* x[0] * y[srcBLen - 4] , x[1] * y[srcBLen - 3] */ 00199 sum = __SMLALD(*__SIMD32(px)++, *__SIMD32(py)++, sum); 00200 /* x[3] * y[srcBLen - 1] , x[2] * y[srcBLen - 2] */ 00201 sum = __SMLALD(*__SIMD32(px)++, *__SIMD32(py)++, sum); 00202 00203 /* Decrement the loop counter */ 00204 k--; 00205 } 00206 00207 /* If the count is not a multiple of 4, compute any remaining MACs here. 00208 ** No loop unrolling is used. */ 00209 k = count % 0x4u; 00210 00211 while(k > 0u) 00212 { 00213 /* Perform the multiply-accumulates */ 00214 /* x[0] * y[srcBLen - 1] */ 00215 sum = __SMLALD(*px++, *py++, sum); 00216 00217 /* Decrement the loop counter */ 00218 k--; 00219 } 00220 00221 /* Store the result in the accumulator in the destination buffer. */ 00222 *pOut = (q15_t) (__SSAT((sum >> 15), 16)); 00223 /* Destination pointer is updated according to the address modifier, inc */ 00224 pOut += inc; 00225 00226 /* Update the inputA and inputB pointers for next MAC calculation */ 00227 py = pSrc1 - count; 00228 px = pIn1; 00229 00230 /* Increment the MAC count */ 00231 count++; 00232 00233 /* Decrement the loop counter */ 00234 blockSize1--; 00235 } 00236 00237 /* -------------------------- 00238 * Initializations of stage2 00239 * ------------------------*/ 00240 00241 /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1] 00242 * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1] 00243 * .... 00244 * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] 00245 */ 00246 00247 /* Working pointer of inputA */ 00248 px = pIn1; 00249 00250 /* Working pointer of inputB */ 00251 py = pIn2; 00252 00253 /* Initialize inputB pointer of type q31 */ 00254 pb = (q31_t *) (py); 00255 00256 /* count is index by which the pointer pIn1 to be incremented */ 00257 count = 0u; 00258 00259 /* ------------------- 00260 * Stage2 process 00261 * ------------------*/ 00262 00263 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00264 * So, to loop unroll over blockSize2, 00265 * srcBLen should be greater than or equal to 4, to loop unroll the srcBLen loop */ 00266 if(srcBLen >= 4u) 00267 { 00268 /* Loop unroll over blockSize2, by 4 */ 00269 blkCnt = blockSize2 >> 2u; 00270 00271 while(blkCnt > 0u) 00272 { 00273 /* Set all accumulators to zero */ 00274 acc0 = 0; 00275 acc1 = 0; 00276 acc2 = 0; 00277 acc3 = 0; 00278 00279 /* read x[0], x[1] samples */ 00280 x0 = *(q31_t *) (px++); 00281 /* read x[1], x[2] samples */ 00282 x1 = *(q31_t *) (px++); 00283 00284 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00285 k = srcBLen >> 2u; 00286 00287 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00288 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00289 do 00290 { 00291 /* Read the first two inputB samples using SIMD: 00292 * y[0] and y[1] */ 00293 c0 = *(pb++); 00294 00295 /* acc0 += x[0] * y[0] + x[1] * y[1] */ 00296 acc0 = __SMLALD(x0, c0, acc0); 00297 00298 /* acc1 += x[1] * y[0] + x[2] * y[1] */ 00299 acc1 = __SMLALD(x1, c0, acc1); 00300 00301 /* Read x[2], x[3] */ 00302 x2 = *(q31_t *) (px++); 00303 00304 /* Read x[3], x[4] */ 00305 x3 = *(q31_t *) (px++); 00306 00307 /* acc2 += x[2] * y[0] + x[3] * y[1] */ 00308 acc2 = __SMLALD(x2, c0, acc2); 00309 00310 /* acc3 += x[3] * y[0] + x[4] * y[1] */ 00311 acc3 = __SMLALD(x3, c0, acc3); 00312 00313 /* Read y[2] and y[3] */ 00314 c0 = *(pb++); 00315 00316 /* acc0 += x[2] * y[2] + x[3] * y[3] */ 00317 acc0 = __SMLALD(x2, c0, acc0); 00318 00319 /* acc1 += x[3] * y[2] + x[4] * y[3] */ 00320 acc1 = __SMLALD(x3, c0, acc1); 00321 00322 /* Read x[4], x[5] */ 00323 x0 = *(q31_t *) (px++); 00324 00325 /* Read x[5], x[6] */ 00326 x1 = *(q31_t *) (px++); 00327 00328 /* acc2 += x[4] * y[2] + x[5] * y[3] */ 00329 acc2 = __SMLALD(x0, c0, acc2); 00330 00331 /* acc3 += x[5] * y[2] + x[6] * y[3] */ 00332 acc3 = __SMLALD(x1, c0, acc3); 00333 00334 } while(--k); 00335 00336 /* For the next MAC operations, SIMD is not used 00337 * So, the 16 bit pointer if inputB, py is updated */ 00338 py = (q15_t *) (pb); 00339 00340 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00341 ** No loop unrolling is used. */ 00342 k = srcBLen % 0x4u; 00343 00344 if(k == 1u) 00345 { 00346 /* Read y[4] */ 00347 c0 = *py; 00348 c0 = c0 & 0x0000FFFF; 00349 00350 /* Read x[7] */ 00351 x3 = *(q31_t *) px++; 00352 00353 /* Perform the multiply-accumulates */ 00354 acc0 = __SMLALD(x0, c0, acc0); 00355 acc1 = __SMLALD(x1, c0, acc1); 00356 acc2 = __SMLALDX(x1, c0, acc2); 00357 acc3 = __SMLALDX(x3, c0, acc3); 00358 } 00359 00360 if(k == 2u) 00361 { 00362 /* Read y[4], y[5] */ 00363 c0 = *(pb); 00364 00365 /* Read x[7], x[8] */ 00366 x3 = *(q31_t *) px++; 00367 00368 /* Read x[9] */ 00369 x2 = *(q31_t *) px++; 00370 00371 /* Perform the multiply-accumulates */ 00372 acc0 = __SMLALD(x0, c0, acc0); 00373 acc1 = __SMLALD(x1, c0, acc1); 00374 acc2 = __SMLALD(x3, c0, acc2); 00375 acc3 = __SMLALD(x2, c0, acc3); 00376 } 00377 00378 if(k == 3u) 00379 { 00380 /* Read y[4], y[5] */ 00381 c0 = *pb++; 00382 00383 /* Read x[7], x[8] */ 00384 x3 = *(q31_t *) px++; 00385 00386 /* Read x[9] */ 00387 x2 = *(q31_t *) px++; 00388 00389 /* Perform the multiply-accumulates */ 00390 acc0 = __SMLALD(x0, c0, acc0); 00391 acc1 = __SMLALD(x1, c0, acc1); 00392 acc2 = __SMLALD(x3, c0, acc2); 00393 acc3 = __SMLALD(x2, c0, acc3); 00394 00395 /* Read y[6] */ 00396 c0 = (q15_t) (*pb); 00397 c0 = c0 & 0x0000FFFF; 00398 00399 /* Read x[10] */ 00400 x3 = *(q31_t *) px++; 00401 00402 /* Perform the multiply-accumulates */ 00403 acc0 = __SMLALDX(x1, c0, acc0); 00404 acc1 = __SMLALD(x2, c0, acc1); 00405 acc2 = __SMLALDX(x2, c0, acc2); 00406 acc3 = __SMLALDX(x3, c0, acc3); 00407 } 00408 00409 /* Store the result in the accumulator in the destination buffer. */ 00410 *pOut = (q15_t) (__SSAT(acc0 >> 15, 16)); 00411 /* Destination pointer is updated according to the address modifier, inc */ 00412 pOut += inc; 00413 00414 *pOut = (q15_t) (__SSAT(acc1 >> 15, 16)); 00415 pOut += inc; 00416 00417 *pOut = (q15_t) (__SSAT(acc2 >> 15, 16)); 00418 pOut += inc; 00419 00420 *pOut = (q15_t) (__SSAT(acc3 >> 15, 16)); 00421 pOut += inc; 00422 00423 /* Increment the count by 4 as 4 output values are computed */ 00424 count += 4u; 00425 00426 /* Update the inputA and inputB pointers for next MAC calculation */ 00427 px = pIn1 + count; 00428 py = pIn2; 00429 pb = (q31_t *) (py); 00430 00431 00432 /* Decrement the loop counter */ 00433 blkCnt--; 00434 } 00435 00436 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. 00437 ** No loop unrolling is used. */ 00438 blkCnt = blockSize2 % 0x4u; 00439 00440 while(blkCnt > 0u) 00441 { 00442 /* Accumulator is made zero for every iteration */ 00443 sum = 0; 00444 00445 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00446 k = srcBLen >> 2u; 00447 00448 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00449 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00450 while(k > 0u) 00451 { 00452 /* Perform the multiply-accumulates */ 00453 sum += ((q63_t) * px++ * *py++); 00454 sum += ((q63_t) * px++ * *py++); 00455 sum += ((q63_t) * px++ * *py++); 00456 sum += ((q63_t) * px++ * *py++); 00457 00458 /* Decrement the loop counter */ 00459 k--; 00460 } 00461 00462 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00463 ** No loop unrolling is used. */ 00464 k = srcBLen % 0x4u; 00465 00466 while(k > 0u) 00467 { 00468 /* Perform the multiply-accumulates */ 00469 sum += ((q63_t) * px++ * *py++); 00470 00471 /* Decrement the loop counter */ 00472 k--; 00473 } 00474 00475 /* Store the result in the accumulator in the destination buffer. */ 00476 *pOut = (q15_t) (__SSAT(sum >> 15, 16)); 00477 /* Destination pointer is updated according to the address modifier, inc */ 00478 pOut += inc; 00479 00480 /* Increment count by 1, as one output value is computed */ 00481 count++; 00482 00483 /* Update the inputA and inputB pointers for next MAC calculation */ 00484 px = pIn1 + count; 00485 py = pIn2; 00486 00487 /* Decrement the loop counter */ 00488 blkCnt--; 00489 } 00490 } 00491 else 00492 { 00493 /* If the srcBLen is not a multiple of 4, 00494 * the blockSize2 loop cannot be unrolled by 4 */ 00495 blkCnt = blockSize2; 00496 00497 while(blkCnt > 0u) 00498 { 00499 /* Accumulator is made zero for every iteration */ 00500 sum = 0; 00501 00502 /* Loop over srcBLen */ 00503 k = srcBLen; 00504 00505 while(k > 0u) 00506 { 00507 /* Perform the multiply-accumulate */ 00508 sum += ((q63_t) * px++ * *py++); 00509 00510 /* Decrement the loop counter */ 00511 k--; 00512 } 00513 00514 /* Store the result in the accumulator in the destination buffer. */ 00515 *pOut = (q15_t) (__SSAT(sum >> 15, 16)); 00516 /* Destination pointer is updated according to the address modifier, inc */ 00517 pOut += inc; 00518 00519 /* Increment the MAC count */ 00520 count++; 00521 00522 /* Update the inputA and inputB pointers for next MAC calculation */ 00523 px = pIn1 + count; 00524 py = pIn2; 00525 00526 /* Decrement the loop counter */ 00527 blkCnt--; 00528 } 00529 } 00530 00531 /* -------------------------- 00532 * Initializations of stage3 00533 * -------------------------*/ 00534 00535 /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] 00536 * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] 00537 * .... 00538 * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1] 00539 * sum += x[srcALen-1] * y[0] 00540 */ 00541 00542 /* In this stage the MAC operations are decreased by 1 for every iteration. 00543 The count variable holds the number of MAC operations performed */ 00544 count = srcBLen - 1u; 00545 00546 /* Working pointer of inputA */ 00547 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); 00548 px = pSrc1; 00549 00550 /* Working pointer of inputB */ 00551 py = pIn2; 00552 00553 /* ------------------- 00554 * Stage3 process 00555 * ------------------*/ 00556 00557 while(blockSize3 > 0u) 00558 { 00559 /* Accumulator is made zero for every iteration */ 00560 sum = 0; 00561 00562 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00563 k = count >> 2u; 00564 00565 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00566 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00567 while(k > 0u) 00568 { 00569 /* Perform the multiply-accumulates */ 00570 /* sum += x[srcALen - srcBLen + 4] * y[3] , sum += x[srcALen - srcBLen + 3] * y[2] */ 00571 sum = __SMLALD(*__SIMD32(px)++, *__SIMD32(py)++, sum); 00572 /* sum += x[srcALen - srcBLen + 2] * y[1] , sum += x[srcALen - srcBLen + 1] * y[0] */ 00573 sum = __SMLALD(*__SIMD32(px)++, *__SIMD32(py)++, sum); 00574 00575 /* Decrement the loop counter */ 00576 k--; 00577 } 00578 00579 /* If the count is not a multiple of 4, compute any remaining MACs here. 00580 ** No loop unrolling is used. */ 00581 k = count % 0x4u; 00582 00583 while(k > 0u) 00584 { 00585 /* Perform the multiply-accumulates */ 00586 sum = __SMLALD(*px++, *py++, sum); 00587 00588 /* Decrement the loop counter */ 00589 k--; 00590 } 00591 00592 /* Store the result in the accumulator in the destination buffer. */ 00593 *pOut = (q15_t) (__SSAT((sum >> 15), 16)); 00594 /* Destination pointer is updated according to the address modifier, inc */ 00595 pOut += inc; 00596 00597 /* Update the inputA and inputB pointers for next MAC calculation */ 00598 px = ++pSrc1; 00599 py = pIn2; 00600 00601 /* Decrement the MAC count */ 00602 count--; 00603 00604 /* Decrement the loop counter */ 00605 blockSize3--; 00606 } 00607 00608 } 00609 00610 /** 00611 * @} end of Corr group 00612 */
Generated on Tue Jul 12 2022 14:13:52 by 1.7.2