CMSIS DSP Library from CMSIS 2.0. See http://www.onarm.com/cmsis/ for full details

Dependents:   K22F_DSP_Matrix_least_square BNO055-ELEC3810 1BNO055 ECE4180Project--Slave2 ... more

Committer:
simon
Date:
Thu Mar 10 15:07:50 2011 +0000
Revision:
0:1014af42efd9

        

Who changed what in which revision?

UserRevisionLine numberNew contents of line
simon 0:1014af42efd9 1 /* ----------------------------------------------------------------------
simon 0:1014af42efd9 2 * Copyright (C) 2010 ARM Limited. All rights reserved.
simon 0:1014af42efd9 3 *
simon 0:1014af42efd9 4 * $Date: 29. November 2010
simon 0:1014af42efd9 5 * $Revision: V1.0.3
simon 0:1014af42efd9 6 *
simon 0:1014af42efd9 7 * Project: CMSIS DSP Library
simon 0:1014af42efd9 8 * Title: arm_correlate_q15.c
simon 0:1014af42efd9 9 *
simon 0:1014af42efd9 10 * Description: Q15 Correlation.
simon 0:1014af42efd9 11 *
simon 0:1014af42efd9 12 * Target Processor: Cortex-M4/Cortex-M3
simon 0:1014af42efd9 13 *
simon 0:1014af42efd9 14 * Version 1.0.3 2010/11/29
simon 0:1014af42efd9 15 * Re-organized the CMSIS folders and updated documentation.
simon 0:1014af42efd9 16 *
simon 0:1014af42efd9 17 * Version 1.0.2 2010/11/11
simon 0:1014af42efd9 18 * Documentation updated.
simon 0:1014af42efd9 19 *
simon 0:1014af42efd9 20 * Version 1.0.1 2010/10/05
simon 0:1014af42efd9 21 * Production release and review comments incorporated.
simon 0:1014af42efd9 22 *
simon 0:1014af42efd9 23 * Version 1.0.0 2010/09/20
simon 0:1014af42efd9 24 * Production release and review comments incorporated
simon 0:1014af42efd9 25 *
simon 0:1014af42efd9 26 * Version 0.0.7 2010/06/10
simon 0:1014af42efd9 27 * Misra-C changes done
simon 0:1014af42efd9 28 *
simon 0:1014af42efd9 29 * -------------------------------------------------------------------- */
simon 0:1014af42efd9 30
simon 0:1014af42efd9 31 #include "arm_math.h"
simon 0:1014af42efd9 32
simon 0:1014af42efd9 33 /**
simon 0:1014af42efd9 34 * @ingroup groupFilters
simon 0:1014af42efd9 35 */
simon 0:1014af42efd9 36
simon 0:1014af42efd9 37 /**
simon 0:1014af42efd9 38 * @addtogroup Corr
simon 0:1014af42efd9 39 * @{
simon 0:1014af42efd9 40 */
simon 0:1014af42efd9 41
simon 0:1014af42efd9 42 /**
simon 0:1014af42efd9 43 * @brief Correlation of Q15 sequences
simon 0:1014af42efd9 44 * @param[in] *pSrcA points to the first input sequence.
simon 0:1014af42efd9 45 * @param[in] srcALen length of the first input sequence.
simon 0:1014af42efd9 46 * @param[in] *pSrcB points to the second input sequence.
simon 0:1014af42efd9 47 * @param[in] srcBLen length of the second input sequence.
simon 0:1014af42efd9 48 * @param[out] *pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1.
simon 0:1014af42efd9 49 * @return none.
simon 0:1014af42efd9 50 *
simon 0:1014af42efd9 51 * @details
simon 0:1014af42efd9 52 * <b>Scaling and Overflow Behavior:</b>
simon 0:1014af42efd9 53 *
simon 0:1014af42efd9 54 * \par
simon 0:1014af42efd9 55 * The function is implemented using a 64-bit internal accumulator.
simon 0:1014af42efd9 56 * Both inputs are in 1.15 format and multiplications yield a 2.30 result.
simon 0:1014af42efd9 57 * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
simon 0:1014af42efd9 58 * This approach provides 33 guard bits and there is no risk of overflow.
simon 0:1014af42efd9 59 * The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format.
simon 0:1014af42efd9 60 *
simon 0:1014af42efd9 61 * \par
simon 0:1014af42efd9 62 * Refer to <code>arm_correlate_fast_q15()</code> for a faster but less precise version of this function.
simon 0:1014af42efd9 63 */
simon 0:1014af42efd9 64
simon 0:1014af42efd9 65 void arm_correlate_q15(
simon 0:1014af42efd9 66 q15_t * pSrcA,
simon 0:1014af42efd9 67 uint32_t srcALen,
simon 0:1014af42efd9 68 q15_t * pSrcB,
simon 0:1014af42efd9 69 uint32_t srcBLen,
simon 0:1014af42efd9 70 q15_t * pDst)
simon 0:1014af42efd9 71 {
simon 0:1014af42efd9 72 q15_t *pIn1; /* inputA pointer */
simon 0:1014af42efd9 73 q15_t *pIn2; /* inputB pointer */
simon 0:1014af42efd9 74 q15_t *pOut = pDst; /* output pointer */
simon 0:1014af42efd9 75 q63_t sum, acc0, acc1, acc2, acc3; /* Accumulators */
simon 0:1014af42efd9 76 q15_t *px; /* Intermediate inputA pointer */
simon 0:1014af42efd9 77 q15_t *py; /* Intermediate inputB pointer */
simon 0:1014af42efd9 78 q15_t *pSrc1; /* Intermediate pointers */
simon 0:1014af42efd9 79 q31_t x0, x1, x2, x3, c0; /* temporary variables for holding input and coefficient values */
simon 0:1014af42efd9 80 uint32_t j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3; /* loop counter */
simon 0:1014af42efd9 81 int32_t inc = 1; /* Destination address modifier */
simon 0:1014af42efd9 82 q31_t *pb; /* 32 bit pointer for inputB buffer */
simon 0:1014af42efd9 83
simon 0:1014af42efd9 84
simon 0:1014af42efd9 85 /* The algorithm implementation is based on the lengths of the inputs. */
simon 0:1014af42efd9 86 /* srcB is always made to slide across srcA. */
simon 0:1014af42efd9 87 /* So srcBLen is always considered as shorter or equal to srcALen */
simon 0:1014af42efd9 88 /* But CORR(x, y) is reverse of CORR(y, x) */
simon 0:1014af42efd9 89 /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
simon 0:1014af42efd9 90 /* and the destination pointer modifier, inc is set to -1 */
simon 0:1014af42efd9 91 /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
simon 0:1014af42efd9 92 /* But to improve the performance,
simon 0:1014af42efd9 93 * we include zeroes in the output instead of zero padding either of the the inputs*/
simon 0:1014af42efd9 94 /* If srcALen > srcBLen,
simon 0:1014af42efd9 95 * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
simon 0:1014af42efd9 96 /* If srcALen < srcBLen,
simon 0:1014af42efd9 97 * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
simon 0:1014af42efd9 98 if(srcALen >= srcBLen)
simon 0:1014af42efd9 99 {
simon 0:1014af42efd9 100 /* Initialization of inputA pointer */
simon 0:1014af42efd9 101 pIn1 = (pSrcA);
simon 0:1014af42efd9 102
simon 0:1014af42efd9 103 /* Initialization of inputB pointer */
simon 0:1014af42efd9 104 pIn2 = (pSrcB);
simon 0:1014af42efd9 105
simon 0:1014af42efd9 106 /* Number of output samples is calculated */
simon 0:1014af42efd9 107 outBlockSize = (2u * srcALen) - 1u;
simon 0:1014af42efd9 108
simon 0:1014af42efd9 109 /* When srcALen > srcBLen, zero padding is done to srcB
simon 0:1014af42efd9 110 * to make their lengths equal.
simon 0:1014af42efd9 111 * Instead, (outBlockSize - (srcALen + srcBLen - 1))
simon 0:1014af42efd9 112 * number of output samples are made zero */
simon 0:1014af42efd9 113 j = outBlockSize - (srcALen + (srcBLen - 1u));
simon 0:1014af42efd9 114
simon 0:1014af42efd9 115 while(j > 0u)
simon 0:1014af42efd9 116 {
simon 0:1014af42efd9 117 /* Zero is stored in the destination buffer */
simon 0:1014af42efd9 118 *pOut++ = 0;
simon 0:1014af42efd9 119
simon 0:1014af42efd9 120 /* Decrement the loop counter */
simon 0:1014af42efd9 121 j--;
simon 0:1014af42efd9 122 }
simon 0:1014af42efd9 123
simon 0:1014af42efd9 124 }
simon 0:1014af42efd9 125 else
simon 0:1014af42efd9 126 {
simon 0:1014af42efd9 127 /* Initialization of inputA pointer */
simon 0:1014af42efd9 128 pIn1 = (pSrcB);
simon 0:1014af42efd9 129
simon 0:1014af42efd9 130 /* Initialization of inputB pointer */
simon 0:1014af42efd9 131 pIn2 = (pSrcA);
simon 0:1014af42efd9 132
simon 0:1014af42efd9 133 /* srcBLen is always considered as shorter or equal to srcALen */
simon 0:1014af42efd9 134 j = srcBLen;
simon 0:1014af42efd9 135 srcBLen = srcALen;
simon 0:1014af42efd9 136 srcALen = j;
simon 0:1014af42efd9 137
simon 0:1014af42efd9 138 /* CORR(x, y) = Reverse order(CORR(y, x)) */
simon 0:1014af42efd9 139 /* Hence set the destination pointer to point to the last output sample */
simon 0:1014af42efd9 140 pOut = pDst + ((srcALen + srcBLen) - 2u);
simon 0:1014af42efd9 141
simon 0:1014af42efd9 142 /* Destination address modifier is set to -1 */
simon 0:1014af42efd9 143 inc = -1;
simon 0:1014af42efd9 144
simon 0:1014af42efd9 145 }
simon 0:1014af42efd9 146
simon 0:1014af42efd9 147 /* The function is internally
simon 0:1014af42efd9 148 * divided into three parts according to the number of multiplications that has to be
simon 0:1014af42efd9 149 * taken place between inputA samples and inputB samples. In the first part of the
simon 0:1014af42efd9 150 * algorithm, the multiplications increase by one for every iteration.
simon 0:1014af42efd9 151 * In the second part of the algorithm, srcBLen number of multiplications are done.
simon 0:1014af42efd9 152 * In the third part of the algorithm, the multiplications decrease by one
simon 0:1014af42efd9 153 * for every iteration.*/
simon 0:1014af42efd9 154 /* The algorithm is implemented in three stages.
simon 0:1014af42efd9 155 * The loop counters of each stage is initiated here. */
simon 0:1014af42efd9 156 blockSize1 = srcBLen - 1u;
simon 0:1014af42efd9 157 blockSize2 = srcALen - (srcBLen - 1u);
simon 0:1014af42efd9 158 blockSize3 = blockSize1;
simon 0:1014af42efd9 159
simon 0:1014af42efd9 160 /* --------------------------
simon 0:1014af42efd9 161 * Initializations of stage1
simon 0:1014af42efd9 162 * -------------------------*/
simon 0:1014af42efd9 163
simon 0:1014af42efd9 164 /* sum = x[0] * y[srcBlen - 1]
simon 0:1014af42efd9 165 * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1]
simon 0:1014af42efd9 166 * ....
simon 0:1014af42efd9 167 * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
simon 0:1014af42efd9 168 */
simon 0:1014af42efd9 169
simon 0:1014af42efd9 170 /* In this stage the MAC operations are increased by 1 for every iteration.
simon 0:1014af42efd9 171 The count variable holds the number of MAC operations performed */
simon 0:1014af42efd9 172 count = 1u;
simon 0:1014af42efd9 173
simon 0:1014af42efd9 174 /* Working pointer of inputA */
simon 0:1014af42efd9 175 px = pIn1;
simon 0:1014af42efd9 176
simon 0:1014af42efd9 177 /* Working pointer of inputB */
simon 0:1014af42efd9 178 pSrc1 = pIn2 + (srcBLen - 1u);
simon 0:1014af42efd9 179 py = pSrc1;
simon 0:1014af42efd9 180
simon 0:1014af42efd9 181 /* ------------------------
simon 0:1014af42efd9 182 * Stage1 process
simon 0:1014af42efd9 183 * ----------------------*/
simon 0:1014af42efd9 184
simon 0:1014af42efd9 185 /* The first loop starts here */
simon 0:1014af42efd9 186 while(blockSize1 > 0u)
simon 0:1014af42efd9 187 {
simon 0:1014af42efd9 188 /* Accumulator is made zero for every iteration */
simon 0:1014af42efd9 189 sum = 0;
simon 0:1014af42efd9 190
simon 0:1014af42efd9 191 /* Apply loop unrolling and compute 4 MACs simultaneously. */
simon 0:1014af42efd9 192 k = count >> 2;
simon 0:1014af42efd9 193
simon 0:1014af42efd9 194 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
simon 0:1014af42efd9 195 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
simon 0:1014af42efd9 196 while(k > 0u)
simon 0:1014af42efd9 197 {
simon 0:1014af42efd9 198 /* x[0] * y[srcBLen - 4] , x[1] * y[srcBLen - 3] */
simon 0:1014af42efd9 199 sum = __SMLALD(*__SIMD32(px)++, *__SIMD32(py)++, sum);
simon 0:1014af42efd9 200 /* x[3] * y[srcBLen - 1] , x[2] * y[srcBLen - 2] */
simon 0:1014af42efd9 201 sum = __SMLALD(*__SIMD32(px)++, *__SIMD32(py)++, sum);
simon 0:1014af42efd9 202
simon 0:1014af42efd9 203 /* Decrement the loop counter */
simon 0:1014af42efd9 204 k--;
simon 0:1014af42efd9 205 }
simon 0:1014af42efd9 206
simon 0:1014af42efd9 207 /* If the count is not a multiple of 4, compute any remaining MACs here.
simon 0:1014af42efd9 208 ** No loop unrolling is used. */
simon 0:1014af42efd9 209 k = count % 0x4u;
simon 0:1014af42efd9 210
simon 0:1014af42efd9 211 while(k > 0u)
simon 0:1014af42efd9 212 {
simon 0:1014af42efd9 213 /* Perform the multiply-accumulates */
simon 0:1014af42efd9 214 /* x[0] * y[srcBLen - 1] */
simon 0:1014af42efd9 215 sum = __SMLALD(*px++, *py++, sum);
simon 0:1014af42efd9 216
simon 0:1014af42efd9 217 /* Decrement the loop counter */
simon 0:1014af42efd9 218 k--;
simon 0:1014af42efd9 219 }
simon 0:1014af42efd9 220
simon 0:1014af42efd9 221 /* Store the result in the accumulator in the destination buffer. */
simon 0:1014af42efd9 222 *pOut = (q15_t) (__SSAT((sum >> 15), 16));
simon 0:1014af42efd9 223 /* Destination pointer is updated according to the address modifier, inc */
simon 0:1014af42efd9 224 pOut += inc;
simon 0:1014af42efd9 225
simon 0:1014af42efd9 226 /* Update the inputA and inputB pointers for next MAC calculation */
simon 0:1014af42efd9 227 py = pSrc1 - count;
simon 0:1014af42efd9 228 px = pIn1;
simon 0:1014af42efd9 229
simon 0:1014af42efd9 230 /* Increment the MAC count */
simon 0:1014af42efd9 231 count++;
simon 0:1014af42efd9 232
simon 0:1014af42efd9 233 /* Decrement the loop counter */
simon 0:1014af42efd9 234 blockSize1--;
simon 0:1014af42efd9 235 }
simon 0:1014af42efd9 236
simon 0:1014af42efd9 237 /* --------------------------
simon 0:1014af42efd9 238 * Initializations of stage2
simon 0:1014af42efd9 239 * ------------------------*/
simon 0:1014af42efd9 240
simon 0:1014af42efd9 241 /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
simon 0:1014af42efd9 242 * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]
simon 0:1014af42efd9 243 * ....
simon 0:1014af42efd9 244 * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
simon 0:1014af42efd9 245 */
simon 0:1014af42efd9 246
simon 0:1014af42efd9 247 /* Working pointer of inputA */
simon 0:1014af42efd9 248 px = pIn1;
simon 0:1014af42efd9 249
simon 0:1014af42efd9 250 /* Working pointer of inputB */
simon 0:1014af42efd9 251 py = pIn2;
simon 0:1014af42efd9 252
simon 0:1014af42efd9 253 /* Initialize inputB pointer of type q31 */
simon 0:1014af42efd9 254 pb = (q31_t *) (py);
simon 0:1014af42efd9 255
simon 0:1014af42efd9 256 /* count is index by which the pointer pIn1 to be incremented */
simon 0:1014af42efd9 257 count = 0u;
simon 0:1014af42efd9 258
simon 0:1014af42efd9 259 /* -------------------
simon 0:1014af42efd9 260 * Stage2 process
simon 0:1014af42efd9 261 * ------------------*/
simon 0:1014af42efd9 262
simon 0:1014af42efd9 263 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
simon 0:1014af42efd9 264 * So, to loop unroll over blockSize2,
simon 0:1014af42efd9 265 * srcBLen should be greater than or equal to 4, to loop unroll the srcBLen loop */
simon 0:1014af42efd9 266 if(srcBLen >= 4u)
simon 0:1014af42efd9 267 {
simon 0:1014af42efd9 268 /* Loop unroll over blockSize2, by 4 */
simon 0:1014af42efd9 269 blkCnt = blockSize2 >> 2u;
simon 0:1014af42efd9 270
simon 0:1014af42efd9 271 while(blkCnt > 0u)
simon 0:1014af42efd9 272 {
simon 0:1014af42efd9 273 /* Set all accumulators to zero */
simon 0:1014af42efd9 274 acc0 = 0;
simon 0:1014af42efd9 275 acc1 = 0;
simon 0:1014af42efd9 276 acc2 = 0;
simon 0:1014af42efd9 277 acc3 = 0;
simon 0:1014af42efd9 278
simon 0:1014af42efd9 279 /* read x[0], x[1] samples */
simon 0:1014af42efd9 280 x0 = *(q31_t *) (px++);
simon 0:1014af42efd9 281 /* read x[1], x[2] samples */
simon 0:1014af42efd9 282 x1 = *(q31_t *) (px++);
simon 0:1014af42efd9 283
simon 0:1014af42efd9 284 /* Apply loop unrolling and compute 4 MACs simultaneously. */
simon 0:1014af42efd9 285 k = srcBLen >> 2u;
simon 0:1014af42efd9 286
simon 0:1014af42efd9 287 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
simon 0:1014af42efd9 288 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
simon 0:1014af42efd9 289 do
simon 0:1014af42efd9 290 {
simon 0:1014af42efd9 291 /* Read the first two inputB samples using SIMD:
simon 0:1014af42efd9 292 * y[0] and y[1] */
simon 0:1014af42efd9 293 c0 = *(pb++);
simon 0:1014af42efd9 294
simon 0:1014af42efd9 295 /* acc0 += x[0] * y[0] + x[1] * y[1] */
simon 0:1014af42efd9 296 acc0 = __SMLALD(x0, c0, acc0);
simon 0:1014af42efd9 297
simon 0:1014af42efd9 298 /* acc1 += x[1] * y[0] + x[2] * y[1] */
simon 0:1014af42efd9 299 acc1 = __SMLALD(x1, c0, acc1);
simon 0:1014af42efd9 300
simon 0:1014af42efd9 301 /* Read x[2], x[3] */
simon 0:1014af42efd9 302 x2 = *(q31_t *) (px++);
simon 0:1014af42efd9 303
simon 0:1014af42efd9 304 /* Read x[3], x[4] */
simon 0:1014af42efd9 305 x3 = *(q31_t *) (px++);
simon 0:1014af42efd9 306
simon 0:1014af42efd9 307 /* acc2 += x[2] * y[0] + x[3] * y[1] */
simon 0:1014af42efd9 308 acc2 = __SMLALD(x2, c0, acc2);
simon 0:1014af42efd9 309
simon 0:1014af42efd9 310 /* acc3 += x[3] * y[0] + x[4] * y[1] */
simon 0:1014af42efd9 311 acc3 = __SMLALD(x3, c0, acc3);
simon 0:1014af42efd9 312
simon 0:1014af42efd9 313 /* Read y[2] and y[3] */
simon 0:1014af42efd9 314 c0 = *(pb++);
simon 0:1014af42efd9 315
simon 0:1014af42efd9 316 /* acc0 += x[2] * y[2] + x[3] * y[3] */
simon 0:1014af42efd9 317 acc0 = __SMLALD(x2, c0, acc0);
simon 0:1014af42efd9 318
simon 0:1014af42efd9 319 /* acc1 += x[3] * y[2] + x[4] * y[3] */
simon 0:1014af42efd9 320 acc1 = __SMLALD(x3, c0, acc1);
simon 0:1014af42efd9 321
simon 0:1014af42efd9 322 /* Read x[4], x[5] */
simon 0:1014af42efd9 323 x0 = *(q31_t *) (px++);
simon 0:1014af42efd9 324
simon 0:1014af42efd9 325 /* Read x[5], x[6] */
simon 0:1014af42efd9 326 x1 = *(q31_t *) (px++);
simon 0:1014af42efd9 327
simon 0:1014af42efd9 328 /* acc2 += x[4] * y[2] + x[5] * y[3] */
simon 0:1014af42efd9 329 acc2 = __SMLALD(x0, c0, acc2);
simon 0:1014af42efd9 330
simon 0:1014af42efd9 331 /* acc3 += x[5] * y[2] + x[6] * y[3] */
simon 0:1014af42efd9 332 acc3 = __SMLALD(x1, c0, acc3);
simon 0:1014af42efd9 333
simon 0:1014af42efd9 334 } while(--k);
simon 0:1014af42efd9 335
simon 0:1014af42efd9 336 /* For the next MAC operations, SIMD is not used
simon 0:1014af42efd9 337 * So, the 16 bit pointer if inputB, py is updated */
simon 0:1014af42efd9 338 py = (q15_t *) (pb);
simon 0:1014af42efd9 339
simon 0:1014af42efd9 340 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
simon 0:1014af42efd9 341 ** No loop unrolling is used. */
simon 0:1014af42efd9 342 k = srcBLen % 0x4u;
simon 0:1014af42efd9 343
simon 0:1014af42efd9 344 if(k == 1u)
simon 0:1014af42efd9 345 {
simon 0:1014af42efd9 346 /* Read y[4] */
simon 0:1014af42efd9 347 c0 = *py;
simon 0:1014af42efd9 348 c0 = c0 & 0x0000FFFF;
simon 0:1014af42efd9 349
simon 0:1014af42efd9 350 /* Read x[7] */
simon 0:1014af42efd9 351 x3 = *(q31_t *) px++;
simon 0:1014af42efd9 352
simon 0:1014af42efd9 353 /* Perform the multiply-accumulates */
simon 0:1014af42efd9 354 acc0 = __SMLALD(x0, c0, acc0);
simon 0:1014af42efd9 355 acc1 = __SMLALD(x1, c0, acc1);
simon 0:1014af42efd9 356 acc2 = __SMLALDX(x1, c0, acc2);
simon 0:1014af42efd9 357 acc3 = __SMLALDX(x3, c0, acc3);
simon 0:1014af42efd9 358 }
simon 0:1014af42efd9 359
simon 0:1014af42efd9 360 if(k == 2u)
simon 0:1014af42efd9 361 {
simon 0:1014af42efd9 362 /* Read y[4], y[5] */
simon 0:1014af42efd9 363 c0 = *(pb);
simon 0:1014af42efd9 364
simon 0:1014af42efd9 365 /* Read x[7], x[8] */
simon 0:1014af42efd9 366 x3 = *(q31_t *) px++;
simon 0:1014af42efd9 367
simon 0:1014af42efd9 368 /* Read x[9] */
simon 0:1014af42efd9 369 x2 = *(q31_t *) px++;
simon 0:1014af42efd9 370
simon 0:1014af42efd9 371 /* Perform the multiply-accumulates */
simon 0:1014af42efd9 372 acc0 = __SMLALD(x0, c0, acc0);
simon 0:1014af42efd9 373 acc1 = __SMLALD(x1, c0, acc1);
simon 0:1014af42efd9 374 acc2 = __SMLALD(x3, c0, acc2);
simon 0:1014af42efd9 375 acc3 = __SMLALD(x2, c0, acc3);
simon 0:1014af42efd9 376 }
simon 0:1014af42efd9 377
simon 0:1014af42efd9 378 if(k == 3u)
simon 0:1014af42efd9 379 {
simon 0:1014af42efd9 380 /* Read y[4], y[5] */
simon 0:1014af42efd9 381 c0 = *pb++;
simon 0:1014af42efd9 382
simon 0:1014af42efd9 383 /* Read x[7], x[8] */
simon 0:1014af42efd9 384 x3 = *(q31_t *) px++;
simon 0:1014af42efd9 385
simon 0:1014af42efd9 386 /* Read x[9] */
simon 0:1014af42efd9 387 x2 = *(q31_t *) px++;
simon 0:1014af42efd9 388
simon 0:1014af42efd9 389 /* Perform the multiply-accumulates */
simon 0:1014af42efd9 390 acc0 = __SMLALD(x0, c0, acc0);
simon 0:1014af42efd9 391 acc1 = __SMLALD(x1, c0, acc1);
simon 0:1014af42efd9 392 acc2 = __SMLALD(x3, c0, acc2);
simon 0:1014af42efd9 393 acc3 = __SMLALD(x2, c0, acc3);
simon 0:1014af42efd9 394
simon 0:1014af42efd9 395 /* Read y[6] */
simon 0:1014af42efd9 396 c0 = (q15_t) (*pb);
simon 0:1014af42efd9 397 c0 = c0 & 0x0000FFFF;
simon 0:1014af42efd9 398
simon 0:1014af42efd9 399 /* Read x[10] */
simon 0:1014af42efd9 400 x3 = *(q31_t *) px++;
simon 0:1014af42efd9 401
simon 0:1014af42efd9 402 /* Perform the multiply-accumulates */
simon 0:1014af42efd9 403 acc0 = __SMLALDX(x1, c0, acc0);
simon 0:1014af42efd9 404 acc1 = __SMLALD(x2, c0, acc1);
simon 0:1014af42efd9 405 acc2 = __SMLALDX(x2, c0, acc2);
simon 0:1014af42efd9 406 acc3 = __SMLALDX(x3, c0, acc3);
simon 0:1014af42efd9 407 }
simon 0:1014af42efd9 408
simon 0:1014af42efd9 409 /* Store the result in the accumulator in the destination buffer. */
simon 0:1014af42efd9 410 *pOut = (q15_t) (__SSAT(acc0 >> 15, 16));
simon 0:1014af42efd9 411 /* Destination pointer is updated according to the address modifier, inc */
simon 0:1014af42efd9 412 pOut += inc;
simon 0:1014af42efd9 413
simon 0:1014af42efd9 414 *pOut = (q15_t) (__SSAT(acc1 >> 15, 16));
simon 0:1014af42efd9 415 pOut += inc;
simon 0:1014af42efd9 416
simon 0:1014af42efd9 417 *pOut = (q15_t) (__SSAT(acc2 >> 15, 16));
simon 0:1014af42efd9 418 pOut += inc;
simon 0:1014af42efd9 419
simon 0:1014af42efd9 420 *pOut = (q15_t) (__SSAT(acc3 >> 15, 16));
simon 0:1014af42efd9 421 pOut += inc;
simon 0:1014af42efd9 422
simon 0:1014af42efd9 423 /* Increment the count by 4 as 4 output values are computed */
simon 0:1014af42efd9 424 count += 4u;
simon 0:1014af42efd9 425
simon 0:1014af42efd9 426 /* Update the inputA and inputB pointers for next MAC calculation */
simon 0:1014af42efd9 427 px = pIn1 + count;
simon 0:1014af42efd9 428 py = pIn2;
simon 0:1014af42efd9 429 pb = (q31_t *) (py);
simon 0:1014af42efd9 430
simon 0:1014af42efd9 431
simon 0:1014af42efd9 432 /* Decrement the loop counter */
simon 0:1014af42efd9 433 blkCnt--;
simon 0:1014af42efd9 434 }
simon 0:1014af42efd9 435
simon 0:1014af42efd9 436 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
simon 0:1014af42efd9 437 ** No loop unrolling is used. */
simon 0:1014af42efd9 438 blkCnt = blockSize2 % 0x4u;
simon 0:1014af42efd9 439
simon 0:1014af42efd9 440 while(blkCnt > 0u)
simon 0:1014af42efd9 441 {
simon 0:1014af42efd9 442 /* Accumulator is made zero for every iteration */
simon 0:1014af42efd9 443 sum = 0;
simon 0:1014af42efd9 444
simon 0:1014af42efd9 445 /* Apply loop unrolling and compute 4 MACs simultaneously. */
simon 0:1014af42efd9 446 k = srcBLen >> 2u;
simon 0:1014af42efd9 447
simon 0:1014af42efd9 448 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
simon 0:1014af42efd9 449 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
simon 0:1014af42efd9 450 while(k > 0u)
simon 0:1014af42efd9 451 {
simon 0:1014af42efd9 452 /* Perform the multiply-accumulates */
simon 0:1014af42efd9 453 sum += ((q63_t) * px++ * *py++);
simon 0:1014af42efd9 454 sum += ((q63_t) * px++ * *py++);
simon 0:1014af42efd9 455 sum += ((q63_t) * px++ * *py++);
simon 0:1014af42efd9 456 sum += ((q63_t) * px++ * *py++);
simon 0:1014af42efd9 457
simon 0:1014af42efd9 458 /* Decrement the loop counter */
simon 0:1014af42efd9 459 k--;
simon 0:1014af42efd9 460 }
simon 0:1014af42efd9 461
simon 0:1014af42efd9 462 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
simon 0:1014af42efd9 463 ** No loop unrolling is used. */
simon 0:1014af42efd9 464 k = srcBLen % 0x4u;
simon 0:1014af42efd9 465
simon 0:1014af42efd9 466 while(k > 0u)
simon 0:1014af42efd9 467 {
simon 0:1014af42efd9 468 /* Perform the multiply-accumulates */
simon 0:1014af42efd9 469 sum += ((q63_t) * px++ * *py++);
simon 0:1014af42efd9 470
simon 0:1014af42efd9 471 /* Decrement the loop counter */
simon 0:1014af42efd9 472 k--;
simon 0:1014af42efd9 473 }
simon 0:1014af42efd9 474
simon 0:1014af42efd9 475 /* Store the result in the accumulator in the destination buffer. */
simon 0:1014af42efd9 476 *pOut = (q15_t) (__SSAT(sum >> 15, 16));
simon 0:1014af42efd9 477 /* Destination pointer is updated according to the address modifier, inc */
simon 0:1014af42efd9 478 pOut += inc;
simon 0:1014af42efd9 479
simon 0:1014af42efd9 480 /* Increment count by 1, as one output value is computed */
simon 0:1014af42efd9 481 count++;
simon 0:1014af42efd9 482
simon 0:1014af42efd9 483 /* Update the inputA and inputB pointers for next MAC calculation */
simon 0:1014af42efd9 484 px = pIn1 + count;
simon 0:1014af42efd9 485 py = pIn2;
simon 0:1014af42efd9 486
simon 0:1014af42efd9 487 /* Decrement the loop counter */
simon 0:1014af42efd9 488 blkCnt--;
simon 0:1014af42efd9 489 }
simon 0:1014af42efd9 490 }
simon 0:1014af42efd9 491 else
simon 0:1014af42efd9 492 {
simon 0:1014af42efd9 493 /* If the srcBLen is not a multiple of 4,
simon 0:1014af42efd9 494 * the blockSize2 loop cannot be unrolled by 4 */
simon 0:1014af42efd9 495 blkCnt = blockSize2;
simon 0:1014af42efd9 496
simon 0:1014af42efd9 497 while(blkCnt > 0u)
simon 0:1014af42efd9 498 {
simon 0:1014af42efd9 499 /* Accumulator is made zero for every iteration */
simon 0:1014af42efd9 500 sum = 0;
simon 0:1014af42efd9 501
simon 0:1014af42efd9 502 /* Loop over srcBLen */
simon 0:1014af42efd9 503 k = srcBLen;
simon 0:1014af42efd9 504
simon 0:1014af42efd9 505 while(k > 0u)
simon 0:1014af42efd9 506 {
simon 0:1014af42efd9 507 /* Perform the multiply-accumulate */
simon 0:1014af42efd9 508 sum += ((q63_t) * px++ * *py++);
simon 0:1014af42efd9 509
simon 0:1014af42efd9 510 /* Decrement the loop counter */
simon 0:1014af42efd9 511 k--;
simon 0:1014af42efd9 512 }
simon 0:1014af42efd9 513
simon 0:1014af42efd9 514 /* Store the result in the accumulator in the destination buffer. */
simon 0:1014af42efd9 515 *pOut = (q15_t) (__SSAT(sum >> 15, 16));
simon 0:1014af42efd9 516 /* Destination pointer is updated according to the address modifier, inc */
simon 0:1014af42efd9 517 pOut += inc;
simon 0:1014af42efd9 518
simon 0:1014af42efd9 519 /* Increment the MAC count */
simon 0:1014af42efd9 520 count++;
simon 0:1014af42efd9 521
simon 0:1014af42efd9 522 /* Update the inputA and inputB pointers for next MAC calculation */
simon 0:1014af42efd9 523 px = pIn1 + count;
simon 0:1014af42efd9 524 py = pIn2;
simon 0:1014af42efd9 525
simon 0:1014af42efd9 526 /* Decrement the loop counter */
simon 0:1014af42efd9 527 blkCnt--;
simon 0:1014af42efd9 528 }
simon 0:1014af42efd9 529 }
simon 0:1014af42efd9 530
simon 0:1014af42efd9 531 /* --------------------------
simon 0:1014af42efd9 532 * Initializations of stage3
simon 0:1014af42efd9 533 * -------------------------*/
simon 0:1014af42efd9 534
simon 0:1014af42efd9 535 /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
simon 0:1014af42efd9 536 * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
simon 0:1014af42efd9 537 * ....
simon 0:1014af42efd9 538 * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
simon 0:1014af42efd9 539 * sum += x[srcALen-1] * y[0]
simon 0:1014af42efd9 540 */
simon 0:1014af42efd9 541
simon 0:1014af42efd9 542 /* In this stage the MAC operations are decreased by 1 for every iteration.
simon 0:1014af42efd9 543 The count variable holds the number of MAC operations performed */
simon 0:1014af42efd9 544 count = srcBLen - 1u;
simon 0:1014af42efd9 545
simon 0:1014af42efd9 546 /* Working pointer of inputA */
simon 0:1014af42efd9 547 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
simon 0:1014af42efd9 548 px = pSrc1;
simon 0:1014af42efd9 549
simon 0:1014af42efd9 550 /* Working pointer of inputB */
simon 0:1014af42efd9 551 py = pIn2;
simon 0:1014af42efd9 552
simon 0:1014af42efd9 553 /* -------------------
simon 0:1014af42efd9 554 * Stage3 process
simon 0:1014af42efd9 555 * ------------------*/
simon 0:1014af42efd9 556
simon 0:1014af42efd9 557 while(blockSize3 > 0u)
simon 0:1014af42efd9 558 {
simon 0:1014af42efd9 559 /* Accumulator is made zero for every iteration */
simon 0:1014af42efd9 560 sum = 0;
simon 0:1014af42efd9 561
simon 0:1014af42efd9 562 /* Apply loop unrolling and compute 4 MACs simultaneously. */
simon 0:1014af42efd9 563 k = count >> 2u;
simon 0:1014af42efd9 564
simon 0:1014af42efd9 565 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
simon 0:1014af42efd9 566 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
simon 0:1014af42efd9 567 while(k > 0u)
simon 0:1014af42efd9 568 {
simon 0:1014af42efd9 569 /* Perform the multiply-accumulates */
simon 0:1014af42efd9 570 /* sum += x[srcALen - srcBLen + 4] * y[3] , sum += x[srcALen - srcBLen + 3] * y[2] */
simon 0:1014af42efd9 571 sum = __SMLALD(*__SIMD32(px)++, *__SIMD32(py)++, sum);
simon 0:1014af42efd9 572 /* sum += x[srcALen - srcBLen + 2] * y[1] , sum += x[srcALen - srcBLen + 1] * y[0] */
simon 0:1014af42efd9 573 sum = __SMLALD(*__SIMD32(px)++, *__SIMD32(py)++, sum);
simon 0:1014af42efd9 574
simon 0:1014af42efd9 575 /* Decrement the loop counter */
simon 0:1014af42efd9 576 k--;
simon 0:1014af42efd9 577 }
simon 0:1014af42efd9 578
simon 0:1014af42efd9 579 /* If the count is not a multiple of 4, compute any remaining MACs here.
simon 0:1014af42efd9 580 ** No loop unrolling is used. */
simon 0:1014af42efd9 581 k = count % 0x4u;
simon 0:1014af42efd9 582
simon 0:1014af42efd9 583 while(k > 0u)
simon 0:1014af42efd9 584 {
simon 0:1014af42efd9 585 /* Perform the multiply-accumulates */
simon 0:1014af42efd9 586 sum = __SMLALD(*px++, *py++, sum);
simon 0:1014af42efd9 587
simon 0:1014af42efd9 588 /* Decrement the loop counter */
simon 0:1014af42efd9 589 k--;
simon 0:1014af42efd9 590 }
simon 0:1014af42efd9 591
simon 0:1014af42efd9 592 /* Store the result in the accumulator in the destination buffer. */
simon 0:1014af42efd9 593 *pOut = (q15_t) (__SSAT((sum >> 15), 16));
simon 0:1014af42efd9 594 /* Destination pointer is updated according to the address modifier, inc */
simon 0:1014af42efd9 595 pOut += inc;
simon 0:1014af42efd9 596
simon 0:1014af42efd9 597 /* Update the inputA and inputB pointers for next MAC calculation */
simon 0:1014af42efd9 598 px = ++pSrc1;
simon 0:1014af42efd9 599 py = pIn2;
simon 0:1014af42efd9 600
simon 0:1014af42efd9 601 /* Decrement the MAC count */
simon 0:1014af42efd9 602 count--;
simon 0:1014af42efd9 603
simon 0:1014af42efd9 604 /* Decrement the loop counter */
simon 0:1014af42efd9 605 blockSize3--;
simon 0:1014af42efd9 606 }
simon 0:1014af42efd9 607
simon 0:1014af42efd9 608 }
simon 0:1014af42efd9 609
simon 0:1014af42efd9 610 /**
simon 0:1014af42efd9 611 * @} end of Corr group
simon 0:1014af42efd9 612 */