CMSIS DSP Library from CMSIS 2.0. See http://www.onarm.com/cmsis/ for full details

Dependents:   K22F_DSP_Matrix_least_square BNO055-ELEC3810 1BNO055 ECE4180Project--Slave2 ... more

Committer:
simon
Date:
Thu Mar 10 15:07:50 2011 +0000
Revision:
0:1014af42efd9

        

Who changed what in which revision?

UserRevisionLine numberNew contents of line
simon 0:1014af42efd9 1 /* ----------------------------------------------------------------------
simon 0:1014af42efd9 2 * Copyright (C) 2010 ARM Limited. All rights reserved.
simon 0:1014af42efd9 3 *
simon 0:1014af42efd9 4 * $Date: 29. November 2010
simon 0:1014af42efd9 5 * $Revision: V1.0.3
simon 0:1014af42efd9 6 *
simon 0:1014af42efd9 7 * Project: CMSIS DSP Library
simon 0:1014af42efd9 8 * Title: arm_conv_partial_q31.c
simon 0:1014af42efd9 9 *
simon 0:1014af42efd9 10 * Description: Q31 Partial convolution.
simon 0:1014af42efd9 11 *
simon 0:1014af42efd9 12 * Target Processor: Cortex-M4/Cortex-M3
simon 0:1014af42efd9 13 *
simon 0:1014af42efd9 14 * Version 1.0.3 2010/11/29
simon 0:1014af42efd9 15 * Re-organized the CMSIS folders and updated documentation.
simon 0:1014af42efd9 16 *
simon 0:1014af42efd9 17 * Version 1.0.2 2010/11/11
simon 0:1014af42efd9 18 * Documentation updated.
simon 0:1014af42efd9 19 *
simon 0:1014af42efd9 20 * Version 1.0.1 2010/10/05
simon 0:1014af42efd9 21 * Production release and review comments incorporated.
simon 0:1014af42efd9 22 *
simon 0:1014af42efd9 23 * Version 1.0.0 2010/09/20
simon 0:1014af42efd9 24 * Production release and review comments incorporated
simon 0:1014af42efd9 25 *
simon 0:1014af42efd9 26 * Version 0.0.7 2010/06/10
simon 0:1014af42efd9 27 * Misra-C changes done
simon 0:1014af42efd9 28 *
simon 0:1014af42efd9 29 * -------------------------------------------------------------------- */
simon 0:1014af42efd9 30
simon 0:1014af42efd9 31 #include "arm_math.h"
simon 0:1014af42efd9 32
simon 0:1014af42efd9 33 /**
simon 0:1014af42efd9 34 * @ingroup groupFilters
simon 0:1014af42efd9 35 */
simon 0:1014af42efd9 36
simon 0:1014af42efd9 37 /**
simon 0:1014af42efd9 38 * @addtogroup PartialConv
simon 0:1014af42efd9 39 * @{
simon 0:1014af42efd9 40 */
simon 0:1014af42efd9 41
simon 0:1014af42efd9 42 /**
simon 0:1014af42efd9 43 * @brief Partial convolution of Q31 sequences.
simon 0:1014af42efd9 44 * @param[in] *pSrcA points to the first input sequence.
simon 0:1014af42efd9 45 * @param[in] srcALen length of the first input sequence.
simon 0:1014af42efd9 46 * @param[in] *pSrcB points to the second input sequence.
simon 0:1014af42efd9 47 * @param[in] srcBLen length of the second input sequence.
simon 0:1014af42efd9 48 * @param[out] *pDst points to the location where the output result is written.
simon 0:1014af42efd9 49 * @param[in] firstIndex is the first output sample to start with.
simon 0:1014af42efd9 50 * @param[in] numPoints is the number of output points to be computed.
simon 0:1014af42efd9 51 * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
simon 0:1014af42efd9 52 *
simon 0:1014af42efd9 53 * See <code>arm_conv_partial_fast_q31()</code> for a faster but less precise implementation of this function.
simon 0:1014af42efd9 54 */
simon 0:1014af42efd9 55
simon 0:1014af42efd9 56 arm_status arm_conv_partial_q31(
simon 0:1014af42efd9 57 q31_t * pSrcA,
simon 0:1014af42efd9 58 uint32_t srcALen,
simon 0:1014af42efd9 59 q31_t * pSrcB,
simon 0:1014af42efd9 60 uint32_t srcBLen,
simon 0:1014af42efd9 61 q31_t * pDst,
simon 0:1014af42efd9 62 uint32_t firstIndex,
simon 0:1014af42efd9 63 uint32_t numPoints)
simon 0:1014af42efd9 64 {
simon 0:1014af42efd9 65 q31_t *pIn1; /* inputA pointer */
simon 0:1014af42efd9 66 q31_t *pIn2; /* inputB pointer */
simon 0:1014af42efd9 67 q31_t *pOut = pDst; /* output pointer */
simon 0:1014af42efd9 68 q31_t *px; /* Intermediate inputA pointer */
simon 0:1014af42efd9 69 q31_t *py; /* Intermediate inputB pointer */
simon 0:1014af42efd9 70 q31_t *pSrc1, *pSrc2; /* Intermediate pointers */
simon 0:1014af42efd9 71 q63_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
simon 0:1014af42efd9 72 q31_t x0, x1, x2, x3, c0;
simon 0:1014af42efd9 73 uint32_t j, k, count, check, blkCnt;
simon 0:1014af42efd9 74 int32_t blockSize1, blockSize2, blockSize3; /* loop counter */
simon 0:1014af42efd9 75 arm_status status; /* status of Partial convolution */
simon 0:1014af42efd9 76
simon 0:1014af42efd9 77
simon 0:1014af42efd9 78 /* Check for range of output samples to be calculated */
simon 0:1014af42efd9 79 if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
simon 0:1014af42efd9 80 {
simon 0:1014af42efd9 81 /* Set status as ARM_MATH_ARGUMENT_ERROR */
simon 0:1014af42efd9 82 status = ARM_MATH_ARGUMENT_ERROR;
simon 0:1014af42efd9 83 }
simon 0:1014af42efd9 84 else
simon 0:1014af42efd9 85 {
simon 0:1014af42efd9 86
simon 0:1014af42efd9 87 /* The algorithm implementation is based on the lengths of the inputs. */
simon 0:1014af42efd9 88 /* srcB is always made to slide across srcA. */
simon 0:1014af42efd9 89 /* So srcBLen is always considered as shorter or equal to srcALen */
simon 0:1014af42efd9 90 if(srcALen >= srcBLen)
simon 0:1014af42efd9 91 {
simon 0:1014af42efd9 92 /* Initialization of inputA pointer */
simon 0:1014af42efd9 93 pIn1 = pSrcA;
simon 0:1014af42efd9 94
simon 0:1014af42efd9 95 /* Initialization of inputB pointer */
simon 0:1014af42efd9 96 pIn2 = pSrcB;
simon 0:1014af42efd9 97 }
simon 0:1014af42efd9 98 else
simon 0:1014af42efd9 99 {
simon 0:1014af42efd9 100 /* Initialization of inputA pointer */
simon 0:1014af42efd9 101 pIn1 = pSrcB;
simon 0:1014af42efd9 102
simon 0:1014af42efd9 103 /* Initialization of inputB pointer */
simon 0:1014af42efd9 104 pIn2 = pSrcA;
simon 0:1014af42efd9 105
simon 0:1014af42efd9 106 /* srcBLen is always considered as shorter or equal to srcALen */
simon 0:1014af42efd9 107 j = srcBLen;
simon 0:1014af42efd9 108 srcBLen = srcALen;
simon 0:1014af42efd9 109 srcALen = j;
simon 0:1014af42efd9 110 }
simon 0:1014af42efd9 111
simon 0:1014af42efd9 112 /* Conditions to check which loopCounter holds
simon 0:1014af42efd9 113 * the first and last indices of the output samples to be calculated. */
simon 0:1014af42efd9 114 check = firstIndex + numPoints;
simon 0:1014af42efd9 115 blockSize3 = ((int32_t) check - (int32_t) srcALen);
simon 0:1014af42efd9 116 blockSize3 = (blockSize3 > 0) ? blockSize3 : 0;
simon 0:1014af42efd9 117 blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex);
simon 0:1014af42efd9 118 blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 :
simon 0:1014af42efd9 119 (int32_t) numPoints) : 0;
simon 0:1014af42efd9 120 blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) +
simon 0:1014af42efd9 121 (int32_t) firstIndex);
simon 0:1014af42efd9 122 blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
simon 0:1014af42efd9 123
simon 0:1014af42efd9 124 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
simon 0:1014af42efd9 125 /* The function is internally
simon 0:1014af42efd9 126 * divided into three stages according to the number of multiplications that has to be
simon 0:1014af42efd9 127 * taken place between inputA samples and inputB samples. In the first stage of the
simon 0:1014af42efd9 128 * algorithm, the multiplications increase by one for every iteration.
simon 0:1014af42efd9 129 * In the second stage of the algorithm, srcBLen number of multiplications are done.
simon 0:1014af42efd9 130 * In the third stage of the algorithm, the multiplications decrease by one
simon 0:1014af42efd9 131 * for every iteration. */
simon 0:1014af42efd9 132
simon 0:1014af42efd9 133 /* Set the output pointer to point to the firstIndex
simon 0:1014af42efd9 134 * of the output sample to be calculated. */
simon 0:1014af42efd9 135 pOut = pDst + firstIndex;
simon 0:1014af42efd9 136
simon 0:1014af42efd9 137 /* --------------------------
simon 0:1014af42efd9 138 * Initializations of stage1
simon 0:1014af42efd9 139 * -------------------------*/
simon 0:1014af42efd9 140
simon 0:1014af42efd9 141 /* sum = x[0] * y[0]
simon 0:1014af42efd9 142 * sum = x[0] * y[1] + x[1] * y[0]
simon 0:1014af42efd9 143 * ....
simon 0:1014af42efd9 144 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
simon 0:1014af42efd9 145 */
simon 0:1014af42efd9 146
simon 0:1014af42efd9 147 /* In this stage the MAC operations are increased by 1 for every iteration.
simon 0:1014af42efd9 148 The count variable holds the number of MAC operations performed.
simon 0:1014af42efd9 149 Since the partial convolution starts from firstIndex
simon 0:1014af42efd9 150 Number of Macs to be performed is firstIndex + 1 */
simon 0:1014af42efd9 151 count = 1u + firstIndex;
simon 0:1014af42efd9 152
simon 0:1014af42efd9 153 /* Working pointer of inputA */
simon 0:1014af42efd9 154 px = pIn1;
simon 0:1014af42efd9 155
simon 0:1014af42efd9 156 /* Working pointer of inputB */
simon 0:1014af42efd9 157 pSrc2 = pIn2 + firstIndex;
simon 0:1014af42efd9 158 py = pSrc2;
simon 0:1014af42efd9 159
simon 0:1014af42efd9 160 /* ------------------------
simon 0:1014af42efd9 161 * Stage1 process
simon 0:1014af42efd9 162 * ----------------------*/
simon 0:1014af42efd9 163
simon 0:1014af42efd9 164 /* The first loop starts here */
simon 0:1014af42efd9 165 while(blockSize1 > 0)
simon 0:1014af42efd9 166 {
simon 0:1014af42efd9 167 /* Accumulator is made zero for every iteration */
simon 0:1014af42efd9 168 sum = 0;
simon 0:1014af42efd9 169
simon 0:1014af42efd9 170 /* Apply loop unrolling and compute 4 MACs simultaneously. */
simon 0:1014af42efd9 171 k = count >> 2u;
simon 0:1014af42efd9 172
simon 0:1014af42efd9 173 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
simon 0:1014af42efd9 174 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
simon 0:1014af42efd9 175 while(k > 0u)
simon 0:1014af42efd9 176 {
simon 0:1014af42efd9 177 /* x[0] * y[srcBLen - 1] */
simon 0:1014af42efd9 178 sum += (q63_t) * px++ * (*py--);
simon 0:1014af42efd9 179 /* x[1] * y[srcBLen - 2] */
simon 0:1014af42efd9 180 sum += (q63_t) * px++ * (*py--);
simon 0:1014af42efd9 181 /* x[2] * y[srcBLen - 3] */
simon 0:1014af42efd9 182 sum += (q63_t) * px++ * (*py--);
simon 0:1014af42efd9 183 /* x[3] * y[srcBLen - 4] */
simon 0:1014af42efd9 184 sum += (q63_t) * px++ * (*py--);
simon 0:1014af42efd9 185
simon 0:1014af42efd9 186 /* Decrement the loop counter */
simon 0:1014af42efd9 187 k--;
simon 0:1014af42efd9 188 }
simon 0:1014af42efd9 189
simon 0:1014af42efd9 190 /* If the count is not a multiple of 4, compute any remaining MACs here.
simon 0:1014af42efd9 191 ** No loop unrolling is used. */
simon 0:1014af42efd9 192 k = count % 0x4u;
simon 0:1014af42efd9 193
simon 0:1014af42efd9 194 while(k > 0u)
simon 0:1014af42efd9 195 {
simon 0:1014af42efd9 196 /* Perform the multiply-accumulate */
simon 0:1014af42efd9 197 sum += (q63_t) * px++ * (*py--);
simon 0:1014af42efd9 198
simon 0:1014af42efd9 199 /* Decrement the loop counter */
simon 0:1014af42efd9 200 k--;
simon 0:1014af42efd9 201 }
simon 0:1014af42efd9 202
simon 0:1014af42efd9 203 /* Store the result in the accumulator in the destination buffer. */
simon 0:1014af42efd9 204 *pOut++ = (q31_t) (sum >> 31);
simon 0:1014af42efd9 205
simon 0:1014af42efd9 206 /* Update the inputA and inputB pointers for next MAC calculation */
simon 0:1014af42efd9 207 py = ++pSrc2;
simon 0:1014af42efd9 208 px = pIn1;
simon 0:1014af42efd9 209
simon 0:1014af42efd9 210 /* Increment the MAC count */
simon 0:1014af42efd9 211 count++;
simon 0:1014af42efd9 212
simon 0:1014af42efd9 213 /* Decrement the loop counter */
simon 0:1014af42efd9 214 blockSize1--;
simon 0:1014af42efd9 215 }
simon 0:1014af42efd9 216
simon 0:1014af42efd9 217 /* --------------------------
simon 0:1014af42efd9 218 * Initializations of stage2
simon 0:1014af42efd9 219 * ------------------------*/
simon 0:1014af42efd9 220
simon 0:1014af42efd9 221 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
simon 0:1014af42efd9 222 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
simon 0:1014af42efd9 223 * ....
simon 0:1014af42efd9 224 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
simon 0:1014af42efd9 225 */
simon 0:1014af42efd9 226
simon 0:1014af42efd9 227 /* Working pointer of inputA */
simon 0:1014af42efd9 228 px = pIn1;
simon 0:1014af42efd9 229
simon 0:1014af42efd9 230 /* Working pointer of inputB */
simon 0:1014af42efd9 231 pSrc2 = pIn2 + (srcBLen - 1u);
simon 0:1014af42efd9 232 py = pSrc2;
simon 0:1014af42efd9 233
simon 0:1014af42efd9 234 /* count is index by which the pointer pIn1 to be incremented */
simon 0:1014af42efd9 235 count = 1u;
simon 0:1014af42efd9 236
simon 0:1014af42efd9 237 /* -------------------
simon 0:1014af42efd9 238 * Stage2 process
simon 0:1014af42efd9 239 * ------------------*/
simon 0:1014af42efd9 240
simon 0:1014af42efd9 241 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
simon 0:1014af42efd9 242 * So, to loop unroll over blockSize2,
simon 0:1014af42efd9 243 * srcBLen should be greater than or equal to 4 */
simon 0:1014af42efd9 244 if(srcBLen >= 4u)
simon 0:1014af42efd9 245 {
simon 0:1014af42efd9 246 /* Loop unroll over blockSize2 */
simon 0:1014af42efd9 247 blkCnt = ((uint32_t) blockSize2 >> 2u);
simon 0:1014af42efd9 248
simon 0:1014af42efd9 249 while(blkCnt > 0u)
simon 0:1014af42efd9 250 {
simon 0:1014af42efd9 251 /* Set all accumulators to zero */
simon 0:1014af42efd9 252 acc0 = 0;
simon 0:1014af42efd9 253 acc1 = 0;
simon 0:1014af42efd9 254 acc2 = 0;
simon 0:1014af42efd9 255 acc3 = 0;
simon 0:1014af42efd9 256
simon 0:1014af42efd9 257 /* read x[0], x[1], x[2] samples */
simon 0:1014af42efd9 258 x0 = *(px++);
simon 0:1014af42efd9 259 x1 = *(px++);
simon 0:1014af42efd9 260 x2 = *(px++);
simon 0:1014af42efd9 261
simon 0:1014af42efd9 262 /* Apply loop unrolling and compute 4 MACs simultaneously. */
simon 0:1014af42efd9 263 k = srcBLen >> 2u;
simon 0:1014af42efd9 264
simon 0:1014af42efd9 265 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
simon 0:1014af42efd9 266 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
simon 0:1014af42efd9 267 do
simon 0:1014af42efd9 268 {
simon 0:1014af42efd9 269 /* Read y[srcBLen - 1] sample */
simon 0:1014af42efd9 270 c0 = *(py--);
simon 0:1014af42efd9 271
simon 0:1014af42efd9 272 /* Read x[3] sample */
simon 0:1014af42efd9 273 x3 = *(px++);
simon 0:1014af42efd9 274
simon 0:1014af42efd9 275 /* Perform the multiply-accumulates */
simon 0:1014af42efd9 276 /* acc0 += x[0] * y[srcBLen - 1] */
simon 0:1014af42efd9 277 acc0 += (q63_t) x0 *c0;
simon 0:1014af42efd9 278 /* acc1 += x[1] * y[srcBLen - 1] */
simon 0:1014af42efd9 279 acc1 += (q63_t) x1 *c0;
simon 0:1014af42efd9 280 /* acc2 += x[2] * y[srcBLen - 1] */
simon 0:1014af42efd9 281 acc2 += (q63_t) x2 *c0;
simon 0:1014af42efd9 282 /* acc3 += x[3] * y[srcBLen - 1] */
simon 0:1014af42efd9 283 acc3 += (q63_t) x3 *c0;
simon 0:1014af42efd9 284
simon 0:1014af42efd9 285 /* Read y[srcBLen - 2] sample */
simon 0:1014af42efd9 286 c0 = *(py--);
simon 0:1014af42efd9 287
simon 0:1014af42efd9 288 /* Read x[4] sample */
simon 0:1014af42efd9 289 x0 = *(px++);
simon 0:1014af42efd9 290
simon 0:1014af42efd9 291 /* Perform the multiply-accumulate */
simon 0:1014af42efd9 292 /* acc0 += x[1] * y[srcBLen - 2] */
simon 0:1014af42efd9 293 acc0 += (q63_t) x1 *c0;
simon 0:1014af42efd9 294 /* acc1 += x[2] * y[srcBLen - 2] */
simon 0:1014af42efd9 295 acc1 += (q63_t) x2 *c0;
simon 0:1014af42efd9 296 /* acc2 += x[3] * y[srcBLen - 2] */
simon 0:1014af42efd9 297 acc2 += (q63_t) x3 *c0;
simon 0:1014af42efd9 298 /* acc3 += x[4] * y[srcBLen - 2] */
simon 0:1014af42efd9 299 acc3 += (q63_t) x0 *c0;
simon 0:1014af42efd9 300
simon 0:1014af42efd9 301 /* Read y[srcBLen - 3] sample */
simon 0:1014af42efd9 302 c0 = *(py--);
simon 0:1014af42efd9 303
simon 0:1014af42efd9 304 /* Read x[5] sample */
simon 0:1014af42efd9 305 x1 = *(px++);
simon 0:1014af42efd9 306
simon 0:1014af42efd9 307 /* Perform the multiply-accumulates */
simon 0:1014af42efd9 308 /* acc0 += x[2] * y[srcBLen - 3] */
simon 0:1014af42efd9 309 acc0 += (q63_t) x2 *c0;
simon 0:1014af42efd9 310 /* acc1 += x[3] * y[srcBLen - 2] */
simon 0:1014af42efd9 311 acc1 += (q63_t) x3 *c0;
simon 0:1014af42efd9 312 /* acc2 += x[4] * y[srcBLen - 2] */
simon 0:1014af42efd9 313 acc2 += (q63_t) x0 *c0;
simon 0:1014af42efd9 314 /* acc3 += x[5] * y[srcBLen - 2] */
simon 0:1014af42efd9 315 acc3 += (q63_t) x1 *c0;
simon 0:1014af42efd9 316
simon 0:1014af42efd9 317 /* Read y[srcBLen - 4] sample */
simon 0:1014af42efd9 318 c0 = *(py--);
simon 0:1014af42efd9 319
simon 0:1014af42efd9 320 /* Read x[6] sample */
simon 0:1014af42efd9 321 x2 = *(px++);
simon 0:1014af42efd9 322
simon 0:1014af42efd9 323 /* Perform the multiply-accumulates */
simon 0:1014af42efd9 324 /* acc0 += x[3] * y[srcBLen - 4] */
simon 0:1014af42efd9 325 acc0 += (q63_t) x3 *c0;
simon 0:1014af42efd9 326 /* acc1 += x[4] * y[srcBLen - 4] */
simon 0:1014af42efd9 327 acc1 += (q63_t) x0 *c0;
simon 0:1014af42efd9 328 /* acc2 += x[5] * y[srcBLen - 4] */
simon 0:1014af42efd9 329 acc2 += (q63_t) x1 *c0;
simon 0:1014af42efd9 330 /* acc3 += x[6] * y[srcBLen - 4] */
simon 0:1014af42efd9 331 acc3 += (q63_t) x2 *c0;
simon 0:1014af42efd9 332
simon 0:1014af42efd9 333 } while(--k);
simon 0:1014af42efd9 334
simon 0:1014af42efd9 335 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
simon 0:1014af42efd9 336 ** No loop unrolling is used. */
simon 0:1014af42efd9 337 k = srcBLen % 0x4u;
simon 0:1014af42efd9 338
simon 0:1014af42efd9 339 while(k > 0u)
simon 0:1014af42efd9 340 {
simon 0:1014af42efd9 341 /* Read y[srcBLen - 5] sample */
simon 0:1014af42efd9 342 c0 = *(py--);
simon 0:1014af42efd9 343
simon 0:1014af42efd9 344 /* Read x[7] sample */
simon 0:1014af42efd9 345 x3 = *(px++);
simon 0:1014af42efd9 346
simon 0:1014af42efd9 347 /* Perform the multiply-accumulates */
simon 0:1014af42efd9 348 /* acc0 += x[4] * y[srcBLen - 5] */
simon 0:1014af42efd9 349 acc0 += (q63_t) x0 *c0;
simon 0:1014af42efd9 350 /* acc1 += x[5] * y[srcBLen - 5] */
simon 0:1014af42efd9 351 acc1 += (q63_t) x1 *c0;
simon 0:1014af42efd9 352 /* acc2 += x[6] * y[srcBLen - 5] */
simon 0:1014af42efd9 353 acc2 += (q63_t) x2 *c0;
simon 0:1014af42efd9 354 /* acc3 += x[7] * y[srcBLen - 5] */
simon 0:1014af42efd9 355 acc3 += (q63_t) x3 *c0;
simon 0:1014af42efd9 356
simon 0:1014af42efd9 357 /* Reuse the present samples for the next MAC */
simon 0:1014af42efd9 358 x0 = x1;
simon 0:1014af42efd9 359 x1 = x2;
simon 0:1014af42efd9 360 x2 = x3;
simon 0:1014af42efd9 361
simon 0:1014af42efd9 362 /* Decrement the loop counter */
simon 0:1014af42efd9 363 k--;
simon 0:1014af42efd9 364 }
simon 0:1014af42efd9 365
simon 0:1014af42efd9 366 /* Store the result in the accumulator in the destination buffer. */
simon 0:1014af42efd9 367 *pOut++ = (q31_t) (acc0 >> 31);
simon 0:1014af42efd9 368 *pOut++ = (q31_t) (acc1 >> 31);
simon 0:1014af42efd9 369 *pOut++ = (q31_t) (acc2 >> 31);
simon 0:1014af42efd9 370 *pOut++ = (q31_t) (acc3 >> 31);
simon 0:1014af42efd9 371
simon 0:1014af42efd9 372 /* Update the inputA and inputB pointers for next MAC calculation */
simon 0:1014af42efd9 373 px = pIn1 + (count * 4u);
simon 0:1014af42efd9 374 py = pSrc2;
simon 0:1014af42efd9 375
simon 0:1014af42efd9 376 /* Increment the pointer pIn1 index, count by 1 */
simon 0:1014af42efd9 377 count++;
simon 0:1014af42efd9 378
simon 0:1014af42efd9 379 /* Decrement the loop counter */
simon 0:1014af42efd9 380 blkCnt--;
simon 0:1014af42efd9 381 }
simon 0:1014af42efd9 382
simon 0:1014af42efd9 383 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
simon 0:1014af42efd9 384 ** No loop unrolling is used. */
simon 0:1014af42efd9 385 blkCnt = (uint32_t) blockSize2 % 0x4u;
simon 0:1014af42efd9 386
simon 0:1014af42efd9 387 while(blkCnt > 0u)
simon 0:1014af42efd9 388 {
simon 0:1014af42efd9 389 /* Accumulator is made zero for every iteration */
simon 0:1014af42efd9 390 sum = 0;
simon 0:1014af42efd9 391
simon 0:1014af42efd9 392 /* Apply loop unrolling and compute 4 MACs simultaneously. */
simon 0:1014af42efd9 393 k = srcBLen >> 2u;
simon 0:1014af42efd9 394
simon 0:1014af42efd9 395 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
simon 0:1014af42efd9 396 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
simon 0:1014af42efd9 397 while(k > 0u)
simon 0:1014af42efd9 398 {
simon 0:1014af42efd9 399 /* Perform the multiply-accumulates */
simon 0:1014af42efd9 400 sum += (q63_t) * px++ * (*py--);
simon 0:1014af42efd9 401 sum += (q63_t) * px++ * (*py--);
simon 0:1014af42efd9 402 sum += (q63_t) * px++ * (*py--);
simon 0:1014af42efd9 403 sum += (q63_t) * px++ * (*py--);
simon 0:1014af42efd9 404
simon 0:1014af42efd9 405 /* Decrement the loop counter */
simon 0:1014af42efd9 406 k--;
simon 0:1014af42efd9 407 }
simon 0:1014af42efd9 408
simon 0:1014af42efd9 409 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
simon 0:1014af42efd9 410 ** No loop unrolling is used. */
simon 0:1014af42efd9 411 k = srcBLen % 0x4u;
simon 0:1014af42efd9 412
simon 0:1014af42efd9 413 while(k > 0u)
simon 0:1014af42efd9 414 {
simon 0:1014af42efd9 415 /* Perform the multiply-accumulate */
simon 0:1014af42efd9 416 sum += (q63_t) * px++ * (*py--);
simon 0:1014af42efd9 417
simon 0:1014af42efd9 418 /* Decrement the loop counter */
simon 0:1014af42efd9 419 k--;
simon 0:1014af42efd9 420 }
simon 0:1014af42efd9 421
simon 0:1014af42efd9 422 /* Store the result in the accumulator in the destination buffer. */
simon 0:1014af42efd9 423 *pOut++ = (q31_t) (sum >> 31);
simon 0:1014af42efd9 424
simon 0:1014af42efd9 425 /* Update the inputA and inputB pointers for next MAC calculation */
simon 0:1014af42efd9 426 px = pIn1 + count;
simon 0:1014af42efd9 427 py = pSrc2;
simon 0:1014af42efd9 428
simon 0:1014af42efd9 429 /* Increment the MAC count */
simon 0:1014af42efd9 430 count++;
simon 0:1014af42efd9 431
simon 0:1014af42efd9 432 /* Decrement the loop counter */
simon 0:1014af42efd9 433 blkCnt--;
simon 0:1014af42efd9 434 }
simon 0:1014af42efd9 435 }
simon 0:1014af42efd9 436 else
simon 0:1014af42efd9 437 {
simon 0:1014af42efd9 438 /* If the srcBLen is not a multiple of 4,
simon 0:1014af42efd9 439 * the blockSize2 loop cannot be unrolled by 4 */
simon 0:1014af42efd9 440 blkCnt = (uint32_t) blockSize2;
simon 0:1014af42efd9 441
simon 0:1014af42efd9 442 while(blkCnt > 0u)
simon 0:1014af42efd9 443 {
simon 0:1014af42efd9 444 /* Accumulator is made zero for every iteration */
simon 0:1014af42efd9 445 sum = 0;
simon 0:1014af42efd9 446
simon 0:1014af42efd9 447 /* srcBLen number of MACS should be performed */
simon 0:1014af42efd9 448 k = srcBLen;
simon 0:1014af42efd9 449
simon 0:1014af42efd9 450 while(k > 0u)
simon 0:1014af42efd9 451 {
simon 0:1014af42efd9 452 /* Perform the multiply-accumulate */
simon 0:1014af42efd9 453 sum += (q63_t) * px++ * (*py--);
simon 0:1014af42efd9 454
simon 0:1014af42efd9 455 /* Decrement the loop counter */
simon 0:1014af42efd9 456 k--;
simon 0:1014af42efd9 457 }
simon 0:1014af42efd9 458
simon 0:1014af42efd9 459 /* Store the result in the accumulator in the destination buffer. */
simon 0:1014af42efd9 460 *pOut++ = (q31_t) (sum >> 31);
simon 0:1014af42efd9 461
simon 0:1014af42efd9 462 /* Update the inputA and inputB pointers for next MAC calculation */
simon 0:1014af42efd9 463 px = pIn1 + count;
simon 0:1014af42efd9 464 py = pSrc2;
simon 0:1014af42efd9 465
simon 0:1014af42efd9 466 /* Increment the MAC count */
simon 0:1014af42efd9 467 count++;
simon 0:1014af42efd9 468
simon 0:1014af42efd9 469 /* Decrement the loop counter */
simon 0:1014af42efd9 470 blkCnt--;
simon 0:1014af42efd9 471 }
simon 0:1014af42efd9 472 }
simon 0:1014af42efd9 473
simon 0:1014af42efd9 474
simon 0:1014af42efd9 475 /* --------------------------
simon 0:1014af42efd9 476 * Initializations of stage3
simon 0:1014af42efd9 477 * -------------------------*/
simon 0:1014af42efd9 478
simon 0:1014af42efd9 479 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
simon 0:1014af42efd9 480 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
simon 0:1014af42efd9 481 * ....
simon 0:1014af42efd9 482 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
simon 0:1014af42efd9 483 * sum += x[srcALen-1] * y[srcBLen-1]
simon 0:1014af42efd9 484 */
simon 0:1014af42efd9 485
simon 0:1014af42efd9 486 /* In this stage the MAC operations are decreased by 1 for every iteration.
simon 0:1014af42efd9 487 The blockSize3 variable holds the number of MAC operations performed */
simon 0:1014af42efd9 488 count = srcBLen - 1u;
simon 0:1014af42efd9 489
simon 0:1014af42efd9 490 /* Working pointer of inputA */
simon 0:1014af42efd9 491 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
simon 0:1014af42efd9 492 px = pSrc1;
simon 0:1014af42efd9 493
simon 0:1014af42efd9 494 /* Working pointer of inputB */
simon 0:1014af42efd9 495 pSrc2 = pIn2 + (srcBLen - 1u);
simon 0:1014af42efd9 496 py = pSrc2;
simon 0:1014af42efd9 497
simon 0:1014af42efd9 498 /* -------------------
simon 0:1014af42efd9 499 * Stage3 process
simon 0:1014af42efd9 500 * ------------------*/
simon 0:1014af42efd9 501
simon 0:1014af42efd9 502 while(blockSize3 > 0)
simon 0:1014af42efd9 503 {
simon 0:1014af42efd9 504 /* Accumulator is made zero for every iteration */
simon 0:1014af42efd9 505 sum = 0;
simon 0:1014af42efd9 506
simon 0:1014af42efd9 507 /* Apply loop unrolling and compute 4 MACs simultaneously. */
simon 0:1014af42efd9 508 k = count >> 2u;
simon 0:1014af42efd9 509
simon 0:1014af42efd9 510 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
simon 0:1014af42efd9 511 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
simon 0:1014af42efd9 512 while(k > 0u)
simon 0:1014af42efd9 513 {
simon 0:1014af42efd9 514 sum += (q63_t) * px++ * (*py--);
simon 0:1014af42efd9 515 sum += (q63_t) * px++ * (*py--);
simon 0:1014af42efd9 516 sum += (q63_t) * px++ * (*py--);
simon 0:1014af42efd9 517 sum += (q63_t) * px++ * (*py--);
simon 0:1014af42efd9 518
simon 0:1014af42efd9 519 /* Decrement the loop counter */
simon 0:1014af42efd9 520 k--;
simon 0:1014af42efd9 521 }
simon 0:1014af42efd9 522
simon 0:1014af42efd9 523 /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
simon 0:1014af42efd9 524 ** No loop unrolling is used. */
simon 0:1014af42efd9 525 k = count % 0x4u;
simon 0:1014af42efd9 526
simon 0:1014af42efd9 527 while(k > 0u)
simon 0:1014af42efd9 528 {
simon 0:1014af42efd9 529 /* Perform the multiply-accumulate */
simon 0:1014af42efd9 530 sum += (q63_t) * px++ * (*py--);
simon 0:1014af42efd9 531
simon 0:1014af42efd9 532 /* Decrement the loop counter */
simon 0:1014af42efd9 533 k--;
simon 0:1014af42efd9 534 }
simon 0:1014af42efd9 535
simon 0:1014af42efd9 536 /* Store the result in the accumulator in the destination buffer. */
simon 0:1014af42efd9 537 *pOut++ = (q31_t) (sum >> 31);
simon 0:1014af42efd9 538
simon 0:1014af42efd9 539 /* Update the inputA and inputB pointers for next MAC calculation */
simon 0:1014af42efd9 540 px = ++pSrc1;
simon 0:1014af42efd9 541 py = pSrc2;
simon 0:1014af42efd9 542
simon 0:1014af42efd9 543 /* Decrement the MAC count */
simon 0:1014af42efd9 544 count--;
simon 0:1014af42efd9 545
simon 0:1014af42efd9 546 /* Decrement the loop counter */
simon 0:1014af42efd9 547 blockSize3--;
simon 0:1014af42efd9 548
simon 0:1014af42efd9 549 }
simon 0:1014af42efd9 550
simon 0:1014af42efd9 551 /* set status as ARM_MATH_SUCCESS */
simon 0:1014af42efd9 552 status = ARM_MATH_SUCCESS;
simon 0:1014af42efd9 553 }
simon 0:1014af42efd9 554
simon 0:1014af42efd9 555 /* Return to application */
simon 0:1014af42efd9 556 return (status);
simon 0:1014af42efd9 557
simon 0:1014af42efd9 558 }
simon 0:1014af42efd9 559
simon 0:1014af42efd9 560 /**
simon 0:1014af42efd9 561 * @} end of PartialConv group
simon 0:1014af42efd9 562 */