CMSIS DSP Library from CMSIS 2.0. See http://www.onarm.com/cmsis/ for full details

Dependents:   K22F_DSP_Matrix_least_square BNO055-ELEC3810 1BNO055 ECE4180Project--Slave2 ... more

Committer:
simon
Date:
Thu Mar 10 15:07:50 2011 +0000
Revision:
0:1014af42efd9

        

Who changed what in which revision?

UserRevisionLine numberNew contents of line
simon 0:1014af42efd9 1 /* ----------------------------------------------------------------------
simon 0:1014af42efd9 2 * Copyright (C) 2010 ARM Limited. All rights reserved.
simon 0:1014af42efd9 3 *
simon 0:1014af42efd9 4 * $Date: 29. November 2010
simon 0:1014af42efd9 5 * $Revision: V1.0.3
simon 0:1014af42efd9 6 *
simon 0:1014af42efd9 7 * Project: CMSIS DSP Library
simon 0:1014af42efd9 8 * Title: arm_conv_partial_q15.c
simon 0:1014af42efd9 9 *
simon 0:1014af42efd9 10 * Description: Q15 Partial convolution.
simon 0:1014af42efd9 11 *
simon 0:1014af42efd9 12 * Target Processor: Cortex-M4/Cortex-M3
simon 0:1014af42efd9 13 *
simon 0:1014af42efd9 14 * Version 1.0.3 2010/11/29
simon 0:1014af42efd9 15 * Re-organized the CMSIS folders and updated documentation.
simon 0:1014af42efd9 16 *
simon 0:1014af42efd9 17 * Version 1.0.2 2010/11/11
simon 0:1014af42efd9 18 * Documentation updated.
simon 0:1014af42efd9 19 *
simon 0:1014af42efd9 20 * Version 1.0.1 2010/10/05
simon 0:1014af42efd9 21 * Production release and review comments incorporated.
simon 0:1014af42efd9 22 *
simon 0:1014af42efd9 23 * Version 1.0.0 2010/09/20
simon 0:1014af42efd9 24 * Production release and review comments incorporated
simon 0:1014af42efd9 25 *
simon 0:1014af42efd9 26 * Version 0.0.7 2010/06/10
simon 0:1014af42efd9 27 * Misra-C changes done
simon 0:1014af42efd9 28 *
simon 0:1014af42efd9 29 * -------------------------------------------------------------------- */
simon 0:1014af42efd9 30
simon 0:1014af42efd9 31 #include "arm_math.h"
simon 0:1014af42efd9 32
simon 0:1014af42efd9 33 /**
simon 0:1014af42efd9 34 * @ingroup groupFilters
simon 0:1014af42efd9 35 */
simon 0:1014af42efd9 36
simon 0:1014af42efd9 37 /**
simon 0:1014af42efd9 38 * @addtogroup PartialConv
simon 0:1014af42efd9 39 * @{
simon 0:1014af42efd9 40 */
simon 0:1014af42efd9 41
simon 0:1014af42efd9 42 /**
simon 0:1014af42efd9 43 * @brief Partial convolution of Q15 sequences.
simon 0:1014af42efd9 44 * @param[in] *pSrcA points to the first input sequence.
simon 0:1014af42efd9 45 * @param[in] srcALen length of the first input sequence.
simon 0:1014af42efd9 46 * @param[in] *pSrcB points to the second input sequence.
simon 0:1014af42efd9 47 * @param[in] srcBLen length of the second input sequence.
simon 0:1014af42efd9 48 * @param[out] *pDst points to the location where the output result is written.
simon 0:1014af42efd9 49 * @param[in] firstIndex is the first output sample to start with.
simon 0:1014af42efd9 50 * @param[in] numPoints is the number of output points to be computed.
simon 0:1014af42efd9 51 * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
simon 0:1014af42efd9 52 *
simon 0:1014af42efd9 53 * Refer to <code>arm_conv_partial_fast_q15()</code> for a faster but less precise version of this function.
simon 0:1014af42efd9 54 */
simon 0:1014af42efd9 55
simon 0:1014af42efd9 56
simon 0:1014af42efd9 57 arm_status arm_conv_partial_q15(
simon 0:1014af42efd9 58 q15_t * pSrcA,
simon 0:1014af42efd9 59 uint32_t srcALen,
simon 0:1014af42efd9 60 q15_t * pSrcB,
simon 0:1014af42efd9 61 uint32_t srcBLen,
simon 0:1014af42efd9 62 q15_t * pDst,
simon 0:1014af42efd9 63 uint32_t firstIndex,
simon 0:1014af42efd9 64 uint32_t numPoints)
simon 0:1014af42efd9 65 {
simon 0:1014af42efd9 66 q15_t *pIn1; /* inputA pointer */
simon 0:1014af42efd9 67 q15_t *pIn2; /* inputB pointer */
simon 0:1014af42efd9 68 q15_t *pOut = pDst; /* output pointer */
simon 0:1014af42efd9 69 q63_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
simon 0:1014af42efd9 70 q15_t *px; /* Intermediate inputA pointer */
simon 0:1014af42efd9 71 q15_t *py; /* Intermediate inputB pointer */
simon 0:1014af42efd9 72 q15_t *pSrc1, *pSrc2; /* Intermediate pointers */
simon 0:1014af42efd9 73 q31_t x0, x1, x2, x3, c0; /* Temporary input variables */
simon 0:1014af42efd9 74 uint32_t j, k, count, check, blkCnt;
simon 0:1014af42efd9 75 int32_t blockSize1, blockSize2, blockSize3; /* loop counter */
simon 0:1014af42efd9 76 arm_status status; /* status of Partial convolution */
simon 0:1014af42efd9 77 q31_t *pb; /* 32 bit pointer for inputB buffer */
simon 0:1014af42efd9 78
simon 0:1014af42efd9 79 /* Check for range of output samples to be calculated */
simon 0:1014af42efd9 80 if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
simon 0:1014af42efd9 81 {
simon 0:1014af42efd9 82 /* Set status as ARM_MATH_ARGUMENT_ERROR */
simon 0:1014af42efd9 83 status = ARM_MATH_ARGUMENT_ERROR;
simon 0:1014af42efd9 84 }
simon 0:1014af42efd9 85 else
simon 0:1014af42efd9 86 {
simon 0:1014af42efd9 87
simon 0:1014af42efd9 88 /* The algorithm implementation is based on the lengths of the inputs. */
simon 0:1014af42efd9 89 /* srcB is always made to slide across srcA. */
simon 0:1014af42efd9 90 /* So srcBLen is always considered as shorter or equal to srcALen */
simon 0:1014af42efd9 91 if(srcALen >= srcBLen)
simon 0:1014af42efd9 92 {
simon 0:1014af42efd9 93 /* Initialization of inputA pointer */
simon 0:1014af42efd9 94 pIn1 = pSrcA;
simon 0:1014af42efd9 95
simon 0:1014af42efd9 96 /* Initialization of inputB pointer */
simon 0:1014af42efd9 97 pIn2 = pSrcB;
simon 0:1014af42efd9 98 }
simon 0:1014af42efd9 99 else
simon 0:1014af42efd9 100 {
simon 0:1014af42efd9 101 /* Initialization of inputA pointer */
simon 0:1014af42efd9 102 pIn1 = pSrcB;
simon 0:1014af42efd9 103
simon 0:1014af42efd9 104 /* Initialization of inputB pointer */
simon 0:1014af42efd9 105 pIn2 = pSrcA;
simon 0:1014af42efd9 106
simon 0:1014af42efd9 107 /* srcBLen is always considered as shorter or equal to srcALen */
simon 0:1014af42efd9 108 j = srcBLen;
simon 0:1014af42efd9 109 srcBLen = srcALen;
simon 0:1014af42efd9 110 srcALen = j;
simon 0:1014af42efd9 111 }
simon 0:1014af42efd9 112
simon 0:1014af42efd9 113 /* Conditions to check which loopCounter holds
simon 0:1014af42efd9 114 * the first and last indices of the output samples to be calculated. */
simon 0:1014af42efd9 115 check = firstIndex + numPoints;
simon 0:1014af42efd9 116 blockSize3 = ((int32_t) check - (int32_t) srcALen);
simon 0:1014af42efd9 117 blockSize3 = (blockSize3 > 0) ? blockSize3 : 0;
simon 0:1014af42efd9 118 blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex);
simon 0:1014af42efd9 119 blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 :
simon 0:1014af42efd9 120 (int32_t) numPoints) : 0;
simon 0:1014af42efd9 121 blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) +
simon 0:1014af42efd9 122 (int32_t) firstIndex);
simon 0:1014af42efd9 123 blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
simon 0:1014af42efd9 124
simon 0:1014af42efd9 125 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
simon 0:1014af42efd9 126 /* The function is internally
simon 0:1014af42efd9 127 * divided into three stages according to the number of multiplications that has to be
simon 0:1014af42efd9 128 * taken place between inputA samples and inputB samples. In the first stage of the
simon 0:1014af42efd9 129 * algorithm, the multiplications increase by one for every iteration.
simon 0:1014af42efd9 130 * In the second stage of the algorithm, srcBLen number of multiplications are done.
simon 0:1014af42efd9 131 * In the third stage of the algorithm, the multiplications decrease by one
simon 0:1014af42efd9 132 * for every iteration. */
simon 0:1014af42efd9 133
simon 0:1014af42efd9 134 /* Set the output pointer to point to the firstIndex
simon 0:1014af42efd9 135 * of the output sample to be calculated. */
simon 0:1014af42efd9 136 pOut = pDst + firstIndex;
simon 0:1014af42efd9 137
simon 0:1014af42efd9 138 /* --------------------------
simon 0:1014af42efd9 139 * Initializations of stage1
simon 0:1014af42efd9 140 * -------------------------*/
simon 0:1014af42efd9 141
simon 0:1014af42efd9 142 /* sum = x[0] * y[0]
simon 0:1014af42efd9 143 * sum = x[0] * y[1] + x[1] * y[0]
simon 0:1014af42efd9 144 * ....
simon 0:1014af42efd9 145 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
simon 0:1014af42efd9 146 */
simon 0:1014af42efd9 147
simon 0:1014af42efd9 148 /* In this stage the MAC operations are increased by 1 for every iteration.
simon 0:1014af42efd9 149 The count variable holds the number of MAC operations performed.
simon 0:1014af42efd9 150 Since the partial convolution starts from firstIndex
simon 0:1014af42efd9 151 Number of Macs to be performed is firstIndex + 1 */
simon 0:1014af42efd9 152 count = 1u + firstIndex;
simon 0:1014af42efd9 153
simon 0:1014af42efd9 154 /* Working pointer of inputA */
simon 0:1014af42efd9 155 px = pIn1;
simon 0:1014af42efd9 156
simon 0:1014af42efd9 157 /* Working pointer of inputB */
simon 0:1014af42efd9 158 pSrc2 = pIn2 + firstIndex;
simon 0:1014af42efd9 159 py = pSrc2;
simon 0:1014af42efd9 160
simon 0:1014af42efd9 161 /* ------------------------
simon 0:1014af42efd9 162 * Stage1 process
simon 0:1014af42efd9 163 * ----------------------*/
simon 0:1014af42efd9 164
simon 0:1014af42efd9 165 /* For loop unrolling by 4, this stage is divided into two. */
simon 0:1014af42efd9 166 /* First part of this stage computes the MAC operations less than 4 */
simon 0:1014af42efd9 167 /* Second part of this stage computes the MAC operations greater than or equal to 4 */
simon 0:1014af42efd9 168
simon 0:1014af42efd9 169 /* The first part of the stage starts here */
simon 0:1014af42efd9 170 while((count < 4u) && (blockSize1 > 0))
simon 0:1014af42efd9 171 {
simon 0:1014af42efd9 172 /* Accumulator is made zero for every iteration */
simon 0:1014af42efd9 173 sum = 0;
simon 0:1014af42efd9 174
simon 0:1014af42efd9 175 /* Loop over number of MAC operations between
simon 0:1014af42efd9 176 * inputA samples and inputB samples */
simon 0:1014af42efd9 177 k = count;
simon 0:1014af42efd9 178
simon 0:1014af42efd9 179 while(k > 0u)
simon 0:1014af42efd9 180 {
simon 0:1014af42efd9 181 /* Perform the multiply-accumulates */
simon 0:1014af42efd9 182 sum = __SMLALD(*px++, *py--, sum);
simon 0:1014af42efd9 183
simon 0:1014af42efd9 184 /* Decrement the loop counter */
simon 0:1014af42efd9 185 k--;
simon 0:1014af42efd9 186 }
simon 0:1014af42efd9 187
simon 0:1014af42efd9 188 /* Store the result in the accumulator in the destination buffer. */
simon 0:1014af42efd9 189 *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
simon 0:1014af42efd9 190
simon 0:1014af42efd9 191 /* Update the inputA and inputB pointers for next MAC calculation */
simon 0:1014af42efd9 192 py = ++pSrc2;
simon 0:1014af42efd9 193 px = pIn1;
simon 0:1014af42efd9 194
simon 0:1014af42efd9 195 /* Increment the MAC count */
simon 0:1014af42efd9 196 count++;
simon 0:1014af42efd9 197
simon 0:1014af42efd9 198 /* Decrement the loop counter */
simon 0:1014af42efd9 199 blockSize1--;
simon 0:1014af42efd9 200 }
simon 0:1014af42efd9 201
simon 0:1014af42efd9 202 /* The second part of the stage starts here */
simon 0:1014af42efd9 203 /* The internal loop, over count, is unrolled by 4 */
simon 0:1014af42efd9 204 /* To, read the last two inputB samples using SIMD:
simon 0:1014af42efd9 205 * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
simon 0:1014af42efd9 206 py = py - 1;
simon 0:1014af42efd9 207
simon 0:1014af42efd9 208 while(blockSize1 > 0)
simon 0:1014af42efd9 209 {
simon 0:1014af42efd9 210 /* Accumulator is made zero for every iteration */
simon 0:1014af42efd9 211 sum = 0;
simon 0:1014af42efd9 212
simon 0:1014af42efd9 213 /* Apply loop unrolling and compute 4 MACs simultaneously. */
simon 0:1014af42efd9 214 k = count >> 2u;
simon 0:1014af42efd9 215
simon 0:1014af42efd9 216 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
simon 0:1014af42efd9 217 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
simon 0:1014af42efd9 218 while(k > 0u)
simon 0:1014af42efd9 219 {
simon 0:1014af42efd9 220 /* Perform the multiply-accumulates */
simon 0:1014af42efd9 221 /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */
simon 0:1014af42efd9 222 sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
simon 0:1014af42efd9 223 /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */
simon 0:1014af42efd9 224 sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
simon 0:1014af42efd9 225
simon 0:1014af42efd9 226 /* Decrement the loop counter */
simon 0:1014af42efd9 227 k--;
simon 0:1014af42efd9 228 }
simon 0:1014af42efd9 229
simon 0:1014af42efd9 230 /* For the next MAC operations, the pointer py is used without SIMD
simon 0:1014af42efd9 231 * So, py is incremented by 1 */
simon 0:1014af42efd9 232 py = py + 1u;
simon 0:1014af42efd9 233
simon 0:1014af42efd9 234 /* If the count is not a multiple of 4, compute any remaining MACs here.
simon 0:1014af42efd9 235 ** No loop unrolling is used. */
simon 0:1014af42efd9 236 k = count % 0x4u;
simon 0:1014af42efd9 237
simon 0:1014af42efd9 238 while(k > 0u)
simon 0:1014af42efd9 239 {
simon 0:1014af42efd9 240 /* Perform the multiply-accumulates */
simon 0:1014af42efd9 241 sum = __SMLALD(*px++, *py--, sum);
simon 0:1014af42efd9 242
simon 0:1014af42efd9 243 /* Decrement the loop counter */
simon 0:1014af42efd9 244 k--;
simon 0:1014af42efd9 245 }
simon 0:1014af42efd9 246
simon 0:1014af42efd9 247 /* Store the result in the accumulator in the destination buffer. */
simon 0:1014af42efd9 248 *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
simon 0:1014af42efd9 249
simon 0:1014af42efd9 250 /* Update the inputA and inputB pointers for next MAC calculation */
simon 0:1014af42efd9 251 py = ++pSrc2 - 1u;
simon 0:1014af42efd9 252 px = pIn1;
simon 0:1014af42efd9 253
simon 0:1014af42efd9 254 /* Increment the MAC count */
simon 0:1014af42efd9 255 count++;
simon 0:1014af42efd9 256
simon 0:1014af42efd9 257 /* Decrement the loop counter */
simon 0:1014af42efd9 258 blockSize1--;
simon 0:1014af42efd9 259 }
simon 0:1014af42efd9 260
simon 0:1014af42efd9 261 /* --------------------------
simon 0:1014af42efd9 262 * Initializations of stage2
simon 0:1014af42efd9 263 * ------------------------*/
simon 0:1014af42efd9 264
simon 0:1014af42efd9 265 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
simon 0:1014af42efd9 266 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
simon 0:1014af42efd9 267 * ....
simon 0:1014af42efd9 268 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
simon 0:1014af42efd9 269 */
simon 0:1014af42efd9 270
simon 0:1014af42efd9 271 /* Working pointer of inputA */
simon 0:1014af42efd9 272 px = pIn1;
simon 0:1014af42efd9 273
simon 0:1014af42efd9 274 /* Working pointer of inputB */
simon 0:1014af42efd9 275 pSrc2 = pIn2 + (srcBLen - 1u);
simon 0:1014af42efd9 276 py = pSrc2;
simon 0:1014af42efd9 277
simon 0:1014af42efd9 278 /* Initialize inputB pointer of type q31 */
simon 0:1014af42efd9 279 pb = (q31_t *) (py - 1u);
simon 0:1014af42efd9 280
simon 0:1014af42efd9 281 /* count is the index by which the pointer pIn1 to be incremented */
simon 0:1014af42efd9 282 count = 1u;
simon 0:1014af42efd9 283
simon 0:1014af42efd9 284
simon 0:1014af42efd9 285 /* --------------------
simon 0:1014af42efd9 286 * Stage2 process
simon 0:1014af42efd9 287 * -------------------*/
simon 0:1014af42efd9 288
simon 0:1014af42efd9 289 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
simon 0:1014af42efd9 290 * So, to loop unroll over blockSize2,
simon 0:1014af42efd9 291 * srcBLen should be greater than or equal to 4 */
simon 0:1014af42efd9 292 if(srcBLen >= 4u)
simon 0:1014af42efd9 293 {
simon 0:1014af42efd9 294 /* Loop unroll over blockSize2, by 4 */
simon 0:1014af42efd9 295 blkCnt = ((uint32_t) blockSize2 >> 2u);
simon 0:1014af42efd9 296
simon 0:1014af42efd9 297 while(blkCnt > 0u)
simon 0:1014af42efd9 298 {
simon 0:1014af42efd9 299 /* Set all accumulators to zero */
simon 0:1014af42efd9 300 acc0 = 0;
simon 0:1014af42efd9 301 acc1 = 0;
simon 0:1014af42efd9 302 acc2 = 0;
simon 0:1014af42efd9 303 acc3 = 0;
simon 0:1014af42efd9 304
simon 0:1014af42efd9 305
simon 0:1014af42efd9 306 /* read x[0], x[1] samples */
simon 0:1014af42efd9 307 x0 = *(q31_t *) (px++);
simon 0:1014af42efd9 308 /* read x[1], x[2] samples */
simon 0:1014af42efd9 309 x1 = *(q31_t *) (px++);
simon 0:1014af42efd9 310
simon 0:1014af42efd9 311
simon 0:1014af42efd9 312 /* Apply loop unrolling and compute 4 MACs simultaneously. */
simon 0:1014af42efd9 313 k = srcBLen >> 2u;
simon 0:1014af42efd9 314
simon 0:1014af42efd9 315 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
simon 0:1014af42efd9 316 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
simon 0:1014af42efd9 317 do
simon 0:1014af42efd9 318 {
simon 0:1014af42efd9 319 /* Read the last two inputB samples using SIMD:
simon 0:1014af42efd9 320 * y[srcBLen - 1] and y[srcBLen - 2] */
simon 0:1014af42efd9 321 c0 = *(pb--);
simon 0:1014af42efd9 322
simon 0:1014af42efd9 323 /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
simon 0:1014af42efd9 324 acc0 = __SMLALDX(x0, c0, acc0);
simon 0:1014af42efd9 325
simon 0:1014af42efd9 326 /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
simon 0:1014af42efd9 327 acc1 = __SMLALDX(x1, c0, acc1);
simon 0:1014af42efd9 328
simon 0:1014af42efd9 329 /* Read x[2], x[3] */
simon 0:1014af42efd9 330 x2 = *(q31_t *) (px++);
simon 0:1014af42efd9 331
simon 0:1014af42efd9 332 /* Read x[3], x[4] */
simon 0:1014af42efd9 333 x3 = *(q31_t *) (px++);
simon 0:1014af42efd9 334
simon 0:1014af42efd9 335 /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
simon 0:1014af42efd9 336 acc2 = __SMLALDX(x2, c0, acc2);
simon 0:1014af42efd9 337
simon 0:1014af42efd9 338 /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
simon 0:1014af42efd9 339 acc3 = __SMLALDX(x3, c0, acc3);
simon 0:1014af42efd9 340
simon 0:1014af42efd9 341 /* Read y[srcBLen - 3] and y[srcBLen - 4] */
simon 0:1014af42efd9 342 c0 = *(pb--);
simon 0:1014af42efd9 343
simon 0:1014af42efd9 344 /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
simon 0:1014af42efd9 345 acc0 = __SMLALDX(x2, c0, acc0);
simon 0:1014af42efd9 346
simon 0:1014af42efd9 347 /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
simon 0:1014af42efd9 348 acc1 = __SMLALDX(x3, c0, acc1);
simon 0:1014af42efd9 349
simon 0:1014af42efd9 350 /* Read x[4], x[5] */
simon 0:1014af42efd9 351 x0 = *(q31_t *) (px++);
simon 0:1014af42efd9 352
simon 0:1014af42efd9 353 /* Read x[5], x[6] */
simon 0:1014af42efd9 354 x1 = *(q31_t *) (px++);
simon 0:1014af42efd9 355
simon 0:1014af42efd9 356 /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
simon 0:1014af42efd9 357 acc2 = __SMLALDX(x0, c0, acc2);
simon 0:1014af42efd9 358
simon 0:1014af42efd9 359 /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
simon 0:1014af42efd9 360 acc3 = __SMLALDX(x1, c0, acc3);
simon 0:1014af42efd9 361
simon 0:1014af42efd9 362 } while(--k);
simon 0:1014af42efd9 363
simon 0:1014af42efd9 364 /* For the next MAC operations, SIMD is not used
simon 0:1014af42efd9 365 * So, the 16 bit pointer if inputB, py is updated */
simon 0:1014af42efd9 366 py = (q15_t *) pb;
simon 0:1014af42efd9 367 py = py + 1;
simon 0:1014af42efd9 368
simon 0:1014af42efd9 369 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
simon 0:1014af42efd9 370 ** No loop unrolling is used. */
simon 0:1014af42efd9 371 k = srcBLen % 0x4u;
simon 0:1014af42efd9 372
simon 0:1014af42efd9 373 if(k == 1u)
simon 0:1014af42efd9 374 {
simon 0:1014af42efd9 375 /* Read y[srcBLen - 5] */
simon 0:1014af42efd9 376 c0 = *(py);
simon 0:1014af42efd9 377
simon 0:1014af42efd9 378 /* Read x[7] */
simon 0:1014af42efd9 379 x3 = *(q31_t *) px++;
simon 0:1014af42efd9 380
simon 0:1014af42efd9 381 /* Perform the multiply-accumulates */
simon 0:1014af42efd9 382 acc0 = __SMLALD(x0, c0, acc0);
simon 0:1014af42efd9 383 acc1 = __SMLALD(x1, c0, acc1);
simon 0:1014af42efd9 384 acc2 = __SMLALDX(x1, c0, acc2);
simon 0:1014af42efd9 385 acc3 = __SMLALDX(x3, c0, acc3);
simon 0:1014af42efd9 386 }
simon 0:1014af42efd9 387
simon 0:1014af42efd9 388 if(k == 2u)
simon 0:1014af42efd9 389 {
simon 0:1014af42efd9 390 /* Read y[srcBLen - 5], y[srcBLen - 6] */
simon 0:1014af42efd9 391 c0 = *(pb);
simon 0:1014af42efd9 392
simon 0:1014af42efd9 393 /* Read x[7], x[8] */
simon 0:1014af42efd9 394 x3 = *(q31_t *) px++;
simon 0:1014af42efd9 395
simon 0:1014af42efd9 396 /* Read x[9] */
simon 0:1014af42efd9 397 x2 = *(q31_t *) px++;
simon 0:1014af42efd9 398
simon 0:1014af42efd9 399 /* Perform the multiply-accumulates */
simon 0:1014af42efd9 400 acc0 = __SMLALDX(x0, c0, acc0);
simon 0:1014af42efd9 401 acc1 = __SMLALDX(x1, c0, acc1);
simon 0:1014af42efd9 402 acc2 = __SMLALDX(x3, c0, acc2);
simon 0:1014af42efd9 403 acc3 = __SMLALDX(x2, c0, acc3);
simon 0:1014af42efd9 404 }
simon 0:1014af42efd9 405
simon 0:1014af42efd9 406 if(k == 3u)
simon 0:1014af42efd9 407 {
simon 0:1014af42efd9 408 /* Read y[srcBLen - 5], y[srcBLen - 6] */
simon 0:1014af42efd9 409 c0 = *pb--;
simon 0:1014af42efd9 410
simon 0:1014af42efd9 411 /* Read x[7], x[8] */
simon 0:1014af42efd9 412 x3 = *(q31_t *) px++;
simon 0:1014af42efd9 413
simon 0:1014af42efd9 414 /* Read x[9] */
simon 0:1014af42efd9 415 x2 = *(q31_t *) px++;
simon 0:1014af42efd9 416
simon 0:1014af42efd9 417 /* Perform the multiply-accumulates */
simon 0:1014af42efd9 418 acc0 = __SMLALDX(x0, c0, acc0);
simon 0:1014af42efd9 419 acc1 = __SMLALDX(x1, c0, acc1);
simon 0:1014af42efd9 420 acc2 = __SMLALDX(x3, c0, acc2);
simon 0:1014af42efd9 421 acc3 = __SMLALDX(x2, c0, acc3);
simon 0:1014af42efd9 422
simon 0:1014af42efd9 423 /* Read y[srcBLen - 7] */
simon 0:1014af42efd9 424 c0 = (q15_t) (*pb >> 16);
simon 0:1014af42efd9 425
simon 0:1014af42efd9 426 /* Read x[10] */
simon 0:1014af42efd9 427 x3 = *(q31_t *) px++;
simon 0:1014af42efd9 428
simon 0:1014af42efd9 429 /* Perform the multiply-accumulates */
simon 0:1014af42efd9 430 acc0 = __SMLALDX(x1, c0, acc0);
simon 0:1014af42efd9 431 acc1 = __SMLALD(x2, c0, acc1);
simon 0:1014af42efd9 432 acc2 = __SMLALDX(x2, c0, acc2);
simon 0:1014af42efd9 433 acc3 = __SMLALDX(x3, c0, acc3);
simon 0:1014af42efd9 434 }
simon 0:1014af42efd9 435
simon 0:1014af42efd9 436 /* Store the results in the accumulators in the destination buffer. */
simon 0:1014af42efd9 437 *__SIMD32(pOut)++ =
simon 0:1014af42efd9 438 __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16);
simon 0:1014af42efd9 439 *__SIMD32(pOut)++ =
simon 0:1014af42efd9 440 __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16);
simon 0:1014af42efd9 441
simon 0:1014af42efd9 442 /* Update the inputA and inputB pointers for next MAC calculation */
simon 0:1014af42efd9 443 px = pIn1 + (count * 4u);
simon 0:1014af42efd9 444 py = pSrc2;
simon 0:1014af42efd9 445 pb = (q31_t *) (py - 1);
simon 0:1014af42efd9 446
simon 0:1014af42efd9 447 /* Increment the pointer pIn1 index, count by 1 */
simon 0:1014af42efd9 448 count++;
simon 0:1014af42efd9 449
simon 0:1014af42efd9 450 /* Decrement the loop counter */
simon 0:1014af42efd9 451 blkCnt--;
simon 0:1014af42efd9 452 }
simon 0:1014af42efd9 453
simon 0:1014af42efd9 454 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
simon 0:1014af42efd9 455 ** No loop unrolling is used. */
simon 0:1014af42efd9 456 blkCnt = (uint32_t) blockSize2 % 0x4u;
simon 0:1014af42efd9 457
simon 0:1014af42efd9 458 while(blkCnt > 0u)
simon 0:1014af42efd9 459 {
simon 0:1014af42efd9 460 /* Accumulator is made zero for every iteration */
simon 0:1014af42efd9 461 sum = 0;
simon 0:1014af42efd9 462
simon 0:1014af42efd9 463 /* Apply loop unrolling and compute 4 MACs simultaneously. */
simon 0:1014af42efd9 464 k = srcBLen >> 2u;
simon 0:1014af42efd9 465
simon 0:1014af42efd9 466 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
simon 0:1014af42efd9 467 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
simon 0:1014af42efd9 468 while(k > 0u)
simon 0:1014af42efd9 469 {
simon 0:1014af42efd9 470 /* Perform the multiply-accumulates */
simon 0:1014af42efd9 471 sum += (q63_t) ((q31_t) * px++ * *py--);
simon 0:1014af42efd9 472 sum += (q63_t) ((q31_t) * px++ * *py--);
simon 0:1014af42efd9 473 sum += (q63_t) ((q31_t) * px++ * *py--);
simon 0:1014af42efd9 474 sum += (q63_t) ((q31_t) * px++ * *py--);
simon 0:1014af42efd9 475
simon 0:1014af42efd9 476 /* Decrement the loop counter */
simon 0:1014af42efd9 477 k--;
simon 0:1014af42efd9 478 }
simon 0:1014af42efd9 479
simon 0:1014af42efd9 480 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
simon 0:1014af42efd9 481 ** No loop unrolling is used. */
simon 0:1014af42efd9 482 k = srcBLen % 0x4u;
simon 0:1014af42efd9 483
simon 0:1014af42efd9 484 while(k > 0u)
simon 0:1014af42efd9 485 {
simon 0:1014af42efd9 486 /* Perform the multiply-accumulates */
simon 0:1014af42efd9 487 sum += (q63_t) ((q31_t) * px++ * *py--);
simon 0:1014af42efd9 488
simon 0:1014af42efd9 489 /* Decrement the loop counter */
simon 0:1014af42efd9 490 k--;
simon 0:1014af42efd9 491 }
simon 0:1014af42efd9 492
simon 0:1014af42efd9 493 /* Store the result in the accumulator in the destination buffer. */
simon 0:1014af42efd9 494 *pOut++ = (q15_t) (__SSAT(sum >> 15, 16));
simon 0:1014af42efd9 495
simon 0:1014af42efd9 496 /* Update the inputA and inputB pointers for next MAC calculation */
simon 0:1014af42efd9 497 px = pIn1 + count;
simon 0:1014af42efd9 498 py = pSrc2;
simon 0:1014af42efd9 499
simon 0:1014af42efd9 500 /* Increment the pointer pIn1 index, count by 1 */
simon 0:1014af42efd9 501 count++;
simon 0:1014af42efd9 502
simon 0:1014af42efd9 503 /* Decrement the loop counter */
simon 0:1014af42efd9 504 blkCnt--;
simon 0:1014af42efd9 505 }
simon 0:1014af42efd9 506 }
simon 0:1014af42efd9 507 else
simon 0:1014af42efd9 508 {
simon 0:1014af42efd9 509 /* If the srcBLen is not a multiple of 4,
simon 0:1014af42efd9 510 * the blockSize2 loop cannot be unrolled by 4 */
simon 0:1014af42efd9 511 blkCnt = (uint32_t) blockSize2;
simon 0:1014af42efd9 512
simon 0:1014af42efd9 513 while(blkCnt > 0u)
simon 0:1014af42efd9 514 {
simon 0:1014af42efd9 515 /* Accumulator is made zero for every iteration */
simon 0:1014af42efd9 516 sum = 0;
simon 0:1014af42efd9 517
simon 0:1014af42efd9 518 /* srcBLen number of MACS should be performed */
simon 0:1014af42efd9 519 k = srcBLen;
simon 0:1014af42efd9 520
simon 0:1014af42efd9 521 while(k > 0u)
simon 0:1014af42efd9 522 {
simon 0:1014af42efd9 523 /* Perform the multiply-accumulate */
simon 0:1014af42efd9 524 sum += (q63_t) ((q31_t) * px++ * *py--);
simon 0:1014af42efd9 525
simon 0:1014af42efd9 526 /* Decrement the loop counter */
simon 0:1014af42efd9 527 k--;
simon 0:1014af42efd9 528 }
simon 0:1014af42efd9 529
simon 0:1014af42efd9 530 /* Store the result in the accumulator in the destination buffer. */
simon 0:1014af42efd9 531 *pOut++ = (q15_t) (__SSAT(sum >> 15, 16));
simon 0:1014af42efd9 532
simon 0:1014af42efd9 533 /* Update the inputA and inputB pointers for next MAC calculation */
simon 0:1014af42efd9 534 px = pIn1 + count;
simon 0:1014af42efd9 535 py = pSrc2;
simon 0:1014af42efd9 536
simon 0:1014af42efd9 537 /* Increment the MAC count */
simon 0:1014af42efd9 538 count++;
simon 0:1014af42efd9 539
simon 0:1014af42efd9 540 /* Decrement the loop counter */
simon 0:1014af42efd9 541 blkCnt--;
simon 0:1014af42efd9 542 }
simon 0:1014af42efd9 543 }
simon 0:1014af42efd9 544
simon 0:1014af42efd9 545
simon 0:1014af42efd9 546 /* --------------------------
simon 0:1014af42efd9 547 * Initializations of stage3
simon 0:1014af42efd9 548 * -------------------------*/
simon 0:1014af42efd9 549
simon 0:1014af42efd9 550 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
simon 0:1014af42efd9 551 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
simon 0:1014af42efd9 552 * ....
simon 0:1014af42efd9 553 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
simon 0:1014af42efd9 554 * sum += x[srcALen-1] * y[srcBLen-1]
simon 0:1014af42efd9 555 */
simon 0:1014af42efd9 556
simon 0:1014af42efd9 557 /* In this stage the MAC operations are decreased by 1 for every iteration.
simon 0:1014af42efd9 558 The count variable holds the number of MAC operations performed */
simon 0:1014af42efd9 559 count = srcBLen - 1u;
simon 0:1014af42efd9 560
simon 0:1014af42efd9 561 /* Working pointer of inputA */
simon 0:1014af42efd9 562 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
simon 0:1014af42efd9 563 px = pSrc1;
simon 0:1014af42efd9 564
simon 0:1014af42efd9 565 /* Working pointer of inputB */
simon 0:1014af42efd9 566 pSrc2 = pIn2 + (srcBLen - 1u);
simon 0:1014af42efd9 567 pIn2 = pSrc2 - 1u;
simon 0:1014af42efd9 568 py = pIn2;
simon 0:1014af42efd9 569
simon 0:1014af42efd9 570 /* -------------------
simon 0:1014af42efd9 571 * Stage3 process
simon 0:1014af42efd9 572 * ------------------*/
simon 0:1014af42efd9 573
simon 0:1014af42efd9 574 /* For loop unrolling by 4, this stage is divided into two. */
simon 0:1014af42efd9 575 /* First part of this stage computes the MAC operations greater than 4 */
simon 0:1014af42efd9 576 /* Second part of this stage computes the MAC operations less than or equal to 4 */
simon 0:1014af42efd9 577
simon 0:1014af42efd9 578 /* The first part of the stage starts here */
simon 0:1014af42efd9 579 j = count >> 2u;
simon 0:1014af42efd9 580
simon 0:1014af42efd9 581 while((j > 0u) && (blockSize3 > 0))
simon 0:1014af42efd9 582 {
simon 0:1014af42efd9 583 /* Accumulator is made zero for every iteration */
simon 0:1014af42efd9 584 sum = 0;
simon 0:1014af42efd9 585
simon 0:1014af42efd9 586 /* Apply loop unrolling and compute 4 MACs simultaneously. */
simon 0:1014af42efd9 587 k = count >> 2u;
simon 0:1014af42efd9 588
simon 0:1014af42efd9 589 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
simon 0:1014af42efd9 590 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
simon 0:1014af42efd9 591 while(k > 0u)
simon 0:1014af42efd9 592 {
simon 0:1014af42efd9 593 /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied
simon 0:1014af42efd9 594 * with y[srcBLen - 1], y[srcBLen - 2] respectively */
simon 0:1014af42efd9 595 sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
simon 0:1014af42efd9 596 /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied
simon 0:1014af42efd9 597 * with y[srcBLen - 3], y[srcBLen - 4] respectively */
simon 0:1014af42efd9 598 sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
simon 0:1014af42efd9 599
simon 0:1014af42efd9 600 /* Decrement the loop counter */
simon 0:1014af42efd9 601 k--;
simon 0:1014af42efd9 602 }
simon 0:1014af42efd9 603
simon 0:1014af42efd9 604 /* For the next MAC operations, the pointer py is used without SIMD
simon 0:1014af42efd9 605 * So, py is incremented by 1 */
simon 0:1014af42efd9 606 py = py + 1u;
simon 0:1014af42efd9 607
simon 0:1014af42efd9 608 /* If the count is not a multiple of 4, compute any remaining MACs here.
simon 0:1014af42efd9 609 ** No loop unrolling is used. */
simon 0:1014af42efd9 610 k = count % 0x4u;
simon 0:1014af42efd9 611
simon 0:1014af42efd9 612 while(k > 0u)
simon 0:1014af42efd9 613 {
simon 0:1014af42efd9 614 /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */
simon 0:1014af42efd9 615 sum = __SMLALD(*px++, *py--, sum);
simon 0:1014af42efd9 616
simon 0:1014af42efd9 617 /* Decrement the loop counter */
simon 0:1014af42efd9 618 k--;
simon 0:1014af42efd9 619 }
simon 0:1014af42efd9 620
simon 0:1014af42efd9 621 /* Store the result in the accumulator in the destination buffer. */
simon 0:1014af42efd9 622 *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
simon 0:1014af42efd9 623
simon 0:1014af42efd9 624 /* Update the inputA and inputB pointers for next MAC calculation */
simon 0:1014af42efd9 625 px = ++pSrc1;
simon 0:1014af42efd9 626 py = pIn2;
simon 0:1014af42efd9 627
simon 0:1014af42efd9 628 /* Decrement the MAC count */
simon 0:1014af42efd9 629 count--;
simon 0:1014af42efd9 630
simon 0:1014af42efd9 631 /* Decrement the loop counter */
simon 0:1014af42efd9 632 blockSize3--;
simon 0:1014af42efd9 633
simon 0:1014af42efd9 634 j--;
simon 0:1014af42efd9 635 }
simon 0:1014af42efd9 636
simon 0:1014af42efd9 637 /* The second part of the stage starts here */
simon 0:1014af42efd9 638 /* SIMD is not used for the next MAC operations,
simon 0:1014af42efd9 639 * so pointer py is updated to read only one sample at a time */
simon 0:1014af42efd9 640 py = py + 1u;
simon 0:1014af42efd9 641
simon 0:1014af42efd9 642 while(blockSize3 > 0)
simon 0:1014af42efd9 643 {
simon 0:1014af42efd9 644 /* Accumulator is made zero for every iteration */
simon 0:1014af42efd9 645 sum = 0;
simon 0:1014af42efd9 646
simon 0:1014af42efd9 647 /* Apply loop unrolling and compute 4 MACs simultaneously. */
simon 0:1014af42efd9 648 k = count;
simon 0:1014af42efd9 649
simon 0:1014af42efd9 650 while(k > 0u)
simon 0:1014af42efd9 651 {
simon 0:1014af42efd9 652 /* Perform the multiply-accumulates */
simon 0:1014af42efd9 653 /* sum += x[srcALen-1] * y[srcBLen-1] */
simon 0:1014af42efd9 654 sum = __SMLALD(*px++, *py--, sum);
simon 0:1014af42efd9 655
simon 0:1014af42efd9 656 /* Decrement the loop counter */
simon 0:1014af42efd9 657 k--;
simon 0:1014af42efd9 658 }
simon 0:1014af42efd9 659
simon 0:1014af42efd9 660 /* Store the result in the accumulator in the destination buffer. */
simon 0:1014af42efd9 661 *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
simon 0:1014af42efd9 662
simon 0:1014af42efd9 663 /* Update the inputA and inputB pointers for next MAC calculation */
simon 0:1014af42efd9 664 px = ++pSrc1;
simon 0:1014af42efd9 665 py = pSrc2;
simon 0:1014af42efd9 666
simon 0:1014af42efd9 667 /* Decrement the MAC count */
simon 0:1014af42efd9 668 count--;
simon 0:1014af42efd9 669
simon 0:1014af42efd9 670 /* Decrement the loop counter */
simon 0:1014af42efd9 671 blockSize3--;
simon 0:1014af42efd9 672 }
simon 0:1014af42efd9 673
simon 0:1014af42efd9 674 /* set status as ARM_MATH_SUCCESS */
simon 0:1014af42efd9 675 status = ARM_MATH_SUCCESS;
simon 0:1014af42efd9 676 }
simon 0:1014af42efd9 677
simon 0:1014af42efd9 678 /* Return to application */
simon 0:1014af42efd9 679 return (status);
simon 0:1014af42efd9 680
simon 0:1014af42efd9 681 }
simon 0:1014af42efd9 682
simon 0:1014af42efd9 683 /**
simon 0:1014af42efd9 684 * @} end of PartialConv group
simon 0:1014af42efd9 685 */