CMSIS DSP library

Dependents:   performance_timer Surfboard_ gps2rtty Capstone ... more

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_fir_f32.c Source File

arm_fir_f32.c

00001 /* ----------------------------------------------------------------------  
00002 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.  
00003 *  
00004 * $Date:        19. March 2015
00005 * $Revision:    V.1.4.5
00006 *  
00007 * Project:      CMSIS DSP Library  
00008 * Title:        arm_fir_f32.c  
00009 *  
00010 * Description:  Floating-point FIR filter processing function.  
00011 *  
00012 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
00013 *  
00014 * Redistribution and use in source and binary forms, with or without 
00015 * modification, are permitted provided that the following conditions
00016 * are met:
00017 *   - Redistributions of source code must retain the above copyright
00018 *     notice, this list of conditions and the following disclaimer.
00019 *   - Redistributions in binary form must reproduce the above copyright
00020 *     notice, this list of conditions and the following disclaimer in
00021 *     the documentation and/or other materials provided with the 
00022 *     distribution.
00023 *   - Neither the name of ARM LIMITED nor the names of its contributors
00024 *     may be used to endorse or promote products derived from this
00025 *     software without specific prior written permission.
00026 *
00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
00038 * POSSIBILITY OF SUCH DAMAGE. 
00039 * -------------------------------------------------------------------- */
00040 
00041 #include "arm_math.h"
00042 
00043 /**  
00044 * @ingroup groupFilters  
00045 */
00046 
00047 /**  
00048 * @defgroup FIR Finite Impulse Response (FIR) Filters  
00049 *  
00050 * This set of functions implements Finite Impulse Response (FIR) filters  
00051 * for Q7, Q15, Q31, and floating-point data types.  Fast versions of Q15 and Q31 are also provided.  
00052 * The functions operate on blocks of input and output data and each call to the function processes  
00053 * <code>blockSize</code> samples through the filter.  <code>pSrc</code> and  
00054 * <code>pDst</code> points to input and output arrays containing <code>blockSize</code> values.  
00055 *  
00056 * \par Algorithm:  
00057 * The FIR filter algorithm is based upon a sequence of multiply-accumulate (MAC) operations.  
00058 * Each filter coefficient <code>b[n]</code> is multiplied by a state variable which equals a previous input sample <code>x[n]</code>.  
00059 * <pre>  
00060 *    y[n] = b[0] * x[n] + b[1] * x[n-1] + b[2] * x[n-2] + ...+ b[numTaps-1] * x[n-numTaps+1]  
00061 * </pre>  
00062 * \par  
00063 * \image html FIR.gif "Finite Impulse Response filter"  
00064 * \par  
00065 * <code>pCoeffs</code> points to a coefficient array of size <code>numTaps</code>.  
00066 * Coefficients are stored in time reversed order.  
00067 * \par  
00068 * <pre>  
00069 *    {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}  
00070 * </pre>  
00071 * \par  
00072 * <code>pState</code> points to a state array of size <code>numTaps + blockSize - 1</code>.  
00073 * Samples in the state buffer are stored in the following order.  
00074 * \par  
00075 * <pre>  
00076 *    {x[n-numTaps+1], x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2]....x[0], x[1], ..., x[blockSize-1]}  
00077 * </pre>  
00078 * \par  
00079 * Note that the length of the state buffer exceeds the length of the coefficient array by <code>blockSize-1</code>.  
00080 * The increased state buffer length allows circular addressing, which is traditionally used in the FIR filters,  
00081 * to be avoided and yields a significant speed improvement.  
00082 * The state variables are updated after each block of data is processed; the coefficients are untouched.  
00083 * \par Instance Structure  
00084 * The coefficients and state variables for a filter are stored together in an instance data structure.  
00085 * A separate instance structure must be defined for each filter.  
00086 * Coefficient arrays may be shared among several instances while state variable arrays cannot be shared.  
00087 * There are separate instance structure declarations for each of the 4 supported data types.  
00088 *  
00089 * \par Initialization Functions  
00090 * There is also an associated initialization function for each data type.  
00091 * The initialization function performs the following operations:  
00092 * - Sets the values of the internal structure fields.  
00093 * - Zeros out the values in the state buffer.  
00094 * To do this manually without calling the init function, assign the follow subfields of the instance structure:
00095 * numTaps, pCoeffs, pState. Also set all of the values in pState to zero. 
00096 *  
00097 * \par  
00098 * Use of the initialization function is optional.  
00099 * However, if the initialization function is used, then the instance structure cannot be placed into a const data section.  
00100 * To place an instance structure into a const data section, the instance structure must be manually initialized.  
00101 * Set the values in the state buffer to zeros before static initialization.  
00102 * The code below statically initializes each of the 4 different data type filter instance structures  
00103 * <pre>  
00104 *arm_fir_instance_f32 S = {numTaps, pState, pCoeffs};  
00105 *arm_fir_instance_q31 S = {numTaps, pState, pCoeffs};  
00106 *arm_fir_instance_q15 S = {numTaps, pState, pCoeffs};  
00107 *arm_fir_instance_q7 S =  {numTaps, pState, pCoeffs};  
00108 * </pre>  
00109 *  
00110 * where <code>numTaps</code> is the number of filter coefficients in the filter; <code>pState</code> is the address of the state buffer;  
00111 * <code>pCoeffs</code> is the address of the coefficient buffer.  
00112 *  
00113 * \par Fixed-Point Behavior  
00114 * Care must be taken when using the fixed-point versions of the FIR filter functions.  
00115 * In particular, the overflow and saturation behavior of the accumulator used in each function must be considered.  
00116 * Refer to the function specific documentation below for usage guidelines.  
00117 */
00118 
00119 /**  
00120 * @addtogroup FIR  
00121 * @{  
00122 */
00123 
00124 /**  
00125 *  
00126 * @param[in]  *S points to an instance of the floating-point FIR filter structure.  
00127 * @param[in]  *pSrc points to the block of input data.  
00128 * @param[out] *pDst points to the block of output data.  
00129 * @param[in]  blockSize number of samples to process per call.  
00130 * @return     none.  
00131 *  
00132 */
00133 
00134 #if defined(ARM_MATH_CM7)
00135 
00136 void arm_fir_f32(
00137 const arm_fir_instance_f32 * S,
00138 float32_t * pSrc,
00139 float32_t * pDst,
00140 uint32_t blockSize)
00141 {
00142    float32_t *pState = S->pState;                 /* State pointer */
00143    float32_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
00144    float32_t *pStateCurnt;                        /* Points to the current sample of the state */
00145    float32_t *px, *pb;                            /* Temporary pointers for state and coefficient buffers */
00146    float32_t acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;     /* Accumulators */
00147    float32_t x0, x1, x2, x3, x4, x5, x6, x7, c0;  /* Temporary variables to hold state and coefficient values */
00148    uint32_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter */
00149    uint32_t i, tapCnt, blkCnt;                    /* Loop counters */
00150 
00151    /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
00152    /* pStateCurnt points to the location where the new input data should be written */
00153    pStateCurnt = &(S->pState[(numTaps - 1u)]);
00154 
00155    /* Apply loop unrolling and compute 8 output values simultaneously.  
00156     * The variables acc0 ... acc7 hold output values that are being computed:  
00157     *  
00158     *    acc0 =  b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]  
00159     *    acc1 =  b[numTaps-1] * x[n-numTaps] +   b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]  
00160     *    acc2 =  b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] +   b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]  
00161     *    acc3 =  b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps]   +...+ b[0] * x[3]  
00162     */
00163    blkCnt = blockSize >> 3;
00164 
00165    /* First part of the processing with loop unrolling.  Compute 8 outputs at a time.  
00166    ** a second loop below computes the remaining 1 to 7 samples. */
00167    while(blkCnt > 0u)
00168    {
00169       /* Copy four new input samples into the state buffer */
00170       *pStateCurnt++ = *pSrc++;
00171       *pStateCurnt++ = *pSrc++;
00172       *pStateCurnt++ = *pSrc++;
00173       *pStateCurnt++ = *pSrc++;
00174 
00175       /* Set all accumulators to zero */
00176       acc0 = 0.0f;
00177       acc1 = 0.0f;
00178       acc2 = 0.0f;
00179       acc3 = 0.0f;
00180       acc4 = 0.0f;
00181       acc5 = 0.0f;
00182       acc6 = 0.0f;
00183       acc7 = 0.0f;      
00184 
00185       /* Initialize state pointer */
00186       px = pState;
00187 
00188       /* Initialize coeff pointer */
00189       pb = (pCoeffs);       
00190    
00191       /* This is separated from the others to avoid 
00192        * a call to __aeabi_memmove which would be slower
00193        */
00194       *pStateCurnt++ = *pSrc++;
00195       *pStateCurnt++ = *pSrc++;
00196       *pStateCurnt++ = *pSrc++;
00197       *pStateCurnt++ = *pSrc++;
00198 
00199       /* Read the first seven samples from the state buffer:  x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2] */
00200       x0 = *px++;
00201       x1 = *px++;
00202       x2 = *px++;
00203       x3 = *px++;
00204       x4 = *px++;
00205       x5 = *px++;
00206       x6 = *px++;
00207 
00208       /* Loop unrolling.  Process 8 taps at a time. */
00209       tapCnt = numTaps >> 3u;
00210       
00211       /* Loop over the number of taps.  Unroll by a factor of 8.  
00212        ** Repeat until we've computed numTaps-8 coefficients. */
00213       while(tapCnt > 0u)
00214       {
00215          /* Read the b[numTaps-1] coefficient */
00216          c0 = *(pb++);
00217 
00218          /* Read x[n-numTaps-3] sample */
00219          x7 = *(px++);
00220 
00221          /* acc0 +=  b[numTaps-1] * x[n-numTaps] */
00222          acc0 += x0 * c0;
00223 
00224          /* acc1 +=  b[numTaps-1] * x[n-numTaps-1] */
00225          acc1 += x1 * c0;
00226 
00227          /* acc2 +=  b[numTaps-1] * x[n-numTaps-2] */
00228          acc2 += x2 * c0;
00229 
00230          /* acc3 +=  b[numTaps-1] * x[n-numTaps-3] */
00231          acc3 += x3 * c0;
00232 
00233          /* acc4 +=  b[numTaps-1] * x[n-numTaps-4] */
00234          acc4 += x4 * c0;
00235 
00236          /* acc1 +=  b[numTaps-1] * x[n-numTaps-5] */
00237          acc5 += x5 * c0;
00238 
00239          /* acc2 +=  b[numTaps-1] * x[n-numTaps-6] */
00240          acc6 += x6 * c0;
00241 
00242          /* acc3 +=  b[numTaps-1] * x[n-numTaps-7] */
00243          acc7 += x7 * c0;
00244          
00245          /* Read the b[numTaps-2] coefficient */
00246          c0 = *(pb++);
00247 
00248          /* Read x[n-numTaps-4] sample */
00249          x0 = *(px++);
00250 
00251          /* Perform the multiply-accumulate */
00252          acc0 += x1 * c0;
00253          acc1 += x2 * c0;   
00254          acc2 += x3 * c0;   
00255          acc3 += x4 * c0;   
00256          acc4 += x5 * c0;   
00257          acc5 += x6 * c0;   
00258          acc6 += x7 * c0;   
00259          acc7 += x0 * c0;   
00260          
00261          /* Read the b[numTaps-3] coefficient */
00262          c0 = *(pb++);
00263 
00264          /* Read x[n-numTaps-5] sample */
00265          x1 = *(px++);
00266 
00267          /* Perform the multiply-accumulates */      
00268          acc0 += x2 * c0;
00269          acc1 += x3 * c0;   
00270          acc2 += x4 * c0;   
00271          acc3 += x5 * c0;   
00272          acc4 += x6 * c0;   
00273          acc5 += x7 * c0;   
00274          acc6 += x0 * c0;   
00275          acc7 += x1 * c0;   
00276 
00277          /* Read the b[numTaps-4] coefficient */
00278          c0 = *(pb++);
00279 
00280          /* Read x[n-numTaps-6] sample */
00281          x2 = *(px++);
00282 
00283          /* Perform the multiply-accumulates */      
00284          acc0 += x3 * c0;
00285          acc1 += x4 * c0;   
00286          acc2 += x5 * c0;   
00287          acc3 += x6 * c0;   
00288          acc4 += x7 * c0;   
00289          acc5 += x0 * c0;   
00290          acc6 += x1 * c0;   
00291          acc7 += x2 * c0;   
00292 
00293          /* Read the b[numTaps-4] coefficient */
00294          c0 = *(pb++);
00295 
00296          /* Read x[n-numTaps-6] sample */
00297          x3 = *(px++);
00298          /* Perform the multiply-accumulates */      
00299          acc0 += x4 * c0;
00300          acc1 += x5 * c0;   
00301          acc2 += x6 * c0;   
00302          acc3 += x7 * c0;   
00303          acc4 += x0 * c0;   
00304          acc5 += x1 * c0;   
00305          acc6 += x2 * c0;   
00306          acc7 += x3 * c0;   
00307 
00308          /* Read the b[numTaps-4] coefficient */
00309          c0 = *(pb++);
00310 
00311          /* Read x[n-numTaps-6] sample */
00312          x4 = *(px++);
00313 
00314          /* Perform the multiply-accumulates */      
00315          acc0 += x5 * c0;
00316          acc1 += x6 * c0;   
00317          acc2 += x7 * c0;   
00318          acc3 += x0 * c0;   
00319          acc4 += x1 * c0;   
00320          acc5 += x2 * c0;   
00321          acc6 += x3 * c0;   
00322          acc7 += x4 * c0;   
00323 
00324          /* Read the b[numTaps-4] coefficient */
00325          c0 = *(pb++);
00326 
00327          /* Read x[n-numTaps-6] sample */
00328          x5 = *(px++);
00329 
00330          /* Perform the multiply-accumulates */      
00331          acc0 += x6 * c0;
00332          acc1 += x7 * c0;   
00333          acc2 += x0 * c0;   
00334          acc3 += x1 * c0;   
00335          acc4 += x2 * c0;   
00336          acc5 += x3 * c0;   
00337          acc6 += x4 * c0;   
00338          acc7 += x5 * c0;   
00339 
00340          /* Read the b[numTaps-4] coefficient */
00341          c0 = *(pb++);
00342 
00343          /* Read x[n-numTaps-6] sample */
00344          x6 = *(px++);
00345 
00346          /* Perform the multiply-accumulates */      
00347          acc0 += x7 * c0;
00348          acc1 += x0 * c0;   
00349          acc2 += x1 * c0;   
00350          acc3 += x2 * c0;   
00351          acc4 += x3 * c0;   
00352          acc5 += x4 * c0;   
00353          acc6 += x5 * c0;   
00354          acc7 += x6 * c0;   
00355 
00356          tapCnt--;
00357       }
00358 
00359       /* If the filter length is not a multiple of 8, compute the remaining filter taps */
00360       tapCnt = numTaps % 0x8u;
00361 
00362       while(tapCnt > 0u)
00363       {
00364          /* Read coefficients */
00365          c0 = *(pb++);
00366 
00367          /* Fetch 1 state variable */
00368          x7 = *(px++);
00369 
00370          /* Perform the multiply-accumulates */      
00371          acc0 += x0 * c0;
00372          acc1 += x1 * c0;   
00373          acc2 += x2 * c0;   
00374          acc3 += x3 * c0;   
00375          acc4 += x4 * c0;   
00376          acc5 += x5 * c0;   
00377          acc6 += x6 * c0;   
00378          acc7 += x7 * c0;   
00379 
00380          /* Reuse the present sample states for next sample */
00381          x0 = x1;
00382          x1 = x2;
00383          x2 = x3;
00384          x3 = x4;
00385          x4 = x5;
00386          x5 = x6;
00387          x6 = x7;
00388 
00389          /* Decrement the loop counter */
00390          tapCnt--;
00391       }
00392 
00393       /* Advance the state pointer by 8 to process the next group of 8 samples */
00394       pState = pState + 8;
00395 
00396       /* The results in the 8 accumulators, store in the destination buffer. */
00397       *pDst++ = acc0;
00398       *pDst++ = acc1;
00399       *pDst++ = acc2;
00400       *pDst++ = acc3;
00401       *pDst++ = acc4;
00402       *pDst++ = acc5;
00403       *pDst++ = acc6;
00404       *pDst++ = acc7;
00405 
00406       blkCnt--;
00407    }
00408 
00409    /* If the blockSize is not a multiple of 8, compute any remaining output samples here.  
00410    ** No loop unrolling is used. */
00411    blkCnt = blockSize % 0x8u;
00412 
00413    while(blkCnt > 0u)
00414    {
00415       /* Copy one sample at a time into state buffer */
00416       *pStateCurnt++ = *pSrc++;
00417 
00418       /* Set the accumulator to zero */
00419       acc0 = 0.0f;
00420 
00421       /* Initialize state pointer */
00422       px = pState;
00423 
00424       /* Initialize Coefficient pointer */
00425       pb = (pCoeffs);
00426 
00427       i = numTaps;
00428 
00429       /* Perform the multiply-accumulates */
00430       do
00431       {
00432          acc0 += *px++ * *pb++;
00433          i--;
00434 
00435       } while(i > 0u);
00436 
00437       /* The result is store in the destination buffer. */
00438       *pDst++ = acc0;
00439 
00440       /* Advance state pointer by 1 for the next sample */
00441       pState = pState + 1;
00442 
00443       blkCnt--;
00444    }
00445 
00446    /* Processing is complete.  
00447    ** Now copy the last numTaps - 1 samples to the start of the state buffer.  
00448    ** This prepares the state buffer for the next function call. */
00449 
00450    /* Points to the start of the state buffer */
00451    pStateCurnt = S->pState;
00452 
00453    tapCnt = (numTaps - 1u) >> 2u;
00454 
00455    /* copy data */
00456    while(tapCnt > 0u)
00457    {
00458       *pStateCurnt++ = *pState++;
00459       *pStateCurnt++ = *pState++;
00460       *pStateCurnt++ = *pState++;
00461       *pStateCurnt++ = *pState++;
00462 
00463       /* Decrement the loop counter */
00464       tapCnt--;
00465    }
00466 
00467    /* Calculate remaining number of copies */
00468    tapCnt = (numTaps - 1u) % 0x4u;
00469 
00470    /* Copy the remaining q31_t data */
00471    while(tapCnt > 0u)
00472    {
00473       *pStateCurnt++ = *pState++;
00474 
00475       /* Decrement the loop counter */
00476       tapCnt--;
00477    }
00478 }
00479 
00480 #elif defined(ARM_MATH_CM0_FAMILY)
00481 
00482 void arm_fir_f32(
00483 const arm_fir_instance_f32 * S,
00484 float32_t * pSrc,
00485 float32_t * pDst,
00486 uint32_t blockSize)
00487 {
00488    float32_t *pState = S->pState;                 /* State pointer */
00489    float32_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
00490    float32_t *pStateCurnt;                        /* Points to the current sample of the state */
00491    float32_t *px, *pb;                            /* Temporary pointers for state and coefficient buffers */
00492    uint32_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter */
00493    uint32_t i, tapCnt, blkCnt;                    /* Loop counters */
00494 
00495    /* Run the below code for Cortex-M0 */
00496 
00497    float32_t acc;
00498 
00499    /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
00500    /* pStateCurnt points to the location where the new input data should be written */
00501    pStateCurnt = &(S->pState[(numTaps - 1u)]);
00502 
00503    /* Initialize blkCnt with blockSize */
00504    blkCnt = blockSize;
00505 
00506    while(blkCnt > 0u)
00507    {
00508       /* Copy one sample at a time into state buffer */
00509       *pStateCurnt++ = *pSrc++;
00510 
00511       /* Set the accumulator to zero */
00512       acc = 0.0f;
00513 
00514       /* Initialize state pointer */
00515       px = pState;
00516 
00517       /* Initialize Coefficient pointer */
00518       pb = pCoeffs;
00519 
00520       i = numTaps;
00521 
00522       /* Perform the multiply-accumulates */
00523       do
00524       {
00525          /* acc =  b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
00526          acc += *px++ * *pb++;
00527          i--;
00528 
00529       } while(i > 0u);
00530 
00531       /* The result is store in the destination buffer. */
00532       *pDst++ = acc;
00533 
00534       /* Advance state pointer by 1 for the next sample */
00535       pState = pState + 1;
00536 
00537       blkCnt--;
00538    }
00539 
00540    /* Processing is complete.         
00541    ** Now copy the last numTaps - 1 samples to the starting of the state buffer.       
00542    ** This prepares the state buffer for the next function call. */
00543 
00544    /* Points to the start of the state buffer */
00545    pStateCurnt = S->pState;
00546 
00547    /* Copy numTaps number of values */
00548    tapCnt = numTaps - 1u;
00549 
00550    /* Copy data */
00551    while(tapCnt > 0u)
00552    {
00553       *pStateCurnt++ = *pState++;
00554 
00555       /* Decrement the loop counter */
00556       tapCnt--;
00557    }
00558 
00559 }
00560 
00561 #else
00562 
00563 /* Run the below code for Cortex-M4 and Cortex-M3 */
00564 
00565 void arm_fir_f32(
00566 const arm_fir_instance_f32 * S,
00567 float32_t * pSrc,
00568 float32_t * pDst,
00569 uint32_t blockSize)
00570 {
00571    float32_t *pState = S->pState;                 /* State pointer */
00572    float32_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
00573    float32_t *pStateCurnt;                        /* Points to the current sample of the state */
00574    float32_t *px, *pb;                            /* Temporary pointers for state and coefficient buffers */
00575    float32_t acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;     /* Accumulators */
00576    float32_t x0, x1, x2, x3, x4, x5, x6, x7, c0;  /* Temporary variables to hold state and coefficient values */
00577    uint32_t numTaps = S->numTaps;                 /* Number of filter coefficients in the filter */
00578    uint32_t i, tapCnt, blkCnt;                    /* Loop counters */
00579    float32_t p0,p1,p2,p3,p4,p5,p6,p7;             /* Temporary product values */
00580 
00581    /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
00582    /* pStateCurnt points to the location where the new input data should be written */
00583    pStateCurnt = &(S->pState[(numTaps - 1u)]);
00584 
00585    /* Apply loop unrolling and compute 8 output values simultaneously.  
00586     * The variables acc0 ... acc7 hold output values that are being computed:  
00587     *  
00588     *    acc0 =  b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]  
00589     *    acc1 =  b[numTaps-1] * x[n-numTaps] +   b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]  
00590     *    acc2 =  b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] +   b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]  
00591     *    acc3 =  b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps]   +...+ b[0] * x[3]  
00592     */
00593    blkCnt = blockSize >> 3;
00594 
00595    /* First part of the processing with loop unrolling.  Compute 8 outputs at a time.  
00596    ** a second loop below computes the remaining 1 to 7 samples. */
00597    while(blkCnt > 0u)
00598    {
00599       /* Copy four new input samples into the state buffer */
00600       *pStateCurnt++ = *pSrc++;
00601       *pStateCurnt++ = *pSrc++;
00602       *pStateCurnt++ = *pSrc++;
00603       *pStateCurnt++ = *pSrc++;
00604 
00605       /* Set all accumulators to zero */
00606       acc0 = 0.0f;
00607       acc1 = 0.0f;
00608       acc2 = 0.0f;
00609       acc3 = 0.0f;
00610       acc4 = 0.0f;
00611       acc5 = 0.0f;
00612       acc6 = 0.0f;
00613       acc7 = 0.0f;      
00614 
00615       /* Initialize state pointer */
00616       px = pState;
00617 
00618       /* Initialize coeff pointer */
00619       pb = (pCoeffs);       
00620    
00621       /* This is separated from the others to avoid 
00622        * a call to __aeabi_memmove which would be slower
00623        */
00624       *pStateCurnt++ = *pSrc++;
00625       *pStateCurnt++ = *pSrc++;
00626       *pStateCurnt++ = *pSrc++;
00627       *pStateCurnt++ = *pSrc++;
00628 
00629       /* Read the first seven samples from the state buffer:  x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2] */
00630       x0 = *px++;
00631       x1 = *px++;
00632       x2 = *px++;
00633       x3 = *px++;
00634       x4 = *px++;
00635       x5 = *px++;
00636       x6 = *px++;
00637 
00638       /* Loop unrolling.  Process 8 taps at a time. */
00639       tapCnt = numTaps >> 3u;
00640       
00641       /* Loop over the number of taps.  Unroll by a factor of 8.  
00642        ** Repeat until we've computed numTaps-8 coefficients. */
00643       while(tapCnt > 0u)
00644       {
00645          /* Read the b[numTaps-1] coefficient */
00646          c0 = *(pb++);
00647 
00648          /* Read x[n-numTaps-3] sample */
00649          x7 = *(px++);
00650 
00651          /* acc0 +=  b[numTaps-1] * x[n-numTaps] */
00652          p0 = x0 * c0;
00653 
00654          /* acc1 +=  b[numTaps-1] * x[n-numTaps-1] */
00655          p1 = x1 * c0;
00656 
00657          /* acc2 +=  b[numTaps-1] * x[n-numTaps-2] */
00658          p2 = x2 * c0;
00659 
00660          /* acc3 +=  b[numTaps-1] * x[n-numTaps-3] */
00661          p3 = x3 * c0;
00662 
00663          /* acc4 +=  b[numTaps-1] * x[n-numTaps-4] */
00664          p4 = x4 * c0;
00665 
00666          /* acc1 +=  b[numTaps-1] * x[n-numTaps-5] */
00667          p5 = x5 * c0;
00668 
00669          /* acc2 +=  b[numTaps-1] * x[n-numTaps-6] */
00670          p6 = x6 * c0;
00671 
00672          /* acc3 +=  b[numTaps-1] * x[n-numTaps-7] */
00673          p7 = x7 * c0;
00674          
00675          /* Read the b[numTaps-2] coefficient */
00676          c0 = *(pb++);
00677 
00678          /* Read x[n-numTaps-4] sample */
00679          x0 = *(px++);
00680          
00681          acc0 += p0;
00682          acc1 += p1;
00683          acc2 += p2;
00684          acc3 += p3;
00685          acc4 += p4;
00686          acc5 += p5;
00687          acc6 += p6;
00688          acc7 += p7;
00689 
00690 
00691          /* Perform the multiply-accumulate */
00692          p0 = x1 * c0;
00693          p1 = x2 * c0;   
00694          p2 = x3 * c0;   
00695          p3 = x4 * c0;   
00696          p4 = x5 * c0;   
00697          p5 = x6 * c0;   
00698          p6 = x7 * c0;   
00699          p7 = x0 * c0;   
00700          
00701          /* Read the b[numTaps-3] coefficient */
00702          c0 = *(pb++);
00703 
00704          /* Read x[n-numTaps-5] sample */
00705          x1 = *(px++);
00706          
00707          acc0 += p0;
00708          acc1 += p1;
00709          acc2 += p2;
00710          acc3 += p3;
00711          acc4 += p4;
00712          acc5 += p5;
00713          acc6 += p6;
00714          acc7 += p7;
00715 
00716          /* Perform the multiply-accumulates */      
00717          p0 = x2 * c0;
00718          p1 = x3 * c0;   
00719          p2 = x4 * c0;   
00720          p3 = x5 * c0;   
00721          p4 = x6 * c0;   
00722          p5 = x7 * c0;   
00723          p6 = x0 * c0;   
00724          p7 = x1 * c0;   
00725 
00726          /* Read the b[numTaps-4] coefficient */
00727          c0 = *(pb++);
00728 
00729          /* Read x[n-numTaps-6] sample */
00730          x2 = *(px++);
00731          
00732          acc0 += p0;
00733          acc1 += p1;
00734          acc2 += p2;
00735          acc3 += p3;
00736          acc4 += p4;
00737          acc5 += p5;
00738          acc6 += p6;
00739          acc7 += p7;
00740 
00741          /* Perform the multiply-accumulates */      
00742          p0 = x3 * c0;
00743          p1 = x4 * c0;   
00744          p2 = x5 * c0;   
00745          p3 = x6 * c0;   
00746          p4 = x7 * c0;   
00747          p5 = x0 * c0;   
00748          p6 = x1 * c0;   
00749          p7 = x2 * c0;   
00750 
00751          /* Read the b[numTaps-4] coefficient */
00752          c0 = *(pb++);
00753 
00754          /* Read x[n-numTaps-6] sample */
00755          x3 = *(px++);
00756          
00757          acc0 += p0;
00758          acc1 += p1;
00759          acc2 += p2;
00760          acc3 += p3;
00761          acc4 += p4;
00762          acc5 += p5;
00763          acc6 += p6;
00764          acc7 += p7;
00765 
00766          /* Perform the multiply-accumulates */      
00767          p0 = x4 * c0;
00768          p1 = x5 * c0;   
00769          p2 = x6 * c0;   
00770          p3 = x7 * c0;   
00771          p4 = x0 * c0;   
00772          p5 = x1 * c0;   
00773          p6 = x2 * c0;   
00774          p7 = x3 * c0;   
00775 
00776          /* Read the b[numTaps-4] coefficient */
00777          c0 = *(pb++);
00778 
00779          /* Read x[n-numTaps-6] sample */
00780          x4 = *(px++);
00781          
00782          acc0 += p0;
00783          acc1 += p1;
00784          acc2 += p2;
00785          acc3 += p3;
00786          acc4 += p4;
00787          acc5 += p5;
00788          acc6 += p6;
00789          acc7 += p7;
00790 
00791          /* Perform the multiply-accumulates */      
00792          p0 = x5 * c0;
00793          p1 = x6 * c0;   
00794          p2 = x7 * c0;   
00795          p3 = x0 * c0;   
00796          p4 = x1 * c0;   
00797          p5 = x2 * c0;   
00798          p6 = x3 * c0;   
00799          p7 = x4 * c0;   
00800 
00801          /* Read the b[numTaps-4] coefficient */
00802          c0 = *(pb++);
00803 
00804          /* Read x[n-numTaps-6] sample */
00805          x5 = *(px++);
00806          
00807          acc0 += p0;
00808          acc1 += p1;
00809          acc2 += p2;
00810          acc3 += p3;
00811          acc4 += p4;
00812          acc5 += p5;
00813          acc6 += p6;
00814          acc7 += p7;
00815 
00816          /* Perform the multiply-accumulates */      
00817          p0 = x6 * c0;
00818          p1 = x7 * c0;   
00819          p2 = x0 * c0;   
00820          p3 = x1 * c0;   
00821          p4 = x2 * c0;   
00822          p5 = x3 * c0;   
00823          p6 = x4 * c0;   
00824          p7 = x5 * c0;   
00825 
00826          /* Read the b[numTaps-4] coefficient */
00827          c0 = *(pb++);
00828 
00829          /* Read x[n-numTaps-6] sample */
00830          x6 = *(px++);
00831          
00832          acc0 += p0;
00833          acc1 += p1;
00834          acc2 += p2;
00835          acc3 += p3;
00836          acc4 += p4;
00837          acc5 += p5;
00838          acc6 += p6;
00839          acc7 += p7;
00840 
00841          /* Perform the multiply-accumulates */      
00842          p0 = x7 * c0;
00843          p1 = x0 * c0;   
00844          p2 = x1 * c0;   
00845          p3 = x2 * c0;   
00846          p4 = x3 * c0;   
00847          p5 = x4 * c0;   
00848          p6 = x5 * c0;   
00849          p7 = x6 * c0;   
00850 
00851          tapCnt--;
00852          
00853          acc0 += p0;
00854          acc1 += p1;
00855          acc2 += p2;
00856          acc3 += p3;
00857          acc4 += p4;
00858          acc5 += p5;
00859          acc6 += p6;
00860          acc7 += p7;
00861       }
00862 
00863       /* If the filter length is not a multiple of 8, compute the remaining filter taps */
00864       tapCnt = numTaps % 0x8u;
00865 
00866       while(tapCnt > 0u)
00867       {
00868          /* Read coefficients */
00869          c0 = *(pb++);
00870 
00871          /* Fetch 1 state variable */
00872          x7 = *(px++);
00873 
00874          /* Perform the multiply-accumulates */      
00875          p0 = x0 * c0;
00876          p1 = x1 * c0;   
00877          p2 = x2 * c0;   
00878          p3 = x3 * c0;   
00879          p4 = x4 * c0;   
00880          p5 = x5 * c0;   
00881          p6 = x6 * c0;   
00882          p7 = x7 * c0;   
00883 
00884          /* Reuse the present sample states for next sample */
00885          x0 = x1;
00886          x1 = x2;
00887          x2 = x3;
00888          x3 = x4;
00889          x4 = x5;
00890          x5 = x6;
00891          x6 = x7;
00892          
00893          acc0 += p0;
00894          acc1 += p1;
00895          acc2 += p2;
00896          acc3 += p3;
00897          acc4 += p4;
00898          acc5 += p5;
00899          acc6 += p6;
00900          acc7 += p7;
00901 
00902          /* Decrement the loop counter */
00903          tapCnt--;
00904       }
00905 
00906       /* Advance the state pointer by 8 to process the next group of 8 samples */
00907       pState = pState + 8;
00908 
00909       /* The results in the 8 accumulators, store in the destination buffer. */
00910       *pDst++ = acc0;
00911       *pDst++ = acc1;
00912       *pDst++ = acc2;
00913       *pDst++ = acc3;
00914       *pDst++ = acc4;
00915       *pDst++ = acc5;
00916       *pDst++ = acc6;
00917       *pDst++ = acc7;
00918 
00919       blkCnt--;
00920    }
00921 
00922    /* If the blockSize is not a multiple of 8, compute any remaining output samples here.  
00923    ** No loop unrolling is used. */
00924    blkCnt = blockSize % 0x8u;
00925 
00926    while(blkCnt > 0u)
00927    {
00928       /* Copy one sample at a time into state buffer */
00929       *pStateCurnt++ = *pSrc++;
00930 
00931       /* Set the accumulator to zero */
00932       acc0 = 0.0f;
00933 
00934       /* Initialize state pointer */
00935       px = pState;
00936 
00937       /* Initialize Coefficient pointer */
00938       pb = (pCoeffs);
00939 
00940       i = numTaps;
00941 
00942       /* Perform the multiply-accumulates */
00943       do
00944       {
00945          acc0 += *px++ * *pb++;
00946          i--;
00947 
00948       } while(i > 0u);
00949 
00950       /* The result is store in the destination buffer. */
00951       *pDst++ = acc0;
00952 
00953       /* Advance state pointer by 1 for the next sample */
00954       pState = pState + 1;
00955 
00956       blkCnt--;
00957    }
00958 
00959    /* Processing is complete.  
00960    ** Now copy the last numTaps - 1 samples to the start of the state buffer.  
00961    ** This prepares the state buffer for the next function call. */
00962 
00963    /* Points to the start of the state buffer */
00964    pStateCurnt = S->pState;
00965 
00966    tapCnt = (numTaps - 1u) >> 2u;
00967 
00968    /* copy data */
00969    while(tapCnt > 0u)
00970    {
00971       *pStateCurnt++ = *pState++;
00972       *pStateCurnt++ = *pState++;
00973       *pStateCurnt++ = *pState++;
00974       *pStateCurnt++ = *pState++;
00975 
00976       /* Decrement the loop counter */
00977       tapCnt--;
00978    }
00979 
00980    /* Calculate remaining number of copies */
00981    tapCnt = (numTaps - 1u) % 0x4u;
00982 
00983    /* Copy the remaining q31_t data */
00984    while(tapCnt > 0u)
00985    {
00986       *pStateCurnt++ = *pState++;
00987 
00988       /* Decrement the loop counter */
00989       tapCnt--;
00990    }
00991 }
00992 
00993 #endif 
00994 
00995 /**  
00996 * @} end of FIR group  
00997 */