STM32F769IDiscovery  1.00
uDANTE Audio Networking with STM32F7 DISCO board
arm_fir_sparse_q31.c
Go to the documentation of this file.
1 /* ----------------------------------------------------------------------
2 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.
3 *
4 * $Date: 19. March 2015
5 * $Revision: V.1.4.5
6 *
7 * Project: CMSIS DSP Library
8 * Title: arm_fir_sparse_q31.c
9 *
10 * Description: Q31 sparse FIR filter processing function.
11 *
12 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * - Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * - Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in
21 * the documentation and/or other materials provided with the
22 * distribution.
23 * - Neither the name of ARM LIMITED nor the names of its contributors
24 * may be used to endorse or promote products derived from this
25 * software without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 * POSSIBILITY OF SUCH DAMAGE.
39 * ------------------------------------------------------------------- */
40 #include "arm_math.h"
41 
42 
68  q31_t * pSrc,
69  q31_t * pDst,
70  q31_t * pScratchIn,
71  uint32_t blockSize)
72 {
73 
74  q31_t *pState = S->pState; /* State pointer */
75  q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
76  q31_t *px; /* Scratch buffer pointer */
77  q31_t *py = pState; /* Temporary pointers for state buffer */
78  q31_t *pb = pScratchIn; /* Temporary pointers for scratch buffer */
79  q31_t *pOut; /* Destination pointer */
80  q63_t out; /* Temporary output variable */
81  int32_t *pTapDelay = S->pTapDelay; /* Pointer to the array containing offset of the non-zero tap values. */
82  uint32_t delaySize = S->maxDelay + blockSize; /* state length */
83  uint16_t numTaps = S->numTaps; /* Filter order */
84  int32_t readIndex; /* Read index of the state buffer */
85  uint32_t tapCnt, blkCnt; /* loop counters */
86  q31_t coeff = *pCoeffs++; /* Read the first coefficient value */
87  q31_t in;
88 
89 
90  /* BlockSize of Input samples are copied into the state buffer */
91  /* StateIndex points to the starting position to write in the state buffer */
92  arm_circularWrite_f32((int32_t *) py, delaySize, &S->stateIndex, 1,
93  (int32_t *) pSrc, 1, blockSize);
94 
95  /* Read Index, from where the state buffer should be read, is calculated. */
96  readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++;
97 
98  /* Wraparound of readIndex */
99  if(readIndex < 0)
100  {
101  readIndex += (int32_t) delaySize;
102  }
103 
104  /* Working pointer for state buffer is updated */
105  py = pState;
106 
107  /* blockSize samples are read from the state buffer */
108  arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1,
109  (int32_t *) pb, (int32_t *) pb, blockSize, 1,
110  blockSize);
111 
112  /* Working pointer for the scratch buffer of state values */
113  px = pb;
114 
115  /* Working pointer for scratch buffer of output values */
116  pOut = pDst;
117 
118 
119 #ifndef ARM_MATH_CM0_FAMILY
120 
121  /* Run the below code for Cortex-M4 and Cortex-M3 */
122 
123  /* Loop over the blockSize. Unroll by a factor of 4.
124  * Compute 4 Multiplications at a time. */
125  blkCnt = blockSize >> 2;
126 
127  while(blkCnt > 0u)
128  {
129  /* Perform Multiplications and store in the destination buffer */
130  *pOut++ = (q31_t) (((q63_t) * px++ * coeff) >> 32);
131  *pOut++ = (q31_t) (((q63_t) * px++ * coeff) >> 32);
132  *pOut++ = (q31_t) (((q63_t) * px++ * coeff) >> 32);
133  *pOut++ = (q31_t) (((q63_t) * px++ * coeff) >> 32);
134 
135  /* Decrement the loop counter */
136  blkCnt--;
137  }
138 
139  /* If the blockSize is not a multiple of 4,
140  * compute the remaining samples */
141  blkCnt = blockSize % 0x4u;
142 
143  while(blkCnt > 0u)
144  {
145  /* Perform Multiplications and store in the destination buffer */
146  *pOut++ = (q31_t) (((q63_t) * px++ * coeff) >> 32);
147 
148  /* Decrement the loop counter */
149  blkCnt--;
150  }
151 
152  /* Load the coefficient value and
153  * increment the coefficient buffer for the next set of state values */
154  coeff = *pCoeffs++;
155 
156  /* Read Index, from where the state buffer should be read, is calculated. */
157  readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++;
158 
159  /* Wraparound of readIndex */
160  if(readIndex < 0)
161  {
162  readIndex += (int32_t) delaySize;
163  }
164 
165  /* Loop over the number of taps. */
166  tapCnt = (uint32_t) numTaps - 2u;
167 
168  while(tapCnt > 0u)
169  {
170  /* Working pointer for state buffer is updated */
171  py = pState;
172 
173  /* blockSize samples are read from the state buffer */
174  arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1,
175  (int32_t *) pb, (int32_t *) pb, blockSize, 1,
176  blockSize);
177 
178  /* Working pointer for the scratch buffer of state values */
179  px = pb;
180 
181  /* Working pointer for scratch buffer of output values */
182  pOut = pDst;
183 
184  /* Loop over the blockSize. Unroll by a factor of 4.
185  * Compute 4 MACS at a time. */
186  blkCnt = blockSize >> 2;
187 
188  while(blkCnt > 0u)
189  {
190  out = *pOut;
191  out += ((q63_t) * px++ * coeff) >> 32;
192  *pOut++ = (q31_t) (out);
193 
194  out = *pOut;
195  out += ((q63_t) * px++ * coeff) >> 32;
196  *pOut++ = (q31_t) (out);
197 
198  out = *pOut;
199  out += ((q63_t) * px++ * coeff) >> 32;
200  *pOut++ = (q31_t) (out);
201 
202  out = *pOut;
203  out += ((q63_t) * px++ * coeff) >> 32;
204  *pOut++ = (q31_t) (out);
205 
206  /* Decrement the loop counter */
207  blkCnt--;
208  }
209 
210  /* If the blockSize is not a multiple of 4,
211  * compute the remaining samples */
212  blkCnt = blockSize % 0x4u;
213 
214  while(blkCnt > 0u)
215  {
216  /* Perform Multiply-Accumulate */
217  out = *pOut;
218  out += ((q63_t) * px++ * coeff) >> 32;
219  *pOut++ = (q31_t) (out);
220 
221  /* Decrement the loop counter */
222  blkCnt--;
223  }
224 
225  /* Load the coefficient value and
226  * increment the coefficient buffer for the next set of state values */
227  coeff = *pCoeffs++;
228 
229  /* Read Index, from where the state buffer should be read, is calculated. */
230  readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++;
231 
232  /* Wraparound of readIndex */
233  if(readIndex < 0)
234  {
235  readIndex += (int32_t) delaySize;
236  }
237 
238  /* Decrement the tap loop counter */
239  tapCnt--;
240  }
241 
242  /* Compute last tap without the final read of pTapDelay */
243 
244  /* Working pointer for state buffer is updated */
245  py = pState;
246 
247  /* blockSize samples are read from the state buffer */
248  arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1,
249  (int32_t *) pb, (int32_t *) pb, blockSize, 1,
250  blockSize);
251 
252  /* Working pointer for the scratch buffer of state values */
253  px = pb;
254 
255  /* Working pointer for scratch buffer of output values */
256  pOut = pDst;
257 
258  /* Loop over the blockSize. Unroll by a factor of 4.
259  * Compute 4 MACS at a time. */
260  blkCnt = blockSize >> 2;
261 
262  while(blkCnt > 0u)
263  {
264  out = *pOut;
265  out += ((q63_t) * px++ * coeff) >> 32;
266  *pOut++ = (q31_t) (out);
267 
268  out = *pOut;
269  out += ((q63_t) * px++ * coeff) >> 32;
270  *pOut++ = (q31_t) (out);
271 
272  out = *pOut;
273  out += ((q63_t) * px++ * coeff) >> 32;
274  *pOut++ = (q31_t) (out);
275 
276  out = *pOut;
277  out += ((q63_t) * px++ * coeff) >> 32;
278  *pOut++ = (q31_t) (out);
279 
280  /* Decrement the loop counter */
281  blkCnt--;
282  }
283 
284  /* If the blockSize is not a multiple of 4,
285  * compute the remaining samples */
286  blkCnt = blockSize % 0x4u;
287 
288  while(blkCnt > 0u)
289  {
290  /* Perform Multiply-Accumulate */
291  out = *pOut;
292  out += ((q63_t) * px++ * coeff) >> 32;
293  *pOut++ = (q31_t) (out);
294 
295  /* Decrement the loop counter */
296  blkCnt--;
297  }
298 
299  /* Working output pointer is updated */
300  pOut = pDst;
301 
302  /* Output is converted into 1.31 format. */
303  /* Loop over the blockSize. Unroll by a factor of 4.
304  * process 4 output samples at a time. */
305  blkCnt = blockSize >> 2;
306 
307  while(blkCnt > 0u)
308  {
309  in = *pOut << 1;
310  *pOut++ = in;
311  in = *pOut << 1;
312  *pOut++ = in;
313  in = *pOut << 1;
314  *pOut++ = in;
315  in = *pOut << 1;
316  *pOut++ = in;
317 
318  /* Decrement the loop counter */
319  blkCnt--;
320  }
321 
322  /* If the blockSize is not a multiple of 4,
323  * process the remaining output samples */
324  blkCnt = blockSize % 0x4u;
325 
326  while(blkCnt > 0u)
327  {
328  in = *pOut << 1;
329  *pOut++ = in;
330 
331  /* Decrement the loop counter */
332  blkCnt--;
333  }
334 
335 #else
336 
337  /* Run the below code for Cortex-M0 */
338  blkCnt = blockSize;
339 
340  while(blkCnt > 0u)
341  {
342  /* Perform Multiplications and store in the destination buffer */
343  *pOut++ = (q31_t) (((q63_t) * px++ * coeff) >> 32);
344 
345  /* Decrement the loop counter */
346  blkCnt--;
347  }
348 
349  /* Load the coefficient value and
350  * increment the coefficient buffer for the next set of state values */
351  coeff = *pCoeffs++;
352 
353  /* Read Index, from where the state buffer should be read, is calculated. */
354  readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++;
355 
356  /* Wraparound of readIndex */
357  if(readIndex < 0)
358  {
359  readIndex += (int32_t) delaySize;
360  }
361 
362  /* Loop over the number of taps. */
363  tapCnt = (uint32_t) numTaps - 2u;
364 
365  while(tapCnt > 0u)
366  {
367  /* Working pointer for state buffer is updated */
368  py = pState;
369 
370  /* blockSize samples are read from the state buffer */
371  arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1,
372  (int32_t *) pb, (int32_t *) pb, blockSize, 1,
373  blockSize);
374 
375  /* Working pointer for the scratch buffer of state values */
376  px = pb;
377 
378  /* Working pointer for scratch buffer of output values */
379  pOut = pDst;
380 
381  blkCnt = blockSize;
382 
383  while(blkCnt > 0u)
384  {
385  /* Perform Multiply-Accumulate */
386  out = *pOut;
387  out += ((q63_t) * px++ * coeff) >> 32;
388  *pOut++ = (q31_t) (out);
389 
390  /* Decrement the loop counter */
391  blkCnt--;
392  }
393 
394  /* Load the coefficient value and
395  * increment the coefficient buffer for the next set of state values */
396  coeff = *pCoeffs++;
397 
398  /* Read Index, from where the state buffer should be read, is calculated. */
399  readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++;
400 
401  /* Wraparound of readIndex */
402  if(readIndex < 0)
403  {
404  readIndex += (int32_t) delaySize;
405  }
406 
407  /* Decrement the tap loop counter */
408  tapCnt--;
409  }
410 
411  /* Compute last tap without the final read of pTapDelay */
412 
413  /* Working pointer for state buffer is updated */
414  py = pState;
415 
416  /* blockSize samples are read from the state buffer */
417  arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1,
418  (int32_t *) pb, (int32_t *) pb, blockSize, 1,
419  blockSize);
420 
421  /* Working pointer for the scratch buffer of state values */
422  px = pb;
423 
424  /* Working pointer for scratch buffer of output values */
425  pOut = pDst;
426 
427  blkCnt = blockSize;
428 
429  while(blkCnt > 0u)
430  {
431  /* Perform Multiply-Accumulate */
432  out = *pOut;
433  out += ((q63_t) * px++ * coeff) >> 32;
434  *pOut++ = (q31_t) (out);
435 
436  /* Decrement the loop counter */
437  blkCnt--;
438  }
439 
440  /* Working output pointer is updated */
441  pOut = pDst;
442 
443  /* Output is converted into 1.31 format. */
444  blkCnt = blockSize;
445 
446  while(blkCnt > 0u)
447  {
448  in = *pOut << 1;
449  *pOut++ = in;
450 
451  /* Decrement the loop counter */
452  blkCnt--;
453  }
454 
455 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
456 
457 }
458 
int64_t q63_t
64-bit fractional data type in 1.63 format.
Definition: arm_math.h:402
Instance structure for the Q31 sparse FIR filter.
Definition: arm_math.h:4443
int32_t q31_t
32-bit fractional data type in 1.31 format.
Definition: arm_math.h:397
void arm_fir_sparse_q31(arm_fir_sparse_instance_q31 *S, q31_t *pSrc, q31_t *pDst, q31_t *pScratchIn, uint32_t blockSize)
Processing function for the Q31 sparse FIR filter.