STM32F769IDiscovery  1.00
uDANTE Audio Networking with STM32F7 DISCO board
arm_fir_sparse_q15.c
Go to the documentation of this file.
1 /* ----------------------------------------------------------------------
2 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.
3 *
4 * $Date: 19. March 2015
5 * $Revision: V.1.4.5
6 *
7 * Project: CMSIS DSP Library
8 * Title: arm_fir_sparse_q15.c
9 *
10 * Description: Q15 sparse FIR filter processing function.
11 *
12 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * - Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * - Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in
21 * the documentation and/or other materials provided with the
22 * distribution.
23 * - Neither the name of ARM LIMITED nor the names of its contributors
24 * may be used to endorse or promote products derived from this
25 * software without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 * POSSIBILITY OF SUCH DAMAGE.
39 * ------------------------------------------------------------------- */
40 #include "arm_math.h"
41 
70  q15_t * pSrc,
71  q15_t * pDst,
72  q15_t * pScratchIn,
73  q31_t * pScratchOut,
74  uint32_t blockSize)
75 {
76 
77  q15_t *pState = S->pState; /* State pointer */
78  q15_t *pIn = pSrc; /* Working pointer for input */
79  q15_t *pOut = pDst; /* Working pointer for output */
80  q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
81  q15_t *px; /* Temporary pointers for scratch buffer */
82  q15_t *pb = pScratchIn; /* Temporary pointers for scratch buffer */
83  q15_t *py = pState; /* Temporary pointers for state buffer */
84  int32_t *pTapDelay = S->pTapDelay; /* Pointer to the array containing offset of the non-zero tap values. */
85  uint32_t delaySize = S->maxDelay + blockSize; /* state length */
86  uint16_t numTaps = S->numTaps; /* Filter order */
87  int32_t readIndex; /* Read index of the state buffer */
88  uint32_t tapCnt, blkCnt; /* loop counters */
89  q15_t coeff = *pCoeffs++; /* Read the first coefficient value */
90  q31_t *pScr2 = pScratchOut; /* Working pointer for pScratchOut */
91 
92 
93 #ifndef ARM_MATH_CM0_FAMILY
94 
95  /* Run the below code for Cortex-M4 and Cortex-M3 */
96 
97  q31_t in1, in2; /* Temporary variables */
98 
99 
100  /* BlockSize of Input samples are copied into the state buffer */
101  /* StateIndex points to the starting position to write in the state buffer */
102  arm_circularWrite_q15(py, delaySize, &S->stateIndex, 1, pIn, 1, blockSize);
103 
104  /* Loop over the number of taps. */
105  tapCnt = numTaps;
106 
107  /* Read Index, from where the state buffer should be read, is calculated. */
108  readIndex = (S->stateIndex - blockSize) - *pTapDelay++;
109 
110  /* Wraparound of readIndex */
111  if(readIndex < 0)
112  {
113  readIndex += (int32_t) delaySize;
114  }
115 
116  /* Working pointer for state buffer is updated */
117  py = pState;
118 
119  /* blockSize samples are read from the state buffer */
120  arm_circularRead_q15(py, delaySize, &readIndex, 1,
121  pb, pb, blockSize, 1, blockSize);
122 
123  /* Working pointer for the scratch buffer of state values */
124  px = pb;
125 
126  /* Working pointer for scratch buffer of output values */
127  pScratchOut = pScr2;
128 
129  /* Loop over the blockSize. Unroll by a factor of 4.
130  * Compute 4 multiplications at a time. */
131  blkCnt = blockSize >> 2;
132 
133  while(blkCnt > 0u)
134  {
135  /* Perform multiplication and store in the scratch buffer */
136  *pScratchOut++ = ((q31_t) * px++ * coeff);
137  *pScratchOut++ = ((q31_t) * px++ * coeff);
138  *pScratchOut++ = ((q31_t) * px++ * coeff);
139  *pScratchOut++ = ((q31_t) * px++ * coeff);
140 
141  /* Decrement the loop counter */
142  blkCnt--;
143  }
144 
145  /* If the blockSize is not a multiple of 4,
146  * compute the remaining samples */
147  blkCnt = blockSize % 0x4u;
148 
149  while(blkCnt > 0u)
150  {
151  /* Perform multiplication and store in the scratch buffer */
152  *pScratchOut++ = ((q31_t) * px++ * coeff);
153 
154  /* Decrement the loop counter */
155  blkCnt--;
156  }
157 
158  /* Load the coefficient value and
159  * increment the coefficient buffer for the next set of state values */
160  coeff = *pCoeffs++;
161 
162  /* Read Index, from where the state buffer should be read, is calculated. */
163  readIndex = (S->stateIndex - blockSize) - *pTapDelay++;
164 
165  /* Wraparound of readIndex */
166  if(readIndex < 0)
167  {
168  readIndex += (int32_t) delaySize;
169  }
170 
171  /* Loop over the number of taps. */
172  tapCnt = (uint32_t) numTaps - 2u;
173 
174  while(tapCnt > 0u)
175  {
176  /* Working pointer for state buffer is updated */
177  py = pState;
178 
179  /* blockSize samples are read from the state buffer */
180  arm_circularRead_q15(py, delaySize, &readIndex, 1,
181  pb, pb, blockSize, 1, blockSize);
182 
183  /* Working pointer for the scratch buffer of state values */
184  px = pb;
185 
186  /* Working pointer for scratch buffer of output values */
187  pScratchOut = pScr2;
188 
189  /* Loop over the blockSize. Unroll by a factor of 4.
190  * Compute 4 MACS at a time. */
191  blkCnt = blockSize >> 2;
192 
193  while(blkCnt > 0u)
194  {
195  /* Perform Multiply-Accumulate */
196  *pScratchOut++ += (q31_t) * px++ * coeff;
197  *pScratchOut++ += (q31_t) * px++ * coeff;
198  *pScratchOut++ += (q31_t) * px++ * coeff;
199  *pScratchOut++ += (q31_t) * px++ * coeff;
200 
201  /* Decrement the loop counter */
202  blkCnt--;
203  }
204 
205  /* If the blockSize is not a multiple of 4,
206  * compute the remaining samples */
207  blkCnt = blockSize % 0x4u;
208 
209  while(blkCnt > 0u)
210  {
211  /* Perform Multiply-Accumulate */
212  *pScratchOut++ += (q31_t) * px++ * coeff;
213 
214  /* Decrement the loop counter */
215  blkCnt--;
216  }
217 
218  /* Load the coefficient value and
219  * increment the coefficient buffer for the next set of state values */
220  coeff = *pCoeffs++;
221 
222  /* Read Index, from where the state buffer should be read, is calculated. */
223  readIndex = (S->stateIndex - blockSize) - *pTapDelay++;
224 
225  /* Wraparound of readIndex */
226  if(readIndex < 0)
227  {
228  readIndex += (int32_t) delaySize;
229  }
230 
231  /* Decrement the tap loop counter */
232  tapCnt--;
233  }
234 
235  /* Compute last tap without the final read of pTapDelay */
236 
237  /* Working pointer for state buffer is updated */
238  py = pState;
239 
240  /* blockSize samples are read from the state buffer */
241  arm_circularRead_q15(py, delaySize, &readIndex, 1,
242  pb, pb, blockSize, 1, blockSize);
243 
244  /* Working pointer for the scratch buffer of state values */
245  px = pb;
246 
247  /* Working pointer for scratch buffer of output values */
248  pScratchOut = pScr2;
249 
250  /* Loop over the blockSize. Unroll by a factor of 4.
251  * Compute 4 MACS at a time. */
252  blkCnt = blockSize >> 2;
253 
254  while(blkCnt > 0u)
255  {
256  /* Perform Multiply-Accumulate */
257  *pScratchOut++ += (q31_t) * px++ * coeff;
258  *pScratchOut++ += (q31_t) * px++ * coeff;
259  *pScratchOut++ += (q31_t) * px++ * coeff;
260  *pScratchOut++ += (q31_t) * px++ * coeff;
261 
262  /* Decrement the loop counter */
263  blkCnt--;
264  }
265 
266  /* If the blockSize is not a multiple of 4,
267  * compute the remaining samples */
268  blkCnt = blockSize % 0x4u;
269 
270  while(blkCnt > 0u)
271  {
272  /* Perform Multiply-Accumulate */
273  *pScratchOut++ += (q31_t) * px++ * coeff;
274 
275  /* Decrement the loop counter */
276  blkCnt--;
277  }
278 
279  /* All the output values are in pScratchOut buffer.
280  Convert them into 1.15 format, saturate and store in the destination buffer. */
281  /* Loop over the blockSize. */
282  blkCnt = blockSize >> 2;
283 
284  while(blkCnt > 0u)
285  {
286  in1 = *pScr2++;
287  in2 = *pScr2++;
288 
289 #ifndef ARM_MATH_BIG_ENDIAN
290 
291  *__SIMD32(pOut)++ =
292  __PKHBT((q15_t) __SSAT(in1 >> 15, 16), (q15_t) __SSAT(in2 >> 15, 16),
293  16);
294 
295 #else
296  *__SIMD32(pOut)++ =
297  __PKHBT((q15_t) __SSAT(in2 >> 15, 16), (q15_t) __SSAT(in1 >> 15, 16),
298  16);
299 
300 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
301 
302  in1 = *pScr2++;
303 
304  in2 = *pScr2++;
305 
306 #ifndef ARM_MATH_BIG_ENDIAN
307 
308  *__SIMD32(pOut)++ =
309  __PKHBT((q15_t) __SSAT(in1 >> 15, 16), (q15_t) __SSAT(in2 >> 15, 16),
310  16);
311 
312 #else
313 
314  *__SIMD32(pOut)++ =
315  __PKHBT((q15_t) __SSAT(in2 >> 15, 16), (q15_t) __SSAT(in1 >> 15, 16),
316  16);
317 
318 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
319 
320 
321  blkCnt--;
322 
323  }
324 
325  /* If the blockSize is not a multiple of 4,
326  remaining samples are processed in the below loop */
327  blkCnt = blockSize % 0x4u;
328 
329  while(blkCnt > 0u)
330  {
331  *pOut++ = (q15_t) __SSAT(*pScr2++ >> 15, 16);
332  blkCnt--;
333  }
334 
335 #else
336 
337  /* Run the below code for Cortex-M0 */
338 
339  /* BlockSize of Input samples are copied into the state buffer */
340  /* StateIndex points to the starting position to write in the state buffer */
341  arm_circularWrite_q15(py, delaySize, &S->stateIndex, 1, pIn, 1, blockSize);
342 
343  /* Loop over the number of taps. */
344  tapCnt = numTaps;
345 
346  /* Read Index, from where the state buffer should be read, is calculated. */
347  readIndex = (S->stateIndex - blockSize) - *pTapDelay++;
348 
349  /* Wraparound of readIndex */
350  if(readIndex < 0)
351  {
352  readIndex += (int32_t) delaySize;
353  }
354 
355  /* Working pointer for state buffer is updated */
356  py = pState;
357 
358  /* blockSize samples are read from the state buffer */
359  arm_circularRead_q15(py, delaySize, &readIndex, 1,
360  pb, pb, blockSize, 1, blockSize);
361 
362  /* Working pointer for the scratch buffer of state values */
363  px = pb;
364 
365  /* Working pointer for scratch buffer of output values */
366  pScratchOut = pScr2;
367 
368  blkCnt = blockSize;
369 
370  while(blkCnt > 0u)
371  {
372  /* Perform multiplication and store in the scratch buffer */
373  *pScratchOut++ = ((q31_t) * px++ * coeff);
374 
375  /* Decrement the loop counter */
376  blkCnt--;
377  }
378 
379  /* Load the coefficient value and
380  * increment the coefficient buffer for the next set of state values */
381  coeff = *pCoeffs++;
382 
383  /* Read Index, from where the state buffer should be read, is calculated. */
384  readIndex = (S->stateIndex - blockSize) - *pTapDelay++;
385 
386  /* Wraparound of readIndex */
387  if(readIndex < 0)
388  {
389  readIndex += (int32_t) delaySize;
390  }
391 
392  /* Loop over the number of taps. */
393  tapCnt = (uint32_t) numTaps - 2u;
394 
395  while(tapCnt > 0u)
396  {
397  /* Working pointer for state buffer is updated */
398  py = pState;
399 
400  /* blockSize samples are read from the state buffer */
401  arm_circularRead_q15(py, delaySize, &readIndex, 1,
402  pb, pb, blockSize, 1, blockSize);
403 
404  /* Working pointer for the scratch buffer of state values */
405  px = pb;
406 
407  /* Working pointer for scratch buffer of output values */
408  pScratchOut = pScr2;
409 
410  blkCnt = blockSize;
411 
412  while(blkCnt > 0u)
413  {
414  /* Perform Multiply-Accumulate */
415  *pScratchOut++ += (q31_t) * px++ * coeff;
416 
417  /* Decrement the loop counter */
418  blkCnt--;
419  }
420 
421  /* Load the coefficient value and
422  * increment the coefficient buffer for the next set of state values */
423  coeff = *pCoeffs++;
424 
425  /* Read Index, from where the state buffer should be read, is calculated. */
426  readIndex = (S->stateIndex - blockSize) - *pTapDelay++;
427 
428  /* Wraparound of readIndex */
429  if(readIndex < 0)
430  {
431  readIndex += (int32_t) delaySize;
432  }
433 
434  /* Decrement the tap loop counter */
435  tapCnt--;
436  }
437 
438  /* Compute last tap without the final read of pTapDelay */
439 
440  /* Working pointer for state buffer is updated */
441  py = pState;
442 
443  /* blockSize samples are read from the state buffer */
444  arm_circularRead_q15(py, delaySize, &readIndex, 1,
445  pb, pb, blockSize, 1, blockSize);
446 
447  /* Working pointer for the scratch buffer of state values */
448  px = pb;
449 
450  /* Working pointer for scratch buffer of output values */
451  pScratchOut = pScr2;
452 
453  blkCnt = blockSize;
454 
455  while(blkCnt > 0u)
456  {
457  /* Perform Multiply-Accumulate */
458  *pScratchOut++ += (q31_t) * px++ * coeff;
459 
460  /* Decrement the loop counter */
461  blkCnt--;
462  }
463 
464  /* All the output values are in pScratchOut buffer.
465  Convert them into 1.15 format, saturate and store in the destination buffer. */
466  /* Loop over the blockSize. */
467  blkCnt = blockSize;
468 
469  while(blkCnt > 0u)
470  {
471  *pOut++ = (q15_t) __SSAT(*pScr2++ >> 15, 16);
472  blkCnt--;
473  }
474 
475 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
476 
477 }
478 
Instance structure for the Q15 sparse FIR filter.
Definition: arm_math.h:4456
int16_t q15_t
16-bit fractional data type in 1.15 format.
Definition: arm_math.h:392
#define __SIMD32(addr)
definition to read/write two 16 bit values.
Definition: arm_math.h:445
int32_t q31_t
32-bit fractional data type in 1.31 format.
Definition: arm_math.h:397
void arm_fir_sparse_q15(arm_fir_sparse_instance_q15 *S, q15_t *pSrc, q15_t *pDst, q15_t *pScratchIn, q31_t *pScratchOut, uint32_t blockSize)
Processing function for the Q15 sparse FIR filter.