STM32F769IDiscovery  1.00
uDANTE Audio Networking with STM32F7 DISCO board
arm_fir_sparse_q7.c
Go to the documentation of this file.
1 /* ----------------------------------------------------------------------
2 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.
3 *
4 * $Date: 19. March 2015
5 * $Revision: V.1.4.5
6 *
7 * Project: CMSIS DSP Library
8 * Title: arm_fir_sparse_q7.c
9 *
10 * Description: Q7 sparse FIR filter processing function.
11 *
12 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * - Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * - Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in
21 * the documentation and/or other materials provided with the
22 * distribution.
23 * - Neither the name of ARM LIMITED nor the names of its contributors
24 * may be used to endorse or promote products derived from this
25 * software without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 * POSSIBILITY OF SUCH DAMAGE.
39 * ------------------------------------------------------------------- */
40 #include "arm_math.h"
41 
42 
75  q7_t * pSrc,
76  q7_t * pDst,
77  q7_t * pScratchIn,
78  q31_t * pScratchOut,
79  uint32_t blockSize)
80 {
81 
82  q7_t *pState = S->pState; /* State pointer */
83  q7_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
84  q7_t *px; /* Scratch buffer pointer */
85  q7_t *py = pState; /* Temporary pointers for state buffer */
86  q7_t *pb = pScratchIn; /* Temporary pointers for scratch buffer */
87  q7_t *pOut = pDst; /* Destination pointer */
88  int32_t *pTapDelay = S->pTapDelay; /* Pointer to the array containing offset of the non-zero tap values. */
89  uint32_t delaySize = S->maxDelay + blockSize; /* state length */
90  uint16_t numTaps = S->numTaps; /* Filter order */
91  int32_t readIndex; /* Read index of the state buffer */
92  uint32_t tapCnt, blkCnt; /* loop counters */
93  q7_t coeff = *pCoeffs++; /* Read the coefficient value */
94  q31_t *pScr2 = pScratchOut; /* Working pointer for scratch buffer of output values */
95  q31_t in;
96 
97 
98 #ifndef ARM_MATH_CM0_FAMILY
99 
100  /* Run the below code for Cortex-M4 and Cortex-M3 */
101 
102  q7_t in1, in2, in3, in4;
103 
104  /* BlockSize of Input samples are copied into the state buffer */
105  /* StateIndex points to the starting position to write in the state buffer */
106  arm_circularWrite_q7(py, (int32_t) delaySize, &S->stateIndex, 1, pSrc, 1,
107  blockSize);
108 
109  /* Loop over the number of taps. */
110  tapCnt = numTaps;
111 
112  /* Read Index, from where the state buffer should be read, is calculated. */
113  readIndex = ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++;
114 
115  /* Wraparound of readIndex */
116  if(readIndex < 0)
117  {
118  readIndex += (int32_t) delaySize;
119  }
120 
121  /* Working pointer for state buffer is updated */
122  py = pState;
123 
124  /* blockSize samples are read from the state buffer */
125  arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1, pb, pb,
126  (int32_t) blockSize, 1, blockSize);
127 
128  /* Working pointer for the scratch buffer of state values */
129  px = pb;
130 
131  /* Working pointer for scratch buffer of output values */
132  pScratchOut = pScr2;
133 
134  /* Loop over the blockSize. Unroll by a factor of 4.
135  * Compute 4 multiplications at a time. */
136  blkCnt = blockSize >> 2;
137 
138  while(blkCnt > 0u)
139  {
140  /* Perform multiplication and store in the scratch buffer */
141  *pScratchOut++ = ((q31_t) * px++ * coeff);
142  *pScratchOut++ = ((q31_t) * px++ * coeff);
143  *pScratchOut++ = ((q31_t) * px++ * coeff);
144  *pScratchOut++ = ((q31_t) * px++ * coeff);
145 
146  /* Decrement the loop counter */
147  blkCnt--;
148  }
149 
150  /* If the blockSize is not a multiple of 4,
151  * compute the remaining samples */
152  blkCnt = blockSize % 0x4u;
153 
154  while(blkCnt > 0u)
155  {
156  /* Perform multiplication and store in the scratch buffer */
157  *pScratchOut++ = ((q31_t) * px++ * coeff);
158 
159  /* Decrement the loop counter */
160  blkCnt--;
161  }
162 
163  /* Load the coefficient value and
164  * increment the coefficient buffer for the next set of state values */
165  coeff = *pCoeffs++;
166 
167  /* Read Index, from where the state buffer should be read, is calculated. */
168  readIndex = ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++;
169 
170  /* Wraparound of readIndex */
171  if(readIndex < 0)
172  {
173  readIndex += (int32_t) delaySize;
174  }
175 
176  /* Loop over the number of taps. */
177  tapCnt = (uint32_t) numTaps - 2u;
178 
179  while(tapCnt > 0u)
180  {
181  /* Working pointer for state buffer is updated */
182  py = pState;
183 
184  /* blockSize samples are read from the state buffer */
185  arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1, pb, pb,
186  (int32_t) blockSize, 1, blockSize);
187 
188  /* Working pointer for the scratch buffer of state values */
189  px = pb;
190 
191  /* Working pointer for scratch buffer of output values */
192  pScratchOut = pScr2;
193 
194  /* Loop over the blockSize. Unroll by a factor of 4.
195  * Compute 4 MACS at a time. */
196  blkCnt = blockSize >> 2;
197 
198  while(blkCnt > 0u)
199  {
200  /* Perform Multiply-Accumulate */
201  in = *pScratchOut + ((q31_t) * px++ * coeff);
202  *pScratchOut++ = in;
203  in = *pScratchOut + ((q31_t) * px++ * coeff);
204  *pScratchOut++ = in;
205  in = *pScratchOut + ((q31_t) * px++ * coeff);
206  *pScratchOut++ = in;
207  in = *pScratchOut + ((q31_t) * px++ * coeff);
208  *pScratchOut++ = in;
209 
210  /* Decrement the loop counter */
211  blkCnt--;
212  }
213 
214  /* If the blockSize is not a multiple of 4,
215  * compute the remaining samples */
216  blkCnt = blockSize % 0x4u;
217 
218  while(blkCnt > 0u)
219  {
220  /* Perform Multiply-Accumulate */
221  in = *pScratchOut + ((q31_t) * px++ * coeff);
222  *pScratchOut++ = in;
223 
224  /* Decrement the loop counter */
225  blkCnt--;
226  }
227 
228  /* Load the coefficient value and
229  * increment the coefficient buffer for the next set of state values */
230  coeff = *pCoeffs++;
231 
232  /* Read Index, from where the state buffer should be read, is calculated. */
233  readIndex = ((int32_t) S->stateIndex -
234  (int32_t) blockSize) - *pTapDelay++;
235 
236  /* Wraparound of readIndex */
237  if(readIndex < 0)
238  {
239  readIndex += (int32_t) delaySize;
240  }
241 
242  /* Decrement the tap loop counter */
243  tapCnt--;
244  }
245 
246  /* Compute last tap without the final read of pTapDelay */
247 
248  /* Working pointer for state buffer is updated */
249  py = pState;
250 
251  /* blockSize samples are read from the state buffer */
252  arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1, pb, pb,
253  (int32_t) blockSize, 1, blockSize);
254 
255  /* Working pointer for the scratch buffer of state values */
256  px = pb;
257 
258  /* Working pointer for scratch buffer of output values */
259  pScratchOut = pScr2;
260 
261  /* Loop over the blockSize. Unroll by a factor of 4.
262  * Compute 4 MACS at a time. */
263  blkCnt = blockSize >> 2;
264 
265  while(blkCnt > 0u)
266  {
267  /* Perform Multiply-Accumulate */
268  in = *pScratchOut + ((q31_t) * px++ * coeff);
269  *pScratchOut++ = in;
270  in = *pScratchOut + ((q31_t) * px++ * coeff);
271  *pScratchOut++ = in;
272  in = *pScratchOut + ((q31_t) * px++ * coeff);
273  *pScratchOut++ = in;
274  in = *pScratchOut + ((q31_t) * px++ * coeff);
275  *pScratchOut++ = in;
276 
277  /* Decrement the loop counter */
278  blkCnt--;
279  }
280 
281  /* If the blockSize is not a multiple of 4,
282  * compute the remaining samples */
283  blkCnt = blockSize % 0x4u;
284 
285  while(blkCnt > 0u)
286  {
287  /* Perform Multiply-Accumulate */
288  in = *pScratchOut + ((q31_t) * px++ * coeff);
289  *pScratchOut++ = in;
290 
291  /* Decrement the loop counter */
292  blkCnt--;
293  }
294 
295  /* All the output values are in pScratchOut buffer.
296  Convert them into 1.15 format, saturate and store in the destination buffer. */
297  /* Loop over the blockSize. */
298  blkCnt = blockSize >> 2;
299 
300  while(blkCnt > 0u)
301  {
302  in1 = (q7_t) __SSAT(*pScr2++ >> 7, 8);
303  in2 = (q7_t) __SSAT(*pScr2++ >> 7, 8);
304  in3 = (q7_t) __SSAT(*pScr2++ >> 7, 8);
305  in4 = (q7_t) __SSAT(*pScr2++ >> 7, 8);
306 
307  *__SIMD32(pOut)++ = __PACKq7(in1, in2, in3, in4);
308 
309  /* Decrement the blockSize loop counter */
310  blkCnt--;
311  }
312 
313  /* If the blockSize is not a multiple of 4,
314  remaining samples are processed in the below loop */
315  blkCnt = blockSize % 0x4u;
316 
317  while(blkCnt > 0u)
318  {
319  *pOut++ = (q7_t) __SSAT(*pScr2++ >> 7, 8);
320 
321  /* Decrement the blockSize loop counter */
322  blkCnt--;
323  }
324 
325 #else
326 
327  /* Run the below code for Cortex-M0 */
328 
329  /* BlockSize of Input samples are copied into the state buffer */
330  /* StateIndex points to the starting position to write in the state buffer */
331  arm_circularWrite_q7(py, (int32_t) delaySize, &S->stateIndex, 1, pSrc, 1,
332  blockSize);
333 
334  /* Loop over the number of taps. */
335  tapCnt = numTaps;
336 
337  /* Read Index, from where the state buffer should be read, is calculated. */
338  readIndex = ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++;
339 
340  /* Wraparound of readIndex */
341  if(readIndex < 0)
342  {
343  readIndex += (int32_t) delaySize;
344  }
345 
346  /* Working pointer for state buffer is updated */
347  py = pState;
348 
349  /* blockSize samples are read from the state buffer */
350  arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1, pb, pb,
351  (int32_t) blockSize, 1, blockSize);
352 
353  /* Working pointer for the scratch buffer of state values */
354  px = pb;
355 
356  /* Working pointer for scratch buffer of output values */
357  pScratchOut = pScr2;
358 
359  /* Loop over the blockSize */
360  blkCnt = blockSize;
361 
362  while(blkCnt > 0u)
363  {
364  /* Perform multiplication and store in the scratch buffer */
365  *pScratchOut++ = ((q31_t) * px++ * coeff);
366 
367  /* Decrement the loop counter */
368  blkCnt--;
369  }
370 
371  /* Load the coefficient value and
372  * increment the coefficient buffer for the next set of state values */
373  coeff = *pCoeffs++;
374 
375  /* Read Index, from where the state buffer should be read, is calculated. */
376  readIndex = ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++;
377 
378  /* Wraparound of readIndex */
379  if(readIndex < 0)
380  {
381  readIndex += (int32_t) delaySize;
382  }
383 
384  /* Loop over the number of taps. */
385  tapCnt = (uint32_t) numTaps - 2u;
386 
387  while(tapCnt > 0u)
388  {
389  /* Working pointer for state buffer is updated */
390  py = pState;
391 
392  /* blockSize samples are read from the state buffer */
393  arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1, pb, pb,
394  (int32_t) blockSize, 1, blockSize);
395 
396  /* Working pointer for the scratch buffer of state values */
397  px = pb;
398 
399  /* Working pointer for scratch buffer of output values */
400  pScratchOut = pScr2;
401 
402  /* Loop over the blockSize */
403  blkCnt = blockSize;
404 
405  while(blkCnt > 0u)
406  {
407  /* Perform Multiply-Accumulate */
408  in = *pScratchOut + ((q31_t) * px++ * coeff);
409  *pScratchOut++ = in;
410 
411  /* Decrement the loop counter */
412  blkCnt--;
413  }
414 
415  /* Load the coefficient value and
416  * increment the coefficient buffer for the next set of state values */
417  coeff = *pCoeffs++;
418 
419  /* Read Index, from where the state buffer should be read, is calculated. */
420  readIndex =
421  ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++;
422 
423  /* Wraparound of readIndex */
424  if(readIndex < 0)
425  {
426  readIndex += (int32_t) delaySize;
427  }
428 
429  /* Decrement the tap loop counter */
430  tapCnt--;
431  }
432 
433  /* Compute last tap without the final read of pTapDelay */
434 
435  /* Working pointer for state buffer is updated */
436  py = pState;
437 
438  /* blockSize samples are read from the state buffer */
439  arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1, pb, pb,
440  (int32_t) blockSize, 1, blockSize);
441 
442  /* Working pointer for the scratch buffer of state values */
443  px = pb;
444 
445  /* Working pointer for scratch buffer of output values */
446  pScratchOut = pScr2;
447 
448  /* Loop over the blockSize */
449  blkCnt = blockSize;
450 
451  while(blkCnt > 0u)
452  {
453  /* Perform Multiply-Accumulate */
454  in = *pScratchOut + ((q31_t) * px++ * coeff);
455  *pScratchOut++ = in;
456 
457  /* Decrement the loop counter */
458  blkCnt--;
459  }
460 
461  /* All the output values are in pScratchOut buffer.
462  Convert them into 1.15 format, saturate and store in the destination buffer. */
463  /* Loop over the blockSize. */
464  blkCnt = blockSize;
465 
466  while(blkCnt > 0u)
467  {
468  *pOut++ = (q7_t) __SSAT(*pScr2++ >> 7, 8);
469 
470  /* Decrement the blockSize loop counter */
471  blkCnt--;
472  }
473 
474 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
475 
476 }
477 
int8_t q7_t
8-bit fractional data type in 1.7 format.
Definition: arm_math.h:387
#define __PACKq7(v0, v1, v2, v3)
definition to pack four 8 bit values.
Definition: arm_math.h:467
Instance structure for the Q7 sparse FIR filter.
Definition: arm_math.h:4469
void arm_fir_sparse_q7(arm_fir_sparse_instance_q7 *S, q7_t *pSrc, q7_t *pDst, q7_t *pScratchIn, q31_t *pScratchOut, uint32_t blockSize)
Processing function for the Q7 sparse FIR filter.
#define __SIMD32(addr)
definition to read/write two 16 bit values.
Definition: arm_math.h:445
int32_t q31_t
32-bit fractional data type in 1.31 format.
Definition: arm_math.h:397