STM32F769IDiscovery  1.00
uDANTE Audio Networking with STM32F7 DISCO board
arm_fir_decimate_fast_q15.c
Go to the documentation of this file.
1 /* ----------------------------------------------------------------------
2 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.
3 *
4 * $Date: 19. March 2015
5 * $Revision: V.1.4.5
6 *
7 * Project: CMSIS DSP Library
8 * Title: arm_fir_decimate_fast_q15.c
9 *
10 * Description: Fast Q15 FIR Decimator.
11 *
12 * Target Processor: Cortex-M4/Cortex-M3
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * - Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * - Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in
21 * the documentation and/or other materials provided with the
22 * distribution.
23 * - Neither the name of ARM LIMITED nor the names of its contributors
24 * may be used to endorse or promote products derived from this
25 * software without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 * POSSIBILITY OF SUCH DAMAGE.
39 * -------------------------------------------------------------------- */
40 
41 #include "arm_math.h"
42 
78 #ifndef UNALIGNED_SUPPORT_DISABLE
79 
82  q15_t * pSrc,
83  q15_t * pDst,
84  uint32_t blockSize)
85 {
86  q15_t *pState = S->pState; /* State pointer */
87  q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
88  q15_t *pStateCurnt; /* Points to the current sample of the state */
89  q15_t *px; /* Temporary pointer for state buffer */
90  q15_t *pb; /* Temporary pointer coefficient buffer */
91  q31_t x0, x1, c0, c1; /* Temporary variables to hold state and coefficient values */
92  q31_t sum0; /* Accumulators */
93  q31_t acc0, acc1;
94  q15_t *px0, *px1;
95  uint32_t blkCntN3;
96  uint32_t numTaps = S->numTaps; /* Number of taps */
97  uint32_t i, blkCnt, tapCnt, outBlockSize = blockSize / S->M; /* Loop counters */
98 
99 
100  /* S->pState buffer contains previous frame (numTaps - 1) samples */
101  /* pStateCurnt points to the location where the new input data should be written */
102  pStateCurnt = S->pState + (numTaps - 1u);
103 
104 
105  /* Total number of output samples to be computed */
106  blkCnt = outBlockSize / 2;
107  blkCntN3 = outBlockSize - (2 * blkCnt);
108 
109 
110  while(blkCnt > 0u)
111  {
112  /* Copy decimation factor number of new input samples into the state buffer */
113  i = 2 * S->M;
114 
115  do
116  {
117  *pStateCurnt++ = *pSrc++;
118 
119  } while(--i);
120 
121  /* Set accumulator to zero */
122  acc0 = 0;
123  acc1 = 0;
124 
125  /* Initialize state pointer */
126  px0 = pState;
127 
128  px1 = pState + S->M;
129 
130 
131  /* Initialize coeff pointer */
132  pb = pCoeffs;
133 
134  /* Loop unrolling. Process 4 taps at a time. */
135  tapCnt = numTaps >> 2;
136 
137  /* Loop over the number of taps. Unroll by a factor of 4.
138  ** Repeat until we've computed numTaps-4 coefficients. */
139  while(tapCnt > 0u)
140  {
141  /* Read the Read b[numTaps-1] and b[numTaps-2] coefficients */
142  c0 = *__SIMD32(pb)++;
143 
144  /* Read x[n-numTaps-1] and x[n-numTaps-2]sample */
145  x0 = *__SIMD32(px0)++;
146 
147  x1 = *__SIMD32(px1)++;
148 
149  /* Perform the multiply-accumulate */
150  acc0 = __SMLAD(x0, c0, acc0);
151 
152  acc1 = __SMLAD(x1, c0, acc1);
153 
154  /* Read the b[numTaps-3] and b[numTaps-4] coefficient */
155  c0 = *__SIMD32(pb)++;
156 
157  /* Read x[n-numTaps-2] and x[n-numTaps-3] sample */
158  x0 = *__SIMD32(px0)++;
159 
160  x1 = *__SIMD32(px1)++;
161 
162  /* Perform the multiply-accumulate */
163  acc0 = __SMLAD(x0, c0, acc0);
164 
165  acc1 = __SMLAD(x1, c0, acc1);
166 
167  /* Decrement the loop counter */
168  tapCnt--;
169  }
170 
171  /* If the filter length is not a multiple of 4, compute the remaining filter taps */
172  tapCnt = numTaps % 0x4u;
173 
174  while(tapCnt > 0u)
175  {
176  /* Read coefficients */
177  c0 = *pb++;
178 
179  /* Fetch 1 state variable */
180  x0 = *px0++;
181 
182  x1 = *px1++;
183 
184  /* Perform the multiply-accumulate */
185  acc0 = __SMLAD(x0, c0, acc0);
186  acc1 = __SMLAD(x1, c0, acc1);
187 
188  /* Decrement the loop counter */
189  tapCnt--;
190  }
191 
192  /* Advance the state pointer by the decimation factor
193  * to process the next group of decimation factor number samples */
194  pState = pState + S->M * 2;
195 
196  /* Store filter output, smlad returns the values in 2.14 format */
197  /* so downsacle by 15 to get output in 1.15 */
198  *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16));
199  *pDst++ = (q15_t) (__SSAT((acc1 >> 15), 16));
200 
201  /* Decrement the loop counter */
202  blkCnt--;
203  }
204 
205 
206 
207  while(blkCntN3 > 0u)
208  {
209  /* Copy decimation factor number of new input samples into the state buffer */
210  i = S->M;
211 
212  do
213  {
214  *pStateCurnt++ = *pSrc++;
215 
216  } while(--i);
217 
218  /*Set sum to zero */
219  sum0 = 0;
220 
221  /* Initialize state pointer */
222  px = pState;
223 
224  /* Initialize coeff pointer */
225  pb = pCoeffs;
226 
227  /* Loop unrolling. Process 4 taps at a time. */
228  tapCnt = numTaps >> 2;
229 
230  /* Loop over the number of taps. Unroll by a factor of 4.
231  ** Repeat until we've computed numTaps-4 coefficients. */
232  while(tapCnt > 0u)
233  {
234  /* Read the Read b[numTaps-1] and b[numTaps-2] coefficients */
235  c0 = *__SIMD32(pb)++;
236 
237  /* Read x[n-numTaps-1] and x[n-numTaps-2]sample */
238  x0 = *__SIMD32(px)++;
239 
240  /* Read the b[numTaps-3] and b[numTaps-4] coefficient */
241  c1 = *__SIMD32(pb)++;
242 
243  /* Perform the multiply-accumulate */
244  sum0 = __SMLAD(x0, c0, sum0);
245 
246  /* Read x[n-numTaps-2] and x[n-numTaps-3] sample */
247  x0 = *__SIMD32(px)++;
248 
249  /* Perform the multiply-accumulate */
250  sum0 = __SMLAD(x0, c1, sum0);
251 
252  /* Decrement the loop counter */
253  tapCnt--;
254  }
255 
256  /* If the filter length is not a multiple of 4, compute the remaining filter taps */
257  tapCnt = numTaps % 0x4u;
258 
259  while(tapCnt > 0u)
260  {
261  /* Read coefficients */
262  c0 = *pb++;
263 
264  /* Fetch 1 state variable */
265  x0 = *px++;
266 
267  /* Perform the multiply-accumulate */
268  sum0 = __SMLAD(x0, c0, sum0);
269 
270  /* Decrement the loop counter */
271  tapCnt--;
272  }
273 
274  /* Advance the state pointer by the decimation factor
275  * to process the next group of decimation factor number samples */
276  pState = pState + S->M;
277 
278  /* Store filter output, smlad returns the values in 2.14 format */
279  /* so downsacle by 15 to get output in 1.15 */
280  *pDst++ = (q15_t) (__SSAT((sum0 >> 15), 16));
281 
282  /* Decrement the loop counter */
283  blkCntN3--;
284  }
285 
286  /* Processing is complete.
287  ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
288  ** This prepares the state buffer for the next function call. */
289 
290  /* Points to the start of the state buffer */
291  pStateCurnt = S->pState;
292 
293  i = (numTaps - 1u) >> 2u;
294 
295  /* copy data */
296  while(i > 0u)
297  {
298  *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++;
299  *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++;
300 
301  /* Decrement the loop counter */
302  i--;
303  }
304 
305  i = (numTaps - 1u) % 0x04u;
306 
307  /* copy data */
308  while(i > 0u)
309  {
310  *pStateCurnt++ = *pState++;
311 
312  /* Decrement the loop counter */
313  i--;
314  }
315 }
316 
317 #else
318 
319 
322  q15_t * pSrc,
323  q15_t * pDst,
324  uint32_t blockSize)
325 {
326  q15_t *pState = S->pState; /* State pointer */
327  q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
328  q15_t *pStateCurnt; /* Points to the current sample of the state */
329  q15_t *px; /* Temporary pointer for state buffer */
330  q15_t *pb; /* Temporary pointer coefficient buffer */
331  q15_t x0, x1, c0; /* Temporary variables to hold state and coefficient values */
332  q31_t sum0; /* Accumulators */
333  q31_t acc0, acc1;
334  q15_t *px0, *px1;
335  uint32_t blkCntN3;
336  uint32_t numTaps = S->numTaps; /* Number of taps */
337  uint32_t i, blkCnt, tapCnt, outBlockSize = blockSize / S->M; /* Loop counters */
338 
339 
340  /* S->pState buffer contains previous frame (numTaps - 1) samples */
341  /* pStateCurnt points to the location where the new input data should be written */
342  pStateCurnt = S->pState + (numTaps - 1u);
343 
344 
345  /* Total number of output samples to be computed */
346  blkCnt = outBlockSize / 2;
347  blkCntN3 = outBlockSize - (2 * blkCnt);
348 
349  while(blkCnt > 0u)
350  {
351  /* Copy decimation factor number of new input samples into the state buffer */
352  i = 2 * S->M;
353 
354  do
355  {
356  *pStateCurnt++ = *pSrc++;
357 
358  } while(--i);
359 
360  /* Set accumulator to zero */
361  acc0 = 0;
362  acc1 = 0;
363 
364  /* Initialize state pointer */
365  px0 = pState;
366 
367  px1 = pState + S->M;
368 
369 
370  /* Initialize coeff pointer */
371  pb = pCoeffs;
372 
373  /* Loop unrolling. Process 4 taps at a time. */
374  tapCnt = numTaps >> 2;
375 
376  /* Loop over the number of taps. Unroll by a factor of 4.
377  ** Repeat until we've computed numTaps-4 coefficients. */
378  while(tapCnt > 0u)
379  {
380  /* Read the Read b[numTaps-1] coefficients */
381  c0 = *pb++;
382 
383  /* Read x[n-numTaps-1] for sample 0 and for sample 1 */
384  x0 = *px0++;
385  x1 = *px1++;
386 
387  /* Perform the multiply-accumulate */
388  acc0 += x0 * c0;
389  acc1 += x1 * c0;
390 
391  /* Read the b[numTaps-2] coefficient */
392  c0 = *pb++;
393 
394  /* Read x[n-numTaps-2] for sample 0 and sample 1 */
395  x0 = *px0++;
396  x1 = *px1++;
397 
398  /* Perform the multiply-accumulate */
399  acc0 += x0 * c0;
400  acc1 += x1 * c0;
401 
402  /* Read the b[numTaps-3] coefficients */
403  c0 = *pb++;
404 
405  /* Read x[n-numTaps-3] for sample 0 and sample 1 */
406  x0 = *px0++;
407  x1 = *px1++;
408 
409  /* Perform the multiply-accumulate */
410  acc0 += x0 * c0;
411  acc1 += x1 * c0;
412 
413  /* Read the b[numTaps-4] coefficient */
414  c0 = *pb++;
415 
416  /* Read x[n-numTaps-4] for sample 0 and sample 1 */
417  x0 = *px0++;
418  x1 = *px1++;
419 
420  /* Perform the multiply-accumulate */
421  acc0 += x0 * c0;
422  acc1 += x1 * c0;
423 
424  /* Decrement the loop counter */
425  tapCnt--;
426  }
427 
428  /* If the filter length is not a multiple of 4, compute the remaining filter taps */
429  tapCnt = numTaps % 0x4u;
430 
431  while(tapCnt > 0u)
432  {
433  /* Read coefficients */
434  c0 = *pb++;
435 
436  /* Fetch 1 state variable */
437  x0 = *px0++;
438  x1 = *px1++;
439 
440  /* Perform the multiply-accumulate */
441  acc0 += x0 * c0;
442  acc1 += x1 * c0;
443 
444  /* Decrement the loop counter */
445  tapCnt--;
446  }
447 
448  /* Advance the state pointer by the decimation factor
449  * to process the next group of decimation factor number samples */
450  pState = pState + S->M * 2;
451 
452  /* Store filter output, smlad returns the values in 2.14 format */
453  /* so downsacle by 15 to get output in 1.15 */
454 
455  *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16));
456  *pDst++ = (q15_t) (__SSAT((acc1 >> 15), 16));
457 
458 
459  /* Decrement the loop counter */
460  blkCnt--;
461  }
462 
463  while(blkCntN3 > 0u)
464  {
465  /* Copy decimation factor number of new input samples into the state buffer */
466  i = S->M;
467 
468  do
469  {
470  *pStateCurnt++ = *pSrc++;
471 
472  } while(--i);
473 
474  /*Set sum to zero */
475  sum0 = 0;
476 
477  /* Initialize state pointer */
478  px = pState;
479 
480  /* Initialize coeff pointer */
481  pb = pCoeffs;
482 
483  /* Loop unrolling. Process 4 taps at a time. */
484  tapCnt = numTaps >> 2;
485 
486  /* Loop over the number of taps. Unroll by a factor of 4.
487  ** Repeat until we've computed numTaps-4 coefficients. */
488  while(tapCnt > 0u)
489  {
490  /* Read the Read b[numTaps-1] coefficients */
491  c0 = *pb++;
492 
493  /* Read x[n-numTaps-1] and sample */
494  x0 = *px++;
495 
496  /* Perform the multiply-accumulate */
497  sum0 += x0 * c0;
498 
499  /* Read the b[numTaps-2] coefficient */
500  c0 = *pb++;
501 
502  /* Read x[n-numTaps-2] and sample */
503  x0 = *px++;
504 
505  /* Perform the multiply-accumulate */
506  sum0 += x0 * c0;
507 
508  /* Read the b[numTaps-3] coefficients */
509  c0 = *pb++;
510 
511  /* Read x[n-numTaps-3] sample */
512  x0 = *px++;
513 
514  /* Perform the multiply-accumulate */
515  sum0 += x0 * c0;
516 
517  /* Read the b[numTaps-4] coefficient */
518  c0 = *pb++;
519 
520  /* Read x[n-numTaps-4] sample */
521  x0 = *px++;
522 
523  /* Perform the multiply-accumulate */
524  sum0 += x0 * c0;
525 
526  /* Decrement the loop counter */
527  tapCnt--;
528  }
529 
530  /* If the filter length is not a multiple of 4, compute the remaining filter taps */
531  tapCnt = numTaps % 0x4u;
532 
533  while(tapCnt > 0u)
534  {
535  /* Read coefficients */
536  c0 = *pb++;
537 
538  /* Fetch 1 state variable */
539  x0 = *px++;
540 
541  /* Perform the multiply-accumulate */
542  sum0 += x0 * c0;
543 
544  /* Decrement the loop counter */
545  tapCnt--;
546  }
547 
548  /* Advance the state pointer by the decimation factor
549  * to process the next group of decimation factor number samples */
550  pState = pState + S->M;
551 
552  /* Store filter output, smlad returns the values in 2.14 format */
553  /* so downsacle by 15 to get output in 1.15 */
554  *pDst++ = (q15_t) (__SSAT((sum0 >> 15), 16));
555 
556  /* Decrement the loop counter */
557  blkCntN3--;
558  }
559 
560  /* Processing is complete.
561  ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
562  ** This prepares the state buffer for the next function call. */
563 
564  /* Points to the start of the state buffer */
565  pStateCurnt = S->pState;
566 
567  i = (numTaps - 1u) >> 2u;
568 
569  /* copy data */
570  while(i > 0u)
571  {
572  *pStateCurnt++ = *pState++;
573  *pStateCurnt++ = *pState++;
574  *pStateCurnt++ = *pState++;
575  *pStateCurnt++ = *pState++;
576 
577  /* Decrement the loop counter */
578  i--;
579  }
580 
581  i = (numTaps - 1u) % 0x04u;
582 
583  /* copy data */
584  while(i > 0u)
585  {
586  *pStateCurnt++ = *pState++;
587 
588  /* Decrement the loop counter */
589  i--;
590  }
591 }
592 
593 
594 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
595 
int16_t q15_t
16-bit fractional data type in 1.15 format.
Definition: arm_math.h:392
Instance structure for the Q15 FIR decimator.
Definition: arm_math.h:3269
#define __SIMD32(addr)
definition to read/write two 16 bit values.
Definition: arm_math.h:445
int32_t q31_t
32-bit fractional data type in 1.31 format.
Definition: arm_math.h:397
void arm_fir_decimate_fast_q15(const arm_fir_decimate_instance_q15 *S, q15_t *pSrc, q15_t *pDst, uint32_t blockSize)
Processing function for the Q15 FIR decimator (fast variant) for Cortex-M3 and Cortex-M4.