STM32F769IDiscovery  1.00
uDANTE Audio Networking with STM32F7 DISCO board
arm_conv_fast_opt_q15.c
Go to the documentation of this file.
1 /* ----------------------------------------------------------------------
2 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.
3 *
4 * $Date: 19. March 2015
5 * $Revision: V.1.4.5
6 *
7 * Project: CMSIS DSP Library
8 * Title: arm_conv_fast_opt_q15.c
9 *
10 * Description: Fast Q15 Convolution.
11 *
12 * Target Processor: Cortex-M4/Cortex-M3
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * - Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * - Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in
21 * the documentation and/or other materials provided with the
22 * distribution.
23 * - Neither the name of ARM LIMITED nor the names of its contributors
24 * may be used to endorse or promote products derived from this
25 * software without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 * POSSIBILITY OF SUCH DAMAGE.
39 * -------------------------------------------------------------------- */
40 
41 #include "arm_math.h"
42 
84  q15_t * pSrcA,
85  uint32_t srcALen,
86  q15_t * pSrcB,
87  uint32_t srcBLen,
88  q15_t * pDst,
89  q15_t * pScratch1,
90  q15_t * pScratch2)
91 {
92  q31_t acc0, acc1, acc2, acc3; /* Accumulators */
93  q31_t x1, x2, x3; /* Temporary variables to hold state and coefficient values */
94  q31_t y1, y2; /* State variables */
95  q15_t *pOut = pDst; /* output pointer */
96  q15_t *pScr1 = pScratch1; /* Temporary pointer for scratch1 */
97  q15_t *pScr2 = pScratch2; /* Temporary pointer for scratch1 */
98  q15_t *pIn1; /* inputA pointer */
99  q15_t *pIn2; /* inputB pointer */
100  q15_t *px; /* Intermediate inputA pointer */
101  q15_t *py; /* Intermediate inputB pointer */
102  uint32_t j, k, blkCnt; /* loop counter */
103  uint32_t tapCnt; /* loop count */
104 #ifdef UNALIGNED_SUPPORT_DISABLE
105 
106  q15_t a, b;
107 
108 #endif /* #ifdef UNALIGNED_SUPPORT_DISABLE */
109 
110  /* The algorithm implementation is based on the lengths of the inputs. */
111  /* srcB is always made to slide across srcA. */
112  /* So srcBLen is always considered as shorter or equal to srcALen */
113  if(srcALen >= srcBLen)
114  {
115  /* Initialization of inputA pointer */
116  pIn1 = pSrcA;
117 
118  /* Initialization of inputB pointer */
119  pIn2 = pSrcB;
120  }
121  else
122  {
123  /* Initialization of inputA pointer */
124  pIn1 = pSrcB;
125 
126  /* Initialization of inputB pointer */
127  pIn2 = pSrcA;
128 
129  /* srcBLen is always considered as shorter or equal to srcALen */
130  j = srcBLen;
131  srcBLen = srcALen;
132  srcALen = j;
133  }
134 
135  /* Pointer to take end of scratch2 buffer */
136  pScr2 = pScratch2 + srcBLen - 1;
137 
138  /* points to smaller length sequence */
139  px = pIn2;
140 
141  /* Apply loop unrolling and do 4 Copies simultaneously. */
142  k = srcBLen >> 2u;
143 
144  /* First part of the processing with loop unrolling copies 4 data points at a time.
145  ** a second loop below copies for the remaining 1 to 3 samples. */
146 
147  /* Copy smaller length input sequence in reverse order into second scratch buffer */
148  while(k > 0u)
149  {
150  /* copy second buffer in reversal manner */
151  *pScr2-- = *px++;
152  *pScr2-- = *px++;
153  *pScr2-- = *px++;
154  *pScr2-- = *px++;
155 
156  /* Decrement the loop counter */
157  k--;
158  }
159 
160  /* If the count is not a multiple of 4, copy remaining samples here.
161  ** No loop unrolling is used. */
162  k = srcBLen % 0x4u;
163 
164  while(k > 0u)
165  {
166  /* copy second buffer in reversal manner for remaining samples */
167  *pScr2-- = *px++;
168 
169  /* Decrement the loop counter */
170  k--;
171  }
172 
173  /* Initialze temporary scratch pointer */
174  pScr1 = pScratch1;
175 
176  /* Assuming scratch1 buffer is aligned by 32-bit */
177  /* Fill (srcBLen - 1u) zeros in scratch1 buffer */
178  arm_fill_q15(0, pScr1, (srcBLen - 1u));
179 
180  /* Update temporary scratch pointer */
181  pScr1 += (srcBLen - 1u);
182 
183  /* Copy bigger length sequence(srcALen) samples in scratch1 buffer */
184 
185 #ifndef UNALIGNED_SUPPORT_DISABLE
186 
187  /* Copy (srcALen) samples in scratch buffer */
188  arm_copy_q15(pIn1, pScr1, srcALen);
189 
190  /* Update pointers */
191  pScr1 += srcALen;
192 
193 #else
194 
195  /* Apply loop unrolling and do 4 Copies simultaneously. */
196  k = srcALen >> 2u;
197 
198  /* First part of the processing with loop unrolling copies 4 data points at a time.
199  ** a second loop below copies for the remaining 1 to 3 samples. */
200  while(k > 0u)
201  {
202  /* copy second buffer in reversal manner */
203  *pScr1++ = *pIn1++;
204  *pScr1++ = *pIn1++;
205  *pScr1++ = *pIn1++;
206  *pScr1++ = *pIn1++;
207 
208  /* Decrement the loop counter */
209  k--;
210  }
211 
212  /* If the count is not a multiple of 4, copy remaining samples here.
213  ** No loop unrolling is used. */
214  k = srcALen % 0x4u;
215 
216  while(k > 0u)
217  {
218  /* copy second buffer in reversal manner for remaining samples */
219  *pScr1++ = *pIn1++;
220 
221  /* Decrement the loop counter */
222  k--;
223  }
224 
225 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
226 
227 
228 #ifndef UNALIGNED_SUPPORT_DISABLE
229 
230  /* Fill (srcBLen - 1u) zeros at end of scratch buffer */
231  arm_fill_q15(0, pScr1, (srcBLen - 1u));
232 
233  /* Update pointer */
234  pScr1 += (srcBLen - 1u);
235 
236 #else
237 
238  /* Apply loop unrolling and do 4 Copies simultaneously. */
239  k = (srcBLen - 1u) >> 2u;
240 
241  /* First part of the processing with loop unrolling copies 4 data points at a time.
242  ** a second loop below copies for the remaining 1 to 3 samples. */
243  while(k > 0u)
244  {
245  /* copy second buffer in reversal manner */
246  *pScr1++ = 0;
247  *pScr1++ = 0;
248  *pScr1++ = 0;
249  *pScr1++ = 0;
250 
251  /* Decrement the loop counter */
252  k--;
253  }
254 
255  /* If the count is not a multiple of 4, copy remaining samples here.
256  ** No loop unrolling is used. */
257  k = (srcBLen - 1u) % 0x4u;
258 
259  while(k > 0u)
260  {
261  /* copy second buffer in reversal manner for remaining samples */
262  *pScr1++ = 0;
263 
264  /* Decrement the loop counter */
265  k--;
266  }
267 
268 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
269 
270  /* Temporary pointer for scratch2 */
271  py = pScratch2;
272 
273 
274  /* Initialization of pIn2 pointer */
275  pIn2 = py;
276 
277  /* First part of the processing with loop unrolling process 4 data points at a time.
278  ** a second loop below process for the remaining 1 to 3 samples. */
279 
280  /* Actual convolution process starts here */
281  blkCnt = (srcALen + srcBLen - 1u) >> 2;
282 
283  while(blkCnt > 0)
284  {
285  /* Initialze temporary scratch pointer as scratch1 */
286  pScr1 = pScratch1;
287 
288  /* Clear Accumlators */
289  acc0 = 0;
290  acc1 = 0;
291  acc2 = 0;
292  acc3 = 0;
293 
294  /* Read two samples from scratch1 buffer */
295  x1 = *__SIMD32(pScr1)++;
296 
297  /* Read next two samples from scratch1 buffer */
298  x2 = *__SIMD32(pScr1)++;
299 
300  tapCnt = (srcBLen) >> 2u;
301 
302  while(tapCnt > 0u)
303  {
304 
305 #ifndef UNALIGNED_SUPPORT_DISABLE
306 
307  /* Read four samples from smaller buffer */
308  y1 = _SIMD32_OFFSET(pIn2);
309  y2 = _SIMD32_OFFSET(pIn2 + 2u);
310 
311  /* multiply and accumlate */
312  acc0 = __SMLAD(x1, y1, acc0);
313  acc2 = __SMLAD(x2, y1, acc2);
314 
315  /* pack input data */
316 #ifndef ARM_MATH_BIG_ENDIAN
317  x3 = __PKHBT(x2, x1, 0);
318 #else
319  x3 = __PKHBT(x1, x2, 0);
320 #endif
321 
322  /* multiply and accumlate */
323  acc1 = __SMLADX(x3, y1, acc1);
324 
325  /* Read next two samples from scratch1 buffer */
326  x1 = _SIMD32_OFFSET(pScr1);
327 
328  /* multiply and accumlate */
329  acc0 = __SMLAD(x2, y2, acc0);
330  acc2 = __SMLAD(x1, y2, acc2);
331 
332  /* pack input data */
333 #ifndef ARM_MATH_BIG_ENDIAN
334  x3 = __PKHBT(x1, x2, 0);
335 #else
336  x3 = __PKHBT(x2, x1, 0);
337 #endif
338 
339  acc3 = __SMLADX(x3, y1, acc3);
340  acc1 = __SMLADX(x3, y2, acc1);
341 
342  x2 = _SIMD32_OFFSET(pScr1 + 2u);
343 
344 #ifndef ARM_MATH_BIG_ENDIAN
345  x3 = __PKHBT(x2, x1, 0);
346 #else
347  x3 = __PKHBT(x1, x2, 0);
348 #endif
349 
350  acc3 = __SMLADX(x3, y2, acc3);
351 
352 #else
353 
354  /* Read four samples from smaller buffer */
355  a = *pIn2;
356  b = *(pIn2 + 1);
357 
358 #ifndef ARM_MATH_BIG_ENDIAN
359  y1 = __PKHBT(a, b, 16);
360 #else
361  y1 = __PKHBT(b, a, 16);
362 #endif
363 
364  a = *(pIn2 + 2);
365  b = *(pIn2 + 3);
366 #ifndef ARM_MATH_BIG_ENDIAN
367  y2 = __PKHBT(a, b, 16);
368 #else
369  y2 = __PKHBT(b, a, 16);
370 #endif
371 
372  acc0 = __SMLAD(x1, y1, acc0);
373 
374  acc2 = __SMLAD(x2, y1, acc2);
375 
376 #ifndef ARM_MATH_BIG_ENDIAN
377  x3 = __PKHBT(x2, x1, 0);
378 #else
379  x3 = __PKHBT(x1, x2, 0);
380 #endif
381 
382  acc1 = __SMLADX(x3, y1, acc1);
383 
384  a = *pScr1;
385  b = *(pScr1 + 1);
386 
387 #ifndef ARM_MATH_BIG_ENDIAN
388  x1 = __PKHBT(a, b, 16);
389 #else
390  x1 = __PKHBT(b, a, 16);
391 #endif
392 
393  acc0 = __SMLAD(x2, y2, acc0);
394 
395  acc2 = __SMLAD(x1, y2, acc2);
396 
397 #ifndef ARM_MATH_BIG_ENDIAN
398  x3 = __PKHBT(x1, x2, 0);
399 #else
400  x3 = __PKHBT(x2, x1, 0);
401 #endif
402 
403  acc3 = __SMLADX(x3, y1, acc3);
404 
405  acc1 = __SMLADX(x3, y2, acc1);
406 
407  a = *(pScr1 + 2);
408  b = *(pScr1 + 3);
409 
410 #ifndef ARM_MATH_BIG_ENDIAN
411  x2 = __PKHBT(a, b, 16);
412 #else
413  x2 = __PKHBT(b, a, 16);
414 #endif
415 
416 #ifndef ARM_MATH_BIG_ENDIAN
417  x3 = __PKHBT(x2, x1, 0);
418 #else
419  x3 = __PKHBT(x1, x2, 0);
420 #endif
421 
422  acc3 = __SMLADX(x3, y2, acc3);
423 
424 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
425 
426  /* update scratch pointers */
427  pIn2 += 4u;
428  pScr1 += 4u;
429 
430 
431  /* Decrement the loop counter */
432  tapCnt--;
433  }
434 
435  /* Update scratch pointer for remaining samples of smaller length sequence */
436  pScr1 -= 4u;
437 
438  /* apply same above for remaining samples of smaller length sequence */
439  tapCnt = (srcBLen) & 3u;
440 
441  while(tapCnt > 0u)
442  {
443 
444  /* accumlate the results */
445  acc0 += (*pScr1++ * *pIn2);
446  acc1 += (*pScr1++ * *pIn2);
447  acc2 += (*pScr1++ * *pIn2);
448  acc3 += (*pScr1++ * *pIn2++);
449 
450  pScr1 -= 3u;
451 
452  /* Decrement the loop counter */
453  tapCnt--;
454  }
455 
456  blkCnt--;
457 
458 
459  /* Store the results in the accumulators in the destination buffer. */
460 
461 #ifndef ARM_MATH_BIG_ENDIAN
462 
463  *__SIMD32(pOut)++ =
464  __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16);
465 
466  *__SIMD32(pOut)++ =
467  __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16);
468 
469 
470 #else
471 
472  *__SIMD32(pOut)++ =
473  __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16);
474 
475  *__SIMD32(pOut)++ =
476  __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16);
477 
478 
479 
480 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
481 
482  /* Initialization of inputB pointer */
483  pIn2 = py;
484 
485  pScratch1 += 4u;
486 
487  }
488 
489 
490  blkCnt = (srcALen + srcBLen - 1u) & 0x3;
491 
492  /* Calculate convolution for remaining samples of Bigger length sequence */
493  while(blkCnt > 0)
494  {
495  /* Initialze temporary scratch pointer as scratch1 */
496  pScr1 = pScratch1;
497 
498  /* Clear Accumlators */
499  acc0 = 0;
500 
501  tapCnt = (srcBLen) >> 1u;
502 
503  while(tapCnt > 0u)
504  {
505 
506  acc0 += (*pScr1++ * *pIn2++);
507  acc0 += (*pScr1++ * *pIn2++);
508 
509  /* Decrement the loop counter */
510  tapCnt--;
511  }
512 
513  tapCnt = (srcBLen) & 1u;
514 
515  /* apply same above for remaining samples of smaller length sequence */
516  while(tapCnt > 0u)
517  {
518 
519  /* accumlate the results */
520  acc0 += (*pScr1++ * *pIn2++);
521 
522  /* Decrement the loop counter */
523  tapCnt--;
524  }
525 
526  blkCnt--;
527 
528  /* The result is in 2.30 format. Convert to 1.15 with saturation.
529  ** Then store the output in the destination buffer. */
530  *pOut++ = (q15_t) (__SSAT((acc0 >> 15), 16));
531 
532  /* Initialization of inputB pointer */
533  pIn2 = py;
534 
535  pScratch1 += 1u;
536 
537  }
538 
539 }
540 
void arm_copy_q15(q15_t *pSrc, q15_t *pDst, uint32_t blockSize)
Copies the elements of a Q15 vector.
Definition: arm_copy_q15.c:60
int16_t q15_t
16-bit fractional data type in 1.15 format.
Definition: arm_math.h:392
void arm_conv_fast_opt_q15(q15_t *pSrcA, uint32_t srcALen, q15_t *pSrcB, uint32_t srcBLen, q15_t *pDst, q15_t *pScratch1, q15_t *pScratch2)
Convolution of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4.
#define __SIMD32(addr)
definition to read/write two 16 bit values.
Definition: arm_math.h:445
#define _SIMD32_OFFSET(addr)
Definition: arm_math.h:447
int32_t q31_t
32-bit fractional data type in 1.31 format.
Definition: arm_math.h:397
void arm_fill_q15(q15_t value, q15_t *pDst, uint32_t blockSize)
Fills a constant value into a Q15 vector.
Definition: arm_fill_q15.c:61