STM32F769IDiscovery  1.00
uDANTE Audio Networking with STM32F7 DISCO board
arm_conv_opt_q15.c
Go to the documentation of this file.
1 /* ----------------------------------------------------------------------
2 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.
3 *
4 * $Date: 19. March 2015
5 * $Revision: V.1.4.5
6 *
7 * Project: CMSIS DSP Library
8 * Title: arm_conv_opt_q15.c
9 *
10 * Description: Convolution of Q15 sequences.
11 *
12 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * - Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * - Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in
21 * the documentation and/or other materials provided with the
22 * distribution.
23 * - Neither the name of ARM LIMITED nor the names of its contributors
24 * may be used to endorse or promote products derived from this
25 * software without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 * POSSIBILITY OF SUCH DAMAGE.
39 * -------------------------------------------------------------------- */
40 
41 #include "arm_math.h"
42 
86  q15_t * pSrcA,
87  uint32_t srcALen,
88  q15_t * pSrcB,
89  uint32_t srcBLen,
90  q15_t * pDst,
91  q15_t * pScratch1,
92  q15_t * pScratch2)
93 {
94  q63_t acc0, acc1, acc2, acc3; /* Accumulator */
95  q31_t x1, x2, x3; /* Temporary variables to hold state and coefficient values */
96  q31_t y1, y2; /* State variables */
97  q15_t *pOut = pDst; /* output pointer */
98  q15_t *pScr1 = pScratch1; /* Temporary pointer for scratch1 */
99  q15_t *pScr2 = pScratch2; /* Temporary pointer for scratch1 */
100  q15_t *pIn1; /* inputA pointer */
101  q15_t *pIn2; /* inputB pointer */
102  q15_t *px; /* Intermediate inputA pointer */
103  q15_t *py; /* Intermediate inputB pointer */
104  uint32_t j, k, blkCnt; /* loop counter */
105  uint32_t tapCnt; /* loop count */
106 #ifdef UNALIGNED_SUPPORT_DISABLE
107 
108  q15_t a, b;
109 
110 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
111 
112  /* The algorithm implementation is based on the lengths of the inputs. */
113  /* srcB is always made to slide across srcA. */
114  /* So srcBLen is always considered as shorter or equal to srcALen */
115  if(srcALen >= srcBLen)
116  {
117  /* Initialization of inputA pointer */
118  pIn1 = pSrcA;
119 
120  /* Initialization of inputB pointer */
121  pIn2 = pSrcB;
122 
123  }
124  else
125  {
126  /* Initialization of inputA pointer */
127  pIn1 = pSrcB;
128 
129  /* Initialization of inputB pointer */
130  pIn2 = pSrcA;
131 
132  /* srcBLen is always considered as shorter or equal to srcALen */
133  j = srcBLen;
134  srcBLen = srcALen;
135  srcALen = j;
136  }
137 
138  /* pointer to take end of scratch2 buffer */
139  pScr2 = pScratch2 + srcBLen - 1;
140 
141  /* points to smaller length sequence */
142  px = pIn2;
143 
144  /* Apply loop unrolling and do 4 Copies simultaneously. */
145  k = srcBLen >> 2u;
146 
147  /* First part of the processing with loop unrolling copies 4 data points at a time.
148  ** a second loop below copies for the remaining 1 to 3 samples. */
149  /* Copy smaller length input sequence in reverse order into second scratch buffer */
150  while(k > 0u)
151  {
152  /* copy second buffer in reversal manner */
153  *pScr2-- = *px++;
154  *pScr2-- = *px++;
155  *pScr2-- = *px++;
156  *pScr2-- = *px++;
157 
158  /* Decrement the loop counter */
159  k--;
160  }
161 
162  /* If the count is not a multiple of 4, copy remaining samples here.
163  ** No loop unrolling is used. */
164  k = srcBLen % 0x4u;
165 
166  while(k > 0u)
167  {
168  /* copy second buffer in reversal manner for remaining samples */
169  *pScr2-- = *px++;
170 
171  /* Decrement the loop counter */
172  k--;
173  }
174 
175  /* Initialze temporary scratch pointer */
176  pScr1 = pScratch1;
177 
178  /* Assuming scratch1 buffer is aligned by 32-bit */
179  /* Fill (srcBLen - 1u) zeros in scratch buffer */
180  arm_fill_q15(0, pScr1, (srcBLen - 1u));
181 
182  /* Update temporary scratch pointer */
183  pScr1 += (srcBLen - 1u);
184 
185  /* Copy bigger length sequence(srcALen) samples in scratch1 buffer */
186 
187 #ifndef UNALIGNED_SUPPORT_DISABLE
188 
189  /* Copy (srcALen) samples in scratch buffer */
190  arm_copy_q15(pIn1, pScr1, srcALen);
191 
192  /* Update pointers */
193  pScr1 += srcALen;
194 
195 #else
196 
197  /* Apply loop unrolling and do 4 Copies simultaneously. */
198  k = srcALen >> 2u;
199 
200  /* First part of the processing with loop unrolling copies 4 data points at a time.
201  ** a second loop below copies for the remaining 1 to 3 samples. */
202  while(k > 0u)
203  {
204  /* copy second buffer in reversal manner */
205  *pScr1++ = *pIn1++;
206  *pScr1++ = *pIn1++;
207  *pScr1++ = *pIn1++;
208  *pScr1++ = *pIn1++;
209 
210  /* Decrement the loop counter */
211  k--;
212  }
213 
214  /* If the count is not a multiple of 4, copy remaining samples here.
215  ** No loop unrolling is used. */
216  k = srcALen % 0x4u;
217 
218  while(k > 0u)
219  {
220  /* copy second buffer in reversal manner for remaining samples */
221  *pScr1++ = *pIn1++;
222 
223  /* Decrement the loop counter */
224  k--;
225  }
226 
227 #endif
228 
229 
230 #ifndef UNALIGNED_SUPPORT_DISABLE
231 
232  /* Fill (srcBLen - 1u) zeros at end of scratch buffer */
233  arm_fill_q15(0, pScr1, (srcBLen - 1u));
234 
235  /* Update pointer */
236  pScr1 += (srcBLen - 1u);
237 
238 #else
239 
240  /* Apply loop unrolling and do 4 Copies simultaneously. */
241  k = (srcBLen - 1u) >> 2u;
242 
243  /* First part of the processing with loop unrolling copies 4 data points at a time.
244  ** a second loop below copies for the remaining 1 to 3 samples. */
245  while(k > 0u)
246  {
247  /* copy second buffer in reversal manner */
248  *pScr1++ = 0;
249  *pScr1++ = 0;
250  *pScr1++ = 0;
251  *pScr1++ = 0;
252 
253  /* Decrement the loop counter */
254  k--;
255  }
256 
257  /* If the count is not a multiple of 4, copy remaining samples here.
258  ** No loop unrolling is used. */
259  k = (srcBLen - 1u) % 0x4u;
260 
261  while(k > 0u)
262  {
263  /* copy second buffer in reversal manner for remaining samples */
264  *pScr1++ = 0;
265 
266  /* Decrement the loop counter */
267  k--;
268  }
269 
270 #endif
271 
272  /* Temporary pointer for scratch2 */
273  py = pScratch2;
274 
275 
276  /* Initialization of pIn2 pointer */
277  pIn2 = py;
278 
279  /* First part of the processing with loop unrolling process 4 data points at a time.
280  ** a second loop below process for the remaining 1 to 3 samples. */
281 
282  /* Actual convolution process starts here */
283  blkCnt = (srcALen + srcBLen - 1u) >> 2;
284 
285  while(blkCnt > 0)
286  {
287  /* Initialze temporary scratch pointer as scratch1 */
288  pScr1 = pScratch1;
289 
290  /* Clear Accumlators */
291  acc0 = 0;
292  acc1 = 0;
293  acc2 = 0;
294  acc3 = 0;
295 
296  /* Read two samples from scratch1 buffer */
297  x1 = *__SIMD32(pScr1)++;
298 
299  /* Read next two samples from scratch1 buffer */
300  x2 = *__SIMD32(pScr1)++;
301 
302  tapCnt = (srcBLen) >> 2u;
303 
304  while(tapCnt > 0u)
305  {
306 
307 #ifndef UNALIGNED_SUPPORT_DISABLE
308 
309  /* Read four samples from smaller buffer */
310  y1 = _SIMD32_OFFSET(pIn2);
311  y2 = _SIMD32_OFFSET(pIn2 + 2u);
312 
313  /* multiply and accumlate */
314  acc0 = __SMLALD(x1, y1, acc0);
315  acc2 = __SMLALD(x2, y1, acc2);
316 
317  /* pack input data */
318 #ifndef ARM_MATH_BIG_ENDIAN
319  x3 = __PKHBT(x2, x1, 0);
320 #else
321  x3 = __PKHBT(x1, x2, 0);
322 #endif
323 
324  /* multiply and accumlate */
325  acc1 = __SMLALDX(x3, y1, acc1);
326 
327  /* Read next two samples from scratch1 buffer */
328  x1 = _SIMD32_OFFSET(pScr1);
329 
330  /* multiply and accumlate */
331  acc0 = __SMLALD(x2, y2, acc0);
332  acc2 = __SMLALD(x1, y2, acc2);
333 
334  /* pack input data */
335 #ifndef ARM_MATH_BIG_ENDIAN
336  x3 = __PKHBT(x1, x2, 0);
337 #else
338  x3 = __PKHBT(x2, x1, 0);
339 #endif
340 
341  acc3 = __SMLALDX(x3, y1, acc3);
342  acc1 = __SMLALDX(x3, y2, acc1);
343 
344  x2 = _SIMD32_OFFSET(pScr1 + 2u);
345 
346 #ifndef ARM_MATH_BIG_ENDIAN
347  x3 = __PKHBT(x2, x1, 0);
348 #else
349  x3 = __PKHBT(x1, x2, 0);
350 #endif
351 
352  acc3 = __SMLALDX(x3, y2, acc3);
353 
354 #else
355 
356  /* Read four samples from smaller buffer */
357  a = *pIn2;
358  b = *(pIn2 + 1);
359 
360 #ifndef ARM_MATH_BIG_ENDIAN
361  y1 = __PKHBT(a, b, 16);
362 #else
363  y1 = __PKHBT(b, a, 16);
364 #endif
365 
366  a = *(pIn2 + 2);
367  b = *(pIn2 + 3);
368 #ifndef ARM_MATH_BIG_ENDIAN
369  y2 = __PKHBT(a, b, 16);
370 #else
371  y2 = __PKHBT(b, a, 16);
372 #endif
373 
374  acc0 = __SMLALD(x1, y1, acc0);
375 
376  acc2 = __SMLALD(x2, y1, acc2);
377 
378 #ifndef ARM_MATH_BIG_ENDIAN
379  x3 = __PKHBT(x2, x1, 0);
380 #else
381  x3 = __PKHBT(x1, x2, 0);
382 #endif
383 
384  acc1 = __SMLALDX(x3, y1, acc1);
385 
386  a = *pScr1;
387  b = *(pScr1 + 1);
388 
389 #ifndef ARM_MATH_BIG_ENDIAN
390  x1 = __PKHBT(a, b, 16);
391 #else
392  x1 = __PKHBT(b, a, 16);
393 #endif
394 
395  acc0 = __SMLALD(x2, y2, acc0);
396 
397  acc2 = __SMLALD(x1, y2, acc2);
398 
399 #ifndef ARM_MATH_BIG_ENDIAN
400  x3 = __PKHBT(x1, x2, 0);
401 #else
402  x3 = __PKHBT(x2, x1, 0);
403 #endif
404 
405  acc3 = __SMLALDX(x3, y1, acc3);
406 
407  acc1 = __SMLALDX(x3, y2, acc1);
408 
409  a = *(pScr1 + 2);
410  b = *(pScr1 + 3);
411 
412 #ifndef ARM_MATH_BIG_ENDIAN
413  x2 = __PKHBT(a, b, 16);
414 #else
415  x2 = __PKHBT(b, a, 16);
416 #endif
417 
418 #ifndef ARM_MATH_BIG_ENDIAN
419  x3 = __PKHBT(x2, x1, 0);
420 #else
421  x3 = __PKHBT(x1, x2, 0);
422 #endif
423 
424  acc3 = __SMLALDX(x3, y2, acc3);
425 
426 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
427 
428  pIn2 += 4u;
429  pScr1 += 4u;
430 
431 
432  /* Decrement the loop counter */
433  tapCnt--;
434  }
435 
436  /* Update scratch pointer for remaining samples of smaller length sequence */
437  pScr1 -= 4u;
438 
439  /* apply same above for remaining samples of smaller length sequence */
440  tapCnt = (srcBLen) & 3u;
441 
442  while(tapCnt > 0u)
443  {
444 
445  /* accumlate the results */
446  acc0 += (*pScr1++ * *pIn2);
447  acc1 += (*pScr1++ * *pIn2);
448  acc2 += (*pScr1++ * *pIn2);
449  acc3 += (*pScr1++ * *pIn2++);
450 
451  pScr1 -= 3u;
452 
453  /* Decrement the loop counter */
454  tapCnt--;
455  }
456 
457  blkCnt--;
458 
459 
460  /* Store the results in the accumulators in the destination buffer. */
461 
462 #ifndef ARM_MATH_BIG_ENDIAN
463 
464  *__SIMD32(pOut)++ =
465  __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16);
466 
467  *__SIMD32(pOut)++ =
468  __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16);
469 
470 #else
471 
472  *__SIMD32(pOut)++ =
473  __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16);
474 
475  *__SIMD32(pOut)++ =
476  __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16);
477 
478 
479 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
480 
481  /* Initialization of inputB pointer */
482  pIn2 = py;
483 
484  pScratch1 += 4u;
485 
486  }
487 
488 
489  blkCnt = (srcALen + srcBLen - 1u) & 0x3;
490 
491  /* Calculate convolution for remaining samples of Bigger length sequence */
492  while(blkCnt > 0)
493  {
494  /* Initialze temporary scratch pointer as scratch1 */
495  pScr1 = pScratch1;
496 
497  /* Clear Accumlators */
498  acc0 = 0;
499 
500  tapCnt = (srcBLen) >> 1u;
501 
502  while(tapCnt > 0u)
503  {
504 
505  /* Read next two samples from scratch1 buffer */
506  acc0 += (*pScr1++ * *pIn2++);
507  acc0 += (*pScr1++ * *pIn2++);
508 
509  /* Decrement the loop counter */
510  tapCnt--;
511  }
512 
513  tapCnt = (srcBLen) & 1u;
514 
515  /* apply same above for remaining samples of smaller length sequence */
516  while(tapCnt > 0u)
517  {
518 
519  /* accumlate the results */
520  acc0 += (*pScr1++ * *pIn2++);
521 
522  /* Decrement the loop counter */
523  tapCnt--;
524  }
525 
526  blkCnt--;
527 
528  /* The result is in 2.30 format. Convert to 1.15 with saturation.
529  ** Then store the output in the destination buffer. */
530  *pOut++ = (q15_t) (__SSAT((acc0 >> 15), 16));
531 
532 
533  /* Initialization of inputB pointer */
534  pIn2 = py;
535 
536  pScratch1 += 1u;
537 
538  }
539 
540 }
541 
542 
int64_t q63_t
64-bit fractional data type in 1.63 format.
Definition: arm_math.h:402
void arm_copy_q15(q15_t *pSrc, q15_t *pDst, uint32_t blockSize)
Copies the elements of a Q15 vector.
Definition: arm_copy_q15.c:60
void arm_conv_opt_q15(q15_t *pSrcA, uint32_t srcALen, q15_t *pSrcB, uint32_t srcBLen, q15_t *pDst, q15_t *pScratch1, q15_t *pScratch2)
Convolution of Q15 sequences.
int16_t q15_t
16-bit fractional data type in 1.15 format.
Definition: arm_math.h:392
#define __SIMD32(addr)
definition to read/write two 16 bit values.
Definition: arm_math.h:445
#define _SIMD32_OFFSET(addr)
Definition: arm_math.h:447
int32_t q31_t
32-bit fractional data type in 1.31 format.
Definition: arm_math.h:397
void arm_fill_q15(q15_t value, q15_t *pDst, uint32_t blockSize)
Fills a constant value into a Q15 vector.
Definition: arm_fill_q15.c:61