STM32F769IDiscovery  1.00
uDANTE Audio Networking with STM32F7 DISCO board
arm_conv_q31.c
Go to the documentation of this file.
1 /* ----------------------------------------------------------------------
2 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.
3 *
4 * $Date: 19. March 2015
5 * $Revision: V.1.4.5
6 *
7 * Project: CMSIS DSP Library
8 * Title: arm_conv_q31.c
9 *
10 * Description: Convolution of Q31 sequences.
11 *
12 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * - Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * - Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in
21 * the documentation and/or other materials provided with the
22 * distribution.
23 * - Neither the name of ARM LIMITED nor the names of its contributors
24 * may be used to endorse or promote products derived from this
25 * software without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 * POSSIBILITY OF SUCH DAMAGE.
39 * -------------------------------------------------------------------- */
40 
41 #include "arm_math.h"
42 
79  q31_t * pSrcA,
80  uint32_t srcALen,
81  q31_t * pSrcB,
82  uint32_t srcBLen,
83  q31_t * pDst)
84 {
85 
86 
87 #ifndef ARM_MATH_CM0_FAMILY
88 
89  /* Run the below code for Cortex-M4 and Cortex-M3 */
90 
91  q31_t *pIn1; /* inputA pointer */
92  q31_t *pIn2; /* inputB pointer */
93  q31_t *pOut = pDst; /* output pointer */
94  q31_t *px; /* Intermediate inputA pointer */
95  q31_t *py; /* Intermediate inputB pointer */
96  q31_t *pSrc1, *pSrc2; /* Intermediate pointers */
97  q63_t sum; /* Accumulator */
98  q63_t acc0, acc1, acc2; /* Accumulator */
99  q31_t x0, x1, x2, c0; /* Temporary variables to hold state and coefficient values */
100  uint32_t j, k, count, blkCnt, blockSize1, blockSize2, blockSize3; /* loop counter */
101 
102  /* The algorithm implementation is based on the lengths of the inputs. */
103  /* srcB is always made to slide across srcA. */
104  /* So srcBLen is always considered as shorter or equal to srcALen */
105  if(srcALen >= srcBLen)
106  {
107  /* Initialization of inputA pointer */
108  pIn1 = pSrcA;
109 
110  /* Initialization of inputB pointer */
111  pIn2 = pSrcB;
112  }
113  else
114  {
115  /* Initialization of inputA pointer */
116  pIn1 = (q31_t *) pSrcB;
117 
118  /* Initialization of inputB pointer */
119  pIn2 = (q31_t *) pSrcA;
120 
121  /* srcBLen is always considered as shorter or equal to srcALen */
122  j = srcBLen;
123  srcBLen = srcALen;
124  srcALen = j;
125  }
126 
127  /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
128  /* The function is internally
129  * divided into three stages according to the number of multiplications that has to be
130  * taken place between inputA samples and inputB samples. In the first stage of the
131  * algorithm, the multiplications increase by one for every iteration.
132  * In the second stage of the algorithm, srcBLen number of multiplications are done.
133  * In the third stage of the algorithm, the multiplications decrease by one
134  * for every iteration. */
135 
136  /* The algorithm is implemented in three stages.
137  The loop counters of each stage is initiated here. */
138  blockSize1 = srcBLen - 1u;
139  blockSize2 = srcALen - (srcBLen - 1u);
140  blockSize3 = blockSize1;
141 
142  /* --------------------------
143  * Initializations of stage1
144  * -------------------------*/
145 
146  /* sum = x[0] * y[0]
147  * sum = x[0] * y[1] + x[1] * y[0]
148  * ....
149  * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
150  */
151 
152  /* In this stage the MAC operations are increased by 1 for every iteration.
153  The count variable holds the number of MAC operations performed */
154  count = 1u;
155 
156  /* Working pointer of inputA */
157  px = pIn1;
158 
159  /* Working pointer of inputB */
160  py = pIn2;
161 
162 
163  /* ------------------------
164  * Stage1 process
165  * ----------------------*/
166 
167  /* The first stage starts here */
168  while(blockSize1 > 0u)
169  {
170  /* Accumulator is made zero for every iteration */
171  sum = 0;
172 
173  /* Apply loop unrolling and compute 4 MACs simultaneously. */
174  k = count >> 2u;
175 
176  /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
177  ** a second loop below computes MACs for the remaining 1 to 3 samples. */
178  while(k > 0u)
179  {
180  /* x[0] * y[srcBLen - 1] */
181  sum += (q63_t) * px++ * (*py--);
182  /* x[1] * y[srcBLen - 2] */
183  sum += (q63_t) * px++ * (*py--);
184  /* x[2] * y[srcBLen - 3] */
185  sum += (q63_t) * px++ * (*py--);
186  /* x[3] * y[srcBLen - 4] */
187  sum += (q63_t) * px++ * (*py--);
188 
189  /* Decrement the loop counter */
190  k--;
191  }
192 
193  /* If the count is not a multiple of 4, compute any remaining MACs here.
194  ** No loop unrolling is used. */
195  k = count % 0x4u;
196 
197  while(k > 0u)
198  {
199  /* Perform the multiply-accumulate */
200  sum += (q63_t) * px++ * (*py--);
201 
202  /* Decrement the loop counter */
203  k--;
204  }
205 
206  /* Store the result in the accumulator in the destination buffer. */
207  *pOut++ = (q31_t) (sum >> 31);
208 
209  /* Update the inputA and inputB pointers for next MAC calculation */
210  py = pIn2 + count;
211  px = pIn1;
212 
213  /* Increment the MAC count */
214  count++;
215 
216  /* Decrement the loop counter */
217  blockSize1--;
218  }
219 
220  /* --------------------------
221  * Initializations of stage2
222  * ------------------------*/
223 
224  /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
225  * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
226  * ....
227  * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
228  */
229 
230  /* Working pointer of inputA */
231  px = pIn1;
232 
233  /* Working pointer of inputB */
234  pSrc2 = pIn2 + (srcBLen - 1u);
235  py = pSrc2;
236 
237  /* count is index by which the pointer pIn1 to be incremented */
238  count = 0u;
239 
240  /* -------------------
241  * Stage2 process
242  * ------------------*/
243 
244  /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
245  * So, to loop unroll over blockSize2,
246  * srcBLen should be greater than or equal to 4 */
247  if(srcBLen >= 4u)
248  {
249  /* Loop unroll by 3 */
250  blkCnt = blockSize2 / 3;
251 
252  while(blkCnt > 0u)
253  {
254  /* Set all accumulators to zero */
255  acc0 = 0;
256  acc1 = 0;
257  acc2 = 0;
258 
259  /* read x[0], x[1], x[2] samples */
260  x0 = *(px++);
261  x1 = *(px++);
262 
263  /* Apply loop unrolling and compute 3 MACs simultaneously. */
264  k = srcBLen / 3;
265 
266  /* First part of the processing with loop unrolling. Compute 3 MACs at a time.
267  ** a second loop below computes MACs for the remaining 1 to 2 samples. */
268  do
269  {
270  /* Read y[srcBLen - 1] sample */
271  c0 = *(py);
272 
273  /* Read x[3] sample */
274  x2 = *(px);
275 
276  /* Perform the multiply-accumulates */
277  /* acc0 += x[0] * y[srcBLen - 1] */
278  acc0 += ((q63_t) x0 * c0);
279  /* acc1 += x[1] * y[srcBLen - 1] */
280  acc1 += ((q63_t) x1 * c0);
281  /* acc2 += x[2] * y[srcBLen - 1] */
282  acc2 += ((q63_t) x2 * c0);
283 
284  /* Read y[srcBLen - 2] sample */
285  c0 = *(py - 1u);
286 
287  /* Read x[4] sample */
288  x0 = *(px + 1u);
289 
290  /* Perform the multiply-accumulate */
291  /* acc0 += x[1] * y[srcBLen - 2] */
292  acc0 += ((q63_t) x1 * c0);
293  /* acc1 += x[2] * y[srcBLen - 2] */
294  acc1 += ((q63_t) x2 * c0);
295  /* acc2 += x[3] * y[srcBLen - 2] */
296  acc2 += ((q63_t) x0 * c0);
297 
298  /* Read y[srcBLen - 3] sample */
299  c0 = *(py - 2u);
300 
301  /* Read x[5] sample */
302  x1 = *(px + 2u);
303 
304  /* Perform the multiply-accumulates */
305  /* acc0 += x[2] * y[srcBLen - 3] */
306  acc0 += ((q63_t) x2 * c0);
307  /* acc1 += x[3] * y[srcBLen - 2] */
308  acc1 += ((q63_t) x0 * c0);
309  /* acc2 += x[4] * y[srcBLen - 2] */
310  acc2 += ((q63_t) x1 * c0);
311 
312  /* update scratch pointers */
313  px += 3u;
314  py -= 3u;
315 
316  } while(--k);
317 
318  /* If the srcBLen is not a multiple of 3, compute any remaining MACs here.
319  ** No loop unrolling is used. */
320  k = srcBLen - (3 * (srcBLen / 3));
321 
322  while(k > 0u)
323  {
324  /* Read y[srcBLen - 5] sample */
325  c0 = *(py--);
326 
327  /* Read x[7] sample */
328  x2 = *(px++);
329 
330  /* Perform the multiply-accumulates */
331  /* acc0 += x[4] * y[srcBLen - 5] */
332  acc0 += ((q63_t) x0 * c0);
333  /* acc1 += x[5] * y[srcBLen - 5] */
334  acc1 += ((q63_t) x1 * c0);
335  /* acc2 += x[6] * y[srcBLen - 5] */
336  acc2 += ((q63_t) x2 * c0);
337 
338  /* Reuse the present samples for the next MAC */
339  x0 = x1;
340  x1 = x2;
341 
342  /* Decrement the loop counter */
343  k--;
344  }
345 
346  /* Store the results in the accumulators in the destination buffer. */
347  *pOut++ = (q31_t) (acc0 >> 31);
348  *pOut++ = (q31_t) (acc1 >> 31);
349  *pOut++ = (q31_t) (acc2 >> 31);
350 
351  /* Increment the pointer pIn1 index, count by 3 */
352  count += 3u;
353 
354  /* Update the inputA and inputB pointers for next MAC calculation */
355  px = pIn1 + count;
356  py = pSrc2;
357 
358  /* Decrement the loop counter */
359  blkCnt--;
360  }
361 
362  /* If the blockSize2 is not a multiple of 3, compute any remaining output samples here.
363  ** No loop unrolling is used. */
364  blkCnt = blockSize2 - 3 * (blockSize2 / 3);
365 
366  while(blkCnt > 0u)
367  {
368  /* Accumulator is made zero for every iteration */
369  sum = 0;
370 
371  /* Apply loop unrolling and compute 4 MACs simultaneously. */
372  k = srcBLen >> 2u;
373 
374  /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
375  ** a second loop below computes MACs for the remaining 1 to 3 samples. */
376  while(k > 0u)
377  {
378  /* Perform the multiply-accumulates */
379  sum += (q63_t) * px++ * (*py--);
380  sum += (q63_t) * px++ * (*py--);
381  sum += (q63_t) * px++ * (*py--);
382  sum += (q63_t) * px++ * (*py--);
383 
384  /* Decrement the loop counter */
385  k--;
386  }
387 
388  /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
389  ** No loop unrolling is used. */
390  k = srcBLen % 0x4u;
391 
392  while(k > 0u)
393  {
394  /* Perform the multiply-accumulate */
395  sum += (q63_t) * px++ * (*py--);
396 
397  /* Decrement the loop counter */
398  k--;
399  }
400 
401  /* Store the result in the accumulator in the destination buffer. */
402  *pOut++ = (q31_t) (sum >> 31);
403 
404  /* Increment the MAC count */
405  count++;
406 
407  /* Update the inputA and inputB pointers for next MAC calculation */
408  px = pIn1 + count;
409  py = pSrc2;
410 
411  /* Decrement the loop counter */
412  blkCnt--;
413  }
414  }
415  else
416  {
417  /* If the srcBLen is not a multiple of 4,
418  * the blockSize2 loop cannot be unrolled by 4 */
419  blkCnt = blockSize2;
420 
421  while(blkCnt > 0u)
422  {
423  /* Accumulator is made zero for every iteration */
424  sum = 0;
425 
426  /* srcBLen number of MACS should be performed */
427  k = srcBLen;
428 
429  while(k > 0u)
430  {
431  /* Perform the multiply-accumulate */
432  sum += (q63_t) * px++ * (*py--);
433 
434  /* Decrement the loop counter */
435  k--;
436  }
437 
438  /* Store the result in the accumulator in the destination buffer. */
439  *pOut++ = (q31_t) (sum >> 31);
440 
441  /* Increment the MAC count */
442  count++;
443 
444  /* Update the inputA and inputB pointers for next MAC calculation */
445  px = pIn1 + count;
446  py = pSrc2;
447 
448  /* Decrement the loop counter */
449  blkCnt--;
450  }
451  }
452 
453 
454  /* --------------------------
455  * Initializations of stage3
456  * -------------------------*/
457 
458  /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
459  * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
460  * ....
461  * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
462  * sum += x[srcALen-1] * y[srcBLen-1]
463  */
464 
465  /* In this stage the MAC operations are decreased by 1 for every iteration.
466  The blockSize3 variable holds the number of MAC operations performed */
467 
468  /* Working pointer of inputA */
469  pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
470  px = pSrc1;
471 
472  /* Working pointer of inputB */
473  pSrc2 = pIn2 + (srcBLen - 1u);
474  py = pSrc2;
475 
476  /* -------------------
477  * Stage3 process
478  * ------------------*/
479 
480  while(blockSize3 > 0u)
481  {
482  /* Accumulator is made zero for every iteration */
483  sum = 0;
484 
485  /* Apply loop unrolling and compute 4 MACs simultaneously. */
486  k = blockSize3 >> 2u;
487 
488  /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
489  ** a second loop below computes MACs for the remaining 1 to 3 samples. */
490  while(k > 0u)
491  {
492  /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
493  sum += (q63_t) * px++ * (*py--);
494  /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
495  sum += (q63_t) * px++ * (*py--);
496  /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
497  sum += (q63_t) * px++ * (*py--);
498  /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
499  sum += (q63_t) * px++ * (*py--);
500 
501  /* Decrement the loop counter */
502  k--;
503  }
504 
505  /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
506  ** No loop unrolling is used. */
507  k = blockSize3 % 0x4u;
508 
509  while(k > 0u)
510  {
511  /* Perform the multiply-accumulate */
512  sum += (q63_t) * px++ * (*py--);
513 
514  /* Decrement the loop counter */
515  k--;
516  }
517 
518  /* Store the result in the accumulator in the destination buffer. */
519  *pOut++ = (q31_t) (sum >> 31);
520 
521  /* Update the inputA and inputB pointers for next MAC calculation */
522  px = ++pSrc1;
523  py = pSrc2;
524 
525  /* Decrement the loop counter */
526  blockSize3--;
527  }
528 
529 #else
530 
531  /* Run the below code for Cortex-M0 */
532 
533  q31_t *pIn1 = pSrcA; /* input pointer */
534  q31_t *pIn2 = pSrcB; /* coefficient pointer */
535  q63_t sum; /* Accumulator */
536  uint32_t i, j; /* loop counter */
537 
538  /* Loop to calculate output of convolution for output length number of times */
539  for (i = 0; i < (srcALen + srcBLen - 1); i++)
540  {
541  /* Initialize sum with zero to carry on MAC operations */
542  sum = 0;
543 
544  /* Loop to perform MAC operations according to convolution equation */
545  for (j = 0; j <= i; j++)
546  {
547  /* Check the array limitations */
548  if(((i - j) < srcBLen) && (j < srcALen))
549  {
550  /* z[i] += x[i-j] * y[j] */
551  sum += ((q63_t) pIn1[j] * (pIn2[i - j]));
552  }
553  }
554 
555  /* Store the output in the destination buffer */
556  pDst[i] = (q31_t) (sum >> 31u);
557  }
558 
559 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
560 
561 }
562 
void arm_conv_q31(q31_t *pSrcA, uint32_t srcALen, q31_t *pSrcB, uint32_t srcBLen, q31_t *pDst)
Convolution of Q31 sequences.
Definition: arm_conv_q31.c:78
int64_t q63_t
64-bit fractional data type in 1.63 format.
Definition: arm_math.h:402
int32_t q31_t
32-bit fractional data type in 1.31 format.
Definition: arm_math.h:397