STM32F769IDiscovery  1.00
uDANTE Audio Networking with STM32F7 DISCO board
arm_conv_q7.c
Go to the documentation of this file.
1 /* ----------------------------------------------------------------------
2 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.
3 *
4 * $Date: 19. March 2015
5 * $Revision: V.1.4.5
6 *
7 * Project: CMSIS DSP Library
8 * Title: arm_conv_q7.c
9 *
10 * Description: Convolution of Q7 sequences.
11 *
12 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * - Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * - Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in
21 * the documentation and/or other materials provided with the
22 * distribution.
23 * - Neither the name of ARM LIMITED nor the names of its contributors
24 * may be used to endorse or promote products derived from this
25 * software without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 * POSSIBILITY OF SUCH DAMAGE.
39 * -------------------------------------------------------------------- */
40 
41 #include "arm_math.h"
42 
77  q7_t * pSrcA,
78  uint32_t srcALen,
79  q7_t * pSrcB,
80  uint32_t srcBLen,
81  q7_t * pDst)
82 {
83 
84 
85 #ifndef ARM_MATH_CM0_FAMILY
86 
87  /* Run the below code for Cortex-M4 and Cortex-M3 */
88 
89  q7_t *pIn1; /* inputA pointer */
90  q7_t *pIn2; /* inputB pointer */
91  q7_t *pOut = pDst; /* output pointer */
92  q7_t *px; /* Intermediate inputA pointer */
93  q7_t *py; /* Intermediate inputB pointer */
94  q7_t *pSrc1, *pSrc2; /* Intermediate pointers */
95  q7_t x0, x1, x2, x3, c0, c1; /* Temporary variables to hold state and coefficient values */
96  q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
97  q31_t input1, input2; /* Temporary input variables */
98  q15_t in1, in2; /* Temporary input variables */
99  uint32_t j, k, count, blkCnt, blockSize1, blockSize2, blockSize3; /* loop counter */
100 
101  /* The algorithm implementation is based on the lengths of the inputs. */
102  /* srcB is always made to slide across srcA. */
103  /* So srcBLen is always considered as shorter or equal to srcALen */
104  if(srcALen >= srcBLen)
105  {
106  /* Initialization of inputA pointer */
107  pIn1 = pSrcA;
108 
109  /* Initialization of inputB pointer */
110  pIn2 = pSrcB;
111  }
112  else
113  {
114  /* Initialization of inputA pointer */
115  pIn1 = pSrcB;
116 
117  /* Initialization of inputB pointer */
118  pIn2 = pSrcA;
119 
120  /* srcBLen is always considered as shorter or equal to srcALen */
121  j = srcBLen;
122  srcBLen = srcALen;
123  srcALen = j;
124  }
125 
126  /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
127  /* The function is internally
128  * divided into three stages according to the number of multiplications that has to be
129  * taken place between inputA samples and inputB samples. In the first stage of the
130  * algorithm, the multiplications increase by one for every iteration.
131  * In the second stage of the algorithm, srcBLen number of multiplications are done.
132  * In the third stage of the algorithm, the multiplications decrease by one
133  * for every iteration. */
134 
135  /* The algorithm is implemented in three stages.
136  The loop counters of each stage is initiated here. */
137  blockSize1 = srcBLen - 1u;
138  blockSize2 = (srcALen - srcBLen) + 1u;
139  blockSize3 = blockSize1;
140 
141  /* --------------------------
142  * Initializations of stage1
143  * -------------------------*/
144 
145  /* sum = x[0] * y[0]
146  * sum = x[0] * y[1] + x[1] * y[0]
147  * ....
148  * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
149  */
150 
151  /* In this stage the MAC operations are increased by 1 for every iteration.
152  The count variable holds the number of MAC operations performed */
153  count = 1u;
154 
155  /* Working pointer of inputA */
156  px = pIn1;
157 
158  /* Working pointer of inputB */
159  py = pIn2;
160 
161 
162  /* ------------------------
163  * Stage1 process
164  * ----------------------*/
165 
166  /* The first stage starts here */
167  while(blockSize1 > 0u)
168  {
169  /* Accumulator is made zero for every iteration */
170  sum = 0;
171 
172  /* Apply loop unrolling and compute 4 MACs simultaneously. */
173  k = count >> 2u;
174 
175  /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
176  ** a second loop below computes MACs for the remaining 1 to 3 samples. */
177  while(k > 0u)
178  {
179  /* x[0] , x[1] */
180  in1 = (q15_t) * px++;
181  in2 = (q15_t) * px++;
182  input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
183 
184  /* y[srcBLen - 1] , y[srcBLen - 2] */
185  in1 = (q15_t) * py--;
186  in2 = (q15_t) * py--;
187  input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
188 
189  /* x[0] * y[srcBLen - 1] */
190  /* x[1] * y[srcBLen - 2] */
191  sum = __SMLAD(input1, input2, sum);
192 
193  /* x[2] , x[3] */
194  in1 = (q15_t) * px++;
195  in2 = (q15_t) * px++;
196  input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
197 
198  /* y[srcBLen - 3] , y[srcBLen - 4] */
199  in1 = (q15_t) * py--;
200  in2 = (q15_t) * py--;
201  input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
202 
203  /* x[2] * y[srcBLen - 3] */
204  /* x[3] * y[srcBLen - 4] */
205  sum = __SMLAD(input1, input2, sum);
206 
207  /* Decrement the loop counter */
208  k--;
209  }
210 
211  /* If the count is not a multiple of 4, compute any remaining MACs here.
212  ** No loop unrolling is used. */
213  k = count % 0x4u;
214 
215  while(k > 0u)
216  {
217  /* Perform the multiply-accumulates */
218  sum += ((q15_t) * px++ * *py--);
219 
220  /* Decrement the loop counter */
221  k--;
222  }
223 
224  /* Store the result in the accumulator in the destination buffer. */
225  *pOut++ = (q7_t) (__SSAT(sum >> 7u, 8));
226 
227  /* Update the inputA and inputB pointers for next MAC calculation */
228  py = pIn2 + count;
229  px = pIn1;
230 
231  /* Increment the MAC count */
232  count++;
233 
234  /* Decrement the loop counter */
235  blockSize1--;
236  }
237 
238  /* --------------------------
239  * Initializations of stage2
240  * ------------------------*/
241 
242  /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
243  * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
244  * ....
245  * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
246  */
247 
248  /* Working pointer of inputA */
249  px = pIn1;
250 
251  /* Working pointer of inputB */
252  pSrc2 = pIn2 + (srcBLen - 1u);
253  py = pSrc2;
254 
255  /* count is index by which the pointer pIn1 to be incremented */
256  count = 0u;
257 
258  /* -------------------
259  * Stage2 process
260  * ------------------*/
261 
262  /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
263  * So, to loop unroll over blockSize2,
264  * srcBLen should be greater than or equal to 4 */
265  if(srcBLen >= 4u)
266  {
267  /* Loop unroll over blockSize2, by 4 */
268  blkCnt = blockSize2 >> 2u;
269 
270  while(blkCnt > 0u)
271  {
272  /* Set all accumulators to zero */
273  acc0 = 0;
274  acc1 = 0;
275  acc2 = 0;
276  acc3 = 0;
277 
278  /* read x[0], x[1], x[2] samples */
279  x0 = *(px++);
280  x1 = *(px++);
281  x2 = *(px++);
282 
283  /* Apply loop unrolling and compute 4 MACs simultaneously. */
284  k = srcBLen >> 2u;
285 
286  /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
287  ** a second loop below computes MACs for the remaining 1 to 3 samples. */
288  do
289  {
290  /* Read y[srcBLen - 1] sample */
291  c0 = *(py--);
292  /* Read y[srcBLen - 2] sample */
293  c1 = *(py--);
294 
295  /* Read x[3] sample */
296  x3 = *(px++);
297 
298  /* x[0] and x[1] are packed */
299  in1 = (q15_t) x0;
300  in2 = (q15_t) x1;
301 
302  input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
303 
304  /* y[srcBLen - 1] and y[srcBLen - 2] are packed */
305  in1 = (q15_t) c0;
306  in2 = (q15_t) c1;
307 
308  input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
309 
310  /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
311  acc0 = __SMLAD(input1, input2, acc0);
312 
313  /* x[1] and x[2] are packed */
314  in1 = (q15_t) x1;
315  in2 = (q15_t) x2;
316 
317  input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
318 
319  /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
320  acc1 = __SMLAD(input1, input2, acc1);
321 
322  /* x[2] and x[3] are packed */
323  in1 = (q15_t) x2;
324  in2 = (q15_t) x3;
325 
326  input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
327 
328  /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
329  acc2 = __SMLAD(input1, input2, acc2);
330 
331  /* Read x[4] sample */
332  x0 = *(px++);
333 
334  /* x[3] and x[4] are packed */
335  in1 = (q15_t) x3;
336  in2 = (q15_t) x0;
337 
338  input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
339 
340  /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
341  acc3 = __SMLAD(input1, input2, acc3);
342 
343  /* Read y[srcBLen - 3] sample */
344  c0 = *(py--);
345  /* Read y[srcBLen - 4] sample */
346  c1 = *(py--);
347 
348  /* Read x[5] sample */
349  x1 = *(px++);
350 
351  /* x[2] and x[3] are packed */
352  in1 = (q15_t) x2;
353  in2 = (q15_t) x3;
354 
355  input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
356 
357  /* y[srcBLen - 3] and y[srcBLen - 4] are packed */
358  in1 = (q15_t) c0;
359  in2 = (q15_t) c1;
360 
361  input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
362 
363  /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
364  acc0 = __SMLAD(input1, input2, acc0);
365 
366  /* x[3] and x[4] are packed */
367  in1 = (q15_t) x3;
368  in2 = (q15_t) x0;
369 
370  input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
371 
372  /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
373  acc1 = __SMLAD(input1, input2, acc1);
374 
375  /* x[4] and x[5] are packed */
376  in1 = (q15_t) x0;
377  in2 = (q15_t) x1;
378 
379  input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
380 
381  /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
382  acc2 = __SMLAD(input1, input2, acc2);
383 
384  /* Read x[6] sample */
385  x2 = *(px++);
386 
387  /* x[5] and x[6] are packed */
388  in1 = (q15_t) x1;
389  in2 = (q15_t) x2;
390 
391  input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
392 
393  /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
394  acc3 = __SMLAD(input1, input2, acc3);
395 
396  } while(--k);
397 
398  /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
399  ** No loop unrolling is used. */
400  k = srcBLen % 0x4u;
401 
402  while(k > 0u)
403  {
404  /* Read y[srcBLen - 5] sample */
405  c0 = *(py--);
406 
407  /* Read x[7] sample */
408  x3 = *(px++);
409 
410  /* Perform the multiply-accumulates */
411  /* acc0 += x[4] * y[srcBLen - 5] */
412  acc0 += ((q15_t) x0 * c0);
413  /* acc1 += x[5] * y[srcBLen - 5] */
414  acc1 += ((q15_t) x1 * c0);
415  /* acc2 += x[6] * y[srcBLen - 5] */
416  acc2 += ((q15_t) x2 * c0);
417  /* acc3 += x[7] * y[srcBLen - 5] */
418  acc3 += ((q15_t) x3 * c0);
419 
420  /* Reuse the present samples for the next MAC */
421  x0 = x1;
422  x1 = x2;
423  x2 = x3;
424 
425  /* Decrement the loop counter */
426  k--;
427  }
428 
429 
430  /* Store the result in the accumulator in the destination buffer. */
431  *pOut++ = (q7_t) (__SSAT(acc0 >> 7u, 8));
432  *pOut++ = (q7_t) (__SSAT(acc1 >> 7u, 8));
433  *pOut++ = (q7_t) (__SSAT(acc2 >> 7u, 8));
434  *pOut++ = (q7_t) (__SSAT(acc3 >> 7u, 8));
435 
436  /* Increment the pointer pIn1 index, count by 4 */
437  count += 4u;
438 
439  /* Update the inputA and inputB pointers for next MAC calculation */
440  px = pIn1 + count;
441  py = pSrc2;
442 
443  /* Decrement the loop counter */
444  blkCnt--;
445  }
446 
447  /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
448  ** No loop unrolling is used. */
449  blkCnt = blockSize2 % 0x4u;
450 
451  while(blkCnt > 0u)
452  {
453  /* Accumulator is made zero for every iteration */
454  sum = 0;
455 
456  /* Apply loop unrolling and compute 4 MACs simultaneously. */
457  k = srcBLen >> 2u;
458 
459  /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
460  ** a second loop below computes MACs for the remaining 1 to 3 samples. */
461  while(k > 0u)
462  {
463 
464  /* Reading two inputs of SrcA buffer and packing */
465  in1 = (q15_t) * px++;
466  in2 = (q15_t) * px++;
467  input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
468 
469  /* Reading two inputs of SrcB buffer and packing */
470  in1 = (q15_t) * py--;
471  in2 = (q15_t) * py--;
472  input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
473 
474  /* Perform the multiply-accumulates */
475  sum = __SMLAD(input1, input2, sum);
476 
477  /* Reading two inputs of SrcA buffer and packing */
478  in1 = (q15_t) * px++;
479  in2 = (q15_t) * px++;
480  input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
481 
482  /* Reading two inputs of SrcB buffer and packing */
483  in1 = (q15_t) * py--;
484  in2 = (q15_t) * py--;
485  input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
486 
487  /* Perform the multiply-accumulates */
488  sum = __SMLAD(input1, input2, sum);
489 
490  /* Decrement the loop counter */
491  k--;
492  }
493 
494  /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
495  ** No loop unrolling is used. */
496  k = srcBLen % 0x4u;
497 
498  while(k > 0u)
499  {
500  /* Perform the multiply-accumulates */
501  sum += ((q15_t) * px++ * *py--);
502 
503  /* Decrement the loop counter */
504  k--;
505  }
506 
507  /* Store the result in the accumulator in the destination buffer. */
508  *pOut++ = (q7_t) (__SSAT(sum >> 7u, 8));
509 
510  /* Increment the pointer pIn1 index, count by 1 */
511  count++;
512 
513  /* Update the inputA and inputB pointers for next MAC calculation */
514  px = pIn1 + count;
515  py = pSrc2;
516 
517  /* Decrement the loop counter */
518  blkCnt--;
519  }
520  }
521  else
522  {
523  /* If the srcBLen is not a multiple of 4,
524  * the blockSize2 loop cannot be unrolled by 4 */
525  blkCnt = blockSize2;
526 
527  while(blkCnt > 0u)
528  {
529  /* Accumulator is made zero for every iteration */
530  sum = 0;
531 
532  /* srcBLen number of MACS should be performed */
533  k = srcBLen;
534 
535  while(k > 0u)
536  {
537  /* Perform the multiply-accumulate */
538  sum += ((q15_t) * px++ * *py--);
539 
540  /* Decrement the loop counter */
541  k--;
542  }
543 
544  /* Store the result in the accumulator in the destination buffer. */
545  *pOut++ = (q7_t) (__SSAT(sum >> 7u, 8));
546 
547  /* Increment the MAC count */
548  count++;
549 
550  /* Update the inputA and inputB pointers for next MAC calculation */
551  px = pIn1 + count;
552  py = pSrc2;
553 
554  /* Decrement the loop counter */
555  blkCnt--;
556  }
557  }
558 
559 
560  /* --------------------------
561  * Initializations of stage3
562  * -------------------------*/
563 
564  /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
565  * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
566  * ....
567  * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
568  * sum += x[srcALen-1] * y[srcBLen-1]
569  */
570 
571  /* In this stage the MAC operations are decreased by 1 for every iteration.
572  The blockSize3 variable holds the number of MAC operations performed */
573 
574  /* Working pointer of inputA */
575  pSrc1 = pIn1 + (srcALen - (srcBLen - 1u));
576  px = pSrc1;
577 
578  /* Working pointer of inputB */
579  pSrc2 = pIn2 + (srcBLen - 1u);
580  py = pSrc2;
581 
582  /* -------------------
583  * Stage3 process
584  * ------------------*/
585 
586  while(blockSize3 > 0u)
587  {
588  /* Accumulator is made zero for every iteration */
589  sum = 0;
590 
591  /* Apply loop unrolling and compute 4 MACs simultaneously. */
592  k = blockSize3 >> 2u;
593 
594  /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
595  ** a second loop below computes MACs for the remaining 1 to 3 samples. */
596  while(k > 0u)
597  {
598  /* Reading two inputs, x[srcALen - srcBLen + 1] and x[srcALen - srcBLen + 2] of SrcA buffer and packing */
599  in1 = (q15_t) * px++;
600  in2 = (q15_t) * px++;
601  input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
602 
603  /* Reading two inputs, y[srcBLen - 1] and y[srcBLen - 2] of SrcB buffer and packing */
604  in1 = (q15_t) * py--;
605  in2 = (q15_t) * py--;
606  input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
607 
608  /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
609  /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
610  sum = __SMLAD(input1, input2, sum);
611 
612  /* Reading two inputs, x[srcALen - srcBLen + 3] and x[srcALen - srcBLen + 4] of SrcA buffer and packing */
613  in1 = (q15_t) * px++;
614  in2 = (q15_t) * px++;
615  input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
616 
617  /* Reading two inputs, y[srcBLen - 3] and y[srcBLen - 4] of SrcB buffer and packing */
618  in1 = (q15_t) * py--;
619  in2 = (q15_t) * py--;
620  input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
621 
622  /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
623  /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
624  sum = __SMLAD(input1, input2, sum);
625 
626  /* Decrement the loop counter */
627  k--;
628  }
629 
630  /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
631  ** No loop unrolling is used. */
632  k = blockSize3 % 0x4u;
633 
634  while(k > 0u)
635  {
636  /* Perform the multiply-accumulates */
637  sum += ((q15_t) * px++ * *py--);
638 
639  /* Decrement the loop counter */
640  k--;
641  }
642 
643  /* Store the result in the accumulator in the destination buffer. */
644  *pOut++ = (q7_t) (__SSAT(sum >> 7u, 8));
645 
646  /* Update the inputA and inputB pointers for next MAC calculation */
647  px = ++pSrc1;
648  py = pSrc2;
649 
650  /* Decrement the loop counter */
651  blockSize3--;
652  }
653 
654 #else
655 
656  /* Run the below code for Cortex-M0 */
657 
658  q7_t *pIn1 = pSrcA; /* input pointer */
659  q7_t *pIn2 = pSrcB; /* coefficient pointer */
660  q31_t sum; /* Accumulator */
661  uint32_t i, j; /* loop counter */
662 
663  /* Loop to calculate output of convolution for output length number of times */
664  for (i = 0; i < (srcALen + srcBLen - 1); i++)
665  {
666  /* Initialize sum with zero to carry on MAC operations */
667  sum = 0;
668 
669  /* Loop to perform MAC operations according to convolution equation */
670  for (j = 0; j <= i; j++)
671  {
672  /* Check the array limitations */
673  if(((i - j) < srcBLen) && (j < srcALen))
674  {
675  /* z[i] += x[i-j] * y[j] */
676  sum += (q15_t) pIn1[j] * (pIn2[i - j]);
677  }
678  }
679 
680  /* Store the output in the destination buffer */
681  pDst[i] = (q7_t) __SSAT((sum >> 7u), 8u);
682  }
683 
684 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
685 
686 }
687 
int8_t q7_t
8-bit fractional data type in 1.7 format.
Definition: arm_math.h:387
int16_t q15_t
16-bit fractional data type in 1.15 format.
Definition: arm_math.h:392
int32_t q31_t
32-bit fractional data type in 1.31 format.
Definition: arm_math.h:397
void arm_conv_q7(q7_t *pSrcA, uint32_t srcALen, q7_t *pSrcB, uint32_t srcBLen, q7_t *pDst)
Convolution of Q7 sequences.
Definition: arm_conv_q7.c:76