STM32F769IDiscovery  1.00
uDANTE Audio Networking with STM32F7 DISCO board
arm_conv_partial_q7.c
Go to the documentation of this file.
1 /* ----------------------------------------------------------------------
2 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.
3 *
4 * $Date: 19. March 2015
5 * $Revision: V.1.4.5
6 *
7 * Project: CMSIS DSP Library
8 * Title: arm_conv_partial_q7.c
9 *
10 * Description: Partial convolution of Q7 sequences.
11 *
12 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * - Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * - Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in
21 * the documentation and/or other materials provided with the
22 * distribution.
23 * - Neither the name of ARM LIMITED nor the names of its contributors
24 * may be used to endorse or promote products derived from this
25 * software without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 * POSSIBILITY OF SUCH DAMAGE.
39 * -------------------------------------------------------------------- */
40 
41 #include "arm_math.h"
42 
69  q7_t * pSrcA,
70  uint32_t srcALen,
71  q7_t * pSrcB,
72  uint32_t srcBLen,
73  q7_t * pDst,
74  uint32_t firstIndex,
75  uint32_t numPoints)
76 {
77 
78 
79 #ifndef ARM_MATH_CM0_FAMILY
80 
81  /* Run the below code for Cortex-M4 and Cortex-M3 */
82 
83  q7_t *pIn1; /* inputA pointer */
84  q7_t *pIn2; /* inputB pointer */
85  q7_t *pOut = pDst; /* output pointer */
86  q7_t *px; /* Intermediate inputA pointer */
87  q7_t *py; /* Intermediate inputB pointer */
88  q7_t *pSrc1, *pSrc2; /* Intermediate pointers */
89  q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
90  q31_t input1, input2;
91  q15_t in1, in2;
92  q7_t x0, x1, x2, x3, c0, c1;
93  uint32_t j, k, count, check, blkCnt;
94  int32_t blockSize1, blockSize2, blockSize3; /* loop counter */
95  arm_status status;
96 
97 
98  /* Check for range of output samples to be calculated */
99  if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
100  {
101  /* Set status as ARM_MATH_ARGUMENT_ERROR */
102  status = ARM_MATH_ARGUMENT_ERROR;
103  }
104  else
105  {
106 
107  /* The algorithm implementation is based on the lengths of the inputs. */
108  /* srcB is always made to slide across srcA. */
109  /* So srcBLen is always considered as shorter or equal to srcALen */
110  if(srcALen >= srcBLen)
111  {
112  /* Initialization of inputA pointer */
113  pIn1 = pSrcA;
114 
115  /* Initialization of inputB pointer */
116  pIn2 = pSrcB;
117  }
118  else
119  {
120  /* Initialization of inputA pointer */
121  pIn1 = pSrcB;
122 
123  /* Initialization of inputB pointer */
124  pIn2 = pSrcA;
125 
126  /* srcBLen is always considered as shorter or equal to srcALen */
127  j = srcBLen;
128  srcBLen = srcALen;
129  srcALen = j;
130  }
131 
132  /* Conditions to check which loopCounter holds
133  * the first and last indices of the output samples to be calculated. */
134  check = firstIndex + numPoints;
135  blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0;
136  blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3;
137  blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex);
138  blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 :
139  (int32_t) numPoints) : 0;
140  blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) +
141  (int32_t) firstIndex);
142  blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
143 
144  /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
145  /* The function is internally
146  * divided into three stages according to the number of multiplications that has to be
147  * taken place between inputA samples and inputB samples. In the first stage of the
148  * algorithm, the multiplications increase by one for every iteration.
149  * In the second stage of the algorithm, srcBLen number of multiplications are done.
150  * In the third stage of the algorithm, the multiplications decrease by one
151  * for every iteration. */
152 
153  /* Set the output pointer to point to the firstIndex
154  * of the output sample to be calculated. */
155  pOut = pDst + firstIndex;
156 
157  /* --------------------------
158  * Initializations of stage1
159  * -------------------------*/
160 
161  /* sum = x[0] * y[0]
162  * sum = x[0] * y[1] + x[1] * y[0]
163  * ....
164  * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
165  */
166 
167  /* In this stage the MAC operations are increased by 1 for every iteration.
168  The count variable holds the number of MAC operations performed.
169  Since the partial convolution starts from from firstIndex
170  Number of Macs to be performed is firstIndex + 1 */
171  count = 1u + firstIndex;
172 
173  /* Working pointer of inputA */
174  px = pIn1;
175 
176  /* Working pointer of inputB */
177  pSrc2 = pIn2 + firstIndex;
178  py = pSrc2;
179 
180  /* ------------------------
181  * Stage1 process
182  * ----------------------*/
183 
184  /* The first stage starts here */
185  while(blockSize1 > 0)
186  {
187  /* Accumulator is made zero for every iteration */
188  sum = 0;
189 
190  /* Apply loop unrolling and compute 4 MACs simultaneously. */
191  k = count >> 2u;
192 
193  /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
194  ** a second loop below computes MACs for the remaining 1 to 3 samples. */
195  while(k > 0u)
196  {
197  /* x[0] , x[1] */
198  in1 = (q15_t) * px++;
199  in2 = (q15_t) * px++;
200  input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
201 
202  /* y[srcBLen - 1] , y[srcBLen - 2] */
203  in1 = (q15_t) * py--;
204  in2 = (q15_t) * py--;
205  input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
206 
207  /* x[0] * y[srcBLen - 1] */
208  /* x[1] * y[srcBLen - 2] */
209  sum = __SMLAD(input1, input2, sum);
210 
211  /* x[2] , x[3] */
212  in1 = (q15_t) * px++;
213  in2 = (q15_t) * px++;
214  input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
215 
216  /* y[srcBLen - 3] , y[srcBLen - 4] */
217  in1 = (q15_t) * py--;
218  in2 = (q15_t) * py--;
219  input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
220 
221  /* x[2] * y[srcBLen - 3] */
222  /* x[3] * y[srcBLen - 4] */
223  sum = __SMLAD(input1, input2, sum);
224 
225  /* Decrement the loop counter */
226  k--;
227  }
228 
229  /* If the count is not a multiple of 4, compute any remaining MACs here.
230  ** No loop unrolling is used. */
231  k = count % 0x4u;
232 
233  while(k > 0u)
234  {
235  /* Perform the multiply-accumulates */
236  sum += ((q31_t) * px++ * *py--);
237 
238  /* Decrement the loop counter */
239  k--;
240  }
241 
242  /* Store the result in the accumulator in the destination buffer. */
243  *pOut++ = (q7_t) (__SSAT(sum >> 7, 8));
244 
245  /* Update the inputA and inputB pointers for next MAC calculation */
246  py = ++pSrc2;
247  px = pIn1;
248 
249  /* Increment the MAC count */
250  count++;
251 
252  /* Decrement the loop counter */
253  blockSize1--;
254  }
255 
256  /* --------------------------
257  * Initializations of stage2
258  * ------------------------*/
259 
260  /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
261  * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
262  * ....
263  * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
264  */
265 
266  /* Working pointer of inputA */
267  if((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
268  {
269  px = pIn1 + firstIndex - srcBLen + 1;
270  }
271  else
272  {
273  px = pIn1;
274  }
275 
276  /* Working pointer of inputB */
277  pSrc2 = pIn2 + (srcBLen - 1u);
278  py = pSrc2;
279 
280  /* count is index by which the pointer pIn1 to be incremented */
281  count = 0u;
282 
283  /* -------------------
284  * Stage2 process
285  * ------------------*/
286 
287  /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
288  * So, to loop unroll over blockSize2,
289  * srcBLen should be greater than or equal to 4 */
290  if(srcBLen >= 4u)
291  {
292  /* Loop unroll over blockSize2, by 4 */
293  blkCnt = ((uint32_t) blockSize2 >> 2u);
294 
295  while(blkCnt > 0u)
296  {
297  /* Set all accumulators to zero */
298  acc0 = 0;
299  acc1 = 0;
300  acc2 = 0;
301  acc3 = 0;
302 
303  /* read x[0], x[1], x[2] samples */
304  x0 = *(px++);
305  x1 = *(px++);
306  x2 = *(px++);
307 
308  /* Apply loop unrolling and compute 4 MACs simultaneously. */
309  k = srcBLen >> 2u;
310 
311  /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
312  ** a second loop below computes MACs for the remaining 1 to 3 samples. */
313  do
314  {
315  /* Read y[srcBLen - 1] sample */
316  c0 = *(py--);
317  /* Read y[srcBLen - 2] sample */
318  c1 = *(py--);
319 
320  /* Read x[3] sample */
321  x3 = *(px++);
322 
323  /* x[0] and x[1] are packed */
324  in1 = (q15_t) x0;
325  in2 = (q15_t) x1;
326 
327  input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
328 
329  /* y[srcBLen - 1] and y[srcBLen - 2] are packed */
330  in1 = (q15_t) c0;
331  in2 = (q15_t) c1;
332 
333  input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
334 
335  /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
336  acc0 = __SMLAD(input1, input2, acc0);
337 
338  /* x[1] and x[2] are packed */
339  in1 = (q15_t) x1;
340  in2 = (q15_t) x2;
341 
342  input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
343 
344  /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
345  acc1 = __SMLAD(input1, input2, acc1);
346 
347  /* x[2] and x[3] are packed */
348  in1 = (q15_t) x2;
349  in2 = (q15_t) x3;
350 
351  input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
352 
353  /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
354  acc2 = __SMLAD(input1, input2, acc2);
355 
356  /* Read x[4] sample */
357  x0 = *(px++);
358 
359  /* x[3] and x[4] are packed */
360  in1 = (q15_t) x3;
361  in2 = (q15_t) x0;
362 
363  input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
364 
365  /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
366  acc3 = __SMLAD(input1, input2, acc3);
367 
368  /* Read y[srcBLen - 3] sample */
369  c0 = *(py--);
370  /* Read y[srcBLen - 4] sample */
371  c1 = *(py--);
372 
373  /* Read x[5] sample */
374  x1 = *(px++);
375 
376  /* x[2] and x[3] are packed */
377  in1 = (q15_t) x2;
378  in2 = (q15_t) x3;
379 
380  input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
381 
382  /* y[srcBLen - 3] and y[srcBLen - 4] are packed */
383  in1 = (q15_t) c0;
384  in2 = (q15_t) c1;
385 
386  input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
387 
388  /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
389  acc0 = __SMLAD(input1, input2, acc0);
390 
391  /* x[3] and x[4] are packed */
392  in1 = (q15_t) x3;
393  in2 = (q15_t) x0;
394 
395  input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
396 
397  /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
398  acc1 = __SMLAD(input1, input2, acc1);
399 
400  /* x[4] and x[5] are packed */
401  in1 = (q15_t) x0;
402  in2 = (q15_t) x1;
403 
404  input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
405 
406  /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
407  acc2 = __SMLAD(input1, input2, acc2);
408 
409  /* Read x[6] sample */
410  x2 = *(px++);
411 
412  /* x[5] and x[6] are packed */
413  in1 = (q15_t) x1;
414  in2 = (q15_t) x2;
415 
416  input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
417 
418  /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
419  acc3 = __SMLAD(input1, input2, acc3);
420 
421  } while(--k);
422 
423  /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
424  ** No loop unrolling is used. */
425  k = srcBLen % 0x4u;
426 
427  while(k > 0u)
428  {
429  /* Read y[srcBLen - 5] sample */
430  c0 = *(py--);
431 
432  /* Read x[7] sample */
433  x3 = *(px++);
434 
435  /* Perform the multiply-accumulates */
436  /* acc0 += x[4] * y[srcBLen - 5] */
437  acc0 += ((q31_t) x0 * c0);
438  /* acc1 += x[5] * y[srcBLen - 5] */
439  acc1 += ((q31_t) x1 * c0);
440  /* acc2 += x[6] * y[srcBLen - 5] */
441  acc2 += ((q31_t) x2 * c0);
442  /* acc3 += x[7] * y[srcBLen - 5] */
443  acc3 += ((q31_t) x3 * c0);
444 
445  /* Reuse the present samples for the next MAC */
446  x0 = x1;
447  x1 = x2;
448  x2 = x3;
449 
450  /* Decrement the loop counter */
451  k--;
452  }
453 
454  /* Store the result in the accumulator in the destination buffer. */
455  *pOut++ = (q7_t) (__SSAT(acc0 >> 7, 8));
456  *pOut++ = (q7_t) (__SSAT(acc1 >> 7, 8));
457  *pOut++ = (q7_t) (__SSAT(acc2 >> 7, 8));
458  *pOut++ = (q7_t) (__SSAT(acc3 >> 7, 8));
459 
460  /* Increment the pointer pIn1 index, count by 4 */
461  count += 4u;
462 
463  /* Update the inputA and inputB pointers for next MAC calculation */
464  px = pIn1 + count;
465  py = pSrc2;
466 
467 
468  /* Decrement the loop counter */
469  blkCnt--;
470  }
471 
472  /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
473  ** No loop unrolling is used. */
474  blkCnt = (uint32_t) blockSize2 % 0x4u;
475 
476  while(blkCnt > 0u)
477  {
478  /* Accumulator is made zero for every iteration */
479  sum = 0;
480 
481  /* Apply loop unrolling and compute 4 MACs simultaneously. */
482  k = srcBLen >> 2u;
483 
484  /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
485  ** a second loop below computes MACs for the remaining 1 to 3 samples. */
486  while(k > 0u)
487  {
488 
489  /* Reading two inputs of SrcA buffer and packing */
490  in1 = (q15_t) * px++;
491  in2 = (q15_t) * px++;
492  input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
493 
494  /* Reading two inputs of SrcB buffer and packing */
495  in1 = (q15_t) * py--;
496  in2 = (q15_t) * py--;
497  input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
498 
499  /* Perform the multiply-accumulates */
500  sum = __SMLAD(input1, input2, sum);
501 
502  /* Reading two inputs of SrcA buffer and packing */
503  in1 = (q15_t) * px++;
504  in2 = (q15_t) * px++;
505  input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
506 
507  /* Reading two inputs of SrcB buffer and packing */
508  in1 = (q15_t) * py--;
509  in2 = (q15_t) * py--;
510  input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
511 
512  /* Perform the multiply-accumulates */
513  sum = __SMLAD(input1, input2, sum);
514 
515  /* Decrement the loop counter */
516  k--;
517  }
518 
519  /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
520  ** No loop unrolling is used. */
521  k = srcBLen % 0x4u;
522 
523  while(k > 0u)
524  {
525  /* Perform the multiply-accumulates */
526  sum += ((q31_t) * px++ * *py--);
527 
528  /* Decrement the loop counter */
529  k--;
530  }
531 
532  /* Store the result in the accumulator in the destination buffer. */
533  *pOut++ = (q7_t) (__SSAT(sum >> 7, 8));
534 
535  /* Increment the pointer pIn1 index, count by 1 */
536  count++;
537 
538  /* Update the inputA and inputB pointers for next MAC calculation */
539  px = pIn1 + count;
540  py = pSrc2;
541 
542  /* Decrement the loop counter */
543  blkCnt--;
544  }
545  }
546  else
547  {
548  /* If the srcBLen is not a multiple of 4,
549  * the blockSize2 loop cannot be unrolled by 4 */
550  blkCnt = (uint32_t) blockSize2;
551 
552  while(blkCnt > 0u)
553  {
554  /* Accumulator is made zero for every iteration */
555  sum = 0;
556 
557  /* srcBLen number of MACS should be performed */
558  k = srcBLen;
559 
560  while(k > 0u)
561  {
562  /* Perform the multiply-accumulate */
563  sum += ((q31_t) * px++ * *py--);
564 
565  /* Decrement the loop counter */
566  k--;
567  }
568 
569  /* Store the result in the accumulator in the destination buffer. */
570  *pOut++ = (q7_t) (__SSAT(sum >> 7, 8));
571 
572  /* Increment the MAC count */
573  count++;
574 
575  /* Update the inputA and inputB pointers for next MAC calculation */
576  px = pIn1 + count;
577  py = pSrc2;
578 
579  /* Decrement the loop counter */
580  blkCnt--;
581  }
582  }
583 
584 
585  /* --------------------------
586  * Initializations of stage3
587  * -------------------------*/
588 
589  /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
590  * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
591  * ....
592  * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
593  * sum += x[srcALen-1] * y[srcBLen-1]
594  */
595 
596  /* In this stage the MAC operations are decreased by 1 for every iteration.
597  The count variable holds the number of MAC operations performed */
598  count = srcBLen - 1u;
599 
600  /* Working pointer of inputA */
601  pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
602  px = pSrc1;
603 
604  /* Working pointer of inputB */
605  pSrc2 = pIn2 + (srcBLen - 1u);
606  py = pSrc2;
607 
608  /* -------------------
609  * Stage3 process
610  * ------------------*/
611 
612  while(blockSize3 > 0)
613  {
614  /* Accumulator is made zero for every iteration */
615  sum = 0;
616 
617  /* Apply loop unrolling and compute 4 MACs simultaneously. */
618  k = count >> 2u;
619 
620  /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
621  ** a second loop below computes MACs for the remaining 1 to 3 samples. */
622  while(k > 0u)
623  {
624  /* Reading two inputs, x[srcALen - srcBLen + 1] and x[srcALen - srcBLen + 2] of SrcA buffer and packing */
625  in1 = (q15_t) * px++;
626  in2 = (q15_t) * px++;
627  input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
628 
629  /* Reading two inputs, y[srcBLen - 1] and y[srcBLen - 2] of SrcB buffer and packing */
630  in1 = (q15_t) * py--;
631  in2 = (q15_t) * py--;
632  input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
633 
634  /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
635  /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
636  sum = __SMLAD(input1, input2, sum);
637 
638  /* Reading two inputs, x[srcALen - srcBLen + 3] and x[srcALen - srcBLen + 4] of SrcA buffer and packing */
639  in1 = (q15_t) * px++;
640  in2 = (q15_t) * px++;
641  input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
642 
643  /* Reading two inputs, y[srcBLen - 3] and y[srcBLen - 4] of SrcB buffer and packing */
644  in1 = (q15_t) * py--;
645  in2 = (q15_t) * py--;
646  input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
647 
648  /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
649  /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
650  sum = __SMLAD(input1, input2, sum);
651 
652  /* Decrement the loop counter */
653  k--;
654  }
655 
656  /* If the count is not a multiple of 4, compute any remaining MACs here.
657  ** No loop unrolling is used. */
658  k = count % 0x4u;
659 
660  while(k > 0u)
661  {
662  /* Perform the multiply-accumulates */
663  /* sum += x[srcALen-1] * y[srcBLen-1] */
664  sum += ((q31_t) * px++ * *py--);
665 
666  /* Decrement the loop counter */
667  k--;
668  }
669 
670  /* Store the result in the accumulator in the destination buffer. */
671  *pOut++ = (q7_t) (__SSAT(sum >> 7, 8));
672 
673  /* Update the inputA and inputB pointers for next MAC calculation */
674  px = ++pSrc1;
675  py = pSrc2;
676 
677  /* Decrement the MAC count */
678  count--;
679 
680  /* Decrement the loop counter */
681  blockSize3--;
682 
683  }
684 
685  /* set status as ARM_MATH_SUCCESS */
686  status = ARM_MATH_SUCCESS;
687  }
688 
689  /* Return to application */
690  return (status);
691 
692 #else
693 
694  /* Run the below code for Cortex-M0 */
695 
696  q7_t *pIn1 = pSrcA; /* inputA pointer */
697  q7_t *pIn2 = pSrcB; /* inputB pointer */
698  q31_t sum; /* Accumulator */
699  uint32_t i, j; /* loop counters */
700  arm_status status; /* status of Partial convolution */
701 
702  /* Check for range of output samples to be calculated */
703  if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
704  {
705  /* Set status as ARM_ARGUMENT_ERROR */
706  status = ARM_MATH_ARGUMENT_ERROR;
707  }
708  else
709  {
710  /* Loop to calculate convolution for output length number of values */
711  for (i = firstIndex; i <= (firstIndex + numPoints - 1); i++)
712  {
713  /* Initialize sum with zero to carry on MAC operations */
714  sum = 0;
715 
716  /* Loop to perform MAC operations according to convolution equation */
717  for (j = 0; j <= i; j++)
718  {
719  /* Check the array limitations */
720  if(((i - j) < srcBLen) && (j < srcALen))
721  {
722  /* z[i] += x[i-j] * y[j] */
723  sum += ((q15_t) pIn1[j] * (pIn2[i - j]));
724  }
725  }
726 
727  /* Store the output in the destination buffer */
728  pDst[i] = (q7_t) __SSAT((sum >> 7u), 8u);
729  }
730  /* set status as ARM_SUCCESS as there are no argument errors */
731  status = ARM_MATH_SUCCESS;
732  }
733  return (status);
734 
735 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
736 
737 }
738 
int8_t q7_t
8-bit fractional data type in 1.7 format.
Definition: arm_math.h:387
arm_status arm_conv_partial_q7(q7_t *pSrcA, uint32_t srcALen, q7_t *pSrcB, uint32_t srcBLen, q7_t *pDst, uint32_t firstIndex, uint32_t numPoints)
Partial convolution of Q7 sequences.
int16_t q15_t
16-bit fractional data type in 1.15 format.
Definition: arm_math.h:392
int32_t q31_t
32-bit fractional data type in 1.31 format.
Definition: arm_math.h:397
arm_status
Error status returned by some functions in the library.
Definition: arm_math.h:373