STM32F769IDiscovery  1.00
uDANTE Audio Networking with STM32F7 DISCO board
arm_conv_opt_q7.c
Go to the documentation of this file.
1 /* ----------------------------------------------------------------------
2 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.
3 *
4 * $Date: 19. March 2015
5 * $Revision: V.1.4.5
6 *
7 * Project: CMSIS DSP Library
8 * Title: arm_conv_opt_q7.c
9 *
10 * Description: Convolution of Q7 sequences.
11 *
12 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * - Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * - Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in
21 * the documentation and/or other materials provided with the
22 * distribution.
23 * - Neither the name of ARM LIMITED nor the names of its contributors
24 * may be used to endorse or promote products derived from this
25 * software without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 * POSSIBILITY OF SUCH DAMAGE.
39 * -------------------------------------------------------------------- */
40 
41 #include "arm_math.h"
42 
80  q7_t * pSrcA,
81  uint32_t srcALen,
82  q7_t * pSrcB,
83  uint32_t srcBLen,
84  q7_t * pDst,
85  q15_t * pScratch1,
86  q15_t * pScratch2)
87 {
88 
89  q15_t *pScr2, *pScr1; /* Intermediate pointers for scratch pointers */
90  q15_t x4; /* Temporary input variable */
91  q7_t *pIn1, *pIn2; /* inputA and inputB pointer */
92  uint32_t j, k, blkCnt, tapCnt; /* loop counter */
93  q7_t *px; /* Temporary input1 pointer */
94  q15_t *py; /* Temporary input2 pointer */
95  q31_t acc0, acc1, acc2, acc3; /* Accumulator */
96  q31_t x1, x2, x3, y1; /* Temporary input variables */
97  q7_t *pOut = pDst; /* output pointer */
98  q7_t out0, out1, out2, out3; /* temporary variables */
99 
100  /* The algorithm implementation is based on the lengths of the inputs. */
101  /* srcB is always made to slide across srcA. */
102  /* So srcBLen is always considered as shorter or equal to srcALen */
103  if(srcALen >= srcBLen)
104  {
105  /* Initialization of inputA pointer */
106  pIn1 = pSrcA;
107 
108  /* Initialization of inputB pointer */
109  pIn2 = pSrcB;
110  }
111  else
112  {
113  /* Initialization of inputA pointer */
114  pIn1 = pSrcB;
115 
116  /* Initialization of inputB pointer */
117  pIn2 = pSrcA;
118 
119  /* srcBLen is always considered as shorter or equal to srcALen */
120  j = srcBLen;
121  srcBLen = srcALen;
122  srcALen = j;
123  }
124 
125  /* pointer to take end of scratch2 buffer */
126  pScr2 = pScratch2;
127 
128  /* points to smaller length sequence */
129  px = pIn2 + srcBLen - 1;
130 
131  /* Apply loop unrolling and do 4 Copies simultaneously. */
132  k = srcBLen >> 2u;
133 
134  /* First part of the processing with loop unrolling copies 4 data points at a time.
135  ** a second loop below copies for the remaining 1 to 3 samples. */
136  while(k > 0u)
137  {
138  /* copy second buffer in reversal manner */
139  x4 = (q15_t) * px--;
140  *pScr2++ = x4;
141  x4 = (q15_t) * px--;
142  *pScr2++ = x4;
143  x4 = (q15_t) * px--;
144  *pScr2++ = x4;
145  x4 = (q15_t) * px--;
146  *pScr2++ = x4;
147 
148  /* Decrement the loop counter */
149  k--;
150  }
151 
152  /* If the count is not a multiple of 4, copy remaining samples here.
153  ** No loop unrolling is used. */
154  k = srcBLen % 0x4u;
155 
156  while(k > 0u)
157  {
158  /* copy second buffer in reversal manner for remaining samples */
159  x4 = (q15_t) * px--;
160  *pScr2++ = x4;
161 
162  /* Decrement the loop counter */
163  k--;
164  }
165 
166  /* Initialze temporary scratch pointer */
167  pScr1 = pScratch1;
168 
169  /* Fill (srcBLen - 1u) zeros in scratch buffer */
170  arm_fill_q15(0, pScr1, (srcBLen - 1u));
171 
172  /* Update temporary scratch pointer */
173  pScr1 += (srcBLen - 1u);
174 
175  /* Copy (srcALen) samples in scratch buffer */
176  /* Apply loop unrolling and do 4 Copies simultaneously. */
177  k = srcALen >> 2u;
178 
179  /* First part of the processing with loop unrolling copies 4 data points at a time.
180  ** a second loop below copies for the remaining 1 to 3 samples. */
181  while(k > 0u)
182  {
183  /* copy second buffer in reversal manner */
184  x4 = (q15_t) * pIn1++;
185  *pScr1++ = x4;
186  x4 = (q15_t) * pIn1++;
187  *pScr1++ = x4;
188  x4 = (q15_t) * pIn1++;
189  *pScr1++ = x4;
190  x4 = (q15_t) * pIn1++;
191  *pScr1++ = x4;
192 
193  /* Decrement the loop counter */
194  k--;
195  }
196 
197  /* If the count is not a multiple of 4, copy remaining samples here.
198  ** No loop unrolling is used. */
199  k = srcALen % 0x4u;
200 
201  while(k > 0u)
202  {
203  /* copy second buffer in reversal manner for remaining samples */
204  x4 = (q15_t) * pIn1++;
205  *pScr1++ = x4;
206 
207  /* Decrement the loop counter */
208  k--;
209  }
210 
211 #ifndef UNALIGNED_SUPPORT_DISABLE
212 
213  /* Fill (srcBLen - 1u) zeros at end of scratch buffer */
214  arm_fill_q15(0, pScr1, (srcBLen - 1u));
215 
216  /* Update pointer */
217  pScr1 += (srcBLen - 1u);
218 
219 #else
220 
221  /* Apply loop unrolling and do 4 Copies simultaneously. */
222  k = (srcBLen - 1u) >> 2u;
223 
224  /* First part of the processing with loop unrolling copies 4 data points at a time.
225  ** a second loop below copies for the remaining 1 to 3 samples. */
226  while(k > 0u)
227  {
228  /* copy second buffer in reversal manner */
229  *pScr1++ = 0;
230  *pScr1++ = 0;
231  *pScr1++ = 0;
232  *pScr1++ = 0;
233 
234  /* Decrement the loop counter */
235  k--;
236  }
237 
238  /* If the count is not a multiple of 4, copy remaining samples here.
239  ** No loop unrolling is used. */
240  k = (srcBLen - 1u) % 0x4u;
241 
242  while(k > 0u)
243  {
244  /* copy second buffer in reversal manner for remaining samples */
245  *pScr1++ = 0;
246 
247  /* Decrement the loop counter */
248  k--;
249  }
250 
251 #endif
252 
253  /* Temporary pointer for scratch2 */
254  py = pScratch2;
255 
256  /* Initialization of pIn2 pointer */
257  pIn2 = (q7_t *) py;
258 
259  pScr2 = py;
260 
261  /* Actual convolution process starts here */
262  blkCnt = (srcALen + srcBLen - 1u) >> 2;
263 
264  while(blkCnt > 0)
265  {
266  /* Initialze temporary scratch pointer as scratch1 */
267  pScr1 = pScratch1;
268 
269  /* Clear Accumlators */
270  acc0 = 0;
271  acc1 = 0;
272  acc2 = 0;
273  acc3 = 0;
274 
275  /* Read two samples from scratch1 buffer */
276  x1 = *__SIMD32(pScr1)++;
277 
278  /* Read next two samples from scratch1 buffer */
279  x2 = *__SIMD32(pScr1)++;
280 
281  tapCnt = (srcBLen) >> 2u;
282 
283  while(tapCnt > 0u)
284  {
285 
286  /* Read four samples from smaller buffer */
287  y1 = _SIMD32_OFFSET(pScr2);
288 
289  /* multiply and accumlate */
290  acc0 = __SMLAD(x1, y1, acc0);
291  acc2 = __SMLAD(x2, y1, acc2);
292 
293  /* pack input data */
294 #ifndef ARM_MATH_BIG_ENDIAN
295  x3 = __PKHBT(x2, x1, 0);
296 #else
297  x3 = __PKHBT(x1, x2, 0);
298 #endif
299 
300  /* multiply and accumlate */
301  acc1 = __SMLADX(x3, y1, acc1);
302 
303  /* Read next two samples from scratch1 buffer */
304  x1 = *__SIMD32(pScr1)++;
305 
306  /* pack input data */
307 #ifndef ARM_MATH_BIG_ENDIAN
308  x3 = __PKHBT(x1, x2, 0);
309 #else
310  x3 = __PKHBT(x2, x1, 0);
311 #endif
312 
313  acc3 = __SMLADX(x3, y1, acc3);
314 
315  /* Read four samples from smaller buffer */
316  y1 = _SIMD32_OFFSET(pScr2 + 2u);
317 
318  acc0 = __SMLAD(x2, y1, acc0);
319 
320  acc2 = __SMLAD(x1, y1, acc2);
321 
322  acc1 = __SMLADX(x3, y1, acc1);
323 
324  x2 = *__SIMD32(pScr1)++;
325 
326 #ifndef ARM_MATH_BIG_ENDIAN
327  x3 = __PKHBT(x2, x1, 0);
328 #else
329  x3 = __PKHBT(x1, x2, 0);
330 #endif
331 
332  acc3 = __SMLADX(x3, y1, acc3);
333 
334  pScr2 += 4u;
335 
336 
337  /* Decrement the loop counter */
338  tapCnt--;
339  }
340 
341 
342 
343  /* Update scratch pointer for remaining samples of smaller length sequence */
344  pScr1 -= 4u;
345 
346 
347  /* apply same above for remaining samples of smaller length sequence */
348  tapCnt = (srcBLen) & 3u;
349 
350  while(tapCnt > 0u)
351  {
352 
353  /* accumlate the results */
354  acc0 += (*pScr1++ * *pScr2);
355  acc1 += (*pScr1++ * *pScr2);
356  acc2 += (*pScr1++ * *pScr2);
357  acc3 += (*pScr1++ * *pScr2++);
358 
359  pScr1 -= 3u;
360 
361  /* Decrement the loop counter */
362  tapCnt--;
363  }
364 
365  blkCnt--;
366 
367  /* Store the result in the accumulator in the destination buffer. */
368  out0 = (q7_t) (__SSAT(acc0 >> 7u, 8));
369  out1 = (q7_t) (__SSAT(acc1 >> 7u, 8));
370  out2 = (q7_t) (__SSAT(acc2 >> 7u, 8));
371  out3 = (q7_t) (__SSAT(acc3 >> 7u, 8));
372 
373  *__SIMD32(pOut)++ = __PACKq7(out0, out1, out2, out3);
374 
375  /* Initialization of inputB pointer */
376  pScr2 = py;
377 
378  pScratch1 += 4u;
379 
380  }
381 
382 
383  blkCnt = (srcALen + srcBLen - 1u) & 0x3;
384 
385  /* Calculate convolution for remaining samples of Bigger length sequence */
386  while(blkCnt > 0)
387  {
388  /* Initialze temporary scratch pointer as scratch1 */
389  pScr1 = pScratch1;
390 
391  /* Clear Accumlators */
392  acc0 = 0;
393 
394  tapCnt = (srcBLen) >> 1u;
395 
396  while(tapCnt > 0u)
397  {
398  acc0 += (*pScr1++ * *pScr2++);
399  acc0 += (*pScr1++ * *pScr2++);
400 
401  /* Decrement the loop counter */
402  tapCnt--;
403  }
404 
405  tapCnt = (srcBLen) & 1u;
406 
407  /* apply same above for remaining samples of smaller length sequence */
408  while(tapCnt > 0u)
409  {
410 
411  /* accumlate the results */
412  acc0 += (*pScr1++ * *pScr2++);
413 
414  /* Decrement the loop counter */
415  tapCnt--;
416  }
417 
418  blkCnt--;
419 
420  /* Store the result in the accumulator in the destination buffer. */
421  *pOut++ = (q7_t) (__SSAT(acc0 >> 7u, 8));
422 
423  /* Initialization of inputB pointer */
424  pScr2 = py;
425 
426  pScratch1 += 1u;
427 
428  }
429 
430 }
431 
432 
int8_t q7_t
8-bit fractional data type in 1.7 format.
Definition: arm_math.h:387
void arm_conv_opt_q7(q7_t *pSrcA, uint32_t srcALen, q7_t *pSrcB, uint32_t srcBLen, q7_t *pDst, q15_t *pScratch1, q15_t *pScratch2)
Convolution of Q7 sequences.
#define __PACKq7(v0, v1, v2, v3)
definition to pack four 8 bit values.
Definition: arm_math.h:467
int16_t q15_t
16-bit fractional data type in 1.15 format.
Definition: arm_math.h:392
#define __SIMD32(addr)
definition to read/write two 16 bit values.
Definition: arm_math.h:445
#define _SIMD32_OFFSET(addr)
Definition: arm_math.h:447
int32_t q31_t
32-bit fractional data type in 1.31 format.
Definition: arm_math.h:397
void arm_fill_q15(q15_t value, q15_t *pDst, uint32_t blockSize)
Fills a constant value into a Q15 vector.
Definition: arm_fill_q15.c:61