STM32F769IDiscovery  1.00
uDANTE Audio Networking with STM32F7 DISCO board
arm_correlate_opt_q7.c
Go to the documentation of this file.
1 /* ----------------------------------------------------------------------
2 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.
3 *
4 * $Date: 19. March 2015
5 * $Revision: V.1.4.5
6 *
7 * Project: CMSIS DSP Library
8 * Title: arm_correlate_opt_q7.c
9 *
10 * Description: Correlation of Q7 sequences.
11 *
12 * Target Processor: Cortex-M4/Cortex-M3
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * - Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * - Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in
21 * the documentation and/or other materials provided with the
22 * distribution.
23 * - Neither the name of ARM LIMITED nor the names of its contributors
24 * may be used to endorse or promote products derived from this
25 * software without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 * POSSIBILITY OF SUCH DAMAGE.
39 * -------------------------------------------------------------------- */
40 
41 #include "arm_math.h"
42 
84  q7_t * pSrcA,
85  uint32_t srcALen,
86  q7_t * pSrcB,
87  uint32_t srcBLen,
88  q7_t * pDst,
89  q15_t * pScratch1,
90  q15_t * pScratch2)
91 {
92  q7_t *pOut = pDst; /* output pointer */
93  q15_t *pScr1 = pScratch1; /* Temporary pointer for scratch */
94  q15_t *pScr2 = pScratch2; /* Temporary pointer for scratch */
95  q7_t *pIn1; /* inputA pointer */
96  q7_t *pIn2; /* inputB pointer */
97  q15_t *py; /* Intermediate inputB pointer */
98  q31_t acc0, acc1, acc2, acc3; /* Accumulators */
99  uint32_t j, k = 0u, blkCnt; /* loop counter */
100  int32_t inc = 1; /* output pointer increment */
101  uint32_t outBlockSize; /* loop counter */
102  q15_t x4; /* Temporary input variable */
103  uint32_t tapCnt; /* loop counter */
104  q31_t x1, x2, x3, y1; /* Temporary input variables */
105 
106  /* The algorithm implementation is based on the lengths of the inputs. */
107  /* srcB is always made to slide across srcA. */
108  /* So srcBLen is always considered as shorter or equal to srcALen */
109  /* But CORR(x, y) is reverse of CORR(y, x) */
110  /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
111  /* and the destination pointer modifier, inc is set to -1 */
112  /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
113  /* But to improve the performance,
114  * we include zeroes in the output instead of zero padding either of the the inputs*/
115  /* If srcALen > srcBLen,
116  * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
117  /* If srcALen < srcBLen,
118  * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
119  if(srcALen >= srcBLen)
120  {
121  /* Initialization of inputA pointer */
122  pIn1 = (pSrcA);
123 
124  /* Initialization of inputB pointer */
125  pIn2 = (pSrcB);
126 
127  /* Number of output samples is calculated */
128  outBlockSize = (2u * srcALen) - 1u;
129 
130  /* When srcALen > srcBLen, zero padding is done to srcB
131  * to make their lengths equal.
132  * Instead, (outBlockSize - (srcALen + srcBLen - 1))
133  * number of output samples are made zero */
134  j = outBlockSize - (srcALen + (srcBLen - 1u));
135 
136  /* Updating the pointer position to non zero value */
137  pOut += j;
138 
139  }
140  else
141  {
142  /* Initialization of inputA pointer */
143  pIn1 = (pSrcB);
144 
145  /* Initialization of inputB pointer */
146  pIn2 = (pSrcA);
147 
148  /* srcBLen is always considered as shorter or equal to srcALen */
149  j = srcBLen;
150  srcBLen = srcALen;
151  srcALen = j;
152 
153  /* CORR(x, y) = Reverse order(CORR(y, x)) */
154  /* Hence set the destination pointer to point to the last output sample */
155  pOut = pDst + ((srcALen + srcBLen) - 2u);
156 
157  /* Destination address modifier is set to -1 */
158  inc = -1;
159 
160  }
161 
162 
163  /* Copy (srcBLen) samples in scratch buffer */
164  k = srcBLen >> 2u;
165 
166  /* First part of the processing with loop unrolling copies 4 data points at a time.
167  ** a second loop below copies for the remaining 1 to 3 samples. */
168  while(k > 0u)
169  {
170  /* copy second buffer in reversal manner */
171  x4 = (q15_t) * pIn2++;
172  *pScr2++ = x4;
173  x4 = (q15_t) * pIn2++;
174  *pScr2++ = x4;
175  x4 = (q15_t) * pIn2++;
176  *pScr2++ = x4;
177  x4 = (q15_t) * pIn2++;
178  *pScr2++ = x4;
179 
180  /* Decrement the loop counter */
181  k--;
182  }
183 
184  /* If the count is not a multiple of 4, copy remaining samples here.
185  ** No loop unrolling is used. */
186  k = srcBLen % 0x4u;
187 
188  while(k > 0u)
189  {
190  /* copy second buffer in reversal manner for remaining samples */
191  x4 = (q15_t) * pIn2++;
192  *pScr2++ = x4;
193 
194  /* Decrement the loop counter */
195  k--;
196  }
197 
198  /* Fill (srcBLen - 1u) zeros in scratch buffer */
199  arm_fill_q15(0, pScr1, (srcBLen - 1u));
200 
201  /* Update temporary scratch pointer */
202  pScr1 += (srcBLen - 1u);
203 
204  /* Copy (srcALen) samples in scratch buffer */
205  k = srcALen >> 2u;
206 
207  /* First part of the processing with loop unrolling copies 4 data points at a time.
208  ** a second loop below copies for the remaining 1 to 3 samples. */
209  while(k > 0u)
210  {
211  /* copy second buffer in reversal manner */
212  x4 = (q15_t) * pIn1++;
213  *pScr1++ = x4;
214  x4 = (q15_t) * pIn1++;
215  *pScr1++ = x4;
216  x4 = (q15_t) * pIn1++;
217  *pScr1++ = x4;
218  x4 = (q15_t) * pIn1++;
219  *pScr1++ = x4;
220 
221  /* Decrement the loop counter */
222  k--;
223  }
224 
225  /* If the count is not a multiple of 4, copy remaining samples here.
226  ** No loop unrolling is used. */
227  k = srcALen % 0x4u;
228 
229  while(k > 0u)
230  {
231  /* copy second buffer in reversal manner for remaining samples */
232  x4 = (q15_t) * pIn1++;
233  *pScr1++ = x4;
234 
235  /* Decrement the loop counter */
236  k--;
237  }
238 
239 #ifndef UNALIGNED_SUPPORT_DISABLE
240 
241  /* Fill (srcBLen - 1u) zeros at end of scratch buffer */
242  arm_fill_q15(0, pScr1, (srcBLen - 1u));
243 
244  /* Update pointer */
245  pScr1 += (srcBLen - 1u);
246 
247 #else
248 
249 /* Apply loop unrolling and do 4 Copies simultaneously. */
250  k = (srcBLen - 1u) >> 2u;
251 
252  /* First part of the processing with loop unrolling copies 4 data points at a time.
253  ** a second loop below copies for the remaining 1 to 3 samples. */
254  while(k > 0u)
255  {
256  /* copy second buffer in reversal manner */
257  *pScr1++ = 0;
258  *pScr1++ = 0;
259  *pScr1++ = 0;
260  *pScr1++ = 0;
261 
262  /* Decrement the loop counter */
263  k--;
264  }
265 
266  /* If the count is not a multiple of 4, copy remaining samples here.
267  ** No loop unrolling is used. */
268  k = (srcBLen - 1u) % 0x4u;
269 
270  while(k > 0u)
271  {
272  /* copy second buffer in reversal manner for remaining samples */
273  *pScr1++ = 0;
274 
275  /* Decrement the loop counter */
276  k--;
277  }
278 
279 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
280 
281  /* Temporary pointer for second sequence */
282  py = pScratch2;
283 
284  /* Initialization of pScr2 pointer */
285  pScr2 = pScratch2;
286 
287  /* Actual correlation process starts here */
288  blkCnt = (srcALen + srcBLen - 1u) >> 2;
289 
290  while(blkCnt > 0)
291  {
292  /* Initialze temporary scratch pointer as scratch1 */
293  pScr1 = pScratch1;
294 
295  /* Clear Accumlators */
296  acc0 = 0;
297  acc1 = 0;
298  acc2 = 0;
299  acc3 = 0;
300 
301  /* Read two samples from scratch1 buffer */
302  x1 = *__SIMD32(pScr1)++;
303 
304  /* Read next two samples from scratch1 buffer */
305  x2 = *__SIMD32(pScr1)++;
306 
307  tapCnt = (srcBLen) >> 2u;
308 
309  while(tapCnt > 0u)
310  {
311 
312  /* Read four samples from smaller buffer */
313  y1 = _SIMD32_OFFSET(pScr2);
314 
315  /* multiply and accumlate */
316  acc0 = __SMLAD(x1, y1, acc0);
317  acc2 = __SMLAD(x2, y1, acc2);
318 
319  /* pack input data */
320 #ifndef ARM_MATH_BIG_ENDIAN
321  x3 = __PKHBT(x2, x1, 0);
322 #else
323  x3 = __PKHBT(x1, x2, 0);
324 #endif
325 
326  /* multiply and accumlate */
327  acc1 = __SMLADX(x3, y1, acc1);
328 
329  /* Read next two samples from scratch1 buffer */
330  x1 = *__SIMD32(pScr1)++;
331 
332  /* pack input data */
333 #ifndef ARM_MATH_BIG_ENDIAN
334  x3 = __PKHBT(x1, x2, 0);
335 #else
336  x3 = __PKHBT(x2, x1, 0);
337 #endif
338 
339  acc3 = __SMLADX(x3, y1, acc3);
340 
341  /* Read four samples from smaller buffer */
342  y1 = _SIMD32_OFFSET(pScr2 + 2u);
343 
344  acc0 = __SMLAD(x2, y1, acc0);
345 
346  acc2 = __SMLAD(x1, y1, acc2);
347 
348  acc1 = __SMLADX(x3, y1, acc1);
349 
350  x2 = *__SIMD32(pScr1)++;
351 
352 #ifndef ARM_MATH_BIG_ENDIAN
353  x3 = __PKHBT(x2, x1, 0);
354 #else
355  x3 = __PKHBT(x1, x2, 0);
356 #endif
357 
358  acc3 = __SMLADX(x3, y1, acc3);
359 
360  pScr2 += 4u;
361 
362 
363  /* Decrement the loop counter */
364  tapCnt--;
365  }
366 
367 
368 
369  /* Update scratch pointer for remaining samples of smaller length sequence */
370  pScr1 -= 4u;
371 
372 
373  /* apply same above for remaining samples of smaller length sequence */
374  tapCnt = (srcBLen) & 3u;
375 
376  while(tapCnt > 0u)
377  {
378 
379  /* accumlate the results */
380  acc0 += (*pScr1++ * *pScr2);
381  acc1 += (*pScr1++ * *pScr2);
382  acc2 += (*pScr1++ * *pScr2);
383  acc3 += (*pScr1++ * *pScr2++);
384 
385  pScr1 -= 3u;
386 
387  /* Decrement the loop counter */
388  tapCnt--;
389  }
390 
391  blkCnt--;
392 
393  /* Store the result in the accumulator in the destination buffer. */
394  *pOut = (q7_t) (__SSAT(acc0 >> 7u, 8));
395  pOut += inc;
396  *pOut = (q7_t) (__SSAT(acc1 >> 7u, 8));
397  pOut += inc;
398  *pOut = (q7_t) (__SSAT(acc2 >> 7u, 8));
399  pOut += inc;
400  *pOut = (q7_t) (__SSAT(acc3 >> 7u, 8));
401  pOut += inc;
402 
403  /* Initialization of inputB pointer */
404  pScr2 = py;
405 
406  pScratch1 += 4u;
407 
408  }
409 
410 
411  blkCnt = (srcALen + srcBLen - 1u) & 0x3;
412 
413  /* Calculate correlation for remaining samples of Bigger length sequence */
414  while(blkCnt > 0)
415  {
416  /* Initialze temporary scratch pointer as scratch1 */
417  pScr1 = pScratch1;
418 
419  /* Clear Accumlators */
420  acc0 = 0;
421 
422  tapCnt = (srcBLen) >> 1u;
423 
424  while(tapCnt > 0u)
425  {
426  acc0 += (*pScr1++ * *pScr2++);
427  acc0 += (*pScr1++ * *pScr2++);
428 
429  /* Decrement the loop counter */
430  tapCnt--;
431  }
432 
433  tapCnt = (srcBLen) & 1u;
434 
435  /* apply same above for remaining samples of smaller length sequence */
436  while(tapCnt > 0u)
437  {
438 
439  /* accumlate the results */
440  acc0 += (*pScr1++ * *pScr2++);
441 
442  /* Decrement the loop counter */
443  tapCnt--;
444  }
445 
446  blkCnt--;
447 
448  /* Store the result in the accumulator in the destination buffer. */
449  *pOut = (q7_t) (__SSAT(acc0 >> 7u, 8));
450 
451  pOut += inc;
452 
453  /* Initialization of inputB pointer */
454  pScr2 = py;
455 
456  pScratch1 += 1u;
457 
458  }
459 
460 }
461 
int8_t q7_t
8-bit fractional data type in 1.7 format.
Definition: arm_math.h:387
void arm_correlate_opt_q7(q7_t *pSrcA, uint32_t srcALen, q7_t *pSrcB, uint32_t srcBLen, q7_t *pDst, q15_t *pScratch1, q15_t *pScratch2)
Correlation of Q7 sequences.
int16_t q15_t
16-bit fractional data type in 1.15 format.
Definition: arm_math.h:392
#define __SIMD32(addr)
definition to read/write two 16 bit values.
Definition: arm_math.h:445
#define _SIMD32_OFFSET(addr)
Definition: arm_math.h:447
int32_t q31_t
32-bit fractional data type in 1.31 format.
Definition: arm_math.h:397
void arm_fill_q15(q15_t value, q15_t *pDst, uint32_t blockSize)
Fills a constant value into a Q15 vector.
Definition: arm_fill_q15.c:61