STM32F769IDiscovery  1.00
uDANTE Audio Networking with STM32F7 DISCO board
arm_correlate_opt_q15.c
Go to the documentation of this file.
1 /* ----------------------------------------------------------------------
2 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.
3 *
4 * $Date: 19. March 2015
5 * $Revision: V.1.4.5
6 *
7 * Project: CMSIS DSP Library
8 * Title: arm_correlate_opt_q15.c
9 *
10 * Description: Correlation of Q15 sequences.
11 *
12 * Target Processor: Cortex-M4/Cortex-M3
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * - Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * - Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in
21 * the documentation and/or other materials provided with the
22 * distribution.
23 * - Neither the name of ARM LIMITED nor the names of its contributors
24 * may be used to endorse or promote products derived from this
25 * software without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 * POSSIBILITY OF SUCH DAMAGE.
39 * -------------------------------------------------------------------- */
40 
41 #include "arm_math.h"
42 
84  q15_t * pSrcA,
85  uint32_t srcALen,
86  q15_t * pSrcB,
87  uint32_t srcBLen,
88  q15_t * pDst,
89  q15_t * pScratch)
90 {
91  q15_t *pIn1; /* inputA pointer */
92  q15_t *pIn2; /* inputB pointer */
93  q63_t acc0, acc1, acc2, acc3; /* Accumulators */
94  q15_t *py; /* Intermediate inputB pointer */
95  q31_t x1, x2, x3; /* temporary variables for holding input1 and input2 values */
96  uint32_t j, blkCnt, outBlockSize; /* loop counter */
97  int32_t inc = 1; /* output pointer increment */
98  uint32_t tapCnt;
99  q31_t y1, y2;
100  q15_t *pScr; /* Intermediate pointers */
101  q15_t *pOut = pDst; /* output pointer */
102 #ifdef UNALIGNED_SUPPORT_DISABLE
103 
104  q15_t a, b;
105 
106 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
107 
108  /* The algorithm implementation is based on the lengths of the inputs. */
109  /* srcB is always made to slide across srcA. */
110  /* So srcBLen is always considered as shorter or equal to srcALen */
111  /* But CORR(x, y) is reverse of CORR(y, x) */
112  /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
113  /* and the destination pointer modifier, inc is set to -1 */
114  /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
115  /* But to improve the performance,
116  * we include zeroes in the output instead of zero padding either of the the inputs*/
117  /* If srcALen > srcBLen,
118  * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
119  /* If srcALen < srcBLen,
120  * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
121  if(srcALen >= srcBLen)
122  {
123  /* Initialization of inputA pointer */
124  pIn1 = (pSrcA);
125 
126  /* Initialization of inputB pointer */
127  pIn2 = (pSrcB);
128 
129  /* Number of output samples is calculated */
130  outBlockSize = (2u * srcALen) - 1u;
131 
132  /* When srcALen > srcBLen, zero padding is done to srcB
133  * to make their lengths equal.
134  * Instead, (outBlockSize - (srcALen + srcBLen - 1))
135  * number of output samples are made zero */
136  j = outBlockSize - (srcALen + (srcBLen - 1u));
137 
138  /* Updating the pointer position to non zero value */
139  pOut += j;
140 
141  }
142  else
143  {
144  /* Initialization of inputA pointer */
145  pIn1 = (pSrcB);
146 
147  /* Initialization of inputB pointer */
148  pIn2 = (pSrcA);
149 
150  /* srcBLen is always considered as shorter or equal to srcALen */
151  j = srcBLen;
152  srcBLen = srcALen;
153  srcALen = j;
154 
155  /* CORR(x, y) = Reverse order(CORR(y, x)) */
156  /* Hence set the destination pointer to point to the last output sample */
157  pOut = pDst + ((srcALen + srcBLen) - 2u);
158 
159  /* Destination address modifier is set to -1 */
160  inc = -1;
161 
162  }
163 
164  pScr = pScratch;
165 
166  /* Fill (srcBLen - 1u) zeros in scratch buffer */
167  arm_fill_q15(0, pScr, (srcBLen - 1u));
168 
169  /* Update temporary scratch pointer */
170  pScr += (srcBLen - 1u);
171 
172 #ifndef UNALIGNED_SUPPORT_DISABLE
173 
174  /* Copy (srcALen) samples in scratch buffer */
175  arm_copy_q15(pIn1, pScr, srcALen);
176 
177  /* Update pointers */
178  //pIn1 += srcALen;
179  pScr += srcALen;
180 
181 #else
182 
183  /* Apply loop unrolling and do 4 Copies simultaneously. */
184  j = srcALen >> 2u;
185 
186  /* First part of the processing with loop unrolling copies 4 data points at a time.
187  ** a second loop below copies for the remaining 1 to 3 samples. */
188  while(j > 0u)
189  {
190  /* copy second buffer in reversal manner */
191  *pScr++ = *pIn1++;
192  *pScr++ = *pIn1++;
193  *pScr++ = *pIn1++;
194  *pScr++ = *pIn1++;
195 
196  /* Decrement the loop counter */
197  j--;
198  }
199 
200  /* If the count is not a multiple of 4, copy remaining samples here.
201  ** No loop unrolling is used. */
202  j = srcALen % 0x4u;
203 
204  while(j > 0u)
205  {
206  /* copy second buffer in reversal manner for remaining samples */
207  *pScr++ = *pIn1++;
208 
209  /* Decrement the loop counter */
210  j--;
211  }
212 
213 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
214 
215 #ifndef UNALIGNED_SUPPORT_DISABLE
216 
217  /* Fill (srcBLen - 1u) zeros at end of scratch buffer */
218  arm_fill_q15(0, pScr, (srcBLen - 1u));
219 
220  /* Update pointer */
221  pScr += (srcBLen - 1u);
222 
223 #else
224 
225 /* Apply loop unrolling and do 4 Copies simultaneously. */
226  j = (srcBLen - 1u) >> 2u;
227 
228  /* First part of the processing with loop unrolling copies 4 data points at a time.
229  ** a second loop below copies for the remaining 1 to 3 samples. */
230  while(j > 0u)
231  {
232  /* copy second buffer in reversal manner */
233  *pScr++ = 0;
234  *pScr++ = 0;
235  *pScr++ = 0;
236  *pScr++ = 0;
237 
238  /* Decrement the loop counter */
239  j--;
240  }
241 
242  /* If the count is not a multiple of 4, copy remaining samples here.
243  ** No loop unrolling is used. */
244  j = (srcBLen - 1u) % 0x4u;
245 
246  while(j > 0u)
247  {
248  /* copy second buffer in reversal manner for remaining samples */
249  *pScr++ = 0;
250 
251  /* Decrement the loop counter */
252  j--;
253  }
254 
255 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
256 
257  /* Temporary pointer for scratch2 */
258  py = pIn2;
259 
260 
261  /* Actual correlation process starts here */
262  blkCnt = (srcALen + srcBLen - 1u) >> 2;
263 
264  while(blkCnt > 0)
265  {
266  /* Initialze temporary scratch pointer as scratch1 */
267  pScr = pScratch;
268 
269  /* Clear Accumlators */
270  acc0 = 0;
271  acc1 = 0;
272  acc2 = 0;
273  acc3 = 0;
274 
275  /* Read four samples from scratch1 buffer */
276  x1 = *__SIMD32(pScr)++;
277 
278  /* Read next four samples from scratch1 buffer */
279  x2 = *__SIMD32(pScr)++;
280 
281  tapCnt = (srcBLen) >> 2u;
282 
283  while(tapCnt > 0u)
284  {
285 
286 #ifndef UNALIGNED_SUPPORT_DISABLE
287 
288  /* Read four samples from smaller buffer */
289  y1 = _SIMD32_OFFSET(pIn2);
290  y2 = _SIMD32_OFFSET(pIn2 + 2u);
291 
292  acc0 = __SMLALD(x1, y1, acc0);
293 
294  acc2 = __SMLALD(x2, y1, acc2);
295 
296 #ifndef ARM_MATH_BIG_ENDIAN
297  x3 = __PKHBT(x2, x1, 0);
298 #else
299  x3 = __PKHBT(x1, x2, 0);
300 #endif
301 
302  acc1 = __SMLALDX(x3, y1, acc1);
303 
304  x1 = _SIMD32_OFFSET(pScr);
305 
306  acc0 = __SMLALD(x2, y2, acc0);
307 
308  acc2 = __SMLALD(x1, y2, acc2);
309 
310 #ifndef ARM_MATH_BIG_ENDIAN
311  x3 = __PKHBT(x1, x2, 0);
312 #else
313  x3 = __PKHBT(x2, x1, 0);
314 #endif
315 
316  acc3 = __SMLALDX(x3, y1, acc3);
317 
318  acc1 = __SMLALDX(x3, y2, acc1);
319 
320  x2 = _SIMD32_OFFSET(pScr + 2u);
321 
322 #ifndef ARM_MATH_BIG_ENDIAN
323  x3 = __PKHBT(x2, x1, 0);
324 #else
325  x3 = __PKHBT(x1, x2, 0);
326 #endif
327 
328  acc3 = __SMLALDX(x3, y2, acc3);
329 
330 #else
331 
332  /* Read four samples from smaller buffer */
333  a = *pIn2;
334  b = *(pIn2 + 1);
335 
336 #ifndef ARM_MATH_BIG_ENDIAN
337  y1 = __PKHBT(a, b, 16);
338 #else
339  y1 = __PKHBT(b, a, 16);
340 #endif
341 
342  a = *(pIn2 + 2);
343  b = *(pIn2 + 3);
344 #ifndef ARM_MATH_BIG_ENDIAN
345  y2 = __PKHBT(a, b, 16);
346 #else
347  y2 = __PKHBT(b, a, 16);
348 #endif
349 
350  acc0 = __SMLALD(x1, y1, acc0);
351 
352  acc2 = __SMLALD(x2, y1, acc2);
353 
354 #ifndef ARM_MATH_BIG_ENDIAN
355  x3 = __PKHBT(x2, x1, 0);
356 #else
357  x3 = __PKHBT(x1, x2, 0);
358 #endif
359 
360  acc1 = __SMLALDX(x3, y1, acc1);
361 
362  a = *pScr;
363  b = *(pScr + 1);
364 
365 #ifndef ARM_MATH_BIG_ENDIAN
366  x1 = __PKHBT(a, b, 16);
367 #else
368  x1 = __PKHBT(b, a, 16);
369 #endif
370 
371  acc0 = __SMLALD(x2, y2, acc0);
372 
373  acc2 = __SMLALD(x1, y2, acc2);
374 
375 #ifndef ARM_MATH_BIG_ENDIAN
376  x3 = __PKHBT(x1, x2, 0);
377 #else
378  x3 = __PKHBT(x2, x1, 0);
379 #endif
380 
381  acc3 = __SMLALDX(x3, y1, acc3);
382 
383  acc1 = __SMLALDX(x3, y2, acc1);
384 
385  a = *(pScr + 2);
386  b = *(pScr + 3);
387 
388 #ifndef ARM_MATH_BIG_ENDIAN
389  x2 = __PKHBT(a, b, 16);
390 #else
391  x2 = __PKHBT(b, a, 16);
392 #endif
393 
394 #ifndef ARM_MATH_BIG_ENDIAN
395  x3 = __PKHBT(x2, x1, 0);
396 #else
397  x3 = __PKHBT(x1, x2, 0);
398 #endif
399 
400  acc3 = __SMLALDX(x3, y2, acc3);
401 
402 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
403 
404  pIn2 += 4u;
405 
406  pScr += 4u;
407 
408 
409  /* Decrement the loop counter */
410  tapCnt--;
411  }
412 
413 
414 
415  /* Update scratch pointer for remaining samples of smaller length sequence */
416  pScr -= 4u;
417 
418 
419  /* apply same above for remaining samples of smaller length sequence */
420  tapCnt = (srcBLen) & 3u;
421 
422  while(tapCnt > 0u)
423  {
424 
425  /* accumlate the results */
426  acc0 += (*pScr++ * *pIn2);
427  acc1 += (*pScr++ * *pIn2);
428  acc2 += (*pScr++ * *pIn2);
429  acc3 += (*pScr++ * *pIn2++);
430 
431  pScr -= 3u;
432 
433  /* Decrement the loop counter */
434  tapCnt--;
435  }
436 
437  blkCnt--;
438 
439 
440  /* Store the results in the accumulators in the destination buffer. */
441  *pOut = (__SSAT(acc0 >> 15u, 16));
442  pOut += inc;
443  *pOut = (__SSAT(acc1 >> 15u, 16));
444  pOut += inc;
445  *pOut = (__SSAT(acc2 >> 15u, 16));
446  pOut += inc;
447  *pOut = (__SSAT(acc3 >> 15u, 16));
448  pOut += inc;
449 
450  /* Initialization of inputB pointer */
451  pIn2 = py;
452 
453  pScratch += 4u;
454 
455  }
456 
457 
458  blkCnt = (srcALen + srcBLen - 1u) & 0x3;
459 
460  /* Calculate correlation for remaining samples of Bigger length sequence */
461  while(blkCnt > 0)
462  {
463  /* Initialze temporary scratch pointer as scratch1 */
464  pScr = pScratch;
465 
466  /* Clear Accumlators */
467  acc0 = 0;
468 
469  tapCnt = (srcBLen) >> 1u;
470 
471  while(tapCnt > 0u)
472  {
473 
474  acc0 += (*pScr++ * *pIn2++);
475  acc0 += (*pScr++ * *pIn2++);
476 
477  /* Decrement the loop counter */
478  tapCnt--;
479  }
480 
481  tapCnt = (srcBLen) & 1u;
482 
483  /* apply same above for remaining samples of smaller length sequence */
484  while(tapCnt > 0u)
485  {
486 
487  /* accumlate the results */
488  acc0 += (*pScr++ * *pIn2++);
489 
490  /* Decrement the loop counter */
491  tapCnt--;
492  }
493 
494  blkCnt--;
495 
496  /* Store the result in the accumulator in the destination buffer. */
497  *pOut = (q15_t) (__SSAT((acc0 >> 15), 16));
498 
499  pOut += inc;
500 
501  /* Initialization of inputB pointer */
502  pIn2 = py;
503 
504  pScratch += 1u;
505 
506  }
507 
508 
509 }
510 
int64_t q63_t
64-bit fractional data type in 1.63 format.
Definition: arm_math.h:402
void arm_copy_q15(q15_t *pSrc, q15_t *pDst, uint32_t blockSize)
Copies the elements of a Q15 vector.
Definition: arm_copy_q15.c:60
int16_t q15_t
16-bit fractional data type in 1.15 format.
Definition: arm_math.h:392
#define __SIMD32(addr)
definition to read/write two 16 bit values.
Definition: arm_math.h:445
#define _SIMD32_OFFSET(addr)
Definition: arm_math.h:447
int32_t q31_t
32-bit fractional data type in 1.31 format.
Definition: arm_math.h:397
void arm_correlate_opt_q15(q15_t *pSrcA, uint32_t srcALen, q15_t *pSrcB, uint32_t srcBLen, q15_t *pDst, q15_t *pScratch)
Correlation of Q15 sequences.
void arm_fill_q15(q15_t value, q15_t *pDst, uint32_t blockSize)
Fills a constant value into a Q15 vector.
Definition: arm_fill_q15.c:61