STM32F769IDiscovery  1.00
uDANTE Audio Networking with STM32F7 DISCO board
arm_mat_mult_q15.c
Go to the documentation of this file.
1 /* ----------------------------------------------------------------------
2 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.
3 *
4 * $Date: 19. March 2015
5 * $Revision: V.1.4.5
6 *
7 * Project: CMSIS DSP Library
8 * Title: arm_mat_mult_q15.c
9 *
10 * Description: Q15 matrix multiplication.
11 *
12 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * - Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * - Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in
21 * the documentation and/or other materials provided with the
22 * distribution.
23 * - Neither the name of ARM LIMITED nor the names of its contributors
24 * may be used to endorse or promote products derived from this
25 * software without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 * POSSIBILITY OF SUCH DAMAGE.
39 * -------------------------------------------------------------------- */
40 
41 #include "arm_math.h"
42 
80  const arm_matrix_instance_q15 * pSrcA,
81  const arm_matrix_instance_q15 * pSrcB,
83  q15_t * pState CMSIS_UNUSED)
84 {
85  q63_t sum; /* accumulator */
86 
87 #ifndef ARM_MATH_CM0_FAMILY
88 
89  /* Run the below code for Cortex-M4 and Cortex-M3 */
90 
91  q15_t *pSrcBT = pState; /* input data matrix pointer for transpose */
92  q15_t *pInA = pSrcA->pData; /* input data matrix pointer A of Q15 type */
93  q15_t *pInB = pSrcB->pData; /* input data matrix pointer B of Q15 type */
94  q15_t *px; /* Temporary output data matrix pointer */
95  uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
96  uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
97  uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
98  uint16_t numRowsB = pSrcB->numRows; /* number of rows of input matrix A */
99  uint16_t col, i = 0u, row = numRowsB, colCnt; /* loop counters */
100  arm_status status; /* status of matrix multiplication */
101 
102 #ifndef UNALIGNED_SUPPORT_DISABLE
103 
104  q31_t in; /* Temporary variable to hold the input value */
105  q31_t pSourceA1, pSourceB1, pSourceA2, pSourceB2;
106 
107 #else
108 
109  q15_t in; /* Temporary variable to hold the input value */
110  q15_t inA1, inB1, inA2, inB2;
111 
112 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
113 
114 #ifdef ARM_MATH_MATRIX_CHECK
115  /* Check for matrix mismatch condition */
116  if((pSrcA->numCols != pSrcB->numRows) ||
117  (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
118  {
119  /* Set status as ARM_MATH_SIZE_MISMATCH */
120  status = ARM_MATH_SIZE_MISMATCH;
121  }
122  else
123 #endif /* #ifdef ARM_MATH_MATRIX_CHECK */
124  {
125  /* Matrix transpose */
126  do
127  {
128  /* Apply loop unrolling and exchange the columns with row elements */
129  col = numColsB >> 2;
130 
131  /* The pointer px is set to starting address of the column being processed */
132  px = pSrcBT + i;
133 
134  /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
135  ** a second loop below computes the remaining 1 to 3 samples. */
136  while(col > 0u)
137  {
138 #ifndef UNALIGNED_SUPPORT_DISABLE
139 
140  /* Read two elements from the row */
141  in = *__SIMD32(pInB)++;
142 
143  /* Unpack and store one element in the destination */
144 #ifndef ARM_MATH_BIG_ENDIAN
145 
146  *px = (q15_t) in;
147 
148 #else
149 
150  *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
151 
152 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
153 
154  /* Update the pointer px to point to the next row of the transposed matrix */
155  px += numRowsB;
156 
157  /* Unpack and store the second element in the destination */
158 #ifndef ARM_MATH_BIG_ENDIAN
159 
160  *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
161 
162 #else
163 
164  *px = (q15_t) in;
165 
166 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
167 
168  /* Update the pointer px to point to the next row of the transposed matrix */
169  px += numRowsB;
170 
171  /* Read two elements from the row */
172  in = *__SIMD32(pInB)++;
173 
174  /* Unpack and store one element in the destination */
175 #ifndef ARM_MATH_BIG_ENDIAN
176 
177  *px = (q15_t) in;
178 
179 #else
180 
181  *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
182 
183 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
184 
185  /* Update the pointer px to point to the next row of the transposed matrix */
186  px += numRowsB;
187 
188  /* Unpack and store the second element in the destination */
189 
190 #ifndef ARM_MATH_BIG_ENDIAN
191 
192  *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
193 
194 #else
195 
196  *px = (q15_t) in;
197 
198 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
199 
200  /* Update the pointer px to point to the next row of the transposed matrix */
201  px += numRowsB;
202 
203 #else
204 
205  /* Read one element from the row */
206  in = *pInB++;
207 
208  /* Store one element in the destination */
209  *px = in;
210 
211  /* Update the pointer px to point to the next row of the transposed matrix */
212  px += numRowsB;
213 
214  /* Read one element from the row */
215  in = *pInB++;
216 
217  /* Store one element in the destination */
218  *px = in;
219 
220  /* Update the pointer px to point to the next row of the transposed matrix */
221  px += numRowsB;
222 
223  /* Read one element from the row */
224  in = *pInB++;
225 
226  /* Store one element in the destination */
227  *px = in;
228 
229  /* Update the pointer px to point to the next row of the transposed matrix */
230  px += numRowsB;
231 
232  /* Read one element from the row */
233  in = *pInB++;
234 
235  /* Store one element in the destination */
236  *px = in;
237 
238  /* Update the pointer px to point to the next row of the transposed matrix */
239  px += numRowsB;
240 
241 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
242 
243  /* Decrement the column loop counter */
244  col--;
245  }
246 
247  /* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here.
248  ** No loop unrolling is used. */
249  col = numColsB % 0x4u;
250 
251  while(col > 0u)
252  {
253  /* Read and store the input element in the destination */
254  *px = *pInB++;
255 
256  /* Update the pointer px to point to the next row of the transposed matrix */
257  px += numRowsB;
258 
259  /* Decrement the column loop counter */
260  col--;
261  }
262 
263  i++;
264 
265  /* Decrement the row loop counter */
266  row--;
267 
268  } while(row > 0u);
269 
270  /* Reset the variables for the usage in the following multiplication process */
271  row = numRowsA;
272  i = 0u;
273  px = pDst->pData;
274 
275  /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
276  /* row loop */
277  do
278  {
279  /* For every row wise process, the column loop counter is to be initiated */
280  col = numColsB;
281 
282  /* For every row wise process, the pIn2 pointer is set
283  ** to the starting address of the transposed pSrcB data */
284  pInB = pSrcBT;
285 
286  /* column loop */
287  do
288  {
289  /* Set the variable sum, that acts as accumulator, to zero */
290  sum = 0;
291 
292  /* Apply loop unrolling and compute 2 MACs simultaneously. */
293  colCnt = numColsA >> 2;
294 
295  /* Initiate the pointer pIn1 to point to the starting address of the column being processed */
296  pInA = pSrcA->pData + i;
297 
298 
299  /* matrix multiplication */
300  while(colCnt > 0u)
301  {
302  /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
303 #ifndef UNALIGNED_SUPPORT_DISABLE
304 
305  /* read real and imag values from pSrcA and pSrcB buffer */
306  pSourceA1 = *__SIMD32(pInA)++;
307  pSourceB1 = *__SIMD32(pInB)++;
308 
309  pSourceA2 = *__SIMD32(pInA)++;
310  pSourceB2 = *__SIMD32(pInB)++;
311 
312  /* Multiply and Accumlates */
313  sum = __SMLALD(pSourceA1, pSourceB1, sum);
314  sum = __SMLALD(pSourceA2, pSourceB2, sum);
315 
316 #else
317  /* read real and imag values from pSrcA and pSrcB buffer */
318  inA1 = *pInA++;
319  inB1 = *pInB++;
320  inA2 = *pInA++;
321  /* Multiply and Accumlates */
322  sum += inA1 * inB1;
323  inB2 = *pInB++;
324 
325  inA1 = *pInA++;
326  inB1 = *pInB++;
327  /* Multiply and Accumlates */
328  sum += inA2 * inB2;
329  inA2 = *pInA++;
330  inB2 = *pInB++;
331 
332  /* Multiply and Accumlates */
333  sum += inA1 * inB1;
334  sum += inA2 * inB2;
335 
336 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
337 
338  /* Decrement the loop counter */
339  colCnt--;
340  }
341 
342  /* process remaining column samples */
343  colCnt = numColsA & 3u;
344 
345  while(colCnt > 0u)
346  {
347  /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
348  sum += *pInA++ * *pInB++;
349 
350  /* Decrement the loop counter */
351  colCnt--;
352  }
353 
354  /* Saturate and store the result in the destination buffer */
355  *px = (q15_t) (__SSAT((sum >> 15), 16));
356  px++;
357 
358  /* Decrement the column loop counter */
359  col--;
360 
361  } while(col > 0u);
362 
363  i = i + numColsA;
364 
365  /* Decrement the row loop counter */
366  row--;
367 
368  } while(row > 0u);
369 
370 #else
371 
372  /* Run the below code for Cortex-M0 */
373 
374  q15_t *pIn1 = pSrcA->pData; /* input data matrix pointer A */
375  q15_t *pIn2 = pSrcB->pData; /* input data matrix pointer B */
376  q15_t *pInA = pSrcA->pData; /* input data matrix pointer A of Q15 type */
377  q15_t *pInB = pSrcB->pData; /* input data matrix pointer B of Q15 type */
378  q15_t *pOut = pDst->pData; /* output data matrix pointer */
379  q15_t *px; /* Temporary output data matrix pointer */
380  uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
381  uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
382  uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
383  uint16_t col, i = 0u, row = numRowsA, colCnt; /* loop counters */
384  arm_status status; /* status of matrix multiplication */
385 
386 #ifdef ARM_MATH_MATRIX_CHECK
387 
388  /* Check for matrix mismatch condition */
389  if((pSrcA->numCols != pSrcB->numRows) ||
390  (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
391  {
392  /* Set status as ARM_MATH_SIZE_MISMATCH */
393  status = ARM_MATH_SIZE_MISMATCH;
394  }
395  else
396 #endif /* #ifdef ARM_MATH_MATRIX_CHECK */
397 
398  {
399  /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
400  /* row loop */
401  do
402  {
403  /* Output pointer is set to starting address of the row being processed */
404  px = pOut + i;
405 
406  /* For every row wise process, the column loop counter is to be initiated */
407  col = numColsB;
408 
409  /* For every row wise process, the pIn2 pointer is set
410  ** to the starting address of the pSrcB data */
411  pIn2 = pSrcB->pData;
412 
413  /* column loop */
414  do
415  {
416  /* Set the variable sum, that acts as accumulator, to zero */
417  sum = 0;
418 
419  /* Initiate the pointer pIn1 to point to the starting address of pSrcA */
420  pIn1 = pInA;
421 
422  /* Matrix A columns number of MAC operations are to be performed */
423  colCnt = numColsA;
424 
425  /* matrix multiplication */
426  while(colCnt > 0u)
427  {
428  /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
429  /* Perform the multiply-accumulates */
430  sum += (q31_t) * pIn1++ * *pIn2;
431  pIn2 += numColsB;
432 
433  /* Decrement the loop counter */
434  colCnt--;
435  }
436 
437  /* Convert the result from 34.30 to 1.15 format and store the saturated value in destination buffer */
438  /* Saturate and store the result in the destination buffer */
439  *px++ = (q15_t) __SSAT((sum >> 15), 16);
440 
441  /* Decrement the column loop counter */
442  col--;
443 
444  /* Update the pointer pIn2 to point to the starting address of the next column */
445  pIn2 = pInB + (numColsB - col);
446 
447  } while(col > 0u);
448 
449  /* Update the pointer pSrcA to point to the starting address of the next row */
450  i = i + numColsB;
451  pInA = pInA + numColsA;
452 
453  /* Decrement the row loop counter */
454  row--;
455 
456  } while(row > 0u);
457 
458 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
459  /* set status as ARM_MATH_SUCCESS */
460  status = ARM_MATH_SUCCESS;
461  }
462 
463  /* Return to application */
464  return (status);
465 }
466 
int64_t q63_t
64-bit fractional data type in 1.63 format.
Definition: arm_math.h:402
arm_status arm_mat_mult_q15(const arm_matrix_instance_q15 *pSrcA, const arm_matrix_instance_q15 *pSrcB, arm_matrix_instance_q15 *pDst, q15_t *pState CMSIS_UNUSED)
Q15 matrix multiplication.
int16_t q15_t
16-bit fractional data type in 1.15 format.
Definition: arm_math.h:392
#define __SIMD32(addr)
definition to read/write two 16 bit values.
Definition: arm_math.h:445
Instance structure for the Q15 matrix structure.
Definition: arm_math.h:1390
int32_t q31_t
32-bit fractional data type in 1.31 format.
Definition: arm_math.h:397
arm_status
Error status returned by some functions in the library.
Definition: arm_math.h:373