arm_math.h

/* ----------------------------------------------------------------------   
 * Copyright (C) 2010-2011 ARM Limited. All rights reserved.   
 *   
 * $Date:        15. February 2012  
 * $Revision:   V1.1.0  
 *   
 * Project:       CMSIS DSP Library   
 * Title:       arm_math.h
 *   
 * Description:  Public header file for CMSIS DSP Library
 *   
 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
 *  
 * Version 1.1.0 2012/02/15 
 *    Updated with more optimizations, bug fixes and minor API changes.  
 *  
 * Version 1.0.10 2011/7/15 
 *    Big Endian support added and Merged M0 and M3/M4 Source code.  
 *   
 * Version 1.0.3 2010/11/29  
 *    Re-organized the CMSIS folders and updated documentation.   
 *    
 * Version 1.0.2 2010/11/11   
 *    Documentation updated.    
 *   
 * Version 1.0.1 2010/10/05    
 *    Production release and review comments incorporated.   
 *   
 * Version 1.0.0 2010/09/20    
 *    Production release and review comments incorporated.   
 * -------------------------------------------------------------------- */

/**
   \mainpage CMSIS DSP Software Library
   *
   * <b>Introduction</b>
   *
   * This user manual describes the CMSIS DSP software library, 
   * a suite of common signal processing functions for use on Cortex-M processor based devices.
   *
   * The library is divided into a number of functions each covering a specific category:  
   * - Basic math functions
   * - Fast math functions
   * - Complex math functions
   * - Filters
   * - Matrix functions
   * - Transforms
   * - Motor control functions
   * - Statistical functions
   * - Support functions
   * - Interpolation functions
   *
   * The library has separate functions for operating on 8-bit integers, 16-bit integers,
   * 32-bit integer and 32-bit floating-point values. 
   *
   * <b>Pre-processor Macros</b> 
   * 
   * Each library project have differant pre-processor macros. 
   * 
   * - UNALIGNED_SUPPORT_DISABLE: 
   * 
   * Define macro UNALIGNED_SUPPORT_DISABLE, If the silicon does not support unaligned memory access    
   * 
   * - ARM_MATH_BIG_ENDIAN: 
   * 
   * Define macro ARM_MATH_BIG_ENDIAN to build the library for big endian targets. By default library builds for little endian targets. 
   * 
   * - ARM_MATH_MATRIX_CHECK: 
   * 
   * Define macro ARM_MATH_MATRIX_CHECK for checking on the input and output sizes of matrices 
   * 
   * - ARM_MATH_ROUNDING: 
   * 
   * Define macro ARM_MATH_ROUNDING for rounding on support functions
   *
   * - ARM_MATH_CMx:
   *
   * Define macro ARM_MATH_CM4 for building the library on Cortex-M4 target, ARM_MATH_CM3 for building library on Cortex-M3 target
   * and ARM_MATH_CM0 for building library on cortex-M0 target.
   * 
   * - __FPU_PRESENT:
   *
   * Initialize macro __FPU_PRESENT = 1 when building on FPU supported Targets. Enable this macro for M4bf and M4lf libraries 
   *
   * <b>Toolchain Support</b>
   *
   * The library has been developed and tested with MDK-ARM version 4.23. 
   * The library is being tested in GCC and IAR toolchains and updates on this activity will be made available shortly.
   *
   * <b>Using the Library</b>
   *
   * The library installer contains prebuilt versions of the libraries in the <code>Lib</code> folder.
   * - arm_cortexM4lf_math.lib (Little endian and Floating Point Unit on Cortex-M4)
   * - arm_cortexM4bf_math.lib (Big endian and Floating Point Unit on Cortex-M4)
   * - arm_cortexM4l_math.lib (Little endian on Cortex-M4)
   * - arm_cortexM4b_math.lib (Big endian on Cortex-M4)
   * - arm_cortexM3l_math.lib (Little endian on Cortex-M3)
   * - arm_cortexM3b_math.lib (Big endian on Cortex-M3)
   * - arm_cortexM0l_math.lib (Little endian on Cortex-M0)
   * - arm_cortexM0b_math.lib (Big endian on Cortex-M3)
   *
   * The library functions are declared in the public file <code>arm_math.h</code> which is placed in the <code>Include</code> folder.
   * Simply include this file and link the appropriate library in the application and begin calling the library functions. The Library supports single 
   * public header file <code> arm_math.h</code> for Cortex-M4/M3/M0 with little endian and big endian. Same header file will be used for floating point unit(FPU) variants. 
   * Define the appropriate pre processor MACRO ARM_MATH_CM4 or  ARM_MATH_CM3 or 
   * ARM_MATH_CM0 depending on the target processor in the application.
   *
   * <b>Examples</b>
   *
   * The library ships with a number of examples which demonstrate how to use the library functions.
   *
   * <b>Building the Library</b>
   *
   * The library installer contains project files to re build libraries on MDK Tool chain in the <code>CMSIS\\DSP_Lib\\Source\\ARM</code> folder.
   * - arm_cortexM0b_math.uvproj
   * - arm_cortexM0l_math.uvproj
   * - arm_cortexM3b_math.uvproj
   * - arm_cortexM3l_math.uvproj  
   * - arm_cortexM4b_math.uvproj
   * - arm_cortexM4l_math.uvproj
   * - arm_cortexM4bf_math.uvproj
   * - arm_cortexM4lf_math.uvproj
   *
   *
   * The project can be built by opening the appropriate project in MDK-ARM 4.23 chain and defining the optional pre processor MACROs detailed above.
   *
   * <b>Copyright Notice</b>
   *
   * Copyright (C) 2010 ARM Limited. All rights reserved.
   */


/**
 * @defgroup groupMath Basic Math Functions
 */

/**
 * @defgroup groupFastMath Fast Math Functions
 * This set of functions provides a fast approximation to sine, cosine, and square root.
 * As compared to most of the other functions in the CMSIS math library, the fast math functions
 * operate on individual values and not arrays.
 * There are separate functions for Q15, Q31, and floating-point data.
 *
 */

/**
 * @defgroup groupCmplxMath Complex Math Functions
 * This set of functions operates on complex data vectors.
 * The data in the complex arrays is stored in an interleaved fashion
 * (real, imag, real, imag, ...).
 * In the API functions, the number of samples in a complex array refers
 * to the number of complex values; the array contains twice this number of
 * real values.
 */

/**
 * @defgroup groupFilters Filtering Functions
 */

/**
 * @defgroup groupMatrix Matrix Functions
 *
 * This set of functions provides basic matrix math operations.
 * The functions operate on matrix data structures.  For example,
 * the type
 * definition for the floating-point matrix structure is shown
 * below:
 * <pre>
 *     typedef struct
 *     {
 *       uint16_t numRows;     // number of rows of the matrix.
 *       uint16_t numCols;     // number of columns of the matrix.
 *       float32_t *pData;     // points to the data of the matrix.
 *     } arm_matrix_instance_f32;
 * </pre>
 * There are similar definitions for Q15 and Q31 data types.
 *
 * The structure specifies the size of the matrix and then points to
 * an array of data.  The array is of size <code>numRows X numCols</code>
 * and the values are arranged in row order.  That is, the
 * matrix element (i, j) is stored at:
 * <pre>
 *     pData[i*numCols + j]
 * </pre>
 *
 * \par Init Functions
 * There is an associated initialization function for each type of matrix
 * data structure.
 * The initialization function sets the values of the internal structure fields.
 * Refer to the function <code>arm_mat_init_f32()</code>, <code>arm_mat_init_q31()</code>
 * and <code>arm_mat_init_q15()</code> for floating-point, Q31 and Q15 types,  respectively.
 *
 * \par
 * Use of the initialization function is optional. However, if initialization function is used
 * then the instance structure cannot be placed into a const data section.
 * To place the instance structure in a const data
 * section, manually initialize the data structure.  For example:
 * <pre>
 * <code>arm_matrix_instance_f32 S = {nRows, nColumns, pData};</code>
 * <code>arm_matrix_instance_q31 S = {nRows, nColumns, pData};</code>
 * <code>arm_matrix_instance_q15 S = {nRows, nColumns, pData};</code>
 * </pre>
 * where <code>nRows</code> specifies the number of rows, <code>nColumns</code>
 * specifies the number of columns, and <code>pData</code> points to the
 * data array.
 *
 * \par Size Checking
 * By default all of the matrix functions perform size checking on the input and
 * output matrices.  For example, the matrix addition function verifies that the
 * two input matrices and the output matrix all have the same number of rows and
 * columns.  If the size check fails the functions return:
 * <pre>
 *     ARM_MATH_SIZE_MISMATCH
 * </pre>
 * Otherwise the functions return
 * <pre>
 *     ARM_MATH_SUCCESS
 * </pre>
 * There is some overhead associated with this matrix size checking.
 * The matrix size checking is enabled via the \#define
 * <pre>
 *     ARM_MATH_MATRIX_CHECK
 * </pre>
 * within the library project settings.  By default this macro is defined
 * and size checking is enabled.  By changing the project settings and
 * undefining this macro size checking is eliminated and the functions
 * run a bit faster.  With size checking disabled the functions always
 * return <code>ARM_MATH_SUCCESS</code>.
 */

/**
 * @defgroup groupTransforms Transform Functions
 */

/**
 * @defgroup groupController Controller Functions
 */

/**
 * @defgroup groupStats Statistics Functions
 */
/**
 * @defgroup groupSupport Support Functions
 */

/**
 * @defgroup groupInterpolation Interpolation Functions
 * These functions perform 1- and 2-dimensional interpolation of data.
 * Linear interpolation is used for 1-dimensional data and
 * bilinear interpolation is used for 2-dimensional data.
 */

/**
 * @defgroup groupExamples Examples
 */
#ifndef _ARM_MATH_H
#define _ARM_MATH_H

// Teensy 3.0
#include <stdint.h>
#define __ASM   __asm
#define __INLINE  inline
#define __STATIC_INLINE static inline
#define __CORTEX_M  4
#define __FPU_USED  0
#define ARM_MATH_CM4
#include "core_cmInstr.h"
#include "core_cm4_simd.h"


#if 0
// generic for any board...
#define __CMSIS_GENERIC         /* disable NVIC and Systick functions */
#if defined (ARM_MATH_CM4)
#include "core_cm4.h"
#elif defined (ARM_MATH_CM3)
#include "core_cm3.h"
#elif defined (ARM_MATH_CM0)
#include "core_cm0.h"
#else
#include "ARMCM4.h"
#warning "Define either ARM_MATH_CM4 OR ARM_MATH_CM3...By Default building on ARM_MATH_CM4....."
#endif
#undef  __CMSIS_GENERIC         /* enable NVIC and Systick functions */
#endif

#include "string.h"
#include "math.h"
#ifdef  __cplusplus
extern "C"
{
#endif


  /**
   * @brief Macros required for reciprocal calculation in Normalized LMS
   */

#define DELTA_Q31       (0x100)
#define DELTA_Q15       0x5
#define INDEX_MASK      0x0000003F
#ifndef PI
#define PI          3.14159265358979f
#endif

  /**
   * @brief Macros required for SINE and COSINE Fast math approximations
   */

#define TABLE_SIZE      256
#define TABLE_SPACING_Q31 0x800000
#define TABLE_SPACING_Q15 0x80

  /**
   * @brief Macros required for SINE and COSINE Controller functions
   */
  /* 1.31(q31) Fixed value of 2/360 */
  /* -1 to +1 is divided into 360 values so total spacing is (2/360) */
#define INPUT_SPACING     0xB60B61

  /**
   * @brief Macro for Unaligned Support
   */
#ifndef UNALIGNED_SUPPORT_DISABLE
    #define ALIGN4
#else
  #if defined  (__GNUC__)
    #define ALIGN4 __attribute__((aligned(4)))
  #else
    #define ALIGN4 __align(4)
  #endif
#endif  /*  #ifndef UNALIGNED_SUPPORT_DISABLE */

  /**
   * @brief Error status returned by some functions in the library.
   */

  typedef enum
  {
    ARM_MATH_SUCCESS = 0,                /**< No error */
    ARM_MATH_ARGUMENT_ERROR = -1,        /**< One or more arguments are incorrect */
    ARM_MATH_LENGTH_ERROR = -2,          /**< Length of data buffer is incorrect */
    ARM_MATH_SIZE_MISMATCH = -3,         /**< Size of matrices is not compatible with the operation. */
    ARM_MATH_NANINF = -4,                /**< Not-a-number (NaN) or infinity is generated */
    ARM_MATH_SINGULAR = -5,              /**< Generated by matrix inversion if the input matrix is singular and cannot be inverted. */
    ARM_MATH_TEST_FAILURE = -6           /**< Test Failed  */
  } arm_status;

  /**
   * @brief 8-bit fractional data type in 1.7 format.
   */
  typedef int8_t q7_t;

  /**
   * @brief 16-bit fractional data type in 1.15 format.
   */
  typedef int16_t q15_t;

  /**
   * @brief 32-bit fractional data type in 1.31 format.
   */
  typedef int32_t q31_t;

  /**
   * @brief 64-bit fractional data type in 1.63 format.
   */
  typedef int64_t q63_t;

  /**
   * @brief 32-bit floating-point type definition.
   */
  typedef float float32_t;

  /**
   * @brief 64-bit floating-point type definition.
   */
  typedef double float64_t;

  /**
   * @brief definition to read/write two 16 bit values.
   */
#if defined  (__GNUC__)
  #define __SIMD32(addr)         (*( int32_t **) & (addr))
  #define  _SIMD32_OFFSET(addr)  (*( int32_t * )   (addr))
#else
  #define __SIMD32(addr)         (*(__packed                    int32_t **) & (addr))
  #define  _SIMD32_OFFSET(addr)  (*(__packed                    int32_t * )   (addr))
#endif 

  #define __SIMD64(addr)  (*(int64_t **) & (addr))

#if defined (ARM_MATH_CM3) || defined (ARM_MATH_CM0)
  /**
   * @brief definition to pack two 16 bit values.
   */
#define __PKHBT(ARG1, ARG2, ARG3)      ( (((int32_t)(ARG1) <<  0) & (int32_t)0x0000FFFF) | \
                                         (((int32_t)(ARG2) << ARG3) & (int32_t)0xFFFF0000)  )
#define __PKHTB(ARG1, ARG2, ARG3)      ( (((int32_t)(ARG1) <<  0) & (int32_t)0xFFFF0000) | \
                                         (((int32_t)(ARG2) >> ARG3) & (int32_t)0x0000FFFF)  )

#endif


   /**
   * @brief definition to pack four 8 bit values.
   */
#ifndef ARM_MATH_BIG_ENDIAN

#define __PACKq7(v0,v1,v2,v3) ( (((int32_t)(v0) <<  0) & (int32_t)0x000000FF) | \
                                (((int32_t)(v1) <<  8) & (int32_t)0x0000FF00) | \
                  (((int32_t)(v2) << 16) & (int32_t)0x00FF0000) | \
                  (((int32_t)(v3) << 24) & (int32_t)0xFF000000)  )
#else

#define __PACKq7(v0,v1,v2,v3) ( (((int32_t)(v3) <<  0) & (int32_t)0x000000FF) | \
                                (((int32_t)(v2) <<  8) & (int32_t)0x0000FF00) | \
                  (((int32_t)(v1) << 16) & (int32_t)0x00FF0000) | \
                  (((int32_t)(v0) << 24) & (int32_t)0xFF000000)  )

#endif


  /**
   * @brief Clips Q63 to Q31 values.
   */
  __STATIC_INLINE q31_t clip_q63_to_q31(
  q63_t x)
  {
    return ((q31_t) (x >> 32) != ((q31_t) x >> 31)) ?
      ((0x7FFFFFFF ^ ((q31_t) (x >> 63)))) : (q31_t) x;
  }

  /**
   * @brief Clips Q63 to Q15 values.
   */
  __STATIC_INLINE q15_t clip_q63_to_q15(
  q63_t x)
  {
    return ((q31_t) (x >> 32) != ((q31_t) x >> 31)) ?
      ((0x7FFF ^ ((q15_t) (x >> 63)))) : (q15_t) (x >> 15);
  }

  /**
   * @brief Clips Q31 to Q7 values.
   */
  __STATIC_INLINE q7_t clip_q31_to_q7(
  q31_t x)
  {
    return ((q31_t) (x >> 24) != ((q31_t) x >> 23)) ?
      ((0x7F ^ ((q7_t) (x >> 31)))) : (q7_t) x;
  }

  /**
   * @brief Clips Q31 to Q15 values.
   */
  __STATIC_INLINE q15_t clip_q31_to_q15(
  q31_t x)
  {
    return ((q31_t) (x >> 16) != ((q31_t) x >> 15)) ?
      ((0x7FFF ^ ((q15_t) (x >> 31)))) : (q15_t) x;
  }

  /**
   * @brief Multiplies 32 X 64 and returns 32 bit result in 2.30 format.
   */

  __STATIC_INLINE q63_t mult32x64(
  q63_t x,
  q31_t y)
  {
    return ((((q63_t) (x & 0x00000000FFFFFFFF) * y) >> 32) +
            (((q63_t) (x >> 32) * y)));
  }


#if defined (ARM_MATH_CM0) && defined ( __CC_ARM   )
#define __CLZ __clz
#endif

#if defined (ARM_MATH_CM0) && defined ( __TASKING__ )
/* No need to redefine __CLZ */
#endif

#if defined (ARM_MATH_CM0) && ((defined (__ICCARM__)) ||(defined (__GNUC__)) )

  __STATIC_INLINE  uint32_t __CLZ(q31_t data);


  __STATIC_INLINE uint32_t __CLZ(q31_t data)
  {
    uint32_t count = 0;
    uint32_t mask = 0x80000000;

    while((data & mask) == 0)
    {
      count += 1u;
      mask = mask >> 1u;
    }

    return (count);

  }

#endif

  /**
   * @brief Function to Calculates 1/in(reciprocal) value of Q31 Data type.
   */

  __STATIC_INLINE uint32_t arm_recip_q31(
  q31_t in,
  q31_t * dst,
  q31_t * pRecipTable)
  {

    uint32_t out, tempVal;
    uint32_t index, i;
    uint32_t signBits;

    if(in > 0)
    {
      signBits = __CLZ(in) - 1;
    }
    else
    {
      signBits = __CLZ(-in) - 1;
    }

    /* Convert input sample to 1.31 format */
    in = in << signBits;

    /* calculation of index for initial approximated Val */
    index = (uint32_t) (in >> 24u);
    index = (index & INDEX_MASK);

    /* 1.31 with exp 1 */
    out = pRecipTable[index];

    /* calculation of reciprocal value */
    /* running approximation for two iterations */
    for (i = 0u; i < 2u; i++)
    {
      tempVal = (q31_t) (((q63_t) in * out) >> 31u);
      tempVal = 0x7FFFFFFF - tempVal;
      /*      1.31 with exp 1 */
      //out = (q31_t) (((q63_t) out * tempVal) >> 30u);
      out = (q31_t) clip_q63_to_q31(((q63_t) out * tempVal) >> 30u);
    }

    /* write output */
    *dst = out;

    /* return num of signbits of out = 1/in value */
    return (signBits + 1u);

  }

  /**
   * @brief Function to Calculates 1/in(reciprocal) value of Q15 Data type.
   */
  __STATIC_INLINE uint32_t arm_recip_q15(
  q15_t in,
  q15_t * dst,
  q15_t * pRecipTable)
  {

    uint32_t out = 0, tempVal = 0;
    uint32_t index = 0, i = 0;
    uint32_t signBits = 0;

    if(in > 0)
    {
      signBits = __CLZ(in) - 17;
    }
    else
    {
      signBits = __CLZ(-in) - 17;
    }

    /* Convert input sample to 1.15 format */
    in = in << signBits;

    /* calculation of index for initial approximated Val */
    index = in >> 8;
    index = (index & INDEX_MASK);

    /*      1.15 with exp 1  */
    out = pRecipTable[index];

    /* calculation of reciprocal value */
    /* running approximation for two iterations */
    for (i = 0; i < 2; i++)
    {
      tempVal = (q15_t) (((q31_t) in * out) >> 15);
      tempVal = 0x7FFF - tempVal;
      /*      1.15 with exp 1 */
      out = (q15_t) (((q31_t) out * tempVal) >> 14);
    }

    /* write output */
    *dst = out;

    /* return num of signbits of out = 1/in value */
    return (signBits + 1);

  }


  /*
   * @brief C custom defined intrinisic function for only M0 processors
   */
#if defined(ARM_MATH_CM0)

  __STATIC_INLINE q31_t __SSAT(
  q31_t x,
  uint32_t y)
  {
    int32_t posMax, negMin;
    uint32_t i;

    posMax = 1;
    for (i = 0; i < (y - 1); i++)
    {
      posMax = posMax * 2;
    }

    if(x > 0)
    {
      posMax = (posMax - 1);

      if(x > posMax)
      {
        x = posMax;
      }
    }
    else
    {
      negMin = -posMax;

      if(x < negMin)
      {
        x = negMin;
      }
    }
    return (x);


  }

#endif /* end of ARM_MATH_CM0 */


  /*
   * @brief C custom defined intrinsic function for M3 and M0 processors
   */
#if defined (ARM_MATH_CM3) || defined (ARM_MATH_CM0)

  /*
   * @brief C custom defined QADD8 for M3 and M0 processors
   */
  __STATIC_INLINE q31_t __QADD8(
  q31_t x,
  q31_t y)
  {

    q31_t sum;
    q7_t r, s, t, u;

    r = (q7_t) x;
    s = (q7_t) y;

    r = __SSAT((q31_t) (r + s), 8);
    s = __SSAT(((q31_t) (((x << 16) >> 24) + ((y << 16) >> 24))), 8);
    t = __SSAT(((q31_t) (((x << 8) >> 24) + ((y << 8) >> 24))), 8);
    u = __SSAT(((q31_t) ((x >> 24) + (y >> 24))), 8);

    sum =
      (((q31_t) u << 24) & 0xFF000000) | (((q31_t) t << 16) & 0x00FF0000) |
      (((q31_t) s << 8) & 0x0000FF00) | (r & 0x000000FF);

    return sum;

  }

  /*
   * @brief C custom defined QSUB8 for M3 and M0 processors
   */
  __STATIC_INLINE q31_t __QSUB8(
  q31_t x,
  q31_t y)
  {

    q31_t sum;
    q31_t r, s, t, u;

    r = (q7_t) x;
    s = (q7_t) y;

    r = __SSAT((r - s), 8);
    s = __SSAT(((q31_t) (((x << 16) >> 24) - ((y << 16) >> 24))), 8) << 8;
    t = __SSAT(((q31_t) (((x << 8) >> 24) - ((y << 8) >> 24))), 8) << 16;
    u = __SSAT(((q31_t) ((x >> 24) - (y >> 24))), 8) << 24;

    sum =
      (u & 0xFF000000) | (t & 0x00FF0000) | (s & 0x0000FF00) | (r &
                                                                0x000000FF);

    return sum;
  }

  /*
   * @brief C custom defined QADD16 for M3 and M0 processors
   */

  /*
   * @brief C custom defined QADD16 for M3 and M0 processors
   */
  __STATIC_INLINE q31_t __QADD16(
  q31_t x,
  q31_t y)
  {

    q31_t sum;
    q31_t r, s;

    r = (short) x;
    s = (short) y;

    r = __SSAT(r + s, 16);
    s = __SSAT(((q31_t) ((x >> 16) + (y >> 16))), 16) << 16;

    sum = (s & 0xFFFF0000) | (r & 0x0000FFFF);

    return sum;

  }

  /*
   * @brief C custom defined SHADD16 for M3 and M0 processors
   */
  __STATIC_INLINE q31_t __SHADD16(
  q31_t x,
  q31_t y)
  {

    q31_t sum;
    q31_t r, s;

    r = (short) x;
    s = (short) y;

    r = ((r >> 1) + (s >> 1));
    s = ((q31_t) ((x >> 17) + (y >> 17))) << 16;

    sum = (s & 0xFFFF0000) | (r & 0x0000FFFF);

    return sum;

  }

  /*
   * @brief C custom defined QSUB16 for M3 and M0 processors
   */
  __STATIC_INLINE q31_t __QSUB16(
  q31_t x,
  q31_t y)
  {

    q31_t sum;
    q31_t r, s;

    r = (short) x;
    s = (short) y;

    r = __SSAT(r - s, 16);
    s = __SSAT(((q31_t) ((x >> 16) - (y >> 16))), 16) << 16;

    sum = (s & 0xFFFF0000) | (r & 0x0000FFFF);

    return sum;
  }

  /*
   * @brief C custom defined SHSUB16 for M3 and M0 processors
   */
  __STATIC_INLINE q31_t __SHSUB16(
  q31_t x,
  q31_t y)
  {

    q31_t diff;
    q31_t r, s;

    r = (short) x;
    s = (short) y;

    r = ((r >> 1) - (s >> 1));
    s = (((x >> 17) - (y >> 17)) << 16);

    diff = (s & 0xFFFF0000) | (r & 0x0000FFFF);

    return diff;
  }

  /*
   * @brief C custom defined QASX for M3 and M0 processors
   */
  __STATIC_INLINE q31_t __QASX(
  q31_t x,
  q31_t y)
  {

    q31_t sum = 0;

    sum =
      ((sum +
        clip_q31_to_q15((q31_t) ((short) (x >> 16) + (short) y))) << 16) +
      clip_q31_to_q15((q31_t) ((short) x - (short) (y >> 16)));

    return sum;
  }

  /*
   * @brief C custom defined SHASX for M3 and M0 processors
   */
  __STATIC_INLINE q31_t __SHASX(
  q31_t x,
  q31_t y)
  {

    q31_t sum;
    q31_t r, s;

    r = (short) x;
    s = (short) y;

    r = ((r >> 1) - (y >> 17));
    s = (((x >> 17) + (s >> 1)) << 16);

    sum = (s & 0xFFFF0000) | (r & 0x0000FFFF);

    return sum;
  }


  /*
   * @brief C custom defined QSAX for M3 and M0 processors
   */
  __STATIC_INLINE q31_t __QSAX(
  q31_t x,
  q31_t y)
  {

    q31_t sum = 0;

    sum =
      ((sum +
        clip_q31_to_q15((q31_t) ((short) (x >> 16) - (short) y))) << 16) +
      clip_q31_to_q15((q31_t) ((short) x + (short) (y >> 16)));

    return sum;
  }

  /*
   * @brief C custom defined SHSAX for M3 and M0 processors
   */
  __STATIC_INLINE q31_t __SHSAX(
  q31_t x,
  q31_t y)
  {

    q31_t sum;
    q31_t r, s;

    r = (short) x;
    s = (short) y;

    r = ((r >> 1) + (y >> 17));
    s = (((x >> 17) - (s >> 1)) << 16);

    sum = (s & 0xFFFF0000) | (r & 0x0000FFFF);

    return sum;
  }

  /*
   * @brief C custom defined SMUSDX for M3 and M0 processors
   */
  __STATIC_INLINE q31_t __SMUSDX(
  q31_t x,
  q31_t y)
  {

    return ((q31_t) (((short) x * (short) (y >> 16)) -
                     ((short) (x >> 16) * (short) y)));
  }

  /*
   * @brief C custom defined SMUADX for M3 and M0 processors
   */
  __STATIC_INLINE q31_t __SMUADX(
  q31_t x,
  q31_t y)
  {

    return ((q31_t) (((short) x * (short) (y >> 16)) +
                     ((short) (x >> 16) * (short) y)));
  }

  /*
   * @brief C custom defined QADD for M3 and M0 processors
   */
  __STATIC_INLINE q31_t __QADD(
  q31_t x,
  q31_t y)
  {
    return clip_q63_to_q31((q63_t) x + y);
  }

  /*
   * @brief C custom defined QSUB for M3 and M0 processors
   */
  __STATIC_INLINE q31_t __QSUB(
  q31_t x,
  q31_t y)
  {
    return clip_q63_to_q31((q63_t) x - y);
  }

  /*
   * @brief C custom defined SMLAD for M3 and M0 processors
   */
  __STATIC_INLINE q31_t __SMLAD(
  q31_t x,
  q31_t y,
  q31_t sum)
  {

    return (sum + ((short) (x >> 16) * (short) (y >> 16)) +
            ((short) x * (short) y));
  }

  /*
   * @brief C custom defined SMLADX for M3 and M0 processors
   */
  __STATIC_INLINE q31_t __SMLADX(
  q31_t x,
  q31_t y,
  q31_t sum)
  {

    return (sum + ((short) (x >> 16) * (short) (y)) +
            ((short) x * (short) (y >> 16)));
  }

  /*
   * @brief C custom defined SMLSDX for M3 and M0 processors
   */
  __STATIC_INLINE q31_t __SMLSDX(
  q31_t x,
  q31_t y,
  q31_t sum)
  {

    return (sum - ((short) (x >> 16) * (short) (y)) +
            ((short) x * (short) (y >> 16)));
  }

  /*
   * @brief C custom defined SMLALD for M3 and M0 processors
   */
  __STATIC_INLINE q63_t __SMLALD(
  q31_t x,
  q31_t y,
  q63_t sum)
  {

    return (sum + ((short) (x >> 16) * (short) (y >> 16)) +
            ((short) x * (short) y));
  }

  /*
   * @brief C custom defined SMLALDX for M3 and M0 processors
   */
  __STATIC_INLINE q63_t __SMLALDX(
  q31_t x,
  q31_t y,
  q63_t sum)
  {

    return (sum + ((short) (x >> 16) * (short) y)) +
      ((short) x * (short) (y >> 16));
  }

  /*
   * @brief C custom defined SMUAD for M3 and M0 processors
   */
  __STATIC_INLINE q31_t __SMUAD(
  q31_t x,