MicroPerf: Fixed point arithmatic - 16.16 signed int multiplication

In neural network code I've seen a lot of arithmetic using fixed point numbers. Ordinarily I've seen the low 16bits used for the fractional portion and the high 16bits used for the whole portion.

Addition and subtraction use the ordinary int addition and subtraction.

Multiplication of two 16.16 fixed point numbers is slightly more complicated:
#ifndef Int32x32To64
// While this is normally a single machine instruction x86-64 intrinsic, declare a backup for ansi compatibility...
__forceinline __int64 Int32x32To64(long a, long b)
{
    __int64 iRet = (((__int64)((long)(a))) * ((__int64)((long)(b))));
    return iRet;
}
#endif // !Int32x32To64
int iMul(const int a, const int b)
{
    int iRet = (int)(Int32x32To64(a, b) >> 16);
    return iRet;
}

Division of these numbers can similarly be expressed in terms of standard 64bit arithmetic.

It seems like it could be useful to do this in four parallel lanes using SSE. The code is very similar to _mm_mul_epi32_SSE2.

__m128i _mm_fixedMul_epi32_SSE41(const __m128i &x, const __m128i &y)
{
    __m128i mxy02i = _mm_mul_epi32(x, y);
    __m128i mxs = _mm_srli_si128(x, 4);
    __m128i mys = _mm_srli_si128(y, 4);
    __m128i mxy13i = _mm_mul_epi32(mxs,mys);
    __m128i mxy02i_s = _mm_srli_si128(mxy02i, 2);
    __m128i mxy13i_s = _mm_slli_si128(mxy13i, 2);
    __m128i mxy0123 = _mm_blend_epi16(mxy02i_s, mxy13i_s, 0xCC);
    return mxy0123;
}
The sigmoid function is often useful with artificial neural networks:
float sigmoid(const float x)
{
    return (1 / (1 + expf(-x)));
}
This can be approximated as a polynomial, which can then be expressed in fixed point. The difference from ideal is only ~1.5%:
int isigmoid(const int x)
{
    // The approximation for the right half in HornerForm is : y = 0. + x (20185.5 + x (-4794.54 + x(513.987 - 20.7585 x) ))
    // Once we wiggle around a little bit for the integer truncation it becomes:       y = 0. + x (19620.7 + x (-4405.61 + (432.455 - 15.5408 x) x))
    // The approximation is only valid for 0<=x<=6. We reflect to get the left half.
    // First we make sure that we are in the approximated region.
    int ax = abs(x);
    if (((UINT)ax) >= 6 * 65536)
    {
        return (x > 0) ? 65536 : 0;
    }
    // Take the abs and use from now on.
    int x4 = iMul(-16, ax);
    int x4p = x4 + 432;
    int x3 = iMul(ax, x4p);
    int x3p = x3 - 4406;
    int x2 = iMul(ax, x3p);
    int x2p = x2 + 19621;
    int x1 = iMul(ax, x2p);
    // Now that we have the left half approximation, reflect to the right half.
    int iRet = 32768 + ((x >= 0) ? (+x1) : (-x1));
    return iRet;
}

This translates easily into SSE code:

// requires SSSE3 and SSE41.
__m128i _mm_fixedSigmoid(const __m128i &x)
{
    __m128i mAbsXu = _mm_abs_epi32(x);
    __m128i mApproxInvalid = _mm_cmpGTE_epu32_SSE41(mAbsXu, _mm_set1_epi32(6*65536));
    const __m128i mPolynomialCoefficients = _mm_setr_epi32(-16, 432, -4406, 19621);
    __m128i m = _mm_fixedMul_epi32_SSE41(_mm_shuffle_epi32(mPolynomialCoefficients,_MM_SHUFFLE(0,0,0,0)), mAbsXu);
    __m128i mp = _mm_add_epi32(m, _mm_shuffle_epi32(mPolynomialCoefficients, _MM_SHUFFLE(1,1,1,1)));
    m = _mm_fixedMul_epi32_SSE41(mp, mAbsXu);
    mp = _mm_add_epi32(m, _mm_shuffle_epi32(mPolynomialCoefficients, _MM_SHUFFLE(2,2,2,2)));
    m = _mm_fixedMul_epi32_SSE41(mp, mAbsXu);
    mp = _mm_add_epi32(m, _mm_shuffle_epi32(mPolynomialCoefficients, _MM_SHUFFLE(3,3,3,3)));
    m = _mm_fixedMul_epi32_SSE41(mp, mAbsXu);
    __m128i mClamped = _mm_blendv_epi8(m, _mm_set1_epi32(32768), mApproxInvalid);
    __m128i mResult = _mm_add_epi32(_mm_set1_epi32(32768),_mm_sign_epi32(mClamped,x));
    return mResult;
}

This makes me yearn for floats.

For completeness, here's the one lane unsigned ansi fixed point multiply, and the four lane unsigned sse fixed point multiply. Not much different:
__forceinline unsigned int uMul(const unsigned int a, const unsigned int b)
{
    unsigned int uiRet = (unsigned int)(UInt32x32To64(a, b) >> 16);
    return uiRet;
}
__forceinline __m128i _mm_fixedMul_epu32_SSE41(const __m128i &x, const __m128i &y)
{
    __m128i mxy02i = _mm_mul_epu32(x, y);
    __m128i mxs = _mm_srli_si128(x, 4);
    __m128i mys = _mm_srli_si128(y, 4);
    __m128i mxy13i = _mm_mul_epu32(mxs, mys);
    __m128i mxy02i_s = _mm_srli_si128(mxy02i, 2);
    __m128i mxy13i_s = _mm_slli_si128(mxy13i, 2);
    __m128i mxy0123 = _mm_blend_epi16(mxy02i_s, mxy13i_s, 0xCC);
    return mxy0123;
}

MicroPerf

Thursday, December 22, 2016

Fixed point arithmatic - 16.16 signed int multiplication - and the sigmoid function

No comments:

Post a Comment