Sunday, April 23, 2017

using aligned memory reads for a summation in SSE

Allocated memory in C does not have any guaranteed alignment.  This eventually was incorporated into the language with std::aligned_alloc.

Reads that are aligned to 16byte memory boundaries can be made faster by making a guarantee to the processor, by using _mm_load_ps rather than _mm_loadu_ps.  On newer hardware the benefits are smaller, and may not be enough to justify added complexity.

But let's look at an aligning implementation of a function that we've seen before.

__forceinline float horizontalSum_SSE2(const __m128 &mABCD)
{
    __m128 mCDCD = _mm_movehl_ps(mABCD, mABCD);
    __m128 mApCBpD = _mm_add_ps(mABCD, mCDCD);
    __m128 mBpD = _mm_shuffle_ps(mApCBpD, mApCBpD, 0x55);
    __m128 mApBpCpD = _mm_add_ps(mApCBpD, mBpD);
    return _mm_cvtss_f32(mApBpCpD);
}

template <bool bShort = false, bool bAligned = false>
__forceinline __m128 SumWithoutHorizontal(const float * const pA, const size_t uiAOrig)
{
    __m128 mSummed = _mm_setzero_ps();
    size_t uiA = uiAOrig;
    if (uiA & 1)
    {
        uiA--;
        mSummed = _mm_load_ss(&pA[uiA]);
    }
    if (uiA & 2)
    {
        uiA -= 2;
        mSummed = _mm_loadh_pi(mSummed, (const __m64*)&pA[uiA]);
    }
    if (!bShort)
    {
        while (uiA > 0)
        {
            uiA -= 4;
            __m128 mLoaded = bAligned? _mm_load_ps(&pA[uiA]): _mm_loadu_ps(&pA[uiA]);
            mSummed = _mm_add_ps(mSummed, mLoaded);
        }
    }
    return mSummed;
}

float SumWithAlign_SSE2(const float * const pA, const size_t uiAOrig)
{
    ULONG_PTR ulpA = (ULONG_PTR)pA;
    __m128 mSum;
    if ((uiAOrig < 8) || (ulpA & 0x3))
    {
        mSum = SumWithoutHorizontal<false, false>(pA, uiAOrig);
    }
    else
    {
        size_t numBytesToAligned = (0xf & -ulpA);
        size_t numElementsToAligned = numBytesToAligned / sizeof(*pA);

        mSum = _mm_add_ps(SumWithoutHorizontal<true, false>(pA, numElementsToAligned), SumWithoutHorizontal<false, true>(pA+ numElementsToAligned, uiAOrig - numElementsToAligned));
    }
    return horizontalSum_SSE2(mSum);
}

Calculating the number of bytes that need to be processed to get to a 16 byte alignment is performed with: numBytesToAligned = (0xf & -ulpA); This feels like a bit of magic.  I found this by looking at the generated assembly from a more conventional implementation.  

No comments:

Post a Comment