MicroPerf: The preprocessing for a box blur in SSE2. The sum of all of the elements in a 2d array above and to to the left.

It is often useful to blur images. It is often to blur the same image in multiple ways. A common way of doing this is to precalculate a resulting intermediate and then quickly produce multiple resulting blurred images.
The preprocess is create an array of floats in the shape of the image. The value at each location is the inclusive sum of all of the values from the original image all the way to the upper left pixel of the image above and to the left of the target point.
This process pretty easy suffers from integer overflow, so we need to use floats in the intermediate image.

We saw a 1-dimensional cumulative sum in a previous post. And that is re-included here.

template <typename T>
__forceinline T SumArray(__in_ecount(cElements) const T * const aIn, __out_ecount(cElements) T* const aOut, const size_t cElements, const T fRunningSumIn = 0)
{
    T fRunningSum = fRunningSumIn;
    for (size_t iOnElement = 0; iOnElement < cElements; iOnElement++)
    {
        aOut[iOnElement] = (fRunningSum += aIn[iOnElement]);
    }
    return fRunningSum;
}
template<const unsigned char nElementsToShift>
__forceinline __m128 _mm_slli_ps(__m128 m)
{
    return _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(m), 4 * nElementsToShift));
}
__forceinline __m128 SumRegister(__m128 mIn)
{
    __m128 mAddShift = _mm_add_ps(mIn, _mm_slli_ps<1>(mIn));
    return _mm_add_ps(mAddShift, _mm_slli_ps<2>(mAddShift));
}
float CumulativeSum_SSE(const float * const aIn, float * const aOut, size_t cElements, float fRunningSum = 0)
{
    size_t cOnElement = cElements & 3;
    fRunningSum = SumArray(aIn, aOut, cOnElement, fRunningSum);
    __m128 mLeftSum = _mm_set1_ps(fRunningSum);
    if (cElements & 4)
    {
        __m128 mIn = _mm_loadu_ps(aIn + cOnElement);
        __m128 mOut = _mm_add_ps(mLeftSum, SumRegister(mIn));
        _mm_storeu_ps(aOut + cOnElement, mOut);
        mLeftSum = _mm_shuffle_ps(mOut, mOut, _MM_SHUFFLE(3, 3, 3, 3));
        cOnElement += 4;
    }
    while (cOnElement<cElements)
    {
        __m128 mInLo = _mm_loadu_ps(aIn + cOnElement);
        __m128 mInHi = _mm_loadu_ps(aIn + cOnElement + 4);
        __m128 mSummedInLo = SumRegister(mInLo);
        __m128 mSummedInHi = SumRegister(mInHi);
        __m128 mOutLo = _mm_add_ps(mLeftSum, mSummedInLo);
        _mm_storeu_ps(aOut + cOnElement, mOutLo);
        mLeftSum = _mm_shuffle_ps(mOutLo, mOutLo, _MM_SHUFFLE(3, 3, 3, 3));
        __m128 mOutHi = _mm_add_ps(mLeftSum, mSummedInHi);
        _mm_storeu_ps(aOut + cOnElement + 4, mOutHi);
        mLeftSum = _mm_shuffle_ps(mOutHi, mOutHi, _MM_SHUFFLE(3, 3, 3, 3));
        cOnElement += 8;
    }
    return _mm_cvtss_f32(mLeftSum);
}

And then the 2-dimensional summation will look pretty similar:

void BoxFilterPreCalc_SSE(const float * const aIn, float * const aOut, const size_t cHorizontalElements, const size_t cVerticalElements, const ptrdiff_t verticalStrideIn, const ptrdiff_t verticalStrideOut)
{
    if (!cVerticalElements)
    {
        return;
    }
    CumulativeSum_SSE(aIn, aOut, cHorizontalElements, 0);
    size_t cRowsRemaining = cVerticalElements - 1;
    const float *pStartOutPrevLine = aOut;
    float *pStartOutCurLine = AddBytesToPointer(aOut, verticalStrideOut);
    const float *pStartInCurLine = AddBytesToPointer(aIn, verticalStrideIn);
    while (cRowsRemaining-- > 0)
    {
        float fLeftSum = 0;
        size_t iOnElement = 0;
        size_t cElementsRemaining = cHorizontalElements;
        while (cElementsRemaining & 3)
        {
            cElementsRemaining--;
            pStartOutCurLine[iOnElement] = pStartOutPrevLine[iOnElement]+(fLeftSum += pStartInCurLine[iOnElement]);
            iOnElement++;
        }
        __m128 mLeftSum = _mm_set1_ps(fLeftSum);
        if (cElementsRemaining & 4)
        {
            cElementsRemaining -= 4;
            __m128 mInLo = _mm_loadu_ps(pStartInCurLine + iOnElement);
            __m128 mPrevOutLo = _mm_loadu_ps(pStartOutPrevLine + iOnElement);
            __m128 mSummedInLo = SumRegister(mInLo);
            __m128 mLeftSumOutLo = _mm_add_ps(mLeftSum, mSummedInLo);
            __m128 mOutLo = _mm_add_ps(mLeftSumOutLo, mPrevOutLo);
            _mm_storeu_ps(pStartOutCurLine + iOnElement, mOutLo);
            mLeftSum = _mm_shuffle_ps(mLeftSumOutLo, mLeftSumOutLo, _MM_SHUFFLE(3, 3, 3, 3));
            iOnElement += 4;
        }
        while (cElementsRemaining > 0)
        {
            cElementsRemaining -= 8;
            __m128 mInLo = _mm_loadu_ps(pStartInCurLine + iOnElement);
            __m128 mInHi = _mm_loadu_ps(pStartInCurLine + iOnElement + 4);
            __m128 mPrevOutLo = _mm_loadu_ps(pStartOutPrevLine + iOnElement);
            __m128 mPrevOutHi = _mm_loadu_ps(pStartOutPrevLine + iOnElement+4);
            __m128 mSummedInLo = SumRegister(mInLo);
            __m128 mSummedInHi = SumRegister(mInHi);
            __m128 mLeftSumOutLo = _mm_add_ps(mLeftSum, mSummedInLo);
            __m128 mOutLo = _mm_add_ps(mLeftSumOutLo, mPrevOutLo);
            _mm_storeu_ps(pStartOutCurLine + iOnElement, mOutLo);
            __m128 mLeftSum = _mm_shuffle_ps(mLeftSumOutLo, mLeftSumOutLo, _MM_SHUFFLE(3, 3, 3, 3));
            __m128 mLeftSumOutHi = _mm_add_ps(mLeftSum, mSummedInHi);
            __m128 mOutHi = _mm_add_ps(mLeftSumOutHi, mPrevOutHi);
            _mm_storeu_ps(pStartOutCurLine + iOnElement + 4, mOutHi);
            mLeftSum = _mm_shuffle_ps(mLeftSumOutHi, mLeftSumOutHi, _MM_SHUFFLE(3, 3, 3, 3));
            iOnElement += 8;
        }
        pStartOutPrevLine = pStartOutCurLine;
        pStartOutCurLine = AddBytesToPointer(pStartOutCurLine, verticalStrideOut);
        pStartInCurLine = AddBytesToPointer(pStartInCurLine, verticalStrideIn);
    }
}

For some similar opterations we will actually legitimately need the increased accuracy of doubles. That's coming soon.

MicroPerf

Saturday, May 13, 2017

The preprocessing for a box blur in SSE2. The sum of all of the elements in a 2d array above and to to the left.

No comments:

Post a Comment