MicroPerf: May 2017

Wednesday, May 31, 2017

a 2d horizontal sum in SSE2

2d vectors come up a lot in graphics. Floats are treated in pairs with all of the normal linear algebra operations.

Adding four scaled 2d vectors is a straightforward operation.

We pass in the 2d vectors in 2 XMM registers, and the four element scaling values in a single XMM register.

__forceinline __m128 horizontalSum2D_SSE2(const __m128 mLeft, const __m128 mRight)
{
    __m128 mTop = _mm_add_ps(mLeft, mRight);
    __m128 mShuffleTop = _mm_shuffle_ps(mTop, mTop, _MM_SHUFFLE(1, 0, 3, 2));
    __m128 mRet = _mm_add_ps(mTop, mShuffleTop);
    return mRet;
}

__m128 scaleHorizontalSum2D_SSE2(const __m128 mLeft, const __m128 mRight, const __m128 mScales)
{
    __m128 mScaleLeft = _mm_unpacklo_ps(mScales, mScales);
    __m128 mScaleRight = _mm_unpackhi_ps(mScales, mScales);
    __m128 mMulLeft = _mm_mul_ps(mScaleLeft, mLeft);
    __m128 mMulRight = _mm_mul_ps(mScaleRight, mRight);
    __m128 mRet = horizontalSum2D_SSE2(mMulLeft, mMulRight);
    return mRet;
}

The result is returned in the an XMM register. This entire operation takes only 9 instructions.

Monday, May 29, 2017

float bilinear interpolation using SSE2

Bilinear interpolation is an easy way to smooth out sampled data. The sampled value is known at the corners of the unit square. Interpolation is performed linearly in each dimension.

Basically this boils down to evaluating the function:

f(x,y)\approx f(0,0)(1-x)(1-y)+f(1,0)x(1-y)+f(0,1)(1-x)y+f(1,1)xy.

void bilinearInterpolate_ANSI(__in_ecount(2) const float * const pXY, __in_ecount(2) const float *const pF0, __in_ecount(2) const float *const pF1, __out float *const pResult)
{
const float &x = pXY[0];
const float &y = pXY[1];

*pResult = pF0[0] * (1 - x)*(1 - y) + pF0[1] * x*(1 - y) + pF1[0] * (1 - x)*y + pF1[1] * x*y;
}

That addition is either going to require a horizontal add, or will require something like a transpose for the functional values. I've chosen to go with the horizontal addition route.

For the moment, let's look at everything up to multiplying by the f values. It's another case where it's close to the same cost to evaluate two as to evaluate one.

void bilinearInterpolateInner_SSE(const __m128 &mXY, __out __m128 &mLeft, __out __m128 &mRight)
{
__m128 m1111 = _mm_set1_ps(1);
__m128 mOmXY = _mm_sub_ps(m1111, mXY);

__m128 mYLeft = _mm_shuffle_ps(mOmXY, mXY, _MM_SHUFFLE(1, 1, 1, 1));
__m128 mYRight = _mm_shuffle_ps(mOmXY, mXY, _MM_SHUFFLE(3, 3, 3, 3));

__m128 mXYOmXYLeft = _mm_unpacklo_ps(mXY, mOmXY);
__m128 mXYOmXYRight = _mm_unpackhi_ps(mXY, mOmXY);
__m128 mXLeft = _mm_shuffle_ps(mXYOmXYLeft, mXYOmXYLeft, _MM_SHUFFLE(0, 1, 0, 1));
__m128 mXRight = _mm_shuffle_ps(mXYOmXYRight, mXYOmXYRight, _MM_SHUFFLE(0, 1, 0, 1));

mLeft = _mm_mul_ps(mXLeft, mYLeft);
mRight = _mm_mul_ps(mXRight, mYRight);
}

If we have loaded the values for f into two SSE registers, this can be evaluated as follows:

__forceinline __m128 _mm_hadd_ps_SSE2(const __m128 mLeft, const __m128 mRight)
{
__m128 mTop = _mm_shuffle_ps(mLeft, mRight, _MM_SHUFFLE(0, 2, 0, 2));
__m128 mBottom = _mm_shuffle_ps(mLeft, mRight, _MM_SHUFFLE(1, 3, 1, 3));
return _mm_add_ps(mTop, mBottom);
}

__forceinline void store2floats(const __m128 &mFloats, __out_ecount(2) float * const pfResults)
{
_mm_store_sd(reinterpret_cast<double *>(pfResults), _mm_castps_pd(mFloats));
}

void bilinearInterpolateMiddle_SSE(const __m128 &mXYLeftRight, const __m128 &mFLeft, const __m128 &mFRight, __out_ecount(2) float * const pfResults)
{
__m128 mLeft, mRight;
bilinearInterpolateInner_SSE(mXYLeftRight, mLeft, mRight);
__m128 mInt = _mm_hadd_ps_SSE2(_mm_mul_ps(mRight,mFRight),_mm_mul_ps(mLeft,mFLeft));
__m128 mRet = _mm_hadd_ps_SSE2(mInt, mInt);
store2floats(mRet, pfResults);
}

This does the final multiply against the f values. And then does the horizontal sum. The two horizontal sums are performed simultaneously. The SSE3 _mm_hadd_ps instruction would be helpful, but it is sumulated using SSE2 in the above example.

Saturday, May 13, 2017

The preprocessing for a box blur in SSE2. The sum of all of the elements in a 2d array above and to to the left.

It is often useful to blur images. It is often to blur the same image in multiple ways. A common way of doing this is to precalculate a resulting intermediate and then quickly produce multiple resulting blurred images.
The preprocess is create an array of floats in the shape of the image. The value at each location is the inclusive sum of all of the values from the original image all the way to the upper left pixel of the image above and to the left of the target point.
This process pretty easy suffers from integer overflow, so we need to use floats in the intermediate image.

We saw a 1-dimensional cumulative sum in a previous post. And that is re-included here.

template <typename T>
__forceinline T SumArray(__in_ecount(cElements) const T * const aIn, __out_ecount(cElements) T* const aOut, const size_t cElements, const T fRunningSumIn = 0)
{
    T fRunningSum = fRunningSumIn;
    for (size_t iOnElement = 0; iOnElement < cElements; iOnElement++)
    {
        aOut[iOnElement] = (fRunningSum += aIn[iOnElement]);
    }
    return fRunningSum;
}
template<const unsigned char nElementsToShift>
__forceinline __m128 _mm_slli_ps(__m128 m)
{
    return _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(m), 4 * nElementsToShift));
}
__forceinline __m128 SumRegister(__m128 mIn)
{
    __m128 mAddShift = _mm_add_ps(mIn, _mm_slli_ps<1>(mIn));
    return _mm_add_ps(mAddShift, _mm_slli_ps<2>(mAddShift));
}
float CumulativeSum_SSE(const float * const aIn, float * const aOut, size_t cElements, float fRunningSum = 0)
{
    size_t cOnElement = cElements & 3;
    fRunningSum = SumArray(aIn, aOut, cOnElement, fRunningSum);
    __m128 mLeftSum = _mm_set1_ps(fRunningSum);
    if (cElements & 4)
    {
        __m128 mIn = _mm_loadu_ps(aIn + cOnElement);
        __m128 mOut = _mm_add_ps(mLeftSum, SumRegister(mIn));
        _mm_storeu_ps(aOut + cOnElement, mOut);
        mLeftSum = _mm_shuffle_ps(mOut, mOut, _MM_SHUFFLE(3, 3, 3, 3));
        cOnElement += 4;
    }
    while (cOnElement<cElements)
    {
        __m128 mInLo = _mm_loadu_ps(aIn + cOnElement);
        __m128 mInHi = _mm_loadu_ps(aIn + cOnElement + 4);
        __m128 mSummedInLo = SumRegister(mInLo);
        __m128 mSummedInHi = SumRegister(mInHi);
        __m128 mOutLo = _mm_add_ps(mLeftSum, mSummedInLo);
        _mm_storeu_ps(aOut + cOnElement, mOutLo);
        mLeftSum = _mm_shuffle_ps(mOutLo, mOutLo, _MM_SHUFFLE(3, 3, 3, 3));
        __m128 mOutHi = _mm_add_ps(mLeftSum, mSummedInHi);
        _mm_storeu_ps(aOut + cOnElement + 4, mOutHi);
        mLeftSum = _mm_shuffle_ps(mOutHi, mOutHi, _MM_SHUFFLE(3, 3, 3, 3));
        cOnElement += 8;
    }
    return _mm_cvtss_f32(mLeftSum);
}

And then the 2-dimensional summation will look pretty similar:

void BoxFilterPreCalc_SSE(const float * const aIn, float * const aOut, const size_t cHorizontalElements, const size_t cVerticalElements, const ptrdiff_t verticalStrideIn, const ptrdiff_t verticalStrideOut)
{
    if (!cVerticalElements)
    {
        return;
    }
    CumulativeSum_SSE(aIn, aOut, cHorizontalElements, 0);
    size_t cRowsRemaining = cVerticalElements - 1;
    const float *pStartOutPrevLine = aOut;
    float *pStartOutCurLine = AddBytesToPointer(aOut, verticalStrideOut);
    const float *pStartInCurLine = AddBytesToPointer(aIn, verticalStrideIn);
    while (cRowsRemaining-- > 0)
    {
        float fLeftSum = 0;
        size_t iOnElement = 0;
        size_t cElementsRemaining = cHorizontalElements;
        while (cElementsRemaining & 3)
        {
            cElementsRemaining--;
            pStartOutCurLine[iOnElement] = pStartOutPrevLine[iOnElement]+(fLeftSum += pStartInCurLine[iOnElement]);
            iOnElement++;
        }
        __m128 mLeftSum = _mm_set1_ps(fLeftSum);
        if (cElementsRemaining & 4)
        {
            cElementsRemaining -= 4;
            __m128 mInLo = _mm_loadu_ps(pStartInCurLine + iOnElement);
            __m128 mPrevOutLo = _mm_loadu_ps(pStartOutPrevLine + iOnElement);
            __m128 mSummedInLo = SumRegister(mInLo);
            __m128 mLeftSumOutLo = _mm_add_ps(mLeftSum, mSummedInLo);
            __m128 mOutLo = _mm_add_ps(mLeftSumOutLo, mPrevOutLo);
            _mm_storeu_ps(pStartOutCurLine + iOnElement, mOutLo);
            mLeftSum = _mm_shuffle_ps(mLeftSumOutLo, mLeftSumOutLo, _MM_SHUFFLE(3, 3, 3, 3));
            iOnElement += 4;
        }
        while (cElementsRemaining > 0)
        {
            cElementsRemaining -= 8;
            __m128 mInLo = _mm_loadu_ps(pStartInCurLine + iOnElement);
            __m128 mInHi = _mm_loadu_ps(pStartInCurLine + iOnElement + 4);
            __m128 mPrevOutLo = _mm_loadu_ps(pStartOutPrevLine + iOnElement);
            __m128 mPrevOutHi = _mm_loadu_ps(pStartOutPrevLine + iOnElement+4);
            __m128 mSummedInLo = SumRegister(mInLo);
            __m128 mSummedInHi = SumRegister(mInHi);
            __m128 mLeftSumOutLo = _mm_add_ps(mLeftSum, mSummedInLo);
            __m128 mOutLo = _mm_add_ps(mLeftSumOutLo, mPrevOutLo);
            _mm_storeu_ps(pStartOutCurLine + iOnElement, mOutLo);
            __m128 mLeftSum = _mm_shuffle_ps(mLeftSumOutLo, mLeftSumOutLo, _MM_SHUFFLE(3, 3, 3, 3));
            __m128 mLeftSumOutHi = _mm_add_ps(mLeftSum, mSummedInHi);
            __m128 mOutHi = _mm_add_ps(mLeftSumOutHi, mPrevOutHi);
            _mm_storeu_ps(pStartOutCurLine + iOnElement + 4, mOutHi);
            mLeftSum = _mm_shuffle_ps(mLeftSumOutHi, mLeftSumOutHi, _MM_SHUFFLE(3, 3, 3, 3));
            iOnElement += 8;
        }
        pStartOutPrevLine = pStartOutCurLine;
        pStartOutCurLine = AddBytesToPointer(pStartOutCurLine, verticalStrideOut);
        pStartInCurLine = AddBytesToPointer(pStartInCurLine, verticalStrideIn);
    }
}

For some similar opterations we will actually legitimately need the increased accuracy of doubles. That's coming soon.

Sunday, May 7, 2017

Levenshtein distance using SSSE3 and SSE41

Levenshtein distance is a measure of how close two strings are to each other. The distance is the count of additions, deletions, and substitutions necessary to transform one string into another.

The following is a fairly faithful translation from the code in the Wikipedia article into C++.

template <typename T>
void swap(T & a, T& b)
{
    T temp = a;
    a = b;
    b = temp;
}

size_t Levenshtein_distance_ANSI(LPCWSTR s, LPCWSTR t)
{
    size_t lenS = wcslen(s);
    size_t lenT = wcslen(t);
    if ((0 == lenS) || (0== lenT) || (0==wcscmp(s,t)))
    {
        return 0;
    }
    // create two work vectors of integer distances
    int *v0 = new int[lenT+1];
    int *v1 = new int[lenT+1];
    // initialize v0 (the previous row of distances)
    // this row is A[0][i]: edit distance for an empty s
    // the distance is just the number of characters to delete from t
    for (size_t i = 0; i <= lenT; i++)
    {
        v0[i] = i;
    }
    for (size_t i = 0; i <lenS; i++)
    {
        // calculate v1 (current row distances) from the previous row v0
        // first element of v1 is A[i+1][0]
        //   edit distance is delete (i+1) chars from s to match empty t
        v1[0] = i + 1;
        // use formula to fill in the rest of the row
        for (int j = 0; j <lenT; j++)
        {
            int cost = (s[i] != t[j]);
            int right = v1[j] + 1;
            int down = v0[j + 1] + 1;
            int diag = v0[j] + cost;
            v1[j + 1] = min(min(right, down), diag);
        }
        // swap v1 (current row) to v0 (previous row) for next iteration
        swap(v0, v1);
    }
    size_t retVal = v0[lenT];
    delete[] v0;
    delete[] v1;
    return retVal;
}

I have made one minor change. The code above swaps the pointers instead of recopying from the destination row back into the source row.

size_t Levenshtein_distance_SSE42(LPCWSTR s, LPCWSTR t)
{
    size_t lenS = wcslen(s);
    size_t lenT = wcslen(t);
    if ((0 == lenS) || (0 == lenT) || (0 == wcscmp(s, t)))
    {
        return 0;
    }
    // create two work vectors of integer distances
    int *v0 = new int[lenT + 1];
    int *v1 = new int[lenT + 1];
    // initialize v0 (the previous row of distances)
    // this row is A[0][i]: edit distance for an empty s
    // the distance is just the number of characters to delete from t
    for (size_t i = 0; i <= lenT; i++)
    {
        v0[i] = i;
    }
    const __m128i mOne = _mm_set1_epi32(1);
    const __m128i mTwo = _mm_set1_epi32(2);
    const __m128i mFour = _mm_set1_epi32(4);
    for (size_t i = 0; i <lenS; i++)
    {
        // calculate v1 (current row distances) from the previous row v0
        // first element of v1 is A[i+1][0]
        //   edit distance is delete (i+1) chars from s to match empty t
        v1[0] = i + 1;
        // use formula to fill in the rest of the
        // first handle the first elements so that the remainder is a multiple of 4
        size_t j = 0;
        while ((lenT - j) &3)
        {
            int cost = (s[i] != t[j]);
            int right = v1[j] + 1;
            int diag = v0[j] + cost;
            j++;
            int down = v0[j] + 1;
            v1[j] = min(min(right, down), diag);
        }
        // Any left over at this point can be handled in blocks of four elements.
        if (j<lenT)
        {
            __m128i msi = _mm_set1_epi16(s[i]);
            __m128i mPrevBest = _mm_set1_epi32(v1[j]);
            do {
                __m128i v0lo = _mm_loadu_si128((__m128i *)&v0[j]);
                __m128i v0loOffset = _mm_loadu_si128((__m128i *)&v0[j + 1]);
                __m128i mtj = _mm_set1_epi64x(*(__int64 *)&t[j]);
                __m128i mstCmp = _mm_cmpeq_epi16(msi, mtj);
                __m128i mCost = _mm_add_epi32(mOne, _mm_unpacklo_epi16(mstCmp, mstCmp));
                __m128i mDiag = _mm_add_epi32(v0lo, mCost);
                __m128i mDown = _mm_add_epi32(mOne, v0loOffset);
                __m128i mMinDiagDown = _mm_min_epi32(mDiag, mDown);
                __m128i mRightOne = _mm_min_epi32(mMinDiagDown, _mm_add_epi32(mOne, _mm_alignr_epi8(mMinDiagDown, mPrevBest, 12)));
                __m128i mRightTwo = _mm_add_epi32(mTwo, _mm_alignr_epi8(mRightOne, mPrevBest, 8));
                __m128i mRightFour = _mm_add_epi32(mFour, mPrevBest);
                __m128i mBest = _mm_min_epi32(mRightOne, _mm_min_epi32(mRightFour, mRightTwo));
                mPrevBest = mBest;
                _mm_storeu_si128((__m128i *)&v1[j + 1], mBest);
                j += 4;
            } while (j<lenT);
        }
        // swap v1 (current row) to v0 (previous row) for next iteration
        swap(v0, v1);
    }
    size_t retVal = v0[lenT];
    delete[] v0;
    delete[] v1;
    return retVal;
}

This is a pretty faithful reimplementation of the first version. It takes about 1/3 the wall-clock time of the ansi implementation. Note that there are two reads from memory per inner loop iteration. This can be addressed by reading one group of four ahead.

The horizontal minimums are performed by doing a min against the block shifted left one element, then by two elements, then by four elements, each plus a constant.

Other implementation I see on the internet use a different algorithm based on calculation along the diagonal. A bit harder to understand. But we will revisit that later.