MicroPerf: April 2017

Sunday, April 30, 2017

Distance between adjacent points in a polyline using SSE3

We will need this a little bit later to talk about splines. Spline approximation is usually done using an iterative process. The parametric variable is usually given a starting position based on the distance between the points in the polyline.

So it is a real-world problem to create an array of the distance between adjacent points in a polyline.

typedef struct
{
float x;
float y;
} Pointf2d;

__forceinline float DistanceBetweenPoints(const Pointf2d & aXY, const Pointf2d & bXY)
{
return _hypotf(aXY.x - bXY.x, aXY.y - bXY.y);
}

void distanceBetween2dPoints_ANSI(const size_t cPoints, __in_ecount(cPoints) const Pointf2d* const paxy, __in_ecount(cPoints) const Pointf2d * const pbxy, __out_ecount(cPoints) float * const pDists)
{
size_t onPoint = cPoints;
while (onPoint-- > 0)
{
pDists[onPoint] = DistanceBetweenPoints(paxy[onPoint], pbxy[onPoint]);
}
}

void lengthOfPolylineSegments_ANSI(const size_t cPoints, __in_ecount(cPoints) const Pointf2d* const paxy, __out_ecount(cPoints-1) float * const pLengths)
{
if (cPoints > 0)
{
distanceBetween2dPoints_ANSI(cPoints - 1, &paxy[0], &paxy[1], pLengths);
}
}

This translates pretty easily into SSE. It does rely on a horizontal add, which I have left using the SSE3 intrinsic.

void distanceBetween2dPoints_SSE3(const size_t cPoints, __in_ecount(cPoints) const Pointf2d* const paxy, __in_ecount(cPoints) const Pointf2d * const pbxy, __out_ecount(cPoints) float * const pDists)

{

size_t onPoint = cPoints;

while (onPoint & 3)

{

onPoint--;

pDists[onPoint] = DistanceBetweenPoints(paxy[onPoint], pbxy[onPoint]);

}

while (onPoint > 0)

{

onPoint -= 4;

__m128 mAXYhi = _mm_loadu_ps((float *)&paxy[onPoint+2]);

__m128 mBXYhi = _mm_loadu_ps((float *)&pbxy[onPoint+2]);

__m128 mAXYlo = _mm_loadu_ps((float *)&paxy[onPoint]);

__m128 mBXYlo = _mm_loadu_ps((float *)&pbxy[onPoint]);

__m128 mDiffHi = _mm_sub_ps(mAXYhi, mBXYhi);

__m128 mDiffLo = _mm_sub_ps(mAXYlo, mBXYlo);

__m128 mDiffSqrHi = _mm_mul_ps(mDiffHi, mDiffHi);

__m128 mDiffSqrLo = _mm_mul_ps(mDiffLo, mDiffLo);

__m128 mSumDiffSqr = _mm_hadd_ps(mDiffSqrLo, mDiffSqrHi);

__m128 mSqrtSumDiffSqr = _mm_sqrt_ps(mSumDiffSqr);

_mm_storeu_ps(&pDists[onPoint], mSqrtSumDiffSqr);

}

void lengthOfPolylineSegments_SSE3(const size_t cPoints, __in_ecount(cPoints) const Pointf2d* const paxy, __out_ecount(cPoints-1) float * const pLengths)

{

if (cPoints > 0)

{

distanceBetween2dPoints_SSE(cPoints - 1, &paxy[0], &paxy[1], pLengths);

}

This code can be more efficient. Using the same variable twice for use in the SSE3 implementation makes for easy to understand code. But we can do a little better using the _mm_alignr_epi8 intrinsic, and will see that soon.

Sunday, April 23, 2017

using aligned memory reads for a summation in SSE

Allocated memory in C does not have any guaranteed alignment. This eventually was incorporated into the language with std::aligned_alloc.

Reads that are aligned to 16byte memory boundaries can be made faster by making a guarantee to the processor, by using _mm_load_ps rather than _mm_loadu_ps. On newer hardware the benefits are smaller, and may not be enough to justify added complexity.

But let's look at an aligning implementation of a function that we've seen before.

__forceinline float horizontalSum_SSE2(const __m128 &mABCD)
{
__m128 mCDCD = _mm_movehl_ps(mABCD, mABCD);
__m128 mApCBpD = _mm_add_ps(mABCD, mCDCD);
__m128 mBpD = _mm_shuffle_ps(mApCBpD, mApCBpD, 0x55);
__m128 mApBpCpD = _mm_add_ps(mApCBpD, mBpD);
return _mm_cvtss_f32(mApBpCpD);
}

template <bool bShort = false, bool bAligned = false>
__forceinline __m128 SumWithoutHorizontal(const float * const pA, const size_t uiAOrig)
{
__m128 mSummed = _mm_setzero_ps();
size_t uiA = uiAOrig;
if (uiA & 1)
{
uiA--;
mSummed = _mm_load_ss(&pA[uiA]);
}
if (uiA & 2)
{
uiA -= 2;
mSummed = _mm_loadh_pi(mSummed, (const __m64*)&pA[uiA]);
}
if (!bShort)
{
while (uiA > 0)
{
uiA -= 4;
__m128 mLoaded = bAligned? _mm_load_ps(&pA[uiA]): _mm_loadu_ps(&pA[uiA]);
mSummed = _mm_add_ps(mSummed, mLoaded);
}
}
return mSummed;
}

float SumWithAlign_SSE2(const float * const pA, const size_t uiAOrig)
{
ULONG_PTR ulpA = (ULONG_PTR)pA;
__m128 mSum;
if ((uiAOrig < 8) || (ulpA & 0x3))
{
mSum = SumWithoutHorizontal<false, false>(pA, uiAOrig);
}
else
{
size_t numBytesToAligned = (0xf & -ulpA);
size_t numElementsToAligned = numBytesToAligned / sizeof(*pA);

mSum = _mm_add_ps(SumWithoutHorizontal<true, false>(pA, numElementsToAligned), SumWithoutHorizontal<false, true>(pA+ numElementsToAligned, uiAOrig - numElementsToAligned));
}
return horizontalSum_SSE2(mSum);
}

Calculating the number of bytes that need to be processed to get to a 16 byte alignment is performed with: numBytesToAligned = (0xf & -ulpA); This feels like a bit of magic. I found this by looking at the generated assembly from a more conventional implementation.

Saturday, April 22, 2017

Cumulative sum of the float elements in an array using SSE

Summing all of the elements is pretty simple with SSE. And the compiler does a pretty good job of vectorizing a simple sum automatically.
I've had some cases where the input is the distance between adjacent points, and we'd like to create an array with the cumulative distance up to a particular point.

float CumulativeSum(__in_ecount(cElements) const float * const aIn, __out_ecount(cElements) float * const aOut, size_t cElements, const float fRunningSumIn=0)
{
float fRunningSum = fRunningSumIn;
for (size_t iOnElement = 0; iOnElement < cElements; iOnElement++)
{
  aOut[iOnElement] = (fRunningSum += aIn[iOnElement]);
}
return fRunningSum;
}

This does require a bunch of horizontal sums. But that's okay.

template<const unsigned char nElementsToShift>
__m128 _mm_slli_ps(__m128 m)
{
return _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(m), 4 * nElementsToShift));
}
__m128 SumRegister(__m128 mIn)
{
__m128 mAddShift = _mm_add_ps(mIn, _mm_slli_ps<1>(mIn));
return _mm_add_ps(mAddShift, _mm_slli_ps<2>(mAddShift));
}
float CumulativeSum_SSE(const float * aIn, float * aOut, size_t cElements, float fRunningSum = 0)
{
size_t cOnElement = cElements & 3;
fRunningSum = CumulativeSum(aIn, aOut, cOnElement, fRunningSum);
__m128 mRunningSum = _mm_set1_ps(fRunningSum);
while (cOnElement<cElements)
{
  __m128 mIn = _mm_loadu_ps(aIn + cOnElement);
  __m128 mOut = _mm_add_ps(mRunningSum, SumRegister(mIn));
  _mm_storeu_ps(aOut + cOnElement, mOut);
  mRunningSum = _mm_shuffle_ps(mOut, mOut, _MM_SHUFFLE(3, 3, 3, 3));
  cOnElement += 4;
}
return _mm_cvtss_f32(mRunningSum);
}

The loop could be unrolled a bit, but the benefits would be small.
We will be using this in a little bit.

Thursday, April 20, 2017

The sum and the euclidean norm of an array of float using SSE

The information in this post is a near repeat of the information in the post on calculating the average... But for completeness and setting the stage for a future post on curve approximation let's get this stuff out of the way:

Calculating the sum of an array of numbers is pretty easy with SSE.

We will need the horizontal sum from previous posts.

__forceinline float horizontalSum_SSE2(const __m128 &mABCD)
{
__m128 mCDCD = _mm_movehl_ps(mABCD, mABCD);
__m128 mApCBpD = _mm_add_ps(mABCD, mCDCD);
__m128 mBpD = _mm_shuffle_ps(mApCBpD, mApCBpD, 0x55);
__m128 mApBpCpD = _mm_add_ps(mApCBpD, mBpD);
return _mm_cvtss_f32(mApBpCpD);
}

The actual Summing looks a lot like the code for the average.

float Sum_SSE2(const float * const pA, const size_t uiAOrig)
{
__m128 mSummed = _mm_setzero_ps();
size_t uiA = uiAOrig;
if (uiA & 1)
{
uiA--;
mSummed = _mm_load_ss(&pA[uiA]);
}
if (uiA & 2)
{
uiA -= 2;
mSummed = _mm_loadh_pi(mSummed, (const __m64*)&pA[uiA]);
}
while (uiA > 0)
{
uiA -= 4;
mSummed = _mm_add_ps(mSummed, _mm_loadu_ps(&pA[uiA]));
}
return horizontalSum_SSE2(mSummed);
}

The euclidean norm is the euclidean distance without the square root. If we are only trying to figure out which of a set has the smallest (or largest) distance, we don't need to perform the square root, and can instead of distance just compare the euclidean norm.

float EuclideanNormSquared_SSE2(const float * const pA, const size_t uiAOrig)
{
__m128 mSummed = _mm_setzero_ps();
size_t uiA = uiAOrig;
if (uiA & 3)
{
__m128 mVals = _mm_setzero_ps();
if (uiA & 1)
{
uiA--;
mVals = _mm_load_ss(&pA[uiA]);
}
if (uiA & 2)
{
uiA -= 2;
mVals = _mm_loadh_pi(mVals, (const __m64*)&pA[uiA]);
}
mSummed = _mm_mul_ps(mVals, mVals);
}
while (uiA > 0)
{
uiA -= 4;
__m128 mVals = _mm_loadu_ps(&pA[uiA]);
mSummed = _mm_add_ps(mSummed, _mm_mul_ps(mVals, mVals));
}
return horizontalSum_SSE2(mSummed);
}

We could squeeze a little more perf by unrolling the loop a bit, and on lower performance processors could slightly benefit from aligning the reads. But it's close enough.

Saturday, April 1, 2017

evaluating a 2d cubic spline using SSE. subdividing a 2d cubic spline using SSE.

I really like Bezier cubic splines. Unfortunately, this post will assume some understanding of them, and will not make as much sense without that understanding. If you don't have problems with these curves, this post may not be quite as accessible. I will ask you to assume that these are real problems that real people have with these curves.

Cubic splines are pretty. A 2d curve is described with 4 points as diagramed above. The curve starts at P0 and ends at P3. At P0 the curve starts by going in the direction of P1. At P3 the curve ends while going in the direction away from P2. And many other useful properties.

Sometimes we will end up wanting to evaluate the curve along its' path. We construct a parametric representation and evaluate as the parameter t varies from 0 to 1. Each of the dimensions will be evaluated as a cubic polynomial in t. We can construct the polynomial and evaluate directly. Or...

There's another way to evaluate cubic splines, called De Casteljau's algorithm. This works nicely with SSE.

It is a recursive process, generating a spline of lower order at each level. The recursion is unrolled in the implementation below:

void AffineInterpolate(const __m128 &mP0123, const __m128 &mT, __m128 &mQ012)
{
    __m128 mP0123t = _mm_mul_ps(mP0123, mT);
    __m128 mP0123omt = _mm_sub_ps(mP0123, mP0123t);
    __m128 mP1233t = _mm_shuffle_ps(mP0123t, mP0123t, _MM_SHUFFLE(3, 3, 2, 1));
    mQ012 = _mm_add_ps(mP0123omt, mP1233t);
}
float evaluate1DCubicSplineAt(const __m128 &mP0123, const __m128 &mT)
{
    __m128 mQ012;
    __m128 mR01;
    __m128 mS0;
    AffineInterpolate(mP0123, mT, mQ012);
    AffineInterpolate(mQ012, mT, mR01);
    AffineInterpolate(mR01, mT, mS0);
    return _mm_cvtss_f32(mS0);
}
void evaluate2DCubicSplineAt(__in_ecount(8) const float *P0123xy, const float t, __out_ecount(2) float * const pResultXY)
{
    const __m128 mP01 = _mm_loadu_ps(&P0123xy[0]);
    const __m128 mP23 = _mm_loadu_ps(&P0123xy[4]);
    __m128 mT = _mm_set1_ps(t);
    __m128 mP0123x = _mm_shuffle_ps(mP01, mP23, _MM_SHUFFLE(2, 0, 2, 0));
    __m128 mP0123y = _mm_shuffle_ps(mP01, mP23, _MM_SHUFFLE(3, 1, 3, 1));
    pResultXY[0] = evaluate1DCubicSplineAt(mP0123x, mT);
    pResultXY[1] = evaluate1DCubicSplineAt(mP0123y, mT);
}

This splits the 2d control points into two sets of 1d control points. These are evaluated separately.

The cubic Bezier curve can be split losslessly into two separate curves along its length. All of the control points necessary for this are in fact present in the intermediates of evaluate1DCubicSplineAt. So let's extract them.

void split1DCubicSplineAt(const __m128 &mP0123, const __m128 &mT, __m128 & leftOut, __m128 &rightOut)
{
    __m128 mQ012;
    __m128 mR01;
    __m128 mS0;
    AffineInterpolate(mP0123, mT, mQ012);
    AffineInterpolate(mQ012, mT, mR01);
    AffineInterpolate(mR01, mT, mS0);
    __m128 mPQ = _mm_shuffle_ps(mP0123, mQ012, _MM_SHUFFLE(2, 0, 3, 0));
    __m128 mRS = _mm_shuffle_ps(mR01, mS0, _MM_SHUFFLE(0, 0, 1, 0));
    leftOut = _mm_shuffle_ps(mPQ, mRS, _MM_SHUFFLE(2, 0, 2, 0));
    rightOut = _mm_shuffle_ps(mRS, mPQ, _MM_SHUFFLE(1, 3, 1, 3));
}
void split2DCubicSplineAt(__in_ecount(8) const float *P0123xy, const float t, __out_ecount(16) float * const pSplitCurvesXY)
{
    const __m128 mP01 = _mm_loadu_ps(&P0123xy[0]);
    const __m128 mP23 = _mm_loadu_ps(&P0123xy[4]);
    __m128 mT = _mm_set1_ps(t);
    __m128 mP0123x = _mm_shuffle_ps(mP01, mP23, _MM_SHUFFLE(2, 0, 2, 0));
    __m128 mP0123y = _mm_shuffle_ps(mP01, mP23, _MM_SHUFFLE(3, 1, 3, 1));
    __m128 mLeftX, mLeftY, mRightX, mRightY;
    split1DCubicSplineAt(mP0123x, mT, mLeftX, mRightX);
    split1DCubicSplineAt(mP0123y, mT, mLeftY, mRightY);
    __m128 mLeftXyLo = _mm_unpacklo_ps(mLeftX, mLeftY);
    __m128 mLeftXyHi = _mm_unpackhi_ps(mLeftX, mLeftY);
    __m128 mRightXyLo = _mm_unpacklo_ps(mRightX, mRightY);
    __m128 mRightXyHi = _mm_unpackhi_ps(mRightX, mRightY);
    _mm_storeu_ps(&pSplitCurvesXY[0], mLeftXyLo);
    _mm_storeu_ps(&pSplitCurvesXY[4], mLeftXyHi);
    _mm_storeu_ps(&pSplitCurvesXY[8], mRightXyLo);
    _mm_storeu_ps(&pSplitCurvesXY[12], mRightXyHi);
}

This yields two separate Bezier curves, both output to pSplitCurvesXY. The last point of the first of the Bezier curves is the evaluation at t. This is exactly the same as the first point of the second Bezier curve. In SVG paths this duplicated point would be omitted.

This requires 3 multiplies, 3 adds, and 3 subtractions for each of the 1d evaluation or subdivision. And a bunch of very fast shuffles. The final multiply of each 1d could be combined with each other. But it doesn't seem worth the added complexity.

This compiles to x86 assembly without register spills.