So it is a real-world problem to create an array of the distance between adjacent points in a polyline.
typedef struct
{
float x;
float y;
} Pointf2d;
{
return _hypotf(aXY.x - bXY.x, aXY.y - bXY.y);
}
void distanceBetween2dPoints_ANSI(const size_t cPoints, __in_ecount(cPoints) const Pointf2d* const paxy, __in_ecount(cPoints) const Pointf2d * const pbxy, __out_ecount(cPoints) float * const pDists)
{
size_t onPoint = cPoints;
while (onPoint-- > 0)
{
pDists[onPoint] = DistanceBetweenPoints(paxy[onPoint], pbxy[onPoint]);
}
}
void lengthOfPolylineSegments_ANSI(const size_t cPoints, __in_ecount(cPoints) const Pointf2d* const paxy, __out_ecount(cPoints-1) float * const pLengths)
{
if (cPoints > 0)
{
distanceBetween2dPoints_ANSI(cPoints - 1, &paxy[0], &paxy[1], pLengths);
}
}
This translates pretty easily into SSE. It does rely on a horizontal add, which I have left using the SSE3 intrinsic.
void distanceBetween2dPoints_SSE3(const size_t cPoints, __in_ecount(cPoints) const Pointf2d* const paxy, __in_ecount(cPoints) const Pointf2d * const pbxy, __out_ecount(cPoints) float * const pDists)
{
size_t onPoint = cPoints;
while (onPoint & 3)
{
onPoint--;
pDists[onPoint] = DistanceBetweenPoints(paxy[onPoint], pbxy[onPoint]);
}
while (onPoint > 0)
{
onPoint -= 4;
__m128 mAXYhi = _mm_loadu_ps((float *)&paxy[onPoint+2]);
__m128 mBXYhi = _mm_loadu_ps((float *)&pbxy[onPoint+2]);
__m128 mAXYlo = _mm_loadu_ps((float *)&paxy[onPoint]);
__m128 mBXYlo = _mm_loadu_ps((float *)&pbxy[onPoint]);
__m128 mDiffHi = _mm_sub_ps(mAXYhi, mBXYhi);
__m128 mDiffLo = _mm_sub_ps(mAXYlo, mBXYlo);
__m128 mDiffSqrHi = _mm_mul_ps(mDiffHi, mDiffHi);
__m128 mDiffSqrLo = _mm_mul_ps(mDiffLo, mDiffLo);
__m128 mSumDiffSqr = _mm_hadd_ps(mDiffSqrLo, mDiffSqrHi);
__m128 mSqrtSumDiffSqr = _mm_sqrt_ps(mSumDiffSqr);
_mm_storeu_ps(&pDists[onPoint], mSqrtSumDiffSqr);
}
}
void lengthOfPolylineSegments_SSE3(const size_t cPoints, __in_ecount(cPoints) const Pointf2d* const paxy, __out_ecount(cPoints-1) float * const pLengths)
{
if (cPoints > 0)
{
distanceBetween2dPoints_SSE(cPoints - 1, &paxy[0], &paxy[1], pLengths);
}
}
This code can be more efficient. Using the same variable twice for use in the SSE3 implementation makes for easy to understand code. But we can do a little better using the _mm_alignr_epi8 intrinsic, and will see that soon.
No comments:
Post a Comment