Calculating the sum of an array of numbers is pretty easy with SSE.
We will need the horizontal sum from previous posts.
__forceinline float horizontalSum_SSE2(const __m128 &mABCD)
{
__m128 mCDCD = _mm_movehl_ps(mABCD, mABCD);
__m128 mApCBpD = _mm_add_ps(mABCD, mCDCD);
__m128 mBpD = _mm_shuffle_ps(mApCBpD, mApCBpD, 0x55);
__m128 mApBpCpD = _mm_add_ps(mApCBpD, mBpD);
return _mm_cvtss_f32(mApBpCpD);
}
The actual Summing looks a lot like the code for the average.
float Sum_SSE2(const float * const pA, const size_t uiAOrig)
{
__m128 mSummed = _mm_setzero_ps();
size_t uiA = uiAOrig;
if (uiA & 1)
{
uiA--;
mSummed = _mm_load_ss(&pA[uiA]);
}
if (uiA & 2)
{
uiA -= 2;
mSummed = _mm_loadh_pi(mSummed, (const __m64*)&pA[uiA]);
}
while (uiA > 0)
{
uiA -= 4;
mSummed = _mm_add_ps(mSummed, _mm_loadu_ps(&pA[uiA]));
}
return horizontalSum_SSE2(mSummed);
}
The euclidean norm is the euclidean distance without the square root. If we are only trying to figure out which of a set has the smallest (or largest) distance, we don't need to perform the square root, and can instead of distance just compare the euclidean norm.
float EuclideanNormSquared_SSE2(const float * const pA, const size_t uiAOrig)
{
__m128 mSummed = _mm_setzero_ps();
size_t uiA = uiAOrig;
if (uiA & 3)
{
__m128 mVals = _mm_setzero_ps();
if (uiA & 1)
{
uiA--;
mVals = _mm_load_ss(&pA[uiA]);
}
if (uiA & 2)
{
uiA -= 2;
mVals = _mm_loadh_pi(mVals, (const __m64*)&pA[uiA]);
}
mSummed = _mm_mul_ps(mVals, mVals);
}
while (uiA > 0)
{
uiA -= 4;
__m128 mVals = _mm_loadu_ps(&pA[uiA]);
mSummed = _mm_add_ps(mSummed, _mm_mul_ps(mVals, mVals));
}
return horizontalSum_SSE2(mSummed);
}
We could squeeze a little more perf by unrolling the loop a bit, and on lower performance processors could slightly benefit from aligning the reads. But it's close enough.
No comments:
Post a Comment