Reads that are aligned to 16byte memory boundaries can be made faster by making a guarantee to the processor, by using _mm_load_ps rather than _mm_loadu_ps. On newer hardware the benefits are smaller, and may not be enough to justify added complexity.
But let's look at an aligning implementation of a function that we've seen before.
__forceinline float horizontalSum_SSE2(const __m128 &mABCD)
{
__m128 mCDCD = _mm_movehl_ps(mABCD, mABCD);
__m128 mApCBpD = _mm_add_ps(mABCD, mCDCD);
__m128 mBpD = _mm_shuffle_ps(mApCBpD, mApCBpD, 0x55);
__m128 mApBpCpD = _mm_add_ps(mApCBpD, mBpD);
return _mm_cvtss_f32(mApBpCpD);
}
template <bool bShort = false, bool bAligned = false>
__forceinline __m128 SumWithoutHorizontal(const float * const pA, const size_t uiAOrig)
{
__m128 mSummed = _mm_setzero_ps();
size_t uiA = uiAOrig;
if (uiA & 1)
{
uiA--;
mSummed = _mm_load_ss(&pA[uiA]);
}
if (uiA & 2)
{
uiA -= 2;
mSummed = _mm_loadh_pi(mSummed, (const __m64*)&pA[uiA]);
}
if (!bShort)
{
while (uiA > 0)
{
uiA -= 4;
__m128 mLoaded = bAligned? _mm_load_ps(&pA[uiA]): _mm_loadu_ps(&pA[uiA]);
mSummed = _mm_add_ps(mSummed, mLoaded);
}
}
return mSummed;
}
float SumWithAlign_SSE2(const float * const pA, const size_t uiAOrig)
{
ULONG_PTR ulpA = (ULONG_PTR)pA;
__m128 mSum;
if ((uiAOrig < 8) || (ulpA & 0x3))
{
mSum = SumWithoutHorizontal<false, false>(pA, uiAOrig);
}
else
{
size_t numBytesToAligned = (0xf & -ulpA);
size_t numElementsToAligned = numBytesToAligned / sizeof(*pA);
mSum = _mm_add_ps(SumWithoutHorizontal<true, false>(pA, numElementsToAligned), SumWithoutHorizontal<false, true>(pA+ numElementsToAligned, uiAOrig - numElementsToAligned));
}
return horizontalSum_SSE2(mSum);
}
Calculating the number of bytes that need to be processed to get to a 16 byte alignment is performed with: numBytesToAligned = (0xf & -ulpA); This feels like a bit of magic. I found this by looking at the generated assembly from a more conventional implementation.
No comments:
Post a Comment