MicroPerf: shaving a few shuffles from the float sorting network in SSE2

In an earlier post I commented that I felt that it should be possible to get rid of a couple shuffles in the presented network.

Below is a network implementation of the same bitonic network above. But this new implementation seems to have a handful fewer shuffles.

void BitonicSortingNetwork_FewerShuffles_SSE2(const __m128 &a, const __m128 &b, __m128 &aOut, __m128 &bOut)
{
    __m128 a1 = a;
    __m128 b1 = b;
    __m128 a1Min = _mm_min_ps(a1, b1); // 1357
    __m128 b1Max = _mm_max_ps(a1, b1); // 2468
    __m128 a2 = a1Min; // 1357
    __m128 b2 = _mm_shuffle_ps(b1Max, b1Max, _MM_SHUFFLE(2, 3, 0, 1)); // 4286
    __m128 a2Min = _mm_min_ps(a2, b2); // 1256
    __m128 b2Max = _mm_max_ps(a2, b2); // 4387
    __m128 a3 = _mm_shuffle_ps(a2Min, b2Max, _MM_SHUFFLE(3, 1, 2, 0)); // 1537
    __m128 b3 = _mm_shuffle_ps(a2Min, b2Max, _MM_SHUFFLE(2, 0, 3,1)); // 2648
    __m128 a3Min = _mm_min_ps(a3, b3); // 1537
    __m128 b3Max = _mm_max_ps(a3, b3); // 2648
    __m128 a4 = a3Min; // 1537
    __m128 b4 = _mm_shuffle_ps(b3Max, b3Max, _MM_SHUFFLE(0, 1, 2, 3)); // 8462
    __m128 a4Min = _mm_min_ps(a4, b4); // 1432
    __m128 b4Max = _mm_max_ps(a4, b4); // 8567
    __m128 a5 = _mm_unpacklo_ps(a4Min, b4Max); // 1845
    __m128 b5 = _mm_unpackhi_ps(a4Min, b4Max); // 3627
    __m128 a5Min = _mm_min_ps(a5, b5); // 1625
    __m128 b5Max = _mm_max_ps(a5, b5); // 3847
    __m128 a6 = _mm_unpacklo_ps(a5Min, b5Max); // 1368
    __m128 b6 = _mm_unpackhi_ps(a5Min, b5Max); // 2457
    __m128 a6Min = _mm_min_ps(a6, b6); // 1357
    __m128 b6Max = _mm_max_ps(a6, b6); // 2468
    aOut = _mm_unpacklo_ps(a6Min, b6Max); // 1234
    bOut = _mm_unpackhi_ps(a6Min, b6Max); // 5678
}

In particular it arranges to have only the unpacks before returning the results.  By re-using an intermediate, two shuffles are eliminated between min/max. While this only shaves a cycle or two from the runtime, it is more aesthetically pleasing.

The compiled assembly code looks good.

__m128 a1 = a;
00007FF751C51070 movups xmm3,xmmword ptr [rcx]
__m128 b1 = b;
__m128 a1Min = _mm_min_ps(a1, b1); // 1357
__m128 b1Max = _mm_max_ps(a1, b1); // 2468
00007FF751C51073 movaps xmm1,xmm3
00007FF751C51076 maxps xmm1,xmmword ptr [rdx]
00007FF751C51079 minps xmm3,xmmword ptr [rdx]
__m128 a2 = a1Min; // 1357
__m128 b2 = _mm_shuffle_ps(b1Max, b1Max, _MM_SHUFFLE(2, 3, 0, 1)); // 4286
00007FF751C5107C shufps xmm1,xmm1,0B1h
__m128 a2Min = _mm_min_ps(a2, b2); // 1256
00007FF751C51080 movaps xmm2,xmm3
00007FF751C51083 minps xmm2,xmm1
__m128 b2Max = _mm_max_ps(a2, b2); // 4387
00007FF751C51086 maxps xmm3,xmm1
__m128 a3 = _mm_shuffle_ps(a2Min, b2Max, _MM_SHUFFLE(3, 1, 2, 0)); // 1537
00007FF751C51089 movaps xmm4,xmm2
__m128 b3 = _mm_shuffle_ps(a2Min, b2Max, _MM_SHUFFLE(2, 0, 3, 1)); // 2648
00007FF751C5108C shufps xmm2,xmm3,8Dh
__m128 b3 = _mm_shuffle_ps(a2Min, b2Max, _MM_SHUFFLE(2, 0, 3, 1)); // 2648
00007FF751C51090 shufps xmm4,xmm3,0D8h
__m128 a3Min = _mm_min_ps(a3, b3); // 1537
__m128 b3Max = _mm_max_ps(a3, b3); // 2648
00007FF751C51094 movaps xmm0,xmm4
00007FF751C51097 maxps xmm0,xmm2
00007FF751C5109A minps xmm4,xmm2
__m128 a4 = a3Min; // 1537
__m128 b4 = _mm_shuffle_ps(b3Max, b3Max, _MM_SHUFFLE(0, 1, 2, 3)); // 8462
00007FF751C5109D shufps xmm0,xmm0,1Bh
__m128 a4Min = _mm_min_ps(a4, b4); // 1432
00007FF751C510A1 movaps xmm1,xmm4
00007FF751C510A4 minps xmm1,xmm0
__m128 b4Max = _mm_max_ps(a4, b4); // 8567
00007FF751C510A7 maxps xmm4,xmm0
__m128 a5 = _mm_unpacklo_ps(a4Min, b4Max); // 1845
00007FF751C510AA movaps xmm0,xmm1
__m128 b5 = _mm_unpackhi_ps(a4Min, b4Max); // 3627
00007FF751C510AD unpckhps xmm1,xmm4
00007FF751C510B0 unpcklps xmm0,xmm4
__m128 a5Min = _mm_min_ps(a5, b5); // 1625
00007FF751C510B3 movaps xmm2,xmm0
__m128 b5Max = _mm_max_ps(a5, b5); // 3847
00007FF751C510B6 maxps xmm0,xmm1
00007FF751C510B9 minps xmm2,xmm1
__m128 a6 = _mm_unpacklo_ps(a5Min, b5Max); // 1368
00007FF751C510BC movaps xmm3,xmm2
__m128 b6 = _mm_unpackhi_ps(a5Min, b5Max); // 2457
00007FF751C510BF unpckhps xmm2,xmm0
00007FF751C510C2 unpcklps xmm3,xmm0
__m128 a6Min = _mm_min_ps(a6, b6); // 1357
00007FF751C510C5 movaps xmm1,xmm3
00007FF751C510C8 minps xmm1,xmm2
__m128 b6Max = _mm_max_ps(a6, b6); // 2468
00007FF751C510CB maxps xmm3,xmm2
aOut = _mm_unpacklo_ps(a6Min, b6Max); // 1234
00007FF751C510CE movaps xmm0,xmm1
00007FF751C510D1 unpcklps xmm0,xmm3
bOut = _mm_unpackhi_ps(a6Min, b6Max); // 5678
00007FF751C510D4 unpckhps xmm1,xmm3
00007FF751C510D7 movups xmmword ptr [r8],xmm0
00007FF751C510DB movups xmmword ptr [r9],xmm1
}
00007FF751C510DF ret

Update: I see that the final three stages are the same as what is described in a paper that I read but hadn't remembered. I find a different paper in a quick search. I think that the first three stages are better than what are described in that paper. But incorporated into a much larger mergesort gives a different set of problems. Also, the papers don't give source code...

MicroPerf

Sunday, March 19, 2017

shaving a few shuffles from the float sorting network in SSE2

No comments:

Post a Comment