Sunday, March 19, 2017

shaving a few shuffles from the float sorting network in SSE2

In an earlier post I commented that I felt that it should be possible to get rid of a couple shuffles in the presented network. 

https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:Batcher_Bitonic_Mergesort_for_eight_inputs.svg

Below is a network implementation of the same bitonic network above.  But this new implementation seems to have a handful fewer shuffles.

void BitonicSortingNetwork_FewerShuffles_SSE2(const __m128 &a, const __m128 &b, __m128 &aOut, __m128 &bOut)
{
    __m128 a1 = a;
    __m128 b1 = b;
    __m128 a1Min = _mm_min_ps(a1, b1); // 1357
    __m128 b1Max = _mm_max_ps(a1, b1); // 2468
    __m128 a2 = a1Min; // 1357
    __m128 b2 = _mm_shuffle_ps(b1Max, b1Max, _MM_SHUFFLE(2, 3, 0, 1)); // 4286
    __m128 a2Min = _mm_min_ps(a2, b2); // 1256
    __m128 b2Max = _mm_max_ps(a2, b2); // 4387
    __m128 a3 = _mm_shuffle_ps(a2Min, b2Max, _MM_SHUFFLE(3, 1, 2, 0)); // 1537
    __m128 b3 = _mm_shuffle_ps(a2Min, b2Max, _MM_SHUFFLE(2, 0, 3,1)); // 2648
    __m128 a3Min = _mm_min_ps(a3, b3); // 1537
    __m128 b3Max = _mm_max_ps(a3, b3); // 2648
    __m128 a4 = a3Min; // 1537
    __m128 b4 = _mm_shuffle_ps(b3Max, b3Max, _MM_SHUFFLE(0, 1, 2, 3)); // 8462
    __m128 a4Min = _mm_min_ps(a4, b4); // 1432
    __m128 b4Max = _mm_max_ps(a4, b4); // 8567
    __m128 a5 = _mm_unpacklo_ps(a4Min, b4Max); // 1845
    __m128 b5 = _mm_unpackhi_ps(a4Min, b4Max); // 3627
    __m128 a5Min = _mm_min_ps(a5, b5); // 1625
    __m128 b5Max = _mm_max_ps(a5, b5); // 3847
    __m128 a6 = _mm_unpacklo_ps(a5Min, b5Max); // 1368
    __m128 b6 = _mm_unpackhi_ps(a5Min, b5Max); // 2457
    __m128 a6Min = _mm_min_ps(a6, b6); // 1357
    __m128 b6Max = _mm_max_ps(a6, b6); // 2468

    aOut = _mm_unpacklo_ps(a6Min, b6Max); // 1234
    bOut = _mm_unpackhi_ps(a6Min, b6Max); // 5678
}


In particular it arranges to have only the unpacks before returning the results.  By re-using an intermediate, two shuffles are eliminated between min/max.  While this only shaves a cycle or two from the runtime, it is more aesthetically pleasing.

The compiled assembly code looks good.

    __m128 a1 = a;
00007FF751C51070  movups      xmm3,xmmword ptr [rcx]  
    __m128 b1 = b;
    __m128 a1Min = _mm_min_ps(a1, b1); // 1357
    __m128 b1Max = _mm_max_ps(a1, b1); // 2468
00007FF751C51073  movaps      xmm1,xmm3  
00007FF751C51076  maxps       xmm1,xmmword ptr [rdx]  
00007FF751C51079  minps       xmm3,xmmword ptr [rdx]  
    __m128 a2 = a1Min; // 1357
    __m128 b2 = _mm_shuffle_ps(b1Max, b1Max, _MM_SHUFFLE(2, 3, 0, 1)); // 4286
00007FF751C5107C  shufps      xmm1,xmm1,0B1h  
    __m128 a2Min = _mm_min_ps(a2, b2); // 1256
00007FF751C51080  movaps      xmm2,xmm3  
00007FF751C51083  minps       xmm2,xmm1  
    __m128 b2Max = _mm_max_ps(a2, b2); // 4387
00007FF751C51086  maxps       xmm3,xmm1  
    __m128 a3 = _mm_shuffle_ps(a2Min, b2Max, _MM_SHUFFLE(3, 1, 2, 0)); // 1537
00007FF751C51089  movaps      xmm4,xmm2  
    __m128 b3 = _mm_shuffle_ps(a2Min, b2Max, _MM_SHUFFLE(2, 0, 3, 1)); // 2648
00007FF751C5108C  shufps      xmm2,xmm3,8Dh  
    __m128 b3 = _mm_shuffle_ps(a2Min, b2Max, _MM_SHUFFLE(2, 0, 3, 1)); // 2648
00007FF751C51090  shufps      xmm4,xmm3,0D8h  
    __m128 a3Min = _mm_min_ps(a3, b3); // 1537
    __m128 b3Max = _mm_max_ps(a3, b3); // 2648
00007FF751C51094  movaps      xmm0,xmm4  
00007FF751C51097  maxps       xmm0,xmm2  
00007FF751C5109A  minps       xmm4,xmm2  
    __m128 a4 = a3Min; // 1537
    __m128 b4 = _mm_shuffle_ps(b3Max, b3Max, _MM_SHUFFLE(0, 1, 2, 3)); // 8462
00007FF751C5109D  shufps      xmm0,xmm0,1Bh  
    __m128 a4Min = _mm_min_ps(a4, b4); // 1432
00007FF751C510A1  movaps      xmm1,xmm4  
00007FF751C510A4  minps       xmm1,xmm0  
    __m128 b4Max = _mm_max_ps(a4, b4); // 8567
00007FF751C510A7  maxps       xmm4,xmm0  
    __m128 a5 = _mm_unpacklo_ps(a4Min, b4Max); // 1845
00007FF751C510AA  movaps      xmm0,xmm1  
    __m128 b5 = _mm_unpackhi_ps(a4Min, b4Max); // 3627
00007FF751C510AD  unpckhps    xmm1,xmm4  
00007FF751C510B0  unpcklps    xmm0,xmm4  
    __m128 a5Min = _mm_min_ps(a5, b5); // 1625
00007FF751C510B3  movaps      xmm2,xmm0  
    __m128 b5Max = _mm_max_ps(a5, b5); // 3847
00007FF751C510B6  maxps       xmm0,xmm1  
00007FF751C510B9  minps       xmm2,xmm1  
    __m128 a6 = _mm_unpacklo_ps(a5Min, b5Max); // 1368
00007FF751C510BC  movaps      xmm3,xmm2  
    __m128 b6 = _mm_unpackhi_ps(a5Min, b5Max); // 2457
00007FF751C510BF  unpckhps    xmm2,xmm0  
00007FF751C510C2  unpcklps    xmm3,xmm0  
    __m128 a6Min = _mm_min_ps(a6, b6); // 1357
00007FF751C510C5  movaps      xmm1,xmm3  
00007FF751C510C8  minps       xmm1,xmm2  
    __m128 b6Max = _mm_max_ps(a6, b6); // 2468
00007FF751C510CB  maxps       xmm3,xmm2  
    aOut = _mm_unpacklo_ps(a6Min, b6Max); // 1234
00007FF751C510CE  movaps      xmm0,xmm1  
00007FF751C510D1  unpcklps    xmm0,xmm3  
    bOut = _mm_unpackhi_ps(a6Min, b6Max); // 5678
00007FF751C510D4  unpckhps    xmm1,xmm3  
00007FF751C510D7  movups      xmmword ptr [r8],xmm0  
00007FF751C510DB  movups      xmmword ptr [r9],xmm1  
}
00007FF751C510DF  ret  

Update: I see that the final three stages are the same as what is described in a paper that I read but hadn't remembered.  I find a different paper in a quick search.  I think that the first three stages are better than what are described in that paper.  But incorporated into a much larger mergesort gives a different set of problems.  Also, the papers don't give source code...

No comments:

Post a Comment