Wednesday, March 15, 2017

A bitonic sorting network for 8 floats in SSE2

The previous sorting network did perform the sort in the ideal number of min/max operations.  But there was a lot of shuffling.  But this did strict ordering of the input wires.  If instead we allow the input wires to be reordered to make the shuffles more efficient, we can do a little better.

https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:Batcher_Bitonic_Mergesort_for_eight_inputs.svg


The below is a bitonic sorting network performed in this manner.  We shuffle to get the correct comparisons without regard to where in the registers the conceptual input wires are located.  This verifies as correct with the standard 0/1 method.

The below network is done with floats, since all of the necessary intrisics are available in SSE2. 

void BitonicSortingNetwork_SSE2(const __m128 &a, const __m128 &b, __m128 &aOut, __m128 &bOut)
{
    __m128 a1 = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0)); // 1357
    __m128 b1 = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1)); // 2468
    __m128 a1Min = _mm_min_ps(a1, b1);
    __m128 b1Max = _mm_max_ps(a1, b1);
    __m128 a2 = _mm_shuffle_ps(a1Min, b1Max, _MM_SHUFFLE(2, 0, 2, 0)); // 1526
    __m128 b2 = _mm_shuffle_ps(b1Max, a1Min, _MM_SHUFFLE(3, 1, 3, 1)); // 4837
    __m128 a2Min = _mm_min_ps(a2, b2);
    __m128 b2Max = _mm_max_ps(a2, b2);
    __m128 a3 = _mm_shuffle_ps(a2Min, b2Max, _MM_SHUFFLE(3, 2, 1, 0)); // 1537
    __m128 b3 = _mm_shuffle_ps(a2Min, b2Max, _MM_SHUFFLE(1, 0, 3, 2)); // 2648
    __m128 a3Min = _mm_min_ps(a3, b3);
    __m128 b3Max = _mm_max_ps(a3, b3);
    __m128 a4 = _mm_shuffle_ps(a3Min, b3Max, _MM_SHUFFLE(2, 0, 2, 0)); // 1324
    __m128 b4 = _mm_shuffle_ps(b3Max, a3Min, _MM_SHUFFLE(1, 3, 1, 3)); // 8675
    __m128 a4Min = _mm_min_ps(a4, b4);
    __m128 b4Max = _mm_max_ps(a4, b4);
    __m128 a5 = _mm_shuffle_ps(a4Min, b4Max, _MM_SHUFFLE(1, 3, 2, 0)); // 1256
    __m128 b5 = _mm_shuffle_ps(a4Min, b4Max, _MM_SHUFFLE(0, 2, 3, 1)); // 3478
    __m128 a5Min = _mm_min_ps(a5, b5);
    __m128 b5Max = _mm_max_ps(a5, b5);
    __m128 a6 = _mm_shuffle_ps(a5Min, b5Max, _MM_SHUFFLE(2, 0, 2, 0)); // 1537
    __m128 b6 = _mm_shuffle_ps(a5Min, b5Max, _MM_SHUFFLE(3, 1, 3, 1)); // 2648
    __m128 a6Min = _mm_min_ps(a6, b6);
    __m128 b6Max = _mm_max_ps(a6, b6);
    __m128 a7 = _mm_shuffle_ps(a6Min, b6Max, _MM_SHUFFLE(2, 0, 2, 0)); // 1324
    __m128 b7 = _mm_shuffle_ps(a6Min, b6Max, _MM_SHUFFLE(3, 1, 3, 1)); // 5768

    aOut = _mm_shuffle_ps(a7, a7, _MM_SHUFFLE(3, 1, 2, 0)); // 1234
    bOut = _mm_shuffle_ps(b7, b7, _MM_SHUFFLE(3, 1, 2, 0)); // 5678
}


The comments reflect where in the registers the conceptual input wires are located. 

Since this is in fact a sorting network, the order of the inputs isn't important, and the first pair of shuffles is unnecessary.  I've tried to figure out a way of getting rid of the final double shuffle, but I haven't been able to get rid of it.

Here's the assembly of the compilation without the initial shuffles:

    __m128 a1 = a;
000000013F1B1140  movaps      xmm2,xmmword ptr [rcx] 
    __m128 b1 = b;
    __m128 a1Min = _mm_min_ps(a1, b1);
000000013F1B1143  movaps      xmm1,xmm2 
    __m128 b1Max = _mm_max_ps(a1, b1);
000000013F1B1146  maxps       xmm2,xmmword ptr [rdx] 
000000013F1B1149  minps       xmm1,xmmword ptr [rdx] 
    __m128 a2 = _mm_shuffle_ps(a1Min, b1Max, _MM_SHUFFLE(2, 0, 2, 0)); // 1526
000000013F1B114C  movaps      xmm0,xmm1 
000000013F1B114F  shufps      xmm0,xmm2,88h 
    __m128 b2 = _mm_shuffle_ps(b1Max, a1Min, _MM_SHUFFLE(3, 1, 3, 1)); // 4837
000000013F1B1153  shufps      xmm2,xmm1,0DDh 
    __m128 a2Min = _mm_min_ps(a2, b2);
000000013F1B1157  movaps      xmm1,xmm0 
    __m128 b2Max = _mm_max_ps(a2, b2);
000000013F1B115A  maxps       xmm0,xmm2 
000000013F1B115D  minps       xmm1,xmm2 
    __m128 a3 = _mm_shuffle_ps(a2Min, b2Max, _MM_SHUFFLE(3, 2, 1, 0)); // 1537
000000013F1B1160  movaps      xmm2,xmm1 
    __m128 b3 = _mm_shuffle_ps(a2Min, b2Max, _MM_SHUFFLE(1, 0, 3, 2)); // 2648
000000013F1B1163  shufps      xmm1,xmm0,4Eh 
000000013F1B1167  shufps      xmm2,xmm0,0E4h 
    __m128 a3Min = _mm_min_ps(a3, b3);
000000013F1B116B  movaps      xmm0,xmm2 
    __m128 b3Max = _mm_max_ps(a3, b3);
000000013F1B116E  maxps       xmm2,xmm1 
000000013F1B1171  minps       xmm0,xmm1 
    __m128 a4 = _mm_shuffle_ps(a3Min, b3Max, _MM_SHUFFLE(2, 0, 2, 0)); // 1324
000000013F1B1174  movaps      xmm1,xmm0 
000000013F1B1177  shufps      xmm1,xmm2,88h 
    __m128 b4 = _mm_shuffle_ps(b3Max, a3Min, _MM_SHUFFLE(1, 3, 1, 3)); // 8675
000000013F1B117B  shufps      xmm2,xmm0,77h 
    __m128 a4Min = _mm_min_ps(a4, b4);
000000013F1B117F  movaps      xmm0,xmm1 
000000013F1B1182  minps       xmm0,xmm2 
    __m128 b4Max = _mm_max_ps(a4, b4);
000000013F1B1185  maxps       xmm1,xmm2 
    __m128 a5 = _mm_shuffle_ps(a4Min, b4Max, _MM_SHUFFLE(1, 3, 2, 0)); // 1256
000000013F1B1188  movaps      xmm2,xmm0 
000000013F1B118B  shufps      xmm2,xmm1,78h 
    __m128 b5 = _mm_shuffle_ps(a4Min, b4Max, _MM_SHUFFLE(0, 2, 3, 1)); // 3478
000000013F1B118F  shufps      xmm0,xmm1,2Dh 
    __m128 a5Min = _mm_min_ps(a5, b5);
000000013F1B1193  movaps      xmm1,xmm2 
    __m128 b5Max = _mm_max_ps(a5, b5);
000000013F1B1196  maxps       xmm2,xmm0 
000000013F1B1199  minps       xmm1,xmm0 
    __m128 a6 = _mm_shuffle_ps(a5Min, b5Max, _MM_SHUFFLE(2, 0, 2, 0)); // 1537
000000013F1B119C  movaps      xmm3,xmm1 
    __m128 b6 = _mm_shuffle_ps(a5Min, b5Max, _MM_SHUFFLE(3, 1, 3, 1)); // 2648
000000013F1B119F  shufps      xmm1,xmm2,0DDh 
000000013F1B11A3  shufps      xmm3,xmm2,88h 
    __m128 a6Min = _mm_min_ps(a6, b6);
000000013F1B11A7  movaps      xmm2,xmm3 
000000013F1B11AA  minps       xmm2,xmm1 
    __m128 b6Max = _mm_max_ps(a6, b6);
000000013F1B11AD  maxps       xmm3,xmm1 
    __m128 a7 = _mm_shuffle_ps(a6Min, b6Max, _MM_SHUFFLE(2, 0, 2, 0)); // 1324
000000013F1B11B0  movaps      xmm0,xmm2 
000000013F1B11B3  shufps      xmm0,xmm3,88h 
    __m128 b7 = _mm_shuffle_ps(a6Min, b6Max, _MM_SHUFFLE(3, 1, 3, 1)); // 5768
000000013F1B11B7  shufps      xmm2,xmm3,0DDh 

    aOut = _mm_shuffle_ps(a7, a7, _MM_SHUFFLE(3, 1, 2, 0)); // 1234
000000013F1B11BB  shufps      xmm0,xmm0,0D8h 
    bOut = _mm_shuffle_ps(b7, b7, _MM_SHUFFLE(3, 1, 2, 0)); // 5678
000000013F1B11BF  shufps      xmm2,xmm2,0D8h 
000000013F1B11C3  movaps      xmmword ptr [r8],xmm0 
000000013F1B11C7  movaps      xmmword ptr [r9],xmm2 
}
000000013F1B11CB  ret 


This fits snugly into 4 registers.  It looks pretty good.

No comments:

Post a Comment