The previous sorting network did perform the sort in the ideal number of min/max operations. But there was a lot of shuffling. But this did strict ordering of the input wires. If instead we allow the input wires to be reordered to make the shuffles more efficient, we can do a little better.
The below is a bitonic sorting network performed in this manner. We shuffle to get the correct comparisons without regard to where in the registers the conceptual input wires are located. This verifies as correct with the standard 0/1 method.
The below network is done with floats, since all of the necessary intrisics are available in SSE2.
void BitonicSortingNetwork_SSE2(const __m128 &a, const __m128 &b, __m128 &aOut, __m128 &bOut)
{
__m128 a1 = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0)); // 1357
__m128 b1 = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1)); // 2468
__m128 a1Min = _mm_min_ps(a1, b1);
__m128 b1Max = _mm_max_ps(a1, b1);
__m128 a2 = _mm_shuffle_ps(a1Min, b1Max, _MM_SHUFFLE(2, 0, 2, 0)); // 1526
__m128 b2 = _mm_shuffle_ps(b1Max, a1Min, _MM_SHUFFLE(3, 1, 3, 1)); // 4837
__m128 a2Min = _mm_min_ps(a2, b2);
__m128 b2Max = _mm_max_ps(a2, b2);
__m128 a3 = _mm_shuffle_ps(a2Min, b2Max, _MM_SHUFFLE(3, 2, 1, 0)); // 1537
__m128 b3 = _mm_shuffle_ps(a2Min, b2Max, _MM_SHUFFLE(1, 0, 3, 2)); // 2648
__m128 a3Min = _mm_min_ps(a3, b3);
__m128 b3Max = _mm_max_ps(a3, b3);
__m128 a4 = _mm_shuffle_ps(a3Min, b3Max, _MM_SHUFFLE(2, 0, 2, 0)); // 1324
__m128 b4 = _mm_shuffle_ps(b3Max, a3Min, _MM_SHUFFLE(1, 3, 1, 3)); // 8675
__m128 a4Min = _mm_min_ps(a4, b4);
__m128 b4Max = _mm_max_ps(a4, b4);
__m128 a5 = _mm_shuffle_ps(a4Min, b4Max, _MM_SHUFFLE(1, 3, 2, 0)); // 1256
__m128 b5 = _mm_shuffle_ps(a4Min, b4Max, _MM_SHUFFLE(0, 2, 3, 1)); // 3478
__m128 a5Min = _mm_min_ps(a5, b5);
__m128 b5Max = _mm_max_ps(a5, b5);
__m128 a6 = _mm_shuffle_ps(a5Min, b5Max, _MM_SHUFFLE(2, 0, 2, 0)); // 1537
__m128 b6 = _mm_shuffle_ps(a5Min, b5Max, _MM_SHUFFLE(3, 1, 3, 1)); // 2648
__m128 a6Min = _mm_min_ps(a6, b6);
__m128 b6Max = _mm_max_ps(a6, b6);
__m128 a7 = _mm_shuffle_ps(a6Min, b6Max, _MM_SHUFFLE(2, 0, 2, 0)); // 1324
__m128 b7 = _mm_shuffle_ps(a6Min, b6Max, _MM_SHUFFLE(3, 1, 3, 1)); // 5768
aOut = _mm_shuffle_ps(a7, a7, _MM_SHUFFLE(3, 1, 2, 0)); // 1234
bOut = _mm_shuffle_ps(b7, b7, _MM_SHUFFLE(3, 1, 2, 0)); // 5678
}
The comments reflect where in the registers the conceptual input wires are located.
Since this is in fact a sorting network, the order of the inputs isn't important, and the first pair of shuffles is unnecessary. I've tried to figure out a way of getting rid of the final double shuffle, but I haven't been able to get rid of it.
Here's the assembly of the compilation without the initial shuffles:
__m128 a1 = a;
000000013F1B1140 movaps xmm2,xmmword ptr [rcx]
__m128 b1 = b;
__m128 a1Min = _mm_min_ps(a1, b1);
000000013F1B1143 movaps xmm1,xmm2
__m128 b1Max = _mm_max_ps(a1, b1);
000000013F1B1146 maxps xmm2,xmmword ptr [rdx]
000000013F1B1149 minps xmm1,xmmword ptr [rdx]
__m128 a2 = _mm_shuffle_ps(a1Min, b1Max, _MM_SHUFFLE(2, 0, 2, 0)); // 1526
000000013F1B114C movaps xmm0,xmm1
000000013F1B114F shufps xmm0,xmm2,88h
__m128 b2 = _mm_shuffle_ps(b1Max, a1Min, _MM_SHUFFLE(3, 1, 3, 1)); // 4837
000000013F1B1153 shufps xmm2,xmm1,0DDh
__m128 a2Min = _mm_min_ps(a2, b2);
000000013F1B1157 movaps xmm1,xmm0
__m128 b2Max = _mm_max_ps(a2, b2);
000000013F1B115A maxps xmm0,xmm2
000000013F1B115D minps xmm1,xmm2
__m128 a3 = _mm_shuffle_ps(a2Min, b2Max, _MM_SHUFFLE(3, 2, 1, 0)); // 1537
000000013F1B1160 movaps xmm2,xmm1
__m128 b3 = _mm_shuffle_ps(a2Min, b2Max, _MM_SHUFFLE(1, 0, 3, 2)); // 2648
000000013F1B1163 shufps xmm1,xmm0,4Eh
000000013F1B1167 shufps xmm2,xmm0,0E4h
__m128 a3Min = _mm_min_ps(a3, b3);
000000013F1B116B movaps xmm0,xmm2
__m128 b3Max = _mm_max_ps(a3, b3);
000000013F1B116E maxps xmm2,xmm1
000000013F1B1171 minps xmm0,xmm1
__m128 a4 = _mm_shuffle_ps(a3Min, b3Max, _MM_SHUFFLE(2, 0, 2, 0)); // 1324
000000013F1B1174 movaps xmm1,xmm0
000000013F1B1177 shufps xmm1,xmm2,88h
__m128 b4 = _mm_shuffle_ps(b3Max, a3Min, _MM_SHUFFLE(1, 3, 1, 3)); // 8675
000000013F1B117B shufps xmm2,xmm0,77h
__m128 a4Min = _mm_min_ps(a4, b4);
000000013F1B117F movaps xmm0,xmm1
000000013F1B1182 minps xmm0,xmm2
__m128 b4Max = _mm_max_ps(a4, b4);
000000013F1B1185 maxps xmm1,xmm2
__m128 a5 = _mm_shuffle_ps(a4Min, b4Max, _MM_SHUFFLE(1, 3, 2, 0)); // 1256
000000013F1B1188 movaps xmm2,xmm0
000000013F1B118B shufps xmm2,xmm1,78h
__m128 b5 = _mm_shuffle_ps(a4Min, b4Max, _MM_SHUFFLE(0, 2, 3, 1)); // 3478
000000013F1B118F shufps xmm0,xmm1,2Dh
__m128 a5Min = _mm_min_ps(a5, b5);
000000013F1B1193 movaps xmm1,xmm2
__m128 b5Max = _mm_max_ps(a5, b5);
000000013F1B1196 maxps xmm2,xmm0
000000013F1B1199 minps xmm1,xmm0
__m128 a6 = _mm_shuffle_ps(a5Min, b5Max, _MM_SHUFFLE(2, 0, 2, 0)); // 1537
000000013F1B119C movaps xmm3,xmm1
__m128 b6 = _mm_shuffle_ps(a5Min, b5Max, _MM_SHUFFLE(3, 1, 3, 1)); // 2648
000000013F1B119F shufps xmm1,xmm2,0DDh
000000013F1B11A3 shufps xmm3,xmm2,88h
__m128 a6Min = _mm_min_ps(a6, b6);
000000013F1B11A7 movaps xmm2,xmm3
000000013F1B11AA minps xmm2,xmm1
__m128 b6Max = _mm_max_ps(a6, b6);
000000013F1B11AD maxps xmm3,xmm1
__m128 a7 = _mm_shuffle_ps(a6Min, b6Max, _MM_SHUFFLE(2, 0, 2, 0)); // 1324
000000013F1B11B0 movaps xmm0,xmm2
000000013F1B11B3 shufps xmm0,xmm3,88h
__m128 b7 = _mm_shuffle_ps(a6Min, b6Max, _MM_SHUFFLE(3, 1, 3, 1)); // 5768
000000013F1B11B7 shufps xmm2,xmm3,0DDh
aOut = _mm_shuffle_ps(a7, a7, _MM_SHUFFLE(3, 1, 2, 0)); // 1234
000000013F1B11BB shufps xmm0,xmm0,0D8h
bOut = _mm_shuffle_ps(b7, b7, _MM_SHUFFLE(3, 1, 2, 0)); // 5678
000000013F1B11BF shufps xmm2,xmm2,0D8h
000000013F1B11C3 movaps xmmword ptr [r8],xmm0
000000013F1B11C7 movaps xmmword ptr [r9],xmm2
}
000000013F1B11CB ret
This fits snugly into 4 registers. It looks pretty good.
No comments:
Post a Comment