MicroPerf: Transposing a 16x16 bit matrix using SSE2 and AVX2.

Building on the 16x8 matrix transpose from a previous post, it's pretty easy to make a 16x16 matrix transpose.

void BitMatrixTranspose_16x16_SSE(const __m128i &aIn, const __m128i &bIn, __m128i & cOut, __m128i &dOut)
{

__m128i mLeft = BitMatrixTranspose_16x8_SSE(aIn);
__m128i mRight = BitMatrixTranspose_16x8_SSE(bIn);
cOut = _mm_unpacklo_epi8(mLeft, mRight);
dOut = _mm_unpackhi_epi8(mLeft, mRight);
}

Most new machines support AVX and AVX2 these days.

__m256i unpack8_AVX2(const __m256i&xIn)
{
__m256i mRet = _mm256_unpacklo_epi8(xIn, _mm256_shuffle_epi32(xIn, _MM_SHUFFLE(3, 2, 3, 2)));
return mRet;
}

__m256i repack8_AVX2(const __m256i&xIn)
{
return unpack8_SSE(unpack8_SSE(unpack8_SSE(xIn)));
}

__m256i BitMatrixTranspose_8x8_AVX2(const __m256i &xIn)
{
const __m256i c1 = _mm256_set1_epi64x(0xAA55AA55AA55AA55LL);
const __m256i c2 = _mm256_set1_epi64x(0x00AA00AA00AA00AALL);
const __m256i c3 = _mm256_set1_epi64x(0xCCCC3333CCCC3333LL);
const __m256i c4 = _mm256_set1_epi64x(0x0000CCCC0000CCCCLL);
const __m256i c5 = _mm256_set1_epi64x(0xF0F0F0F00F0F0F0FLL);
const __m256i c6 = _mm256_set1_epi64x(0x00000000F0F0F0F0LL);

const __m256i x1 = _mm256_or_si256(_mm256_and_si256(xIn, c1), _mm256_or_si256(_mm256_slli_epi64(_mm256_and_si256(xIn, c2), 7), _mm256_and_si256(_mm256_srli_epi64(xIn, 7), c2)));
const __m256i x2 = _mm256_or_si256(_mm256_and_si256(x1, c3), _mm256_or_si256(_mm256_slli_epi64(_mm256_and_si256(x1, c4), 14), _mm256_and_si256(_mm256_srli_epi64(x1, 14), c4)));
const __m256i x3 = _mm256_or_si256(_mm256_and_si256(x2, c5), _mm256_or_si256(_mm256_slli_epi64(_mm256_and_si256(x2, c6), 28), _mm256_and_si256(_mm256_srli_epi64(x2, 28), c6)));

return x3;
}

__m256i BitMatrixTranspose_16x16_AVX2(const __m256i &xIn)
{
__m256i mUnshuffle = repack8_AVX2(xIn);
__m256i mSmallTranspose= BitMatrixTranspose_8x8_AVX2(mUnshuffle);
__m256i mRet = unpack8_across_lanes_AVX2(mSmallTranspose);
return mRet;
}

This follows the same shuffle pattern used in SSE2. But the shuffle could be rewritten and simplified using the arbitrary byte shuffle instruction:

__m256i repack8_AVX2(const __m256i&xIn)
{
const __m256i xShuffleConst = _mm256_set_epi64x(0x0f0d0b0907050301LL, 0x0e0c0a0806040200LL, 0x0f0d0b0907050301LL, 0x0e0c0a0806040200LL);
__m256i mRet = _mm256_shuffle_epi8(xIn, xShuffleConst);
return mRet;
}

I would normally declare the above 256bit constant static. But it appears that MSVC doesn't handle static __m256i variables as well as it does static __m128i.

MicroPerf

Sunday, October 29, 2017

Transposing a 16x16 bit matrix using SSE2 and AVX2.

No comments:

Post a Comment