While comparing two rectangles for equality is a nice place to start, it doesn't really provide much benefit over a standard memcmp.
But let's look at a problem that can benefit from the parallel nature of SSE2.
IsRectEmpty is a simple but useful function.
In standard Ansi code it looks something like:
BOOL IsRectEmpty_Ansi(const RECT *pThis)
{
return (pThis->top >= pThis->bottom) || (pThis->left >= pThis->right);
}
But we can do a little better with SSE2:
__forceinline __m128i m128i_from_RECT_SSE2(__in_ecount(1) const RECT *prc)
{
return _mm_loadu_si128((__m128i*)prc);
}
BOOL IsRectEmpty_SSE2(const RECT *pThis)
{
__m128i rthis = m128i_from_RECT_SSE2(pThis);
// rthis=[pThis->left,pThis->top,pThis->right,pThis->bottom]
__m128i mTltl = _mm_shuffle_epi32(rthis, _MM_SHUFFLE(1,0,1,0));
// mTltl =[pThis->left,pThis->top,pThis->left,pThis->top]
__m128i mCompare = _mm_cmpgt_epi32(rthis, mTltl);
// mCompare=[0,0,(pThis->right>pThis->left),(pThis->bottom>pThis->top)]
int iCompare = _mm_movemask_epi32(mCompare);
// iCompare = 4*(pThis->right>pThis->left)+8*(pThis->bottom>pThis->top)
return 12 ^ iCompare;
}
This has a single large read from memory. It is branchless.
This same technique can be used for another useful rectangle function, PtInRect
__forceinline __m128i m128i_twice_from_point_SSE2(const POINT *ppt)
{
return _mm_castpd_si128(_mm_load1_pd((double*)ppt));
// return [ppt->x,ppt->y,ppt->x,ppt->y]
}
BOOL PtInRect_SSE2(const RECT *pThis, const POINT *ppt)
{
__m128i rthis = m128i_from_RECT_SSE2(pThis);
// rrc=[this->left,this->top,this->right,this->bottom]
__m128i ptpt = m128i_twice_from_point_SSE2(ppt);
// ptpt=[ppt->x,ppt->y,ppt->x,ppt->y]
__m128i allComparisons = _mm_cmplt_epi32(ptpt, rthis);
// looking for b0011
bool bRet = (12 == _mm_movemask_ps(_mm_castsi128_ps(allComparisons)));
// the _mm_movemask_ps takes the top bit (sign) of each of the four 32bit numbers
// (doesn't matter that the intrinsic is designed for floats)
// this returns a 4 bit number where the four bits will be:
// !(ppt->x >= pThis->left)
// !(ppt->y >= pThis->top)
// ppt->x < pThis->right
// ppt->y < pThis->bottom
// So we want the bottom two bits to be zero and the top two bits to be one, meaning that the four bit number
// should be b0011=12.
return bRet;
}
viewing the generated assembly for this makes me happy:
movdqu xmm0,xmmword ptr [rcx]
xor eax,eax
movsd xmm1,mmword ptr [rdx]
shufpd xmm1,xmm1,0
pcmpgtd xmm0,xmm1
movmskps ecx,xmm0
cmp ecx,0Ch
sete al
ret
No comments:
Post a Comment