Sunday, November 13, 2016

Let's do something interesting with a RECT

While comparing two rectangles for equality is a nice place to start, it doesn't really provide much benefit over a standard memcmp.

But let's look at a problem that can benefit from the parallel nature of SSE2.

IsRectEmpty is a simple but useful function.

In standard Ansi code it looks something like:

BOOL IsRectEmpty_Ansi(const RECT *pThis)
{
 return (pThis->top >= pThis->bottom) || (pThis->left >= pThis->right);
}


But we can do a little better with SSE2:

__forceinline __m128i m128i_from_RECT_SSE2(__in_ecount(1) const RECT *prc)
{
    return _mm_loadu_si128((__m128i*)prc);
}


BOOL IsRectEmpty_SSE2(const RECT *pThis)
{
 __m128i rthis = m128i_from_RECT_SSE2(pThis);
 // rthis=[pThis->left,pThis->top,pThis->right,pThis->bottom]
 __m128i mTltl = _mm_shuffle_epi32(rthis, _MM_SHUFFLE(1,0,1,0));
 // mTltl =[pThis->left,pThis->top,pThis->left,pThis->top]
 __m128i mCompare = _mm_cmpgt_epi32(rthis, mTltl);
 // mCompare=[0,0,(pThis->right>pThis->left),(pThis->bottom>pThis->top)]
 int iCompare = _mm_movemask_epi32(mCompare);
 // iCompare = 4*(pThis->right>pThis->left)+8*(pThis->bottom>pThis->top)
 return 12 ^ iCompare;
}


This has a single large read from memory.  It is branchless.

This same technique can be used for another useful rectangle function, PtInRect

__forceinline __m128i m128i_twice_from_point_SSE2(const POINT *ppt)
{
    return _mm_castpd_si128(_mm_load1_pd((double*)ppt));
    // return [ppt->x,ppt->y,ppt->x,ppt->y]
}

BOOL PtInRect_SSE2(const RECT *pThis, const POINT *ppt)
{
    __m128i rthis = m128i_from_RECT_SSE2(pThis);
    // rrc=[this->left,this->top,this->right,this->bottom]


    __m128i ptpt = m128i_twice_from_point_SSE2(ppt);
 // ptpt=[ppt->x,ppt->y,ppt->x,ppt->y]


    __m128i allComparisons = _mm_cmplt_epi32(ptpt, rthis);
// looking for b0011
    bool bRet = (12 == _mm_movemask_ps(_mm_castsi128_ps(allComparisons)));
    // the _mm_movemask_ps takes the top bit (sign) of each of the four 32bit numbers
    // (doesn't matter that the intrinsic is designed for floats)
    // this returns a 4 bit number where the four bits will be:
    // !(ppt->x >= pThis->left)
    // !(ppt->y >= pThis->top)
    //   ppt->x < pThis->right
    //   ppt->y < pThis->bottom
    // So we want the bottom two bits to be zero and the top two bits to be one, meaning that the four bit number
    // should be b0011=12.

    return bRet;
}

 viewing the generated assembly for this makes me happy:

movdqu      xmm0,xmmword ptr [rcx] 
xor         eax,eax 
movsd       xmm1,mmword ptr [rdx] 
shufpd      xmm1,xmm1,0 
pcmpgtd     xmm0,xmm1 
movmskps    ecx,xmm0 
cmp         ecx,0Ch 
sete        al 
ret 

No comments:

Post a Comment