MicroPerf: Gauss-Jordan elimination

Gauss-Jordan elimination is taught in high school algebra class. There's an implementation of it in Numerical Recipes, that is used as part of the general linear regression.

While the book identifies that this isn't the greatest method for solving equations, it is instructive since we are familiar with the code.

The first chunk identifies the largest magnitude element that hasn't already been processed:
        float big = 0;
        int irow = 0, icol = 0;
        for (int j = 0; j < n; j++)
        {
            if (ipiv[j] == 0)
            {
                for (int k = 0; k < n; k++)
                {
                    if (ipiv[k] == 0)
                    {
                        if (fabs(a[j][k]) >= big)
                        {
                            big = fabsf(a[j][k]);
                            irow = j;
                            icol = k;
                        }
                    }
                }
            }
        }
        ipiv[icol] ++;

If we combine the fabs with the conditional this code looks a lot like the code for finding the maximum integer in an unsorted list.

One pattern that was mentioned before is masking away the sign bit to take the absolute value. We can combine this with masking away disallowed values.

const float * findLargestMaskedFloat_Ansi(const int *pMask, const float * const pA, const UINT cAOrig)
{
    if (cAOrig <= 0)
    {
        return nullptr;
    }
    UINT cARemaining = cAOrig;
    cARemaining--;
    float fBestVal;
    int &iBestVal = *reinterpret_cast<int *>(&fBestVal);
    float fOnVal;
    int &iOnVal = *reinterpret_cast<int *>(&fOnVal);
    iBestVal = pMask[cARemaining] & *reinterpret_cast<const int *>(&pA+ cARemaining);
    const float *pBest = pA + cARemaining;
    while (cARemaining > 0)
    {
        cARemaining--;
        iOnVal = pMask[cARemaining] & *reinterpret_cast<const int *>(&pA + cARemaining);
        if (fOnVal > fBestVal)
        {
            pBest = pA + cARemaining;
            iBestVal = iOnVal;
        }
    }
    return pBest;
}
The SSE code will look a lot like the maximum value code from a previous post:

const float *findLargestMaskedFloat_SSE2(const int *pMask, const float *pA, UINT cAOrig)
{
    if (cAOrig >= 4)
    {
        UINT uiA = cAOrig;
        __m128 mBestValues = _mm_and_ps(_mm_loadu_ps((const float *)pMask), _mm_loadu_ps(pA));
        __m128i mBestIndexBase = _mm_setzero_si128();
        while (uiA > 4)
        {
            uiA -= 4;
            __m128 mValues = _mm_and_ps(_mm_loadu_ps((const float *)&pMask[uiA]), _mm_loadu_ps(&pA[uiA]));
            __m128 mCompare = _mm_cmpgt_ps(mValues, mBestValues);
            if (_mm_movemask_ps(mCompare))
            {
                __m128i mIndexBase = _mm_set1_epi32(uiA);
                mBestValues = _mm_blend_ps_SSE2(mValues, mBestValues, mCompare);
                mBestIndexBase = _mm_blend_si128_SSE2(mIndexBase, mBestIndexBase, _mm_castps_si128(mCompare));
            }
        }
        __m128i mBestIndex = _mm_add_epi32(mBestIndexBase, _mm_setr_epi32(0, 1, 2, 3));
        __m128 mValues;
        __m128 mCompare;
        mValues = _mm_castsi128_ps( _mm_shuffle_epi32(_mm_castps_si128(mBestValues), _MM_SHUFFLE(2, 3, 2, 3)));
        mCompare = _mm_cmpgt_ps(mValues, mBestValues);
        mBestValues = _mm_blend_ps_SSE2(mValues, mBestValues, mCompare);
        mBestIndex = _mm_blend_si128_SSE2(_mm_shuffle_epi32(mBestIndex, _MM_SHUFFLE(2, 3, 2, 3)), mBestIndex, _mm_castps_si128(mCompare));
        mValues = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(mBestValues), _MM_SHUFFLE(1, 1, 1, 1)));
        mCompare = _mm_cmpgt_ps(mValues, mBestValues);
        // mBestValues = _mm_blend_si128_SSE2(mValues, mBestValues, mCompare);
        mBestIndex = _mm_blend_si128_SSE2(_mm_shuffle_epi32(mBestIndex, _MM_SHUFFLE(1, 1, 1, 1)), mBestIndex, _mm_castps_si128(mCompare));
        return pA + _mm_cvtsi128_si32(mBestIndex);
    }
    return findLargestMaskedFloat_Ansi(pMask, pA, cAOrig);
}
This will require a little more initialization.
    for (int i = n; i-- > 0;)
    {
        ipiv[i] = 0x7fffffff;
    }
...
    float big = 0;
    int irow = 0, icol = 0;
    for (int j = 0; j < n; j++)
    {
        if (ipiv[j] )
        {
            const float *pfBestInRow = findLargestMaskedFloat_SSE2(ipiv, a[j], n);
            float fBestValInRow = fabsf(*pfBestInRow);
            if (fBestValInRow > big)
            {
                big = fBestValInRow;
                irow = j;
                icol = pfBestInRow-a[j];
            }
        }
    }
    ipiv[icol] =0;

The initialization code looks like it could be made SSE tighter. But let's tackle the next chunks in a future post.

MicroPerf

Sunday, January 1, 2017

Gauss-Jordan elimination - Part1

No comments:

Post a Comment