MicroPerf: Gauss-Jordan Elimination

Even after the last post, there's still a bunch of code necessary to get Gauss-Jordan elimination working. Most of it is pretty mundane nuts and bolts. But let's be thorough.

We need to multiply a vector in place by a runtime float:
void mul_ps_SSE2(float * const pA, size_t cAOrig, const float fac)
{
    size_t uiA = cAOrig;
    while (uiA & 3)
    {
        pA[--uiA] *= fac;
    }
    if (uiA)
    {
        __m128 mFac = _mm_set1_ps(fac);
        do {
            uiA -= 4;
            _mm_storeu_ps(pA + uiA,_mm_mul_ps(_mm_loadu_ps(pA + uiA), mFac));
        } while (uiA > 0);
    }
}
We have a need to add a vector multiplied by a runtime float to another vector:
void addVectorTimesConstant_ps_SSE2(float * const pDest, const float * const pProd, size_t cAOrig, const float dum)
{
    size_t uiA = cAOrig;
    while (uiA & 3)
    {
        uiA--;
        pDest[uiA] += pProd[uiA] * dum;
    }
    if (uiA)
    {
        const __m128 mDum = _mm_set1_ps(dum);
        do {
            uiA -= 4;
            __m128 mDest = _mm_loadu_ps(pDest + uiA);
            __m128 mProd = _mm_loadu_ps(pProd + uiA);
            __m128 mResult = _mm_add_ps(mDest, _mm_mul_ps(mProd, mDum));
            _mm_storeu_ps(pDest + uiA, mResult);
        } while (uiA > 0);
    }
}

While it's not huge, we can also SSE the assignment of a runtime int to a vector. Casting would allow this to be used for single precision floats.
void setVector_SSE2(int * const pDest, const size_t cAOrig, int iVal)
{
    size_t uiA = cAOrig;
    while (uiA & 3)
    {
        uiA--;
        pDest[uiA] = iVal;
    }
    if (uiA)
    {
        const __m128i mVal= _mm_set1_epi32(iVal);
        do {
            uiA -= 4;
            _mm_storeu_si128((__m128i *)pDest + uiA, mVal);
        } while (uiA > 0);
    }
}
Swapping the contents of two vectors isn't too difficult:

template <class T>
void swap(T &a, T&b)
{
    T temp = a;
    a = b;
    b = temp;
}
void swapArray_SSE2(int * const pA, int * const pB, const size_t cAOrig)
{
    size_t uiA = cAOrig;
    if (uiA & 1)
    {
        uiA--;
        swap(pA[uiA], pB[uiA]);
    }
    if (uiA & 2)
    {
        uiA-=2;
        swap(* reinterpret_cast<__int64*>(pA+uiA), *reinterpret_cast<__int64*>(pA + uiA));
    }
    while (uiA > 0)
    {
        uiA -= 4;
        __m128i mA = _mm_loadu_si128((__m128i*)(pA + uiA));
        __m128i mB = _mm_loadu_si128((__m128i*)(pB + uiA));
        _mm_storeu_si128((__m128i*)(pA + uiA), mB);
        _mm_storeu_si128((__m128i*)(pB + uiA), mA);
    }
}
void swapArray_SSE2(float * const pA, float * const pB, const size_t cAOrig)
{
    swapArray_SSE2(reinterpret_cast<int*>(pA), reinterpret_cast<int*>(pB), cAOrig);
}

With these additional helper method it isn't too difficult to translate the implementation of gaussj from Numerical Recipes.

void gaussj_SSE2(float **a, const UINT n, float **b, const UINT m)
{
    int *ipiv = (int*)malloc(n * sizeof(int));
    UINT *indxc = (UINT*)malloc(n * sizeof(UINT));
    UINT *indxr = (UINT*)malloc(n * sizeof(UINT));
    setVector_SSE2(ipiv, n, 0x7fffffff);
    for (size_t i = n; i-->0; i++)
    {
        float big = 0;
        size_t irow = 0, icol = 0;
        for (size_t j = n; j-- >0; )
        {
            if (ipiv[j])
            {
                const float *pfBestInRow = findLargestMaskedFloat_SSE2(ipiv, a[j], (UINT)n);
                float fBestValInRow = fabsf(*pfBestInRow);
                if (fBestValInRow > big)
                {
                    big = fBestValInRow;
                    irow = j;
                    icol = (size_t)(pfBestInRow - a[j]);
                }
            }
        }
        ipiv[icol] = 0;
        if (irow != icol)
        {
            swapArray_SSE2(a[irow], a[icol], n);
            swapArray_SSE2(b[irow], b[icol], m);
        }
        indxr[i] = (UINT)irow;
        indxc[i] = (UINT)icol;
        if (0 == a[icol][icol])
        {
            assert(0 && "gaussj : Singular Matrix");
            return;
        }
        float pivinv = 1.0f / a[icol][icol];
        a[icol][icol] = 1;
        mul_ps_SSE2(a[icol], n, pivinv);
        mul_ps_SSE2(b[icol], m, pivinv);
        for (size_t ll = n; ll-- >0; )
        {
            if (ll != icol)
            {
                float dum = -a[ll][icol];
                a[ll][icol] = 0;
                addVectorTimesConstant_ps_SSE2(a[ll], a[icol], n, dum);
                addVectorTimesConstant_ps_SSE2(b[ll], b[icol], m, dum);
            }
        }
    }
    for (size_t l = n ; l-- > 0; )
    {
        if (indxr[l] != indxc[l])
        {
            for (size_t k = 0; k < n; k++)
            {
                swap(a[k][indxr[l]], a[k][indxc[l]]);
            }
        }
    }
    free(indxc);
    free(indxr);
    free(ipiv);
}

MicroPerf

Monday, January 2, 2017

Gauss-Jordan Elimination - Part2

No comments:

Post a Comment