Even after the last post, there's still a bunch of code necessary to get Gauss-Jordan elimination working. Most of it is pretty mundane nuts and bolts. But let's be thorough.
We need to multiply a vector in place by a runtime float:
void mul_ps_SSE2(float * const pA, size_t cAOrig, const float fac)
{
size_t uiA = cAOrig;
while (uiA & 3)
{
pA[--uiA] *= fac;
}
if (uiA)
{
__m128 mFac = _mm_set1_ps(fac);
do {
uiA -= 4;
_mm_storeu_ps(pA + uiA,_mm_mul_ps(_mm_loadu_ps(pA + uiA), mFac));
} while (uiA > 0);
}
}
We have a need to add a vector multiplied by a runtime float to another vector:
void addVectorTimesConstant_ps_SSE2(float * const pDest, const float * const pProd, size_t cAOrig, const float dum)
{
size_t uiA = cAOrig;
while (uiA & 3)
{
uiA--;
pDest[uiA] += pProd[uiA] * dum;
}
if (uiA)
{
const __m128 mDum = _mm_set1_ps(dum);
do {
uiA -= 4;
__m128 mDest = _mm_loadu_ps(pDest + uiA);
__m128 mProd = _mm_loadu_ps(pProd + uiA);
__m128 mResult = _mm_add_ps(mDest, _mm_mul_ps(mProd, mDum));
_mm_storeu_ps(pDest + uiA, mResult);
} while (uiA > 0);
}
}
While it's not huge, we can also SSE the assignment of a runtime int to a vector. Casting would allow this to be used for single precision floats.
void setVector_SSE2(int * const pDest, const size_t cAOrig, int iVal)
{
size_t uiA = cAOrig;
while (uiA & 3)
{
uiA--;
pDest[uiA] = iVal;
}
if (uiA)
{
const __m128i mVal= _mm_set1_epi32(iVal);
do {
uiA -= 4;
_mm_storeu_si128((__m128i *)pDest + uiA, mVal);
} while (uiA > 0);
}
}
Swapping the contents of two vectors isn't too difficult:
template <class T>
void swap(T &a, T&b)
{
T temp = a;
a = b;
b = temp;
}
void swapArray_SSE2(int * const pA, int * const pB, const size_t cAOrig)
{
size_t uiA = cAOrig;
if (uiA & 1)
{
uiA--;
swap(pA[uiA], pB[uiA]);
}
if (uiA & 2)
{
uiA-=2;
swap(* reinterpret_cast<__int64*>(pA+uiA), *reinterpret_cast<__int64*>(pA + uiA));
}
while (uiA > 0)
{
uiA -= 4;
__m128i mA = _mm_loadu_si128((__m128i*)(pA + uiA));
__m128i mB = _mm_loadu_si128((__m128i*)(pB + uiA));
_mm_storeu_si128((__m128i*)(pA + uiA), mB);
_mm_storeu_si128((__m128i*)(pB + uiA), mA);
}
}
void swapArray_SSE2(float * const pA, float * const pB, const size_t cAOrig)
{
swapArray_SSE2(reinterpret_cast<int*>(pA), reinterpret_cast<int*>(pB), cAOrig);
}
With these additional helper method it isn't too difficult to translate the implementation of gaussj from Numerical Recipes.
void gaussj_SSE2(float **a, const UINT n, float **b, const UINT m)
{
int *ipiv = (int*)malloc(n * sizeof(int));
UINT *indxc = (UINT*)malloc(n * sizeof(UINT));
UINT *indxr = (UINT*)malloc(n * sizeof(UINT));
setVector_SSE2(ipiv, n, 0x7fffffff);
for (size_t i = n; i-->0; i++)
{
float big = 0;
size_t irow = 0, icol = 0;
for (size_t j = n; j-- >0; )
{
if (ipiv[j])
{
const float *pfBestInRow = findLargestMaskedFloat_SSE2(ipiv, a[j], (UINT)n);
float fBestValInRow = fabsf(*pfBestInRow);
if (fBestValInRow > big)
{
big = fBestValInRow;
irow = j;
icol = (size_t)(pfBestInRow - a[j]);
}
}
}
ipiv[icol] = 0;
if (irow != icol)
{
swapArray_SSE2(a[irow], a[icol], n);
swapArray_SSE2(b[irow], b[icol], m);
}
indxr[i] = (UINT)irow;
indxc[i] = (UINT)icol;
if (0 == a[icol][icol])
{
assert(0 && "gaussj : Singular Matrix");
return;
}
float pivinv = 1.0f / a[icol][icol];
a[icol][icol] = 1;
mul_ps_SSE2(a[icol], n, pivinv);
mul_ps_SSE2(b[icol], m, pivinv);
for (size_t ll = n; ll-- >0; )
{
if (ll != icol)
{
float dum = -a[ll][icol];
a[ll][icol] = 0;
addVectorTimesConstant_ps_SSE2(a[ll], a[icol], n, dum);
addVectorTimesConstant_ps_SSE2(b[ll], b[icol], m, dum);
}
}
}
for (size_t l = n ; l-- > 0; )
{
if (indxr[l] != indxc[l])
{
for (size_t k = 0; k < n; k++)
{
swap(a[k][indxr[l]], a[k][indxc[l]]);
}
}
}
free(indxc);
free(indxr);
free(ipiv);
}
No comments:
Post a Comment