MicroPerf: Levenshtein distance using SSSE3 and SSE41

Levenshtein distance is a measure of how close two strings are to each other. The distance is the count of additions, deletions, and substitutions necessary to transform one string into another.

The following is a fairly faithful translation from the code in the Wikipedia article into C++.

template <typename T>
void swap(T & a, T& b)
{
    T temp = a;
    a = b;
    b = temp;
}

size_t Levenshtein_distance_ANSI(LPCWSTR s, LPCWSTR t)
{
    size_t lenS = wcslen(s);
    size_t lenT = wcslen(t);
    if ((0 == lenS) || (0== lenT) || (0==wcscmp(s,t)))
    {
        return 0;
    }
    // create two work vectors of integer distances
    int *v0 = new int[lenT+1];
    int *v1 = new int[lenT+1];
    // initialize v0 (the previous row of distances)
    // this row is A[0][i]: edit distance for an empty s
    // the distance is just the number of characters to delete from t
    for (size_t i = 0; i <= lenT; i++)
    {
        v0[i] = i;
    }
    for (size_t i = 0; i <lenS; i++)
    {
        // calculate v1 (current row distances) from the previous row v0
        // first element of v1 is A[i+1][0]
        //   edit distance is delete (i+1) chars from s to match empty t
        v1[0] = i + 1;
        // use formula to fill in the rest of the row
        for (int j = 0; j <lenT; j++)
        {
            int cost = (s[i] != t[j]);
            int right = v1[j] + 1;
            int down = v0[j + 1] + 1;
            int diag = v0[j] + cost;
            v1[j + 1] = min(min(right, down), diag);
        }
        // swap v1 (current row) to v0 (previous row) for next iteration
        swap(v0, v1);
    }
    size_t retVal = v0[lenT];
    delete[] v0;
    delete[] v1;
    return retVal;
}

I have made one minor change. The code above swaps the pointers instead of recopying from the destination row back into the source row.

size_t Levenshtein_distance_SSE42(LPCWSTR s, LPCWSTR t)
{
    size_t lenS = wcslen(s);
    size_t lenT = wcslen(t);
    if ((0 == lenS) || (0 == lenT) || (0 == wcscmp(s, t)))
    {
        return 0;
    }
    // create two work vectors of integer distances
    int *v0 = new int[lenT + 1];
    int *v1 = new int[lenT + 1];
    // initialize v0 (the previous row of distances)
    // this row is A[0][i]: edit distance for an empty s
    // the distance is just the number of characters to delete from t
    for (size_t i = 0; i <= lenT; i++)
    {
        v0[i] = i;
    }
    const __m128i mOne = _mm_set1_epi32(1);
    const __m128i mTwo = _mm_set1_epi32(2);
    const __m128i mFour = _mm_set1_epi32(4);
    for (size_t i = 0; i <lenS; i++)
    {
        // calculate v1 (current row distances) from the previous row v0
        // first element of v1 is A[i+1][0]
        //   edit distance is delete (i+1) chars from s to match empty t
        v1[0] = i + 1;
        // use formula to fill in the rest of the
        // first handle the first elements so that the remainder is a multiple of 4
        size_t j = 0;
        while ((lenT - j) &3)
        {
            int cost = (s[i] != t[j]);
            int right = v1[j] + 1;
            int diag = v0[j] + cost;
            j++;
            int down = v0[j] + 1;
            v1[j] = min(min(right, down), diag);
        }
        // Any left over at this point can be handled in blocks of four elements.
        if (j<lenT)
        {
            __m128i msi = _mm_set1_epi16(s[i]);
            __m128i mPrevBest = _mm_set1_epi32(v1[j]);
            do {
                __m128i v0lo = _mm_loadu_si128((__m128i *)&v0[j]);
                __m128i v0loOffset = _mm_loadu_si128((__m128i *)&v0[j + 1]);
                __m128i mtj = _mm_set1_epi64x(*(__int64 *)&t[j]);
                __m128i mstCmp = _mm_cmpeq_epi16(msi, mtj);
                __m128i mCost = _mm_add_epi32(mOne, _mm_unpacklo_epi16(mstCmp, mstCmp));
                __m128i mDiag = _mm_add_epi32(v0lo, mCost);
                __m128i mDown = _mm_add_epi32(mOne, v0loOffset);
                __m128i mMinDiagDown = _mm_min_epi32(mDiag, mDown);
                __m128i mRightOne = _mm_min_epi32(mMinDiagDown, _mm_add_epi32(mOne, _mm_alignr_epi8(mMinDiagDown, mPrevBest, 12)));
                __m128i mRightTwo = _mm_add_epi32(mTwo, _mm_alignr_epi8(mRightOne, mPrevBest, 8));
                __m128i mRightFour = _mm_add_epi32(mFour, mPrevBest);
                __m128i mBest = _mm_min_epi32(mRightOne, _mm_min_epi32(mRightFour, mRightTwo));
                mPrevBest = mBest;
                _mm_storeu_si128((__m128i *)&v1[j + 1], mBest);
                j += 4;
            } while (j<lenT);
        }
        // swap v1 (current row) to v0 (previous row) for next iteration
        swap(v0, v1);
    }
    size_t retVal = v0[lenT];
    delete[] v0;
    delete[] v1;
    return retVal;
}

This is a pretty faithful reimplementation of the first version. It takes about 1/3 the wall-clock time of the ansi implementation. Note that there are two reads from memory per inner loop iteration. This can be addressed by reading one group of four ahead.

The horizontal minimums are performed by doing a min against the block shifted left one element, then by two elements, then by four elements, each plus a constant.

Other implementation I see on the internet use a different algorithm based on calculation along the diagonal. A bit harder to understand. But we will revisit that later.

MicroPerf

Sunday, May 7, 2017

Levenshtein distance using SSSE3 and SSE41

No comments:

Post a Comment