A long time ago I promised to talk about a better way of calculating Levenshtein distance (edit distance). A way that is more conducive to vectorization.
The trick is to calculate the same edit distance matrix as the standard two row method, but instead fill in the values in a diagonal manner.
This ends up having each diagonal depending on the previous two diagonals. But importantly each diagonal does not depend on any other element of itself.
size_t Levenshtein_distance_diagonal_ANSI(const wchar_t* const s, const wchar_t* const t)
{
const unsigned int lenS = wcslen(s);
const unsigned int lenT = wcslen(t);
if (0 == lenS)
{
return lenT;
}
if (0 == lenT)
{
return lenS;
}
if (0 == wcscmp(s, t))
{
return 0;
}
// create work vectors of integer distances
unsigned int *v0 = new unsigned int[lenT + 1];
unsigned int *v1 = new unsigned int[lenT + 1];
unsigned int *v2 = new unsigned int[lenT + 1];
v1[0] = 0;
for (unsigned int j =1; j <= (lenS+lenT); j++)
{
unsigned int iMin;
unsigned int iMax;
v2[0] = j * 1;
iMin = (unsigned int)max(1, (int)(j - lenS));
if (j <= lenT)
{
v2[j] = j * 1;
iMax = j-1;
}
else
{
iMax = lenT;
}
for (unsigned int i = iMin; i <= iMax; i++)
{
unsigned int substituteCost = v0[i-1]+(s[j-i-1] != t[i-1]);
unsigned int insertionCost = v1[i-1]+1;
unsigned int deletionCost = v1[i]+1;
v2[i] = min(min(substituteCost, insertionCost), deletionCost);
}
unsigned int * const vTemp = v0;
v0 = v1;
v1 = v2;
v2 = vTemp;
}
unsigned int retVal = v1[lenT];
delete[] v0;
delete[] v1;
delete[] v2;
return retVal;
}
While this implementation is not any better than the two row implementation, we can see where this is going.
This also can work with storage only for two diagonals, overwriting a diagonal just after its value is read.
No comments:
Post a Comment