Levenshtein(莱文斯坦)编辑距离算法实现
1.C++版本(含三个benchmark)
#include <iostream>
#include <string>
#include <vector>
using namespace std;
int minimum(int first, int second, int third)
{
int former = ;
if (first < second)
{
former = first;
}
else
{
former = second;
}
if (former < third)
{
return former;
}
else
{
return third;
}
}
int getLevenshteinDistance(string firstStr, string secondStr)
{
int firstStrLen = firstStr.length();
int secondStrLen = secondStr.length();
vector<vector<int>> levenshteinDistanceTable;
for (int firstIndex = ; firstIndex <= firstStrLen; firstIndex++)
{
vector<int> levenshteinRow(secondStrLen+);
levenshteinDistanceTable.push_back(levenshteinRow);
levenshteinDistanceTable[firstIndex][] = firstIndex;
}
for (int secondIndex = ; secondIndex <= secondStrLen; secondIndex++)
{
levenshteinDistanceTable[][secondIndex] = secondIndex;
}
int lastCharCost = ;
for (int firstIndex = ; firstIndex <= firstStrLen; firstIndex++)
{
for (int secondIndex = ; secondIndex <= secondStrLen; secondIndex++)
{
if (firstStr[firstIndex-] == secondStr[secondIndex-])
{
lastCharCost = ;
}
else
{
lastCharCost = ;
}
levenshteinDistanceTable[firstIndex][secondIndex] = minimum(levenshteinDistanceTable[firstIndex - ][secondIndex] + , levenshteinDistanceTable[firstIndex][secondIndex - ] + , levenshteinDistanceTable[firstIndex - ][secondIndex - ] + lastCharCost);
}
}
return levenshteinDistanceTable[firstStrLen][secondStrLen];
}
int main(int argc, char * * argv, char * * env)
{
string firstStr1 = "sitting";
string secondStr1 = "kitten";
cout << "levenstein distance of " << firstStr1 << " and " << secondStr1 << " is:" << getLevenshteinDistance(firstStr1, secondStr1) << endl;
string firstStr2 = "Saturday";
string secondStr2 = "Sunday";
cout << "levenstein distance of " << firstStr2 << " and " << secondStr2 << " is:" << getLevenshteinDistance(firstStr2, secondStr2) << endl;
string firstStr3 = "levenshtein";
string secondStr3 = "meilenstein";
cout << "levenstein distance of " << firstStr3 << " and " << secondStr3 << " is:" << getLevenshteinDistance(firstStr3, secondStr3) << endl;
char ch;
cin >> ch;
return ;
}
2.awk版本1(表格法)
function minimum(first, second, third)
{
if(first < second)
{
former = first;
}
else
{
former = second;
}
if(former < third)
{
return former;
}
else
{
return third;
}
}
function getLevenshteinDistance(firstStr, secondStr)
{
firstStrLen = length(firstStr);
secondStrLen = length(secondStr);
for(secondIndex = ; secondIndex <= secondStrLen; secondIndex++)
{
levenshteinDistanceTable[, secondIndex] = secondIndex;
}
for(firstIndex = ; firstIndex <= firstStrLen; firstIndex++)
{
levenshteinDistanceTable[firstIndex, ] = firstIndex;
}
for(firstIndex = ; firstIndex <= firstStrLen; firstIndex++)
{
for(secondIndex = ; secondIndex <= secondStrLen; secondIndex++)
{
if(match(substr(firstStr, firstIndex, ), substr(secondStr, secondIndex, )) > )
{
lastCharCost = ;
}
else
{
lastCharCost = ;
}
levenshteinDistanceTable[firstIndex, secondIndex] = minimum(levenshteinDistanceTable[firstIndex-, secondIndex] + , levenshteinDistanceTable[firstIndex, secondIndex-] + , levenshteinDistanceTable[firstIndex-, secondIndex-] + lastCharCost);
}
}
return levenshteinDistanceTable[firstStrLen, secondStrLen];
}
2.awk版本1(递归法,存在重叠子问题重复计算问题,性能较低)
function minimum(first, second, third)
{
if(first < second)
{
former = first;
}
else
{
former = second;
}
if(former < third)
{
return former;
}
else
{
return third;
}
}
function getLevenshteinDistance(firstStr, firstLen, secondStr, secondLen)
{
lastCost = ;
if(firstLen == )
{
return secondLen;
}
if(secondLen == )
{
return firstLen;
}
if(match(substr(firstStr, firstLen, ), substr(secondStr, secondLen, )) > )
{
lastCost = ;
}
else
{
lastCost = ;
}
first = getLevenshteinDistance(firstStr, firstLen-, secondStr, secondLen) + ;
second = getLevenshteinDistance(firstStr, firstLen, secondStr, secondLen-) + ;
third = getLevenshteinDistance(firstStr, firstLen-, secondStr, secondLen-) + lastCost;
return minimum(first, second, third);
}