levenshtein.py 2.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970
  1. """
  2. Contains functions implementing edit distance algorithms.
  3. """
  4. from whoosh.compat import xrange
  5. def levenshtein(seq1, seq2, limit=None):
  6. """Returns the Levenshtein edit distance between two strings.
  7. """
  8. oneago = None
  9. thisrow = list(range(1, len(seq2) + 1)) + [0]
  10. for x in xrange(len(seq1)):
  11. # Python lists wrap around for negative indices, so put the
  12. # leftmost column at the *end* of the list. This matches with
  13. # the zero-indexed strings and saves extra calculation.
  14. oneago, thisrow = thisrow, [0] * len(seq2) + [x + 1]
  15. for y in xrange(len(seq2)):
  16. delcost = oneago[y] + 1
  17. addcost = thisrow[y - 1] + 1
  18. subcost = oneago[y - 1] + (seq1[x] != seq2[y])
  19. thisrow[y] = min(delcost, addcost, subcost)
  20. if limit and x > limit and min(thisrow) > limit:
  21. return limit + 1
  22. return thisrow[len(seq2) - 1]
  23. def damerau_levenshtein(seq1, seq2, limit=None):
  24. """Returns the Damerau-Levenshtein edit distance between two strings.
  25. """
  26. oneago = None
  27. thisrow = list(range(1, len(seq2) + 1)) + [0]
  28. for x in xrange(len(seq1)):
  29. # Python lists wrap around for negative indices, so put the
  30. # leftmost column at the *end* of the list. This matches with
  31. # the zero-indexed strings and saves extra calculation.
  32. twoago, oneago, thisrow = oneago, thisrow, [0] * len(seq2) + [x + 1]
  33. for y in xrange(len(seq2)):
  34. delcost = oneago[y] + 1
  35. addcost = thisrow[y - 1] + 1
  36. subcost = oneago[y - 1] + (seq1[x] != seq2[y])
  37. thisrow[y] = min(delcost, addcost, subcost)
  38. # This block deals with transpositions
  39. if (x > 0 and y > 0 and seq1[x] == seq2[y - 1]
  40. and seq1[x - 1] == seq2[y] and seq1[x] != seq2[y]):
  41. thisrow[y] = min(thisrow[y], twoago[y - 2] + 1)
  42. if limit and x > limit and min(thisrow) > limit:
  43. return limit + 1
  44. return thisrow[len(seq2) - 1]
  45. def relative(a, b):
  46. """Returns the relative distance between two strings, in the range
  47. [0-1] where 1 means total equality.
  48. """
  49. d = distance(a, b)
  50. longer = float(max((len(a), len(b))))
  51. shorter = float(min((len(a), len(b))))
  52. r = ((longer - d) / longer) * (shorter / longer)
  53. return r
  54. distance = damerau_levenshtein