Source

whoosh / src / whoosh / support / levenshtein.py

"""
Contains functions implementing edit distance algorithms.
"""

from whoosh.compat import xrange


def levenshtein(seq1, seq2, limit=None):
    """
    Returns the Levenshtein edit distance between two strings.
    """

    oneago = None
    thisrow = list(xrange(1, len(seq2) + 1)) + [0]
    for x in xrange(len(seq1)):
        # Python lists wrap around for negative indices, so put the
        # leftmost column at the *end* of the list. This matches with
        # the zero-indexed strings and saves extra calculation.
        oneago, thisrow = thisrow, [0] * len(seq2) + [x + 1]
        for y in xrange(len(seq2)):
            delcost = oneago[y] + 1
            addcost = thisrow[y - 1] + 1
            subcost = oneago[y - 1] + (seq1[x] != seq2[y])
            thisrow[y] = min(delcost, addcost, subcost)

        if limit and x > limit and min(thisrow) > limit:
            return limit + 1

    return thisrow[len(seq2) - 1]


def damerau_levenshtein(seq1, seq2, limit=2, prefix=0):
    """
    Returns the Damerau-Levenshtein edit distance between two strings.
    """

    if prefix and seq1[:prefix] != seq2[:prefix]:
        return 0, limit + 1

    oneago = None
    thisrow = list(range(1, len(seq2) + 1)) + [0]
    x = 0
    for x in xrange(len(seq1)):
        # Python lists wrap around for negative indices, so put the
        # leftmost column at the *end* of the list. This matches with
        # the zero-indexed strings and saves extra calculation.
        twoago, oneago, thisrow = oneago, thisrow, [0] * len(seq2) + [x + 1]
        for y in xrange(len(seq2)):
            delcost = oneago[y] + 1
            addcost = thisrow[y - 1] + 1
            subcost = oneago[y - 1] + (seq1[x] != seq2[y])
            thisrow[y] = min(delcost, addcost, subcost)
            # This block deals with transpositions
            if (x > 0 and y > 0 and seq1[x] == seq2[y - 1]
                and seq1[x - 1] == seq2[y] and seq1[x] != seq2[y]):
                thisrow[y] = min(thisrow[y], twoago[y - 2] + 1)

        lowest = min(thisrow)
        if lowest > limit:
            return limit + 1

    return thisrow[len(seq2) - 1]


def relative(a, b):
    """
    Returns the relative distance between two strings, in the range
    [0-1] where 1 means total equality.
    """

    d = distance(a, b)
    longer = float(max((len(a), len(b))))
    shorter = float(min((len(a), len(b))))
    r = ((longer - d) / longer) * (shorter / longer)
    return r


distance = damerau_levenshtein