dayuan
/
manyi


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570
							"""This module implements the Lovins stemming algorithm. Use the ``stem()``
function::

    stemmed_word = stem(word)
"""

from collections import defaultdict


# Conditions

def A(base):
    # A   No restrictions on stem
    return True


def B(base):
    # B  Minimum stem length = 3
    return len(base) > 2


def C(base):
    # C  Minimum stem length = 4
    return len(base) > 3


def D(base):
    # D  Minimum stem length = 5
    return len(base) > 4


def E(base):
    # E  Do not remove ending after e
    return base[-1] != "e"


def F(base):
    # F  Minimum stem length = 3 and do not remove ending after e
    return len(base) > 2 and base[-1] != "e"


def G(base):
    # G  Minimum stem length = 3 and remove ending only after f
    return len(base) > 2 and base[-1] == "f"


def H(base):
    # H  Remove ending only after t or ll
    c1, c2 = base[-2:]
    return c2 == "t" or (c2 == "l" and c1 == "l")


def I(base):
    # I  Do not remove ending after o or e
    c = base[-1]
    return c != "o" and c != "e"


def J(base):
    # J  Do not remove ending after a or e
    c = base[-1]
    return c != "a" and c != "e"


def K(base):
    # K  Minimum stem length = 3 and remove ending only after l, i or u*e
    c = base[-1]
    cc = base[-3]
    return len(base) > 2 and (c == "l" or c == "i" or (c == "e" and cc == "u"))


def L(base):
    # L  Do not remove ending after u, x or s, unless s follows o
    c1, c2 = base[-2:]
    return c2 != "u" and c2 != "x" and (c2 != "s" or c1 == "o")


def M(base):
    # M  Do not remove ending after a, c, e or m
    c = base[-1]
    return c != "a" and c != "c" and c != "e" and c != "m"


def N(base):
    # N  Minimum stem length = 4 after s**, elsewhere = 3
    return len(base) > 3 or (len(base) == 3 and base[-1] != "s")


def O(base):
    # O  Remove ending only after l or i
    c = base[-1]
    return c == "l" or c == "i"


def P(base):
    # P  Do not remove ending after c
    return base[-1] != "c"


def Q(base):
    # Q  Minimum stem length = 3 and do not remove ending after l or n
    c = base[-1]
    return len(base) > 2 and (c != "l" and c != "n")


def R(base):
    # R  Remove ending only after n or r
    c = base[-1]
    return c == "n" or c == "r"


def S(base):
    # S  Remove ending only after dr or t, unless t follows t
    l2 = base[-2]
    return l2 == "rd" or (base[-1] == "t" and l2 != "tt")


def T(base):
    # T  Remove ending only after s or t, unless t follows o
    c1, c2 = base[-2:]
    return c2 == "s" or (c2 == "t" and c1 != "o")


def U(base):
    # U  Remove ending only after l, m, n or r
    c = base[-1]
    return c == "l" or c == "m" or c == "n" or c == "r"


def V(base):
    # V  Remove ending only after c
    return base[-1] == "c"


def W(base):
    # W  Do not remove ending after s or u
    c = base[-1]
    return c != "s" and c != "u"


def X(base):
    # X  Remove ending only after l, i or u*e
    c = base[-1]
    cc = base[-3]
    return c == "l" or c == "i" or (c == "e" and cc == "u")


def Y(base):
    # Y  Remove ending only after in
    return base[-2:] == "in"


def Z(base):
    # Z  Do not remove ending after f
    return base[-1] != "f"


def a(base):
    # a  Remove ending only after d, f, ph, th, l, er, or, es or t
    c = base[-1]
    l2 = base[-2:]
    return (c == "d" or c == "f" or l2 == "ph" or l2 == "th" or c == "l"
            or l2 == "er" or l2 == "or" or l2 == "es" or c == "t")


def b(base):
    # b  Minimum stem length = 3 and do not remove ending after met or ryst
    return len(base) > 2 and not (base.endswith("met")
                                  or base.endswith("ryst"))


def c(base):
    # c  Remove ending only after l
    return base[-1] == "l"


# Endings

m = [None] * 12

m[11] = dict((
        ("alistically", B),
        ("arizability", A),
        ("izationally", B)))
m[10] = dict((
        ("antialness", A),
        ("arisations", A),
        ("arizations", A),
        ("entialness", A)))
m[9] = dict((
        ("allically", C),
        ("antaneous", A),
        ("antiality", A),
        ("arisation", A),
        ("arization", A),
        ("ationally", B),
        ("ativeness", A),
        ("eableness", E),
        ("entations", A),
        ("entiality", A),
        ("entialize", A),
        ("entiation", A),
        ("ionalness", A),
        ("istically", A),
        ("itousness", A),
        ("izability", A),
        ("izational", A)))
m[8] = dict((
        ("ableness", A),
        ("arizable", A),
        ("entation", A),
        ("entially", A),
        ("eousness", A),
        ("ibleness", A),
        ("icalness", A),
        ("ionalism", A),
        ("ionality", A),
        ("ionalize", A),
        ("iousness", A),
        ("izations", A),
        ("lessness", A)))
m[7] = dict((
        ("ability", A),
        ("aically", A),
        ("alistic", B),
        ("alities", A),
        ("ariness", E),
        ("aristic", A),
        ("arizing", A),
        ("ateness", A),
        ("atingly", A),
        ("ational", B),
        ("atively", A),
        ("ativism", A),
        ("elihood", E),
        ("encible", A),
        ("entally", A),
        ("entials", A),
        ("entiate", A),
        ("entness", A),
        ("fulness", A),
        ("ibility", A),
        ("icalism", A),
        ("icalist", A),
        ("icality", A),
        ("icalize", A),
        ("ication", G),
        ("icianry", A),
        ("ination", A),
        ("ingness", A),
        ("ionally", A),
        ("isation", A),
        ("ishness", A),
        ("istical", A),
        ("iteness", A),
        ("iveness", A),
        ("ivistic", A),
        ("ivities", A),
        ("ization", F),
        ("izement", A),
        ("oidally", A),
        ("ousness", A)))
m[6] = dict((
        ("aceous", A),
        ("acious", B),
        ("action", G),
        ("alness", A),
        ("ancial", A),
        ("ancies", A),
        ("ancing", B),
        ("ariser", A),
        ("arized", A),
        ("arizer", A),
        ("atable", A),
        ("ations", B),
        ("atives", A),
        ("eature", Z),
        ("efully", A),
        ("encies", A),
        ("encing", A),
        ("ential", A),
        ("enting", C),
        ("entist", A),
        ("eously", A),
        ("ialist", A),
        ("iality", A),
        ("ialize", A),
        ("ically", A),
        ("icance", A),
        ("icians", A),
        ("icists", A),
        ("ifully", A),
        ("ionals", A),
        ("ionate", D),
        ("ioning", A),
        ("ionist", A),
        ("iously", A),
        ("istics", A),
        ("izable", E),
        ("lessly", A),
        ("nesses", A),
        ("oidism", A)))
m[5] = dict((
        ("acies", A),
        ("acity", A),
        ("aging", B),
        ("aical", A),
        ("alist", A),
        ("alism", B),
        ("ality", A),
        ("alize", A),
        ("allic", b),
        ("anced", B),
        ("ances", B),
        ("antic", C),
        ("arial", A),
        ("aries", A),
        ("arily", A),
        ("arity", B),
        ("arize", A),
        ("aroid", A),
        ("ately", A),
        ("ating", I),
        ("ation", B),
        ("ative", A),
        ("ators", A),
        ("atory", A),
        ("ature", E),
        ("early", Y),
        ("ehood", A),
        ("eless", A),
        ("elily", A),
        ("ement", A),
        ("enced", A),
        ("ences", A),
        ("eness", E),
        ("ening", E),
        ("ental", A),
        ("ented", C),
        ("ently", A),
        ("fully", A),
        ("ially", A),
        ("icant", A),
        ("ician", A),
        ("icide", A),
        ("icism", A),
        ("icist", A),
        ("icity", A),
        ("idine", I),
        ("iedly", A),
        ("ihood", A),
        ("inate", A),
        ("iness", A),
        ("ingly", B),
        ("inism", J),
        ("inity", c),
        ("ional", A),
        ("ioned", A),
        ("ished", A),
        ("istic", A),
        ("ities", A),
        ("itous", A),
        ("ively", A),
        ("ivity", A),
        ("izers", F),
        ("izing", F),
        ("oidal", A),
        ("oides", A),
        ("otide", A),
        ("ously", A)))
m[4] = dict((
        ("able", A),
        ("ably", A),
        ("ages", B),
        ("ally", B),
        ("ance", B),
        ("ancy", B),
        ("ants", B),
        ("aric", A),
        ("arly", K),
        ("ated", I),
        ("ates", A),
        ("atic", B),
        ("ator", A),
        ("ealy", Y),
        ("edly", E),
        ("eful", A),
        ("eity", A),
        ("ence", A),
        ("ency", A),
        ("ened", E),
        ("enly", E),
        ("eous", A),
        ("hood", A),
        ("ials", A),
        ("ians", A),
        ("ible", A),
        ("ibly", A),
        ("ical", A),
        ("ides", L),
        ("iers", A),
        ("iful", A),
        ("ines", M),
        ("ings", N),
        ("ions", B),
        ("ious", A),
        ("isms", B),
        ("ists", A),
        ("itic", H),
        ("ized", F),
        ("izer", F),
        ("less", A),
        ("lily", A),
        ("ness", A),
        ("ogen", A),
        ("ward", A),
        ("wise", A),
        ("ying", B),
        ("yish", A)))
m[3] = dict((
        ("acy", A),
        ("age", B),
        ("aic", A),
        ("als", b),
        ("ant", B),
        ("ars", O),
        ("ary", F),
        ("ata", A),
        ("ate", A),
        ("eal", Y),
        ("ear", Y),
        ("ely", E),
        ("ene", E),
        ("ent", C),
        ("ery", E),
        ("ese", A),
        ("ful", A),
        ("ial", A),
        ("ian", A),
        ("ics", A),
        ("ide", L),
        ("ied", A),
        ("ier", A),
        ("ies", P),
        ("ily", A),
        ("ine", M),
        ("ing", N),
        ("ion", Q),
        ("ish", C),
        ("ism", B),
        ("ist", A),
        ("ite", a),
        ("ity", A),
        ("ium", A),
        ("ive", A),
        ("ize", F),
        ("oid", A),
        ("one", R),
        ("ous", A)))
m[2] = dict((
        ("ae", A),
        ("al", b),
        ("ar", X),
        ("as", B),
        ("ed", E),
        ("en", F),
        ("es", E),
        ("ia", A),
        ("ic", A),
        ("is", A),
        ("ly", B),
        ("on", S),
        ("or", T),
        ("um", U),
        ("us", V),
        ("yl", R),
        ("s'", A),
        ("'s", A)))
m[1] = dict((
        ("a", A),
        ("e", A),
        ("i", A),
        ("o", A),
        ("s", W),
        ("y", B)))


def remove_ending(word):
    length = len(word)
    el = 11
    while el > 0:
        if length - el > 1:
            ending = word[length - el:]
            cond = m[el].get(ending)
            if cond:
                base = word[:length - el]
                if cond(base):
                    return base
        el -= 1
    return word


_endings = (("iev", "ief"),
            ("uct", "uc"),
            ("iev", "ief"),
            ("uct", "uc"),
            ("umpt", "um"),
            ("rpt", "rb"),
            ("urs", "ur"),
            ("istr", "ister"),
            ("metr", "meter"),
            ("olv", "olut"),
            ("ul", "l", "aoi"),
            ("bex", "bic"),
            ("dex", "dic"),
            ("pex", "pic"),
            ("tex", "tic"),
            ("ax", "ac"),
            ("ex", "ec"),
            ("ix", "ic"),
            ("lux", "luc"),
            ("uad", "uas"),
            ("vad", "vas"),
            ("cid", "cis"),
            ("lid", "lis"),
            ("erid", "eris"),
            ("pand", "pans"),
            ("end", "ens", "s"),
            ("ond", "ons"),
            ("lud", "lus"),
            ("rud", "rus"),
            ("her", "hes", "pt"),
            ("mit", "mis"),
            ("ent", "ens", "m"),
            ("ert", "ers"),
            ("et", "es", "n"),
            ("yt", "ys"),
            ("yz", "ys"))


# Hash the ending rules by the last letter of the target ending
_endingrules = defaultdict(list)
for rule in _endings:
    _endingrules[rule[0][-1]].append(rule)

_doubles = frozenset(("dd", "gg", "ll", "mm", "nn", "pp", "rr", "ss", "tt"))


def fix_ending(word):
    if word[-2:] in _doubles:
        word = word[:-1]

    for endingrule in _endingrules[word[-1]]:
        target, newend = endingrule[:2]
        if word.endswith(target):
            if len(endingrule) > 2:
                exceptafter = endingrule[2]
                c = word[0 - (len(target) + 1)]
                if c in exceptafter:
                    return word

            return word[:0 - len(target)] + newend

    return word


def stem(word):
    """Returns the stemmed version of the argument string.
    """
    return fix_ending(remove_ending(word))