123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465 |
- from .bases import _StandardStemmer
- from whoosh.compat import u
- class EnglishStemmer(_StandardStemmer):
- """
- The English Snowball stemmer.
- :cvar __vowels: The English vowels.
- :type __vowels: unicode
- :cvar __double_consonants: The English double consonants.
- :type __double_consonants: tuple
- :cvar __li_ending: Letters that may directly appear before a word final 'li'.
- :type __li_ending: unicode
- :cvar __step0_suffixes: Suffixes to be deleted in step 0 of the algorithm.
- :type __step0_suffixes: tuple
- :cvar __step1a_suffixes: Suffixes to be deleted in step 1a of the algorithm.
- :type __step1a_suffixes: tuple
- :cvar __step1b_suffixes: Suffixes to be deleted in step 1b of the algorithm.
- :type __step1b_suffixes: tuple
- :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
- :type __step2_suffixes: tuple
- :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm.
- :type __step3_suffixes: tuple
- :cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm.
- :type __step4_suffixes: tuple
- :cvar __step5_suffixes: Suffixes to be deleted in step 5 of the algorithm.
- :type __step5_suffixes: tuple
- :cvar __special_words: A dictionary containing words
- which have to be stemmed specially.
- :type __special_words: dict
- :note: A detailed description of the English
- stemming algorithm can be found under
- http://snowball.tartarus.org/algorithms/english/stemmer.html
- """
- __vowels = "aeiouy"
- __double_consonants = ("bb", "dd", "ff", "gg", "mm", "nn",
- "pp", "rr", "tt")
- __li_ending = "cdeghkmnrt"
- __step0_suffixes = ("'s'", "'s", "'")
- __step1a_suffixes = ("sses", "ied", "ies", "us", "ss", "s")
- __step1b_suffixes = ("eedly", "ingly", "edly", "eed", "ing", "ed")
- __step2_suffixes = ('ization', 'ational', 'fulness', 'ousness',
- 'iveness', 'tional', 'biliti', 'lessli',
- 'entli', 'ation', 'alism', 'aliti', 'ousli',
- 'iviti', 'fulli', 'enci', 'anci', 'abli',
- 'izer', 'ator', 'alli', 'bli', 'ogi', 'li')
- __step3_suffixes = ('ational', 'tional', 'alize', 'icate', 'iciti',
- 'ative', 'ical', 'ness', 'ful')
- __step4_suffixes = ('ement', 'ance', 'ence', 'able', 'ible', 'ment',
- 'ant', 'ent', 'ism', 'ate', 'iti', 'ous',
- 'ive', 'ize', 'ion', 'al', 'er', 'ic')
- __step5_suffixes = ("e", "l")
- __special_words = {"skis": "ski",
- "skies": "sky",
- "dying": "die",
- "lying": "lie",
- "tying": "tie",
- "idly": "idl",
- "gently": "gentl",
- "ugly": "ugli",
- "early": "earli",
- "only": "onli",
- "singly": "singl",
- "sky": "sky",
- "news": "news",
- "howe": "howe",
- "atlas": "atlas",
- "cosmos": "cosmos",
- "bias": "bias",
- "andes": "andes",
- "inning": "inning",
- "innings": "inning",
- "outing": "outing",
- "outings": "outing",
- "canning": "canning",
- "cannings": "canning",
- "herring": "herring",
- "herrings": "herring",
- "earring": "earring",
- "earrings": "earring",
- "proceed": "proceed",
- "proceeds": "proceed",
- "proceeded": "proceed",
- "proceeding": "proceed",
- "exceed": "exceed",
- "exceeds": "exceed",
- "exceeded": "exceed",
- "exceeding": "exceed",
- "succeed": "succeed",
- "succeeds": "succeed",
- "succeeded": "succeed",
- "succeeding": "succeed"}
- def stem(self, word):
- """
- Stem an English word and return the stemmed form.
- :param word: The word that is stemmed.
- :type word: str or unicode
- :return: The stemmed form.
- :rtype: unicode
- """
- word = word.lower()
- if word in self.__special_words:
- return self.__special_words[word]
- # Map the different apostrophe characters to a single consistent one
- word = (word.replace(u("\u2019"), u("\x27"))
- .replace(u("\u2018"), u("\x27"))
- .replace(u("\u201B"), u("\x27")))
- if word.startswith(u("\x27")):
- word = word[1:]
- if word.startswith("y"):
- word = "".join(("Y", word[1:]))
- for i in range(1, len(word)):
- if word[i - 1] in self.__vowels and word[i] == "y":
- word = "".join((word[:i], "Y", word[i + 1:]))
- step1a_vowel_found = False
- step1b_vowel_found = False
- r1 = ""
- r2 = ""
- if word.startswith(("gener", "commun", "arsen")):
- if word.startswith(("gener", "arsen")):
- r1 = word[5:]
- else:
- r1 = word[6:]
- for i in range(1, len(r1)):
- if r1[i] not in self.__vowels and r1[i - 1] in self.__vowels:
- r2 = r1[i + 1:]
- break
- else:
- r1, r2 = self._r1r2_standard(word, self.__vowels)
- # STEP 0
- for suffix in self.__step0_suffixes:
- if word.endswith(suffix):
- word = word[:-len(suffix)]
- r1 = r1[:-len(suffix)]
- r2 = r2[:-len(suffix)]
- break
- # STEP 1a
- for suffix in self.__step1a_suffixes:
- if word.endswith(suffix):
- if suffix == "sses":
- word = word[:-2]
- r1 = r1[:-2]
- r2 = r2[:-2]
- elif suffix in ("ied", "ies"):
- if len(word[:-len(suffix)]) > 1:
- word = word[:-2]
- r1 = r1[:-2]
- r2 = r2[:-2]
- else:
- word = word[:-1]
- r1 = r1[:-1]
- r2 = r2[:-1]
- elif suffix == "s":
- for letter in word[:-2]:
- if letter in self.__vowels:
- step1a_vowel_found = True
- break
- if step1a_vowel_found:
- word = word[:-1]
- r1 = r1[:-1]
- r2 = r2[:-1]
- break
- # STEP 1b
- for suffix in self.__step1b_suffixes:
- if word.endswith(suffix):
- if suffix in ("eed", "eedly"):
- if r1.endswith(suffix):
- word = "".join((word[:-len(suffix)], "ee"))
- if len(r1) >= len(suffix):
- r1 = "".join((r1[:-len(suffix)], "ee"))
- else:
- r1 = ""
- if len(r2) >= len(suffix):
- r2 = "".join((r2[:-len(suffix)], "ee"))
- else:
- r2 = ""
- else:
- for letter in word[:-len(suffix)]:
- if letter in self.__vowels:
- step1b_vowel_found = True
- break
- if step1b_vowel_found:
- word = word[:-len(suffix)]
- r1 = r1[:-len(suffix)]
- r2 = r2[:-len(suffix)]
- if word.endswith(("at", "bl", "iz")):
- word = "".join((word, "e"))
- r1 = "".join((r1, "e"))
- if len(word) > 5 or len(r1) >= 3:
- r2 = "".join((r2, "e"))
- elif word.endswith(self.__double_consonants):
- word = word[:-1]
- r1 = r1[:-1]
- r2 = r2[:-1]
- elif ((r1 == "" and len(word) >= 3 and
- word[-1] not in self.__vowels and
- word[-1] not in "wxY" and
- word[-2] in self.__vowels and
- word[-3] not in self.__vowels)
- or
- (r1 == "" and len(word) == 2 and
- word[0] in self.__vowels and
- word[1] not in self.__vowels)):
- word = "".join((word, "e"))
- if len(r1) > 0:
- r1 = "".join((r1, "e"))
- if len(r2) > 0:
- r2 = "".join((r2, "e"))
- break
- # STEP 1c
- if (len(word) > 2
- and word[-1] in "yY"
- and word[-2] not in self.__vowels):
- word = "".join((word[:-1], "i"))
- if len(r1) >= 1:
- r1 = "".join((r1[:-1], "i"))
- else:
- r1 = ""
- if len(r2) >= 1:
- r2 = "".join((r2[:-1], "i"))
- else:
- r2 = ""
- # STEP 2
- for suffix in self.__step2_suffixes:
- if word.endswith(suffix):
- if r1.endswith(suffix):
- if suffix == "tional":
- word = word[:-2]
- r1 = r1[:-2]
- r2 = r2[:-2]
- elif suffix in ("enci", "anci", "abli"):
- word = "".join((word[:-1], "e"))
- if len(r1) >= 1:
- r1 = "".join((r1[:-1], "e"))
- else:
- r1 = ""
- if len(r2) >= 1:
- r2 = "".join((r2[:-1], "e"))
- else:
- r2 = ""
- elif suffix == "entli":
- word = word[:-2]
- r1 = r1[:-2]
- r2 = r2[:-2]
- elif suffix in ("izer", "ization"):
- word = "".join((word[:-len(suffix)], "ize"))
- if len(r1) >= len(suffix):
- r1 = "".join((r1[:-len(suffix)], "ize"))
- else:
- r1 = ""
- if len(r2) >= len(suffix):
- r2 = "".join((r2[:-len(suffix)], "ize"))
- else:
- r2 = ""
- elif suffix in ("ational", "ation", "ator"):
- word = "".join((word[:-len(suffix)], "ate"))
- if len(r1) >= len(suffix):
- r1 = "".join((r1[:-len(suffix)], "ate"))
- else:
- r1 = ""
- if len(r2) >= len(suffix):
- r2 = "".join((r2[:-len(suffix)], "ate"))
- else:
- r2 = "e"
- elif suffix in ("alism", "aliti", "alli"):
- word = "".join((word[:-len(suffix)], "al"))
- if len(r1) >= len(suffix):
- r1 = "".join((r1[:-len(suffix)], "al"))
- else:
- r1 = ""
- if len(r2) >= len(suffix):
- r2 = "".join((r2[:-len(suffix)], "al"))
- else:
- r2 = ""
- elif suffix == "fulness":
- word = word[:-4]
- r1 = r1[:-4]
- r2 = r2[:-4]
- elif suffix in ("ousli", "ousness"):
- word = "".join((word[:-len(suffix)], "ous"))
- if len(r1) >= len(suffix):
- r1 = "".join((r1[:-len(suffix)], "ous"))
- else:
- r1 = ""
- if len(r2) >= len(suffix):
- r2 = "".join((r2[:-len(suffix)], "ous"))
- else:
- r2 = ""
- elif suffix in ("iveness", "iviti"):
- word = "".join((word[:-len(suffix)], "ive"))
- if len(r1) >= len(suffix):
- r1 = "".join((r1[:-len(suffix)], "ive"))
- else:
- r1 = ""
- if len(r2) >= len(suffix):
- r2 = "".join((r2[:-len(suffix)], "ive"))
- else:
- r2 = "e"
- elif suffix in ("biliti", "bli"):
- word = "".join((word[:-len(suffix)], "ble"))
- if len(r1) >= len(suffix):
- r1 = "".join((r1[:-len(suffix)], "ble"))
- else:
- r1 = ""
- if len(r2) >= len(suffix):
- r2 = "".join((r2[:-len(suffix)], "ble"))
- else:
- r2 = ""
- elif suffix == "ogi" and word[-4] == "l":
- word = word[:-1]
- r1 = r1[:-1]
- r2 = r2[:-1]
- elif suffix in ("fulli", "lessli"):
- word = word[:-2]
- r1 = r1[:-2]
- r2 = r2[:-2]
- elif suffix == "li" and word[-3] in self.__li_ending:
- word = word[:-2]
- r1 = r1[:-2]
- r2 = r2[:-2]
- break
- # STEP 3
- for suffix in self.__step3_suffixes:
- if word.endswith(suffix):
- if r1.endswith(suffix):
- if suffix == "tional":
- word = word[:-2]
- r1 = r1[:-2]
- r2 = r2[:-2]
- elif suffix == "ational":
- word = "".join((word[:-len(suffix)], "ate"))
- if len(r1) >= len(suffix):
- r1 = "".join((r1[:-len(suffix)], "ate"))
- else:
- r1 = ""
- if len(r2) >= len(suffix):
- r2 = "".join((r2[:-len(suffix)], "ate"))
- else:
- r2 = ""
- elif suffix == "alize":
- word = word[:-3]
- r1 = r1[:-3]
- r2 = r2[:-3]
- elif suffix in ("icate", "iciti", "ical"):
- word = "".join((word[:-len(suffix)], "ic"))
- if len(r1) >= len(suffix):
- r1 = "".join((r1[:-len(suffix)], "ic"))
- else:
- r1 = ""
- if len(r2) >= len(suffix):
- r2 = "".join((r2[:-len(suffix)], "ic"))
- else:
- r2 = ""
- elif suffix in ("ful", "ness"):
- word = word[:-len(suffix)]
- r1 = r1[:-len(suffix)]
- r2 = r2[:-len(suffix)]
- elif suffix == "ative" and r2.endswith(suffix):
- word = word[:-5]
- r1 = r1[:-5]
- r2 = r2[:-5]
- break
- # STEP 4
- for suffix in self.__step4_suffixes:
- if word.endswith(suffix):
- if r2.endswith(suffix):
- if suffix == "ion":
- if word[-4] in "st":
- word = word[:-3]
- r1 = r1[:-3]
- r2 = r2[:-3]
- else:
- word = word[:-len(suffix)]
- r1 = r1[:-len(suffix)]
- r2 = r2[:-len(suffix)]
- break
- # STEP 5
- if r2.endswith("l") and word[-2] == "l":
- word = word[:-1]
- elif r2.endswith("e"):
- word = word[:-1]
- elif r1.endswith("e"):
- if len(word) >= 4 and (word[-2] in self.__vowels or
- word[-2] in "wxY" or
- word[-3] not in self.__vowels or
- word[-4] in self.__vowels):
- word = word[:-1]
- word = word.replace("Y", "y")
- return word
|