english.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465
  1. from .bases import _StandardStemmer
  2. from whoosh.compat import u
  3. class EnglishStemmer(_StandardStemmer):
  4. """
  5. The English Snowball stemmer.
  6. :cvar __vowels: The English vowels.
  7. :type __vowels: unicode
  8. :cvar __double_consonants: The English double consonants.
  9. :type __double_consonants: tuple
  10. :cvar __li_ending: Letters that may directly appear before a word final 'li'.
  11. :type __li_ending: unicode
  12. :cvar __step0_suffixes: Suffixes to be deleted in step 0 of the algorithm.
  13. :type __step0_suffixes: tuple
  14. :cvar __step1a_suffixes: Suffixes to be deleted in step 1a of the algorithm.
  15. :type __step1a_suffixes: tuple
  16. :cvar __step1b_suffixes: Suffixes to be deleted in step 1b of the algorithm.
  17. :type __step1b_suffixes: tuple
  18. :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
  19. :type __step2_suffixes: tuple
  20. :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm.
  21. :type __step3_suffixes: tuple
  22. :cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm.
  23. :type __step4_suffixes: tuple
  24. :cvar __step5_suffixes: Suffixes to be deleted in step 5 of the algorithm.
  25. :type __step5_suffixes: tuple
  26. :cvar __special_words: A dictionary containing words
  27. which have to be stemmed specially.
  28. :type __special_words: dict
  29. :note: A detailed description of the English
  30. stemming algorithm can be found under
  31. http://snowball.tartarus.org/algorithms/english/stemmer.html
  32. """
  33. __vowels = "aeiouy"
  34. __double_consonants = ("bb", "dd", "ff", "gg", "mm", "nn",
  35. "pp", "rr", "tt")
  36. __li_ending = "cdeghkmnrt"
  37. __step0_suffixes = ("'s'", "'s", "'")
  38. __step1a_suffixes = ("sses", "ied", "ies", "us", "ss", "s")
  39. __step1b_suffixes = ("eedly", "ingly", "edly", "eed", "ing", "ed")
  40. __step2_suffixes = ('ization', 'ational', 'fulness', 'ousness',
  41. 'iveness', 'tional', 'biliti', 'lessli',
  42. 'entli', 'ation', 'alism', 'aliti', 'ousli',
  43. 'iviti', 'fulli', 'enci', 'anci', 'abli',
  44. 'izer', 'ator', 'alli', 'bli', 'ogi', 'li')
  45. __step3_suffixes = ('ational', 'tional', 'alize', 'icate', 'iciti',
  46. 'ative', 'ical', 'ness', 'ful')
  47. __step4_suffixes = ('ement', 'ance', 'ence', 'able', 'ible', 'ment',
  48. 'ant', 'ent', 'ism', 'ate', 'iti', 'ous',
  49. 'ive', 'ize', 'ion', 'al', 'er', 'ic')
  50. __step5_suffixes = ("e", "l")
  51. __special_words = {"skis": "ski",
  52. "skies": "sky",
  53. "dying": "die",
  54. "lying": "lie",
  55. "tying": "tie",
  56. "idly": "idl",
  57. "gently": "gentl",
  58. "ugly": "ugli",
  59. "early": "earli",
  60. "only": "onli",
  61. "singly": "singl",
  62. "sky": "sky",
  63. "news": "news",
  64. "howe": "howe",
  65. "atlas": "atlas",
  66. "cosmos": "cosmos",
  67. "bias": "bias",
  68. "andes": "andes",
  69. "inning": "inning",
  70. "innings": "inning",
  71. "outing": "outing",
  72. "outings": "outing",
  73. "canning": "canning",
  74. "cannings": "canning",
  75. "herring": "herring",
  76. "herrings": "herring",
  77. "earring": "earring",
  78. "earrings": "earring",
  79. "proceed": "proceed",
  80. "proceeds": "proceed",
  81. "proceeded": "proceed",
  82. "proceeding": "proceed",
  83. "exceed": "exceed",
  84. "exceeds": "exceed",
  85. "exceeded": "exceed",
  86. "exceeding": "exceed",
  87. "succeed": "succeed",
  88. "succeeds": "succeed",
  89. "succeeded": "succeed",
  90. "succeeding": "succeed"}
  91. def stem(self, word):
  92. """
  93. Stem an English word and return the stemmed form.
  94. :param word: The word that is stemmed.
  95. :type word: str or unicode
  96. :return: The stemmed form.
  97. :rtype: unicode
  98. """
  99. word = word.lower()
  100. if word in self.__special_words:
  101. return self.__special_words[word]
  102. # Map the different apostrophe characters to a single consistent one
  103. word = (word.replace(u("\u2019"), u("\x27"))
  104. .replace(u("\u2018"), u("\x27"))
  105. .replace(u("\u201B"), u("\x27")))
  106. if word.startswith(u("\x27")):
  107. word = word[1:]
  108. if word.startswith("y"):
  109. word = "".join(("Y", word[1:]))
  110. for i in range(1, len(word)):
  111. if word[i - 1] in self.__vowels and word[i] == "y":
  112. word = "".join((word[:i], "Y", word[i + 1:]))
  113. step1a_vowel_found = False
  114. step1b_vowel_found = False
  115. r1 = ""
  116. r2 = ""
  117. if word.startswith(("gener", "commun", "arsen")):
  118. if word.startswith(("gener", "arsen")):
  119. r1 = word[5:]
  120. else:
  121. r1 = word[6:]
  122. for i in range(1, len(r1)):
  123. if r1[i] not in self.__vowels and r1[i - 1] in self.__vowels:
  124. r2 = r1[i + 1:]
  125. break
  126. else:
  127. r1, r2 = self._r1r2_standard(word, self.__vowels)
  128. # STEP 0
  129. for suffix in self.__step0_suffixes:
  130. if word.endswith(suffix):
  131. word = word[:-len(suffix)]
  132. r1 = r1[:-len(suffix)]
  133. r2 = r2[:-len(suffix)]
  134. break
  135. # STEP 1a
  136. for suffix in self.__step1a_suffixes:
  137. if word.endswith(suffix):
  138. if suffix == "sses":
  139. word = word[:-2]
  140. r1 = r1[:-2]
  141. r2 = r2[:-2]
  142. elif suffix in ("ied", "ies"):
  143. if len(word[:-len(suffix)]) > 1:
  144. word = word[:-2]
  145. r1 = r1[:-2]
  146. r2 = r2[:-2]
  147. else:
  148. word = word[:-1]
  149. r1 = r1[:-1]
  150. r2 = r2[:-1]
  151. elif suffix == "s":
  152. for letter in word[:-2]:
  153. if letter in self.__vowels:
  154. step1a_vowel_found = True
  155. break
  156. if step1a_vowel_found:
  157. word = word[:-1]
  158. r1 = r1[:-1]
  159. r2 = r2[:-1]
  160. break
  161. # STEP 1b
  162. for suffix in self.__step1b_suffixes:
  163. if word.endswith(suffix):
  164. if suffix in ("eed", "eedly"):
  165. if r1.endswith(suffix):
  166. word = "".join((word[:-len(suffix)], "ee"))
  167. if len(r1) >= len(suffix):
  168. r1 = "".join((r1[:-len(suffix)], "ee"))
  169. else:
  170. r1 = ""
  171. if len(r2) >= len(suffix):
  172. r2 = "".join((r2[:-len(suffix)], "ee"))
  173. else:
  174. r2 = ""
  175. else:
  176. for letter in word[:-len(suffix)]:
  177. if letter in self.__vowels:
  178. step1b_vowel_found = True
  179. break
  180. if step1b_vowel_found:
  181. word = word[:-len(suffix)]
  182. r1 = r1[:-len(suffix)]
  183. r2 = r2[:-len(suffix)]
  184. if word.endswith(("at", "bl", "iz")):
  185. word = "".join((word, "e"))
  186. r1 = "".join((r1, "e"))
  187. if len(word) > 5 or len(r1) >= 3:
  188. r2 = "".join((r2, "e"))
  189. elif word.endswith(self.__double_consonants):
  190. word = word[:-1]
  191. r1 = r1[:-1]
  192. r2 = r2[:-1]
  193. elif ((r1 == "" and len(word) >= 3 and
  194. word[-1] not in self.__vowels and
  195. word[-1] not in "wxY" and
  196. word[-2] in self.__vowels and
  197. word[-3] not in self.__vowels)
  198. or
  199. (r1 == "" and len(word) == 2 and
  200. word[0] in self.__vowels and
  201. word[1] not in self.__vowels)):
  202. word = "".join((word, "e"))
  203. if len(r1) > 0:
  204. r1 = "".join((r1, "e"))
  205. if len(r2) > 0:
  206. r2 = "".join((r2, "e"))
  207. break
  208. # STEP 1c
  209. if (len(word) > 2
  210. and word[-1] in "yY"
  211. and word[-2] not in self.__vowels):
  212. word = "".join((word[:-1], "i"))
  213. if len(r1) >= 1:
  214. r1 = "".join((r1[:-1], "i"))
  215. else:
  216. r1 = ""
  217. if len(r2) >= 1:
  218. r2 = "".join((r2[:-1], "i"))
  219. else:
  220. r2 = ""
  221. # STEP 2
  222. for suffix in self.__step2_suffixes:
  223. if word.endswith(suffix):
  224. if r1.endswith(suffix):
  225. if suffix == "tional":
  226. word = word[:-2]
  227. r1 = r1[:-2]
  228. r2 = r2[:-2]
  229. elif suffix in ("enci", "anci", "abli"):
  230. word = "".join((word[:-1], "e"))
  231. if len(r1) >= 1:
  232. r1 = "".join((r1[:-1], "e"))
  233. else:
  234. r1 = ""
  235. if len(r2) >= 1:
  236. r2 = "".join((r2[:-1], "e"))
  237. else:
  238. r2 = ""
  239. elif suffix == "entli":
  240. word = word[:-2]
  241. r1 = r1[:-2]
  242. r2 = r2[:-2]
  243. elif suffix in ("izer", "ization"):
  244. word = "".join((word[:-len(suffix)], "ize"))
  245. if len(r1) >= len(suffix):
  246. r1 = "".join((r1[:-len(suffix)], "ize"))
  247. else:
  248. r1 = ""
  249. if len(r2) >= len(suffix):
  250. r2 = "".join((r2[:-len(suffix)], "ize"))
  251. else:
  252. r2 = ""
  253. elif suffix in ("ational", "ation", "ator"):
  254. word = "".join((word[:-len(suffix)], "ate"))
  255. if len(r1) >= len(suffix):
  256. r1 = "".join((r1[:-len(suffix)], "ate"))
  257. else:
  258. r1 = ""
  259. if len(r2) >= len(suffix):
  260. r2 = "".join((r2[:-len(suffix)], "ate"))
  261. else:
  262. r2 = "e"
  263. elif suffix in ("alism", "aliti", "alli"):
  264. word = "".join((word[:-len(suffix)], "al"))
  265. if len(r1) >= len(suffix):
  266. r1 = "".join((r1[:-len(suffix)], "al"))
  267. else:
  268. r1 = ""
  269. if len(r2) >= len(suffix):
  270. r2 = "".join((r2[:-len(suffix)], "al"))
  271. else:
  272. r2 = ""
  273. elif suffix == "fulness":
  274. word = word[:-4]
  275. r1 = r1[:-4]
  276. r2 = r2[:-4]
  277. elif suffix in ("ousli", "ousness"):
  278. word = "".join((word[:-len(suffix)], "ous"))
  279. if len(r1) >= len(suffix):
  280. r1 = "".join((r1[:-len(suffix)], "ous"))
  281. else:
  282. r1 = ""
  283. if len(r2) >= len(suffix):
  284. r2 = "".join((r2[:-len(suffix)], "ous"))
  285. else:
  286. r2 = ""
  287. elif suffix in ("iveness", "iviti"):
  288. word = "".join((word[:-len(suffix)], "ive"))
  289. if len(r1) >= len(suffix):
  290. r1 = "".join((r1[:-len(suffix)], "ive"))
  291. else:
  292. r1 = ""
  293. if len(r2) >= len(suffix):
  294. r2 = "".join((r2[:-len(suffix)], "ive"))
  295. else:
  296. r2 = "e"
  297. elif suffix in ("biliti", "bli"):
  298. word = "".join((word[:-len(suffix)], "ble"))
  299. if len(r1) >= len(suffix):
  300. r1 = "".join((r1[:-len(suffix)], "ble"))
  301. else:
  302. r1 = ""
  303. if len(r2) >= len(suffix):
  304. r2 = "".join((r2[:-len(suffix)], "ble"))
  305. else:
  306. r2 = ""
  307. elif suffix == "ogi" and word[-4] == "l":
  308. word = word[:-1]
  309. r1 = r1[:-1]
  310. r2 = r2[:-1]
  311. elif suffix in ("fulli", "lessli"):
  312. word = word[:-2]
  313. r1 = r1[:-2]
  314. r2 = r2[:-2]
  315. elif suffix == "li" and word[-3] in self.__li_ending:
  316. word = word[:-2]
  317. r1 = r1[:-2]
  318. r2 = r2[:-2]
  319. break
  320. # STEP 3
  321. for suffix in self.__step3_suffixes:
  322. if word.endswith(suffix):
  323. if r1.endswith(suffix):
  324. if suffix == "tional":
  325. word = word[:-2]
  326. r1 = r1[:-2]
  327. r2 = r2[:-2]
  328. elif suffix == "ational":
  329. word = "".join((word[:-len(suffix)], "ate"))
  330. if len(r1) >= len(suffix):
  331. r1 = "".join((r1[:-len(suffix)], "ate"))
  332. else:
  333. r1 = ""
  334. if len(r2) >= len(suffix):
  335. r2 = "".join((r2[:-len(suffix)], "ate"))
  336. else:
  337. r2 = ""
  338. elif suffix == "alize":
  339. word = word[:-3]
  340. r1 = r1[:-3]
  341. r2 = r2[:-3]
  342. elif suffix in ("icate", "iciti", "ical"):
  343. word = "".join((word[:-len(suffix)], "ic"))
  344. if len(r1) >= len(suffix):
  345. r1 = "".join((r1[:-len(suffix)], "ic"))
  346. else:
  347. r1 = ""
  348. if len(r2) >= len(suffix):
  349. r2 = "".join((r2[:-len(suffix)], "ic"))
  350. else:
  351. r2 = ""
  352. elif suffix in ("ful", "ness"):
  353. word = word[:-len(suffix)]
  354. r1 = r1[:-len(suffix)]
  355. r2 = r2[:-len(suffix)]
  356. elif suffix == "ative" and r2.endswith(suffix):
  357. word = word[:-5]
  358. r1 = r1[:-5]
  359. r2 = r2[:-5]
  360. break
  361. # STEP 4
  362. for suffix in self.__step4_suffixes:
  363. if word.endswith(suffix):
  364. if r2.endswith(suffix):
  365. if suffix == "ion":
  366. if word[-4] in "st":
  367. word = word[:-3]
  368. r1 = r1[:-3]
  369. r2 = r2[:-3]
  370. else:
  371. word = word[:-len(suffix)]
  372. r1 = r1[:-len(suffix)]
  373. r2 = r2[:-len(suffix)]
  374. break
  375. # STEP 5
  376. if r2.endswith("l") and word[-2] == "l":
  377. word = word[:-1]
  378. elif r2.endswith("e"):
  379. word = word[:-1]
  380. elif r1.endswith("e"):
  381. if len(word) >= 4 and (word[-2] in self.__vowels or
  382. word[-2] in "wxY" or
  383. word[-3] not in self.__vowels or
  384. word[-4] in self.__vowels):
  385. word = word[:-1]
  386. word = word.replace("Y", "y")
  387. return word