french.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348
  1. from .bases import _StandardStemmer
  2. from whoosh.compat import u
  3. class FrenchStemmer(_StandardStemmer):
  4. """
  5. The French Snowball stemmer.
  6. :cvar __vowels: The French vowels.
  7. :type __vowels: unicode
  8. :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
  9. :type __step1_suffixes: tuple
  10. :cvar __step2a_suffixes: Suffixes to be deleted in step 2a of the algorithm.
  11. :type __step2a_suffixes: tuple
  12. :cvar __step2b_suffixes: Suffixes to be deleted in step 2b of the algorithm.
  13. :type __step2b_suffixes: tuple
  14. :cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm.
  15. :type __step4_suffixes: tuple
  16. :note: A detailed description of the French
  17. stemming algorithm can be found under
  18. http://snowball.tartarus.org/algorithms/french/stemmer.html
  19. """
  20. __vowels = u("aeiouy\xE2\xE0\xEB\xE9\xEA\xE8\xEF\xEE\xF4\xFB\xF9")
  21. __step1_suffixes = ('issements', 'issement', 'atrices', 'atrice',
  22. 'ateurs', 'ations', 'logies', 'usions',
  23. 'utions', 'ements', 'amment', 'emment',
  24. 'ances', 'iqUes', 'ismes', 'ables', 'istes',
  25. 'ateur', 'ation', 'logie', 'usion', 'ution',
  26. 'ences', 'ement', 'euses', 'ments', 'ance',
  27. 'iqUe', 'isme', 'able', 'iste', 'ence',
  28. u('it\xE9s'), 'ives', 'eaux', 'euse', 'ment',
  29. 'eux', u('it\xE9'), 'ive', 'ifs', 'aux', 'if')
  30. __step2a_suffixes = ('issaIent', 'issantes', 'iraIent', 'issante',
  31. 'issants', 'issions', 'irions', 'issais',
  32. 'issait', 'issant', 'issent', 'issiez', 'issons',
  33. 'irais', 'irait', 'irent', 'iriez', 'irons',
  34. 'iront', 'isses', 'issez', u('\xEEmes'),
  35. u('\xEEtes'), 'irai', 'iras', 'irez', 'isse',
  36. 'ies', 'ira', u('\xEEt'), 'ie', 'ir', 'is',
  37. 'it', 'i')
  38. __step2b_suffixes = ('eraIent', 'assions', 'erions', 'assent',
  39. 'assiez', u('\xE8rent'), 'erais', 'erait',
  40. 'eriez', 'erons', 'eront', 'aIent', 'antes',
  41. 'asses', 'ions', 'erai', 'eras', 'erez',
  42. u('\xE2mes'), u('\xE2tes'), 'ante', 'ants',
  43. 'asse', u('\xE9es'), 'era', 'iez', 'ais',
  44. 'ait', 'ant', u('\xE9e'), u('\xE9s'), 'er',
  45. 'ez', u('\xE2t'), 'ai', 'as', u('\xE9'), 'a')
  46. __step4_suffixes = (u('i\xE8re'), u('I\xE8re'), 'ion', 'ier', 'Ier',
  47. 'e', u('\xEB'))
  48. def stem(self, word):
  49. """
  50. Stem a French word and return the stemmed form.
  51. :param word: The word that is stemmed.
  52. :type word: str or unicode
  53. :return: The stemmed form.
  54. :rtype: unicode
  55. """
  56. word = word.lower()
  57. step1_success = False
  58. rv_ending_found = False
  59. step2a_success = False
  60. step2b_success = False
  61. # Every occurrence of 'u' after 'q' is put into upper case.
  62. for i in range(1, len(word)):
  63. if word[i - 1] == "q" and word[i] == "u":
  64. word = "".join((word[:i], "U", word[i + 1:]))
  65. # Every occurrence of 'u' and 'i'
  66. # between vowels is put into upper case.
  67. # Every occurrence of 'y' preceded or
  68. # followed by a vowel is also put into upper case.
  69. for i in range(1, len(word) - 1):
  70. if word[i - 1] in self.__vowels and word[i + 1] in self.__vowels:
  71. if word[i] == "u":
  72. word = "".join((word[:i], "U", word[i + 1:]))
  73. elif word[i] == "i":
  74. word = "".join((word[:i], "I", word[i + 1:]))
  75. if word[i - 1] in self.__vowels or word[i + 1] in self.__vowels:
  76. if word[i] == "y":
  77. word = "".join((word[:i], "Y", word[i + 1:]))
  78. r1, r2 = self._r1r2_standard(word, self.__vowels)
  79. rv = self.__rv_french(word, self.__vowels)
  80. # STEP 1: Standard suffix removal
  81. for suffix in self.__step1_suffixes:
  82. if word.endswith(suffix):
  83. if suffix == "eaux":
  84. word = word[:-1]
  85. step1_success = True
  86. elif suffix in ("euse", "euses"):
  87. if suffix in r2:
  88. word = word[:-len(suffix)]
  89. step1_success = True
  90. elif suffix in r1:
  91. word = "".join((word[:-len(suffix)], "eux"))
  92. step1_success = True
  93. elif suffix in ("ement", "ements") and suffix in rv:
  94. word = word[:-len(suffix)]
  95. step1_success = True
  96. if word[-2:] == "iv" and "iv" in r2:
  97. word = word[:-2]
  98. if word[-2:] == "at" and "at" in r2:
  99. word = word[:-2]
  100. elif word[-3:] == "eus":
  101. if "eus" in r2:
  102. word = word[:-3]
  103. elif "eus" in r1:
  104. word = "".join((word[:-1], "x"))
  105. elif word[-3:] in ("abl", "iqU"):
  106. if "abl" in r2 or "iqU" in r2:
  107. word = word[:-3]
  108. elif word[-3:] in (u("i\xE8r"), u("I\xE8r")):
  109. if u("i\xE8r") in rv or u("I\xE8r") in rv:
  110. word = "".join((word[:-3], "i"))
  111. elif suffix == "amment" and suffix in rv:
  112. word = "".join((word[:-6], "ant"))
  113. rv = "".join((rv[:-6], "ant"))
  114. rv_ending_found = True
  115. elif suffix == "emment" and suffix in rv:
  116. word = "".join((word[:-6], "ent"))
  117. rv_ending_found = True
  118. elif (suffix in ("ment", "ments") and suffix in rv and
  119. not rv.startswith(suffix) and
  120. rv[rv.rindex(suffix) - 1] in self.__vowels):
  121. word = word[:-len(suffix)]
  122. rv = rv[:-len(suffix)]
  123. rv_ending_found = True
  124. elif suffix == "aux" and suffix in r1:
  125. word = "".join((word[:-2], "l"))
  126. step1_success = True
  127. elif (suffix in ("issement", "issements") and suffix in r1
  128. and word[-len(suffix) - 1] not in self.__vowels):
  129. word = word[:-len(suffix)]
  130. step1_success = True
  131. elif suffix in ("ance", "iqUe", "isme", "able", "iste",
  132. "eux", "ances", "iqUes", "ismes",
  133. "ables", "istes") and suffix in r2:
  134. word = word[:-len(suffix)]
  135. step1_success = True
  136. elif suffix in ("atrice", "ateur", "ation", "atrices",
  137. "ateurs", "ations") and suffix in r2:
  138. word = word[:-len(suffix)]
  139. step1_success = True
  140. if word[-2:] == "ic":
  141. if "ic" in r2:
  142. word = word[:-2]
  143. else:
  144. word = "".join((word[:-2], "iqU"))
  145. elif suffix in ("logie", "logies") and suffix in r2:
  146. word = "".join((word[:-len(suffix)], "log"))
  147. step1_success = True
  148. elif (suffix in ("usion", "ution", "usions", "utions") and
  149. suffix in r2):
  150. word = "".join((word[:-len(suffix)], "u"))
  151. step1_success = True
  152. elif suffix in ("ence", "ences") and suffix in r2:
  153. word = "".join((word[:-len(suffix)], "ent"))
  154. step1_success = True
  155. elif suffix in (u("it\xE9"), u("it\xE9s")) and suffix in r2:
  156. word = word[:-len(suffix)]
  157. step1_success = True
  158. if word[-4:] == "abil":
  159. if "abil" in r2:
  160. word = word[:-4]
  161. else:
  162. word = "".join((word[:-2], "l"))
  163. elif word[-2:] == "ic":
  164. if "ic" in r2:
  165. word = word[:-2]
  166. else:
  167. word = "".join((word[:-2], "iqU"))
  168. elif word[-2:] == "iv":
  169. if "iv" in r2:
  170. word = word[:-2]
  171. elif (suffix in ("if", "ive", "ifs", "ives") and
  172. suffix in r2):
  173. word = word[:-len(suffix)]
  174. step1_success = True
  175. if word[-2:] == "at" and "at" in r2:
  176. word = word[:-2]
  177. if word[-2:] == "ic":
  178. if "ic" in r2:
  179. word = word[:-2]
  180. else:
  181. word = "".join((word[:-2], "iqU"))
  182. break
  183. # STEP 2a: Verb suffixes beginning 'i'
  184. if not step1_success or rv_ending_found:
  185. for suffix in self.__step2a_suffixes:
  186. if word.endswith(suffix):
  187. if (suffix in rv and len(rv) > len(suffix) and
  188. rv[rv.rindex(suffix) - 1] not in self.__vowels):
  189. word = word[:-len(suffix)]
  190. step2a_success = True
  191. break
  192. # STEP 2b: Other verb suffixes
  193. if not step2a_success:
  194. for suffix in self.__step2b_suffixes:
  195. if rv.endswith(suffix):
  196. if suffix == "ions" and "ions" in r2:
  197. word = word[:-4]
  198. step2b_success = True
  199. elif suffix in ('eraIent', 'erions', u('\xE8rent'),
  200. 'erais', 'erait', 'eriez',
  201. 'erons', 'eront', 'erai', 'eras',
  202. 'erez', u('\xE9es'), 'era', 'iez',
  203. u('\xE9e'), u('\xE9s'), 'er', 'ez',
  204. u('\xE9')):
  205. word = word[:-len(suffix)]
  206. step2b_success = True
  207. elif suffix in ('assions', 'assent', 'assiez',
  208. 'aIent', 'antes', 'asses',
  209. u('\xE2mes'), u('\xE2tes'), 'ante',
  210. 'ants', 'asse', 'ais', 'ait',
  211. 'ant', u('\xE2t'), 'ai', 'as',
  212. 'a'):
  213. word = word[:-len(suffix)]
  214. rv = rv[:-len(suffix)]
  215. step2b_success = True
  216. if rv.endswith("e"):
  217. word = word[:-1]
  218. break
  219. # STEP 3
  220. if step1_success or step2a_success or step2b_success:
  221. if word[-1] == "Y":
  222. word = "".join((word[:-1], "i"))
  223. elif word[-1] == u("\xE7"):
  224. word = "".join((word[:-1], "c"))
  225. # STEP 4: Residual suffixes
  226. else:
  227. if (len(word) >= 2 and word[-1] == "s" and
  228. word[-2] not in u("aiou\xE8s")):
  229. word = word[:-1]
  230. for suffix in self.__step4_suffixes:
  231. if word.endswith(suffix):
  232. if suffix in rv:
  233. if (suffix == "ion" and suffix in r2 and
  234. rv[-4] in "st"):
  235. word = word[:-3]
  236. elif suffix in ("ier", u("i\xE8re"), "Ier",
  237. u("I\xE8re")):
  238. word = "".join((word[:-len(suffix)], "i"))
  239. elif suffix == "e":
  240. word = word[:-1]
  241. elif suffix == u("\xEB") and word[-3:-1] == "gu":
  242. word = word[:-1]
  243. break
  244. # STEP 5: Undouble
  245. if word.endswith(("enn", "onn", "ett", "ell", "eill")):
  246. word = word[:-1]
  247. # STEP 6: Un-accent
  248. for i in range(1, len(word)):
  249. if word[-i] not in self.__vowels:
  250. i += 1
  251. else:
  252. if i != 1 and word[-i] in (u("\xE9"), u("\xE8")):
  253. word = "".join((word[:-i], "e", word[-i + 1:]))
  254. break
  255. word = (word.replace("I", "i")
  256. .replace("U", "u")
  257. .replace("Y", "y"))
  258. return word
  259. def __rv_french(self, word, vowels):
  260. """
  261. Return the region RV that is used by the French stemmer.
  262. If the word begins with two vowels, RV is the region after
  263. the third letter. Otherwise, it is the region after the first
  264. vowel not at the beginning of the word, or the end of the word
  265. if these positions cannot be found. (Exceptionally, u'par',
  266. u'col' or u'tap' at the beginning of a word is also taken to
  267. define RV as the region to their right.)
  268. :param word: The French word whose region RV is determined.
  269. :type word: str or unicode
  270. :param vowels: The French vowels that are used to determine
  271. the region RV.
  272. :type vowels: unicode
  273. :return: the region RV for the respective French word.
  274. :rtype: unicode
  275. :note: This helper method is invoked by the stem method of
  276. the subclass FrenchStemmer. It is not to be invoked directly!
  277. """
  278. rv = ""
  279. if len(word) >= 2:
  280. if (word.startswith(("par", "col", "tap")) or
  281. (word[0] in vowels and word[1] in vowels)):
  282. rv = word[3:]
  283. else:
  284. for i in range(1, len(word)):
  285. if word[i] in vowels:
  286. rv = word[i + 1:]
  287. break
  288. return rv