finnish.py 9.8 KB


  1. from .bases import _StandardStemmer
  2. from whoosh.compat import u
  3. class FinnishStemmer(_StandardStemmer):
  4. """
  5. The Finnish Snowball stemmer.
  6. :cvar __vowels: The Finnish vowels.
  7. :type __vowels: unicode
  8. :cvar __restricted_vowels: A subset of the Finnish vowels.
  9. :type __restricted_vowels: unicode
  10. :cvar __long_vowels: The Finnish vowels in their long forms.
  11. :type __long_vowels: tuple
  12. :cvar __consonants: The Finnish consonants.
  13. :type __consonants: unicode
  14. :cvar __double_consonants: The Finnish double consonants.
  15. :type __double_consonants: tuple
  16. :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
  17. :type __step1_suffixes: tuple
  18. :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
  19. :type __step2_suffixes: tuple
  20. :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm.
  21. :type __step3_suffixes: tuple
  22. :cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm.
  23. :type __step4_suffixes: tuple
  24. :note: A detailed description of the Finnish
  25. stemming algorithm can be found under
  26. http://snowball.tartarus.org/algorithms/finnish/stemmer.html
  27. """
  28. __vowels = u("aeiouy\xE4\xF6")
  29. __restricted_vowels = u("aeiou\xE4\xF6")
  30. __long_vowels = ("aa", "ee", "ii", "oo", "uu", u("\xE4\xE4"),
  31. u("\xF6\xF6"))
  32. __consonants = "bcdfghjklmnpqrstvwxz"
  33. __double_consonants = ("bb", "cc", "dd", "ff", "gg", "hh", "jj",
  34. "kk", "ll", "mm", "nn", "pp", "qq", "rr",
  35. "ss", "tt", "vv", "ww", "xx", "zz")
  36. __step1_suffixes = ('kaan', u('k\xE4\xE4n'), 'sti', 'kin', 'han',
  37. u('h\xE4n'), 'ko', u('k\xF6'), 'pa', u('p\xE4'))
  38. __step2_suffixes = ('nsa', u('ns\xE4'), 'mme', 'nne', 'si', 'ni',
  39. 'an', u('\xE4n'), 'en')
  40. __step3_suffixes = ('siin', 'tten', 'seen', 'han', 'hen', 'hin',
  41. 'hon', u('h\xE4n'), u('h\xF6n'), 'den', 'tta',
  42. u('tt\xE4'), 'ssa', u('ss\xE4'), 'sta',
  43. u('st\xE4'), 'lla', u('ll\xE4'), 'lta',
  44. u('lt\xE4'), 'lle', 'ksi', 'ine', 'ta',
  45. u('t\xE4'), 'na', u('n\xE4'), 'a', u('\xE4'),
  46. 'n')
  47. __step4_suffixes = ('impi', 'impa', u('imp\xE4'), 'immi', 'imma',
  48. u('imm\xE4'), 'mpi', 'mpa', u('mp\xE4'), 'mmi',
  49. 'mma', u('mm\xE4'), 'eja', u('ej\xE4'))
  50. def stem(self, word):
  51. """
  52. Stem a Finnish word and return the stemmed form.
  53. :param word: The word that is stemmed.
  54. :type word: str or unicode
  55. :return: The stemmed form.
  56. :rtype: unicode
  57. """
  58. word = word.lower()
  59. step3_success = False
  60. r1, r2 = self._r1r2_standard(word, self.__vowels)
  61. # STEP 1: Particles etc.
  62. for suffix in self.__step1_suffixes:
  63. if r1.endswith(suffix):
  64. if suffix == "sti":
  65. if suffix in r2:
  66. word = word[:-3]
  67. r1 = r1[:-3]
  68. r2 = r2[:-3]
  69. else:
  70. if word[-len(suffix) - 1] in u("ntaeiouy\xE4\xF6"):
  71. word = word[:-len(suffix)]
  72. r1 = r1[:-len(suffix)]
  73. r2 = r2[:-len(suffix)]
  74. break
  75. # STEP 2: Possessives
  76. for suffix in self.__step2_suffixes:
  77. if r1.endswith(suffix):
  78. if suffix == "si":
  79. if word[-3] != "k":
  80. word = word[:-2]
  81. r1 = r1[:-2]
  82. r2 = r2[:-2]
  83. elif suffix == "ni":
  84. word = word[:-2]
  85. r1 = r1[:-2]
  86. r2 = r2[:-2]
  87. if word.endswith("kse"):
  88. word = "".join((word[:-3], "ksi"))
  89. if r1.endswith("kse"):
  90. r1 = "".join((r1[:-3], "ksi"))
  91. if r2.endswith("kse"):
  92. r2 = "".join((r2[:-3], "ksi"))
  93. elif suffix == "an":
  94. if (word[-4:-2] in ("ta", "na") or
  95. word[-5:-2] in ("ssa", "sta", "lla", "lta")):
  96. word = word[:-2]
  97. r1 = r1[:-2]
  98. r2 = r2[:-2]
  99. elif suffix == u("\xE4n"):
  100. if (word[-4:-2] in (u("t\xE4"), u("n\xE4")) or
  101. word[-5:-2] in (u("ss\xE4"), u("st\xE4"),
  102. u("ll\xE4"), u("lt\xE4"))):
  103. word = word[:-2]
  104. r1 = r1[:-2]
  105. r2 = r2[:-2]
  106. elif suffix == "en":
  107. if word[-5:-2] in ("lle", "ine"):
  108. word = word[:-2]
  109. r1 = r1[:-2]
  110. r2 = r2[:-2]
  111. else:
  112. word = word[:-3]
  113. r1 = r1[:-3]
  114. r2 = r2[:-3]
  115. break
  116. # STEP 3: Cases
  117. for suffix in self.__step3_suffixes:
  118. if r1.endswith(suffix):
  119. if suffix in ("han", "hen", "hin", "hon", u("h\xE4n"),
  120. u("h\xF6n")):
  121. if ((suffix == "han" and word[-4] == "a") or
  122. (suffix == "hen" and word[-4] == "e") or
  123. (suffix == "hin" and word[-4] == "i") or
  124. (suffix == "hon" and word[-4] == "o") or
  125. (suffix == u("h\xE4n") and word[-4] == u("\xE4")) or
  126. (suffix == u("h\xF6n") and word[-4] == u("\xF6"))):
  127. word = word[:-3]
  128. r1 = r1[:-3]
  129. r2 = r2[:-3]
  130. step3_success = True
  131. elif suffix in ("siin", "den", "tten"):
  132. if (word[-len(suffix) - 1] == "i" and
  133. word[-len(suffix) - 2] in self.__restricted_vowels):
  134. word = word[:-len(suffix)]
  135. r1 = r1[:-len(suffix)]
  136. r2 = r2[:-len(suffix)]
  137. step3_success = True
  138. else:
  139. continue
  140. elif suffix == "seen":
  141. if word[-6:-4] in self.__long_vowels:
  142. word = word[:-4]
  143. r1 = r1[:-4]
  144. r2 = r2[:-4]
  145. step3_success = True
  146. else:
  147. continue
  148. elif suffix in ("a", u("\xE4")):
  149. if word[-2] in self.__vowels and word[-3] in self.__consonants:
  150. word = word[:-1]
  151. r1 = r1[:-1]
  152. r2 = r2[:-1]
  153. step3_success = True
  154. elif suffix in ("tta", u("tt\xE4")):
  155. if word[-4] == "e":
  156. word = word[:-3]
  157. r1 = r1[:-3]
  158. r2 = r2[:-3]
  159. step3_success = True
  160. elif suffix == "n":
  161. word = word[:-1]
  162. r1 = r1[:-1]
  163. r2 = r2[:-1]
  164. step3_success = True
  165. if word[-2:] == "ie" or word[-2:] in self.__long_vowels:
  166. word = word[:-1]
  167. r1 = r1[:-1]
  168. r2 = r2[:-1]
  169. else:
  170. word = word[:-len(suffix)]
  171. r1 = r1[:-len(suffix)]
  172. r2 = r2[:-len(suffix)]
  173. step3_success = True
  174. break
  175. # STEP 4: Other endings
  176. for suffix in self.__step4_suffixes:
  177. if r2.endswith(suffix):
  178. if suffix in ("mpi", "mpa", u("mp\xE4"), "mmi", "mma",
  179. u("mm\xE4")):
  180. if word[-5:-3] != "po":
  181. word = word[:-3]
  182. r1 = r1[:-3]
  183. r2 = r2[:-3]
  184. else:
  185. word = word[:-len(suffix)]
  186. r1 = r1[:-len(suffix)]
  187. r2 = r2[:-len(suffix)]
  188. break
  189. # STEP 5: Plurals
  190. if step3_success and len(r1) >= 1 and r1[-1] in "ij":
  191. word = word[:-1]
  192. r1 = r1[:-1]
  193. elif (not step3_success and len(r1) >= 2 and
  194. r1[-1] == "t" and r1[-2] in self.__vowels):
  195. word = word[:-1]
  196. r1 = r1[:-1]
  197. r2 = r2[:-1]
  198. if r2.endswith("imma"):
  199. word = word[:-4]
  200. r1 = r1[:-4]
  201. elif r2.endswith("mma") and r2[-5:-3] != "po":
  202. word = word[:-3]
  203. r1 = r1[:-3]
  204. # STEP 6: Tidying up
  205. if r1[-2:] in self.__long_vowels:
  206. word = word[:-1]
  207. r1 = r1[:-1]
  208. if (len(r1) >= 2 and r1[-2] in self.__consonants and
  209. r1[-1] in u("a\xE4ei")):
  210. word = word[:-1]
  211. r1 = r1[:-1]
  212. if r1.endswith(("oj", "uj")):
  213. word = word[:-1]
  214. r1 = r1[:-1]
  215. if r1.endswith("jo"):
  216. word = word[:-1]
  217. r1 = r1[:-1]
  218. # If the word ends with a double consonant
  219. # followed by zero or more vowels, the last consonant is removed.
  220. for i in range(1, len(word)):
  221. if word[-i] in self.__vowels:
  222. continue
  223. else:
  224. if i == 1:
  225. if word[-i - 1:] in self.__double_consonants:
  226. word = word[:-1]
  227. else:
  228. if word[-i - 1:-i + 1] in self.__double_consonants:
  229. word = "".join((word[:-i], word[-i + 1:]))
  230. break
  231. return word