hungarian.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268
  1. from whoosh.compat import u
  2. class HungarianStemmer(object):
  3. """
  4. The Hungarian Snowball stemmer.
  5. :cvar __vowels: The Hungarian vowels.
  6. :type __vowels: unicode
  7. :cvar __digraphs: The Hungarian digraphs.
  8. :type __digraphs: tuple
  9. :cvar __double_consonants: The Hungarian double consonants.
  10. :type __double_consonants: tuple
  11. :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
  12. :type __step1_suffixes: tuple
  13. :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
  14. :type __step2_suffixes: tuple
  15. :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm.
  16. :type __step3_suffixes: tuple
  17. :cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm.
  18. :type __step4_suffixes: tuple
  19. :cvar __step5_suffixes: Suffixes to be deleted in step 5 of the algorithm.
  20. :type __step5_suffixes: tuple
  21. :cvar __step6_suffixes: Suffixes to be deleted in step 6 of the algorithm.
  22. :type __step6_suffixes: tuple
  23. :cvar __step7_suffixes: Suffixes to be deleted in step 7 of the algorithm.
  24. :type __step7_suffixes: tuple
  25. :cvar __step8_suffixes: Suffixes to be deleted in step 8 of the algorithm.
  26. :type __step8_suffixes: tuple
  27. :cvar __step9_suffixes: Suffixes to be deleted in step 9 of the algorithm.
  28. :type __step9_suffixes: tuple
  29. :note: A detailed description of the Hungarian
  30. stemming algorithm can be found under
  31. http://snowball.tartarus.org/algorithms/hungarian/stemmer.html
  32. """
  33. __vowels = u("aeiou\xF6\xFC\xE1\xE9\xED\xF3\xF5\xFA\xFB")
  34. __digraphs = ("cs", "dz", "dzs", "gy", "ly", "ny", "ty", "zs")
  35. __double_consonants = ("bb", "cc", "ccs", "dd", "ff", "gg",
  36. "ggy", "jj", "kk", "ll", "lly", "mm",
  37. "nn", "nny", "pp", "rr", "ss", "ssz",
  38. "tt", "tty", "vv", "zz", "zzs")
  39. __step1_suffixes = ("al", "el")
  40. __step2_suffixes = (u('k\xE9ppen'), u('onk\xE9nt'), u('enk\xE9nt'),
  41. u('ank\xE9nt'), u('k\xE9pp'), u('k\xE9nt'), 'ban',
  42. 'ben', 'nak', 'nek', 'val', 'vel', u('t\xF3l'),
  43. u('t\xF5l'), u('r\xF3l'), u('r\xF5l'), u('b\xF3l'),
  44. u('b\xF5l'), 'hoz', 'hez', u('h\xF6z'),
  45. u('n\xE1l'), u('n\xE9l'), u('\xE9rt'), 'kor',
  46. 'ba', 'be', 'ra', 're', 'ig', 'at', 'et',
  47. 'ot', u('\xF6t'), 'ul', u('\xFCl'), u('v\xE1'),
  48. u('v\xE9'), 'en', 'on', 'an', u('\xF6n'),
  49. 'n', 't')
  50. __step3_suffixes = (u("\xE1nk\xE9nt"), u("\xE1n"), u("\xE9n"))
  51. __step4_suffixes = ('astul', u('est\xFCl'), u('\xE1stul'),
  52. u('\xE9st\xFCl'), 'stul', u('st\xFCl'))
  53. __step5_suffixes = (u("\xE1"), u("\xE9"))
  54. __step6_suffixes = (u('ok\xE9'), u('\xF6k\xE9'), u('ak\xE9'),
  55. u('ek\xE9'), u('\xE1k\xE9'), u('\xE1\xE9i'),
  56. u('\xE9k\xE9'), u('\xE9\xE9i'), u('k\xE9'),
  57. u('\xE9i'), u('\xE9\xE9'), u('\xE9'))
  58. __step7_suffixes = (u('\xE1juk'), u('\xE9j\xFCk'), u('\xFCnk'),
  59. 'unk', 'juk', u('j\xFCk'), u('\xE1nk'),
  60. u('\xE9nk'), 'nk', 'uk', u('\xFCk'), 'em',
  61. 'om', 'am', 'od', 'ed', 'ad', u('\xF6d'),
  62. 'ja', 'je', u('\xE1m'), u('\xE1d'), u('\xE9m'),
  63. u('\xE9d'), 'm', 'd', 'a', 'e', 'o',
  64. u('\xE1'), u('\xE9'))
  65. __step8_suffixes = ('jaitok', 'jeitek', 'jaink', 'jeink', 'aitok',
  66. 'eitek', u('\xE1itok'), u('\xE9itek'), 'jaim',
  67. 'jeim', 'jaid', 'jeid', 'eink', 'aink',
  68. 'itek', 'jeik', 'jaik', u('\xE1ink'),
  69. u('\xE9ink'), 'aim', 'eim', 'aid', 'eid',
  70. 'jai', 'jei', 'ink', 'aik', 'eik',
  71. u('\xE1im'), u('\xE1id'), u('\xE1ik'), u('\xE9im'),
  72. u('\xE9id'), u('\xE9ik'), 'im', 'id', 'ai',
  73. 'ei', 'ik', u('\xE1i'), u('\xE9i'), 'i')
  74. __step9_suffixes = (u("\xE1k"), u("\xE9k"), u("\xF6k"), "ok",
  75. "ek", "ak", "k")
  76. def stem(self, word):
  77. """
  78. Stem an Hungarian word and return the stemmed form.
  79. :param word: The word that is stemmed.
  80. :type word: str or unicode
  81. :return: The stemmed form.
  82. :rtype: unicode
  83. """
  84. word = word.lower()
  85. r1 = self.__r1_hungarian(word, self.__vowels, self.__digraphs)
  86. # STEP 1: Remove instrumental case
  87. if r1.endswith(self.__step1_suffixes):
  88. for double_cons in self.__double_consonants:
  89. if word[-2 - len(double_cons):-2] == double_cons:
  90. word = "".join((word[:-4], word[-3]))
  91. if r1[-2 - len(double_cons):-2] == double_cons:
  92. r1 = "".join((r1[:-4], r1[-3]))
  93. break
  94. # STEP 2: Remove frequent cases
  95. for suffix in self.__step2_suffixes:
  96. if word.endswith(suffix):
  97. if r1.endswith(suffix):
  98. word = word[:-len(suffix)]
  99. r1 = r1[:-len(suffix)]
  100. if r1.endswith(u("\xE1")):
  101. word = "".join((word[:-1], "a"))
  102. r1 = "".join((r1[:-1], "a"))
  103. elif r1.endswith(u("\xE9")):
  104. word = "".join((word[:-1], "e"))
  105. r1 = "".join((r1[:-1], "e"))
  106. break
  107. # STEP 3: Remove special cases
  108. for suffix in self.__step3_suffixes:
  109. if r1.endswith(suffix):
  110. if suffix == u("\xE9n"):
  111. word = "".join((word[:-2], "e"))
  112. r1 = "".join((r1[:-2], "e"))
  113. else:
  114. word = "".join((word[:-len(suffix)], "a"))
  115. r1 = "".join((r1[:-len(suffix)], "a"))
  116. break
  117. # STEP 4: Remove other cases
  118. for suffix in self.__step4_suffixes:
  119. if r1.endswith(suffix):
  120. if suffix == u("\xE1stul"):
  121. word = "".join((word[:-5], "a"))
  122. r1 = "".join((r1[:-5], "a"))
  123. elif suffix == u("\xE9st\xFCl"):
  124. word = "".join((word[:-5], "e"))
  125. r1 = "".join((r1[:-5], "e"))
  126. else:
  127. word = word[:-len(suffix)]
  128. r1 = r1[:-len(suffix)]
  129. break
  130. # STEP 5: Remove factive case
  131. for suffix in self.__step5_suffixes:
  132. if r1.endswith(suffix):
  133. for double_cons in self.__double_consonants:
  134. if word[-1 - len(double_cons):-1] == double_cons:
  135. word = "".join((word[:-3], word[-2]))
  136. if r1[-1 - len(double_cons):-1] == double_cons:
  137. r1 = "".join((r1[:-3], r1[-2]))
  138. break
  139. # STEP 6: Remove owned
  140. for suffix in self.__step6_suffixes:
  141. if r1.endswith(suffix):
  142. if suffix in (u("\xE1k\xE9"), u("\xE1\xE9i")):
  143. word = "".join((word[:-3], "a"))
  144. r1 = "".join((r1[:-3], "a"))
  145. elif suffix in (u("\xE9k\xE9"), u("\xE9\xE9i"),
  146. u("\xE9\xE9")):
  147. word = "".join((word[:-len(suffix)], "e"))
  148. r1 = "".join((r1[:-len(suffix)], "e"))
  149. else:
  150. word = word[:-len(suffix)]
  151. r1 = r1[:-len(suffix)]
  152. break
  153. # STEP 7: Remove singular owner suffixes
  154. for suffix in self.__step7_suffixes:
  155. if word.endswith(suffix):
  156. if r1.endswith(suffix):
  157. if suffix in (u("\xE1nk"), u("\xE1juk"), u("\xE1m"),
  158. u("\xE1d"), u("\xE1")):
  159. word = "".join((word[:-len(suffix)], "a"))
  160. r1 = "".join((r1[:-len(suffix)], "a"))
  161. elif suffix in (u("\xE9nk"), u("\xE9j\xFCk"),
  162. u("\xE9m"), u("\xE9d"), u("\xE9")):
  163. word = "".join((word[:-len(suffix)], "e"))
  164. r1 = "".join((r1[:-len(suffix)], "e"))
  165. else:
  166. word = word[:-len(suffix)]
  167. r1 = r1[:-len(suffix)]
  168. break
  169. # STEP 8: Remove plural owner suffixes
  170. for suffix in self.__step8_suffixes:
  171. if word.endswith(suffix):
  172. if r1.endswith(suffix):
  173. if suffix in (u("\xE1im"), u("\xE1id"), u("\xE1i"),
  174. u("\xE1ink"), u("\xE1itok"), u("\xE1ik")):
  175. word = "".join((word[:-len(suffix)], "a"))
  176. r1 = "".join((r1[:-len(suffix)], "a"))
  177. elif suffix in (u("\xE9im"), u("\xE9id"), u("\xE9i"),
  178. u("\xE9ink"), u("\xE9itek"), u("\xE9ik")):
  179. word = "".join((word[:-len(suffix)], "e"))
  180. r1 = "".join((r1[:-len(suffix)], "e"))
  181. else:
  182. word = word[:-len(suffix)]
  183. r1 = r1[:-len(suffix)]
  184. break
  185. # STEP 9: Remove plural suffixes
  186. for suffix in self.__step9_suffixes:
  187. if word.endswith(suffix):
  188. if r1.endswith(suffix):
  189. if suffix == u("\xE1k"):
  190. word = "".join((word[:-2], "a"))
  191. elif suffix == u("\xE9k"):
  192. word = "".join((word[:-2], "e"))
  193. else:
  194. word = word[:-len(suffix)]
  195. break
  196. return word
  197. def __r1_hungarian(self, word, vowels, digraphs):
  198. """
  199. Return the region R1 that is used by the Hungarian stemmer.
  200. If the word begins with a vowel, R1 is defined as the region
  201. after the first consonant or digraph (= two letters stand for
  202. one phoneme) in the word. If the word begins with a consonant,
  203. it is defined as the region after the first vowel in the word.
  204. If the word does not contain both a vowel and consonant, R1
  205. is the null region at the end of the word.
  206. :param word: The Hungarian word whose region R1 is determined.
  207. :type word: str or unicode
  208. :param vowels: The Hungarian vowels that are used to determine
  209. the region R1.
  210. :type vowels: unicode
  211. :param digraphs: The digraphs that are used to determine the
  212. region R1.
  213. :type digraphs: tuple
  214. :return: the region R1 for the respective word.
  215. :rtype: unicode
  216. :note: This helper method is invoked by the stem method of the subclass
  217. HungarianStemmer. It is not to be invoked directly!
  218. """
  219. r1 = ""
  220. if word[0] in vowels:
  221. for digraph in digraphs:
  222. if digraph in word[1:]:
  223. r1 = word[word.index(digraph[-1]) + 1:]
  224. return r1
  225. for i in range(1, len(word)):
  226. if word[i] not in vowels:
  227. r1 = word[i + 1:]
  228. break
  229. else:
  230. for i in range(1, len(word)):
  231. if word[i] in vowels:
  232. r1 = word[i + 1:]
  233. break
  234. return r1