russian.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422
  1. from whoosh.compat import u
  2. class RussianStemmer(object):
  3. """
  4. The Russian Snowball stemmer.
  5. :cvar __perfective_gerund_suffixes: Suffixes to be deleted.
  6. :type __perfective_gerund_suffixes: tuple
  7. :cvar __adjectival_suffixes: Suffixes to be deleted.
  8. :type __adjectival_suffixes: tuple
  9. :cvar __reflexive_suffixes: Suffixes to be deleted.
  10. :type __reflexive_suffixes: tuple
  11. :cvar __verb_suffixes: Suffixes to be deleted.
  12. :type __verb_suffixes: tuple
  13. :cvar __noun_suffixes: Suffixes to be deleted.
  14. :type __noun_suffixes: tuple
  15. :cvar __superlative_suffixes: Suffixes to be deleted.
  16. :type __superlative_suffixes: tuple
  17. :cvar __derivational_suffixes: Suffixes to be deleted.
  18. :type __derivational_suffixes: tuple
  19. :note: A detailed description of the Russian
  20. stemming algorithm can be found under
  21. http://snowball.tartarus.org/algorithms/russian/stemmer.html
  22. """
  23. __perfective_gerund_suffixes = ("ivshis'", "yvshis'", "vshis'",
  24. "ivshi", "yvshi", "vshi", "iv",
  25. "yv", "v")
  26. __adjectival_suffixes = ('ui^ushchi^ui^u', 'ui^ushchi^ai^a',
  27. 'ui^ushchimi', 'ui^ushchymi', 'ui^ushchego',
  28. 'ui^ushchogo', 'ui^ushchemu', 'ui^ushchomu',
  29. 'ui^ushchikh', 'ui^ushchykh',
  30. 'ui^ushchui^u', 'ui^ushchaia',
  31. 'ui^ushchoi^u', 'ui^ushchei^u',
  32. 'i^ushchi^ui^u', 'i^ushchi^ai^a',
  33. 'ui^ushchee', 'ui^ushchie',
  34. 'ui^ushchye', 'ui^ushchoe', 'ui^ushchei`',
  35. 'ui^ushchii`', 'ui^ushchyi`',
  36. 'ui^ushchoi`', 'ui^ushchem', 'ui^ushchim',
  37. 'ui^ushchym', 'ui^ushchom', 'i^ushchimi',
  38. 'i^ushchymi', 'i^ushchego', 'i^ushchogo',
  39. 'i^ushchemu', 'i^ushchomu', 'i^ushchikh',
  40. 'i^ushchykh', 'i^ushchui^u', 'i^ushchai^a',
  41. 'i^ushchoi^u', 'i^ushchei^u', 'i^ushchee',
  42. 'i^ushchie', 'i^ushchye', 'i^ushchoe',
  43. 'i^ushchei`', 'i^ushchii`',
  44. 'i^ushchyi`', 'i^ushchoi`', 'i^ushchem',
  45. 'i^ushchim', 'i^ushchym', 'i^ushchom',
  46. 'shchi^ui^u', 'shchi^ai^a', 'ivshi^ui^u',
  47. 'ivshi^ai^a', 'yvshi^ui^u', 'yvshi^ai^a',
  48. 'shchimi', 'shchymi', 'shchego', 'shchogo',
  49. 'shchemu', 'shchomu', 'shchikh', 'shchykh',
  50. 'shchui^u', 'shchai^a', 'shchoi^u',
  51. 'shchei^u', 'ivshimi', 'ivshymi',
  52. 'ivshego', 'ivshogo', 'ivshemu', 'ivshomu',
  53. 'ivshikh', 'ivshykh', 'ivshui^u',
  54. 'ivshai^a', 'ivshoi^u', 'ivshei^u',
  55. 'yvshimi', 'yvshymi', 'yvshego', 'yvshogo',
  56. 'yvshemu', 'yvshomu', 'yvshikh', 'yvshykh',
  57. 'yvshui^u', 'yvshai^a', 'yvshoi^u',
  58. 'yvshei^u', 'vshi^ui^u', 'vshi^ai^a',
  59. 'shchee', 'shchie', 'shchye', 'shchoe',
  60. 'shchei`', 'shchii`', 'shchyi`', 'shchoi`',
  61. 'shchem', 'shchim', 'shchym', 'shchom',
  62. 'ivshee', 'ivshie', 'ivshye', 'ivshoe',
  63. 'ivshei`', 'ivshii`', 'ivshyi`',
  64. 'ivshoi`', 'ivshem', 'ivshim', 'ivshym',
  65. 'ivshom', 'yvshee', 'yvshie', 'yvshye',
  66. 'yvshoe', 'yvshei`', 'yvshii`',
  67. 'yvshyi`', 'yvshoi`', 'yvshem',
  68. 'yvshim', 'yvshym', 'yvshom', 'vshimi',
  69. 'vshymi', 'vshego', 'vshogo', 'vshemu',
  70. 'vshomu', 'vshikh', 'vshykh', 'vshui^u',
  71. 'vshai^a', 'vshoi^u', 'vshei^u',
  72. 'emi^ui^u', 'emi^ai^a', 'nni^ui^u',
  73. 'nni^ai^a', 'vshee',
  74. 'vshie', 'vshye', 'vshoe', 'vshei`',
  75. 'vshii`', 'vshyi`', 'vshoi`',
  76. 'vshem', 'vshim', 'vshym', 'vshom',
  77. 'emimi', 'emymi', 'emego', 'emogo',
  78. 'ememu', 'emomu', 'emikh', 'emykh',
  79. 'emui^u', 'emai^a', 'emoi^u', 'emei^u',
  80. 'nnimi', 'nnymi', 'nnego', 'nnogo',
  81. 'nnemu', 'nnomu', 'nnikh', 'nnykh',
  82. 'nnui^u', 'nnai^a', 'nnoi^u', 'nnei^u',
  83. 'emee', 'emie', 'emye', 'emoe',
  84. 'emei`', 'emii`', 'emyi`',
  85. 'emoi`', 'emem', 'emim', 'emym',
  86. 'emom', 'nnee', 'nnie', 'nnye', 'nnoe',
  87. 'nnei`', 'nnii`', 'nnyi`',
  88. 'nnoi`', 'nnem', 'nnim', 'nnym',
  89. 'nnom', 'i^ui^u', 'i^ai^a', 'imi', 'ymi',
  90. 'ego', 'ogo', 'emu', 'omu', 'ikh',
  91. 'ykh', 'ui^u', 'ai^a', 'oi^u', 'ei^u',
  92. 'ee', 'ie', 'ye', 'oe', 'ei`',
  93. 'ii`', 'yi`', 'oi`', 'em',
  94. 'im', 'ym', 'om')
  95. __reflexive_suffixes = ("si^a", "s'")
  96. __verb_suffixes = ("esh'", 'ei`te', 'ui`te', 'ui^ut',
  97. "ish'", 'ete', 'i`te', 'i^ut', 'nno',
  98. 'ila', 'yla', 'ena', 'ite', 'ili', 'yli',
  99. 'ilo', 'ylo', 'eno', 'i^at', 'uet', 'eny',
  100. "it'", "yt'", 'ui^u', 'la', 'na', 'li',
  101. 'em', 'lo', 'no', 'et', 'ny', "t'",
  102. 'ei`', 'ui`', 'il', 'yl', 'im',
  103. 'ym', 'en', 'it', 'yt', 'i^u', 'i`',
  104. 'l', 'n')
  105. __noun_suffixes = ('ii^ami', 'ii^akh', 'i^ami', 'ii^am', 'i^akh',
  106. 'ami', 'iei`', 'i^am', 'iem', 'akh',
  107. 'ii^u', "'i^u", 'ii^a', "'i^a", 'ev', 'ov',
  108. 'ie', "'e", 'ei', 'ii', 'ei`',
  109. 'oi`', 'ii`', 'em', 'am', 'om',
  110. 'i^u', 'i^a', 'a', 'e', 'i', 'i`',
  111. 'o', 'u', 'y', "'")
  112. __superlative_suffixes = ("ei`she", "ei`sh")
  113. __derivational_suffixes = ("ost'", "ost")
  114. def stem(self, word):
  115. """
  116. Stem a Russian word and return the stemmed form.
  117. :param word: The word that is stemmed.
  118. :type word: str or unicode
  119. :return: The stemmed form.
  120. :rtype: unicode
  121. """
  122. chr_exceeded = False
  123. for i in range(len(word)):
  124. if ord(word[i]) > 255:
  125. chr_exceeded = True
  126. break
  127. if chr_exceeded:
  128. word = self.__cyrillic_to_roman(word)
  129. step1_success = False
  130. adjectival_removed = False
  131. verb_removed = False
  132. undouble_success = False
  133. superlative_removed = False
  134. rv, r2 = self.__regions_russian(word)
  135. # Step 1
  136. for suffix in self.__perfective_gerund_suffixes:
  137. if rv.endswith(suffix):
  138. if suffix in ("v", "vshi", "vshis'"):
  139. if (rv[-len(suffix) - 3:-len(suffix)] == "i^a" or
  140. rv[-len(suffix) - 1:-len(suffix)] == "a"):
  141. word = word[:-len(suffix)]
  142. r2 = r2[:-len(suffix)]
  143. rv = rv[:-len(suffix)]
  144. step1_success = True
  145. break
  146. else:
  147. word = word[:-len(suffix)]
  148. r2 = r2[:-len(suffix)]
  149. rv = rv[:-len(suffix)]
  150. step1_success = True
  151. break
  152. if not step1_success:
  153. for suffix in self.__reflexive_suffixes:
  154. if rv.endswith(suffix):
  155. word = word[:-len(suffix)]
  156. r2 = r2[:-len(suffix)]
  157. rv = rv[:-len(suffix)]
  158. break
  159. for suffix in self.__adjectival_suffixes:
  160. if rv.endswith(suffix):
  161. if suffix in ('i^ushchi^ui^u', 'i^ushchi^ai^a',
  162. 'i^ushchui^u', 'i^ushchai^a', 'i^ushchoi^u',
  163. 'i^ushchei^u', 'i^ushchimi', 'i^ushchymi',
  164. 'i^ushchego', 'i^ushchogo', 'i^ushchemu',
  165. 'i^ushchomu', 'i^ushchikh', 'i^ushchykh',
  166. 'shchi^ui^u', 'shchi^ai^a', 'i^ushchee',
  167. 'i^ushchie', 'i^ushchye', 'i^ushchoe',
  168. 'i^ushchei`', 'i^ushchii`', 'i^ushchyi`',
  169. 'i^ushchoi`', 'i^ushchem', 'i^ushchim',
  170. 'i^ushchym', 'i^ushchom', 'vshi^ui^u',
  171. 'vshi^ai^a', 'shchui^u', 'shchai^a',
  172. 'shchoi^u', 'shchei^u', 'emi^ui^u',
  173. 'emi^ai^a', 'nni^ui^u', 'nni^ai^a',
  174. 'shchimi', 'shchymi', 'shchego', 'shchogo',
  175. 'shchemu', 'shchomu', 'shchikh', 'shchykh',
  176. 'vshui^u', 'vshai^a', 'vshoi^u', 'vshei^u',
  177. 'shchee', 'shchie', 'shchye', 'shchoe',
  178. 'shchei`', 'shchii`', 'shchyi`', 'shchoi`',
  179. 'shchem', 'shchim', 'shchym', 'shchom',
  180. 'vshimi', 'vshymi', 'vshego', 'vshogo',
  181. 'vshemu', 'vshomu', 'vshikh', 'vshykh',
  182. 'emui^u', 'emai^a', 'emoi^u', 'emei^u',
  183. 'nnui^u', 'nnai^a', 'nnoi^u', 'nnei^u',
  184. 'vshee', 'vshie', 'vshye', 'vshoe',
  185. 'vshei`', 'vshii`', 'vshyi`', 'vshoi`',
  186. 'vshem', 'vshim', 'vshym', 'vshom',
  187. 'emimi', 'emymi', 'emego', 'emogo',
  188. 'ememu', 'emomu', 'emikh', 'emykh',
  189. 'nnimi', 'nnymi', 'nnego', 'nnogo',
  190. 'nnemu', 'nnomu', 'nnikh', 'nnykh',
  191. 'emee', 'emie', 'emye', 'emoe', 'emei`',
  192. 'emii`', 'emyi`', 'emoi`', 'emem', 'emim',
  193. 'emym', 'emom', 'nnee', 'nnie', 'nnye',
  194. 'nnoe', 'nnei`', 'nnii`', 'nnyi`', 'nnoi`',
  195. 'nnem', 'nnim', 'nnym', 'nnom'):
  196. if (rv[-len(suffix) - 3:-len(suffix)] == "i^a" or
  197. rv[-len(suffix) - 1:-len(suffix)] == "a"):
  198. word = word[:-len(suffix)]
  199. r2 = r2[:-len(suffix)]
  200. rv = rv[:-len(suffix)]
  201. adjectival_removed = True
  202. break
  203. else:
  204. word = word[:-len(suffix)]
  205. r2 = r2[:-len(suffix)]
  206. rv = rv[:-len(suffix)]
  207. adjectival_removed = True
  208. break
  209. if not adjectival_removed:
  210. for suffix in self.__verb_suffixes:
  211. if rv.endswith(suffix):
  212. if suffix in ("la", "na", "ete", "i`te", "li",
  213. "i`", "l", "em", "n", "lo", "no",
  214. "et", "i^ut", "ny", "t'", "esh'",
  215. "nno"):
  216. if (rv[-len(suffix) - 3:-len(suffix)] == "i^a" or
  217. rv[-len(suffix) - 1:-len(suffix)] == "a"):
  218. word = word[:-len(suffix)]
  219. r2 = r2[:-len(suffix)]
  220. rv = rv[:-len(suffix)]
  221. verb_removed = True
  222. break
  223. else:
  224. word = word[:-len(suffix)]
  225. r2 = r2[:-len(suffix)]
  226. rv = rv[:-len(suffix)]
  227. verb_removed = True
  228. break
  229. if not adjectival_removed and not verb_removed:
  230. for suffix in self.__noun_suffixes:
  231. if rv.endswith(suffix):
  232. word = word[:-len(suffix)]
  233. r2 = r2[:-len(suffix)]
  234. rv = rv[:-len(suffix)]
  235. break
  236. # Step 2
  237. if rv.endswith("i"):
  238. word = word[:-1]
  239. r2 = r2[:-1]
  240. # Step 3
  241. for suffix in self.__derivational_suffixes:
  242. if r2.endswith(suffix):
  243. word = word[:-len(suffix)]
  244. break
  245. # Step 4
  246. if word.endswith("nn"):
  247. word = word[:-1]
  248. undouble_success = True
  249. if not undouble_success:
  250. for suffix in self.__superlative_suffixes:
  251. if word.endswith(suffix):
  252. word = word[:-len(suffix)]
  253. superlative_removed = True
  254. break
  255. if word.endswith("nn"):
  256. word = word[:-1]
  257. if not undouble_success and not superlative_removed:
  258. if word.endswith("'"):
  259. word = word[:-1]
  260. if chr_exceeded:
  261. word = self.__roman_to_cyrillic(word)
  262. return word
  263. def __regions_russian(self, word):
  264. """
  265. Return the regions RV and R2 which are used by the Russian stemmer.
  266. In any word, RV is the region after the first vowel,
  267. or the end of the word if it contains no vowel.
  268. R2 is the region after the first non-vowel following
  269. a vowel in R1, or the end of the word if there is no such non-vowel.
  270. R1 is the region after the first non-vowel following a vowel,
  271. or the end of the word if there is no such non-vowel.
  272. :param word: The Russian word whose regions RV and R2 are determined.
  273. :type word: str or unicode
  274. :return: the regions RV and R2 for the respective Russian word.
  275. :rtype: tuple
  276. :note: This helper method is invoked by the stem method of the subclass
  277. RussianStemmer. It is not to be invoked directly!
  278. """
  279. r1 = ""
  280. r2 = ""
  281. rv = ""
  282. vowels = ("A", "U", "E", "a", "e", "i", "o", "u", "y")
  283. word = (word.replace("i^a", "A")
  284. .replace("i^u", "U")
  285. .replace("e`", "E"))
  286. for i in range(1, len(word)):
  287. if word[i] not in vowels and word[i - 1] in vowels:
  288. r1 = word[i + 1:]
  289. break
  290. for i in range(1, len(r1)):
  291. if r1[i] not in vowels and r1[i - 1] in vowels:
  292. r2 = r1[i + 1:]
  293. break
  294. for i in range(len(word)):
  295. if word[i] in vowels:
  296. rv = word[i + 1:]
  297. break
  298. r2 = (r2.replace("A", "i^a")
  299. .replace("U", "i^u")
  300. .replace("E", "e`"))
  301. rv = (rv.replace("A", "i^a")
  302. .replace("U", "i^u")
  303. .replace("E", "e`"))
  304. return (rv, r2)
  305. def __cyrillic_to_roman(self, word):
  306. """
  307. Transliterate a Russian word into the Roman alphabet.
  308. A Russian word whose letters consist of the Cyrillic
  309. alphabet are transliterated into the Roman alphabet
  310. in order to ease the forthcoming stemming process.
  311. :param word: The word that is transliterated.
  312. :type word: unicode
  313. :return: the transliterated word.
  314. :rtype: unicode
  315. :note: This helper method is invoked by the stem method of the subclass
  316. RussianStemmer. It is not to be invoked directly!
  317. """
  318. word = (word.replace(u("\u0410"), "a").replace(u("\u0430"), "a")
  319. .replace(u("\u0411"), "b").replace(u("\u0431"), "b")
  320. .replace(u("\u0412"), "v").replace(u("\u0432"), "v")
  321. .replace(u("\u0413"), "g").replace(u("\u0433"), "g")
  322. .replace(u("\u0414"), "d").replace(u("\u0434"), "d")
  323. .replace(u("\u0415"), "e").replace(u("\u0435"), "e")
  324. .replace(u("\u0401"), "e").replace(u("\u0451"), "e")
  325. .replace(u("\u0416"), "zh").replace(u("\u0436"), "zh")
  326. .replace(u("\u0417"), "z").replace(u("\u0437"), "z")
  327. .replace(u("\u0418"), "i").replace(u("\u0438"), "i")
  328. .replace(u("\u0419"), "i`").replace(u("\u0439"), "i`")
  329. .replace(u("\u041A"), "k").replace(u("\u043A"), "k")
  330. .replace(u("\u041B"), "l").replace(u("\u043B"), "l")
  331. .replace(u("\u041C"), "m").replace(u("\u043C"), "m")
  332. .replace(u("\u041D"), "n").replace(u("\u043D"), "n")
  333. .replace(u("\u041E"), "o").replace(u("\u043E"), "o")
  334. .replace(u("\u041F"), "p").replace(u("\u043F"), "p")
  335. .replace(u("\u0420"), "r").replace(u("\u0440"), "r")
  336. .replace(u("\u0421"), "s").replace(u("\u0441"), "s")
  337. .replace(u("\u0422"), "t").replace(u("\u0442"), "t")
  338. .replace(u("\u0423"), "u").replace(u("\u0443"), "u")
  339. .replace(u("\u0424"), "f").replace(u("\u0444"), "f")
  340. .replace(u("\u0425"), "kh").replace(u("\u0445"), "kh")
  341. .replace(u("\u0426"), "t^s").replace(u("\u0446"), "t^s")
  342. .replace(u("\u0427"), "ch").replace(u("\u0447"), "ch")
  343. .replace(u("\u0428"), "sh").replace(u("\u0448"), "sh")
  344. .replace(u("\u0429"), "shch").replace(u("\u0449"), "shch")
  345. .replace(u("\u042A"), "''").replace(u("\u044A"), "''")
  346. .replace(u("\u042B"), "y").replace(u("\u044B"), "y")
  347. .replace(u("\u042C"), "'").replace(u("\u044C"), "'")
  348. .replace(u("\u042D"), "e`").replace(u("\u044D"), "e`")
  349. .replace(u("\u042E"), "i^u").replace(u("\u044E"), "i^u")
  350. .replace(u("\u042F"), "i^a").replace(u("\u044F"), "i^a"))
  351. return word
  352. def __roman_to_cyrillic(self, word):
  353. """
  354. Transliterate a Russian word back into the Cyrillic alphabet.
  355. A Russian word formerly transliterated into the Roman alphabet
  356. in order to ease the stemming process, is transliterated back
  357. into the Cyrillic alphabet, its original form.
  358. :param word: The word that is transliterated.
  359. :type word: str or unicode
  360. :return: word, the transliterated word.
  361. :rtype: unicode
  362. :note: This helper method is invoked by the stem method of the subclass
  363. RussianStemmer. It is not to be invoked directly!
  364. """
  365. word = (word.replace("i^u", u("\u044E")).replace("i^a", u("\u044F"))
  366. .replace("shch", u("\u0449")).replace("kh", u("\u0445"))
  367. .replace("t^s", u("\u0446")).replace("ch", u("\u0447"))
  368. .replace("e`", u("\u044D")).replace("i`", u("\u0439"))
  369. .replace("sh", u("\u0448")).replace("k", u("\u043A"))
  370. .replace("e", u("\u0435")).replace("zh", u("\u0436"))
  371. .replace("a", u("\u0430")).replace("b", u("\u0431"))
  372. .replace("v", u("\u0432")).replace("g", u("\u0433"))
  373. .replace("d", u("\u0434")).replace("e", u("\u0435"))
  374. .replace("z", u("\u0437")).replace("i", u("\u0438"))
  375. .replace("l", u("\u043B")).replace("m", u("\u043C"))
  376. .replace("n", u("\u043D")).replace("o", u("\u043E"))
  377. .replace("p", u("\u043F")).replace("r", u("\u0440"))
  378. .replace("s", u("\u0441")).replace("t", u("\u0442"))
  379. .replace("u", u("\u0443")).replace("f", u("\u0444"))
  380. .replace("''", u("\u044A")).replace("y", u("\u044B"))
  381. .replace("'", u("\u044C")))
  382. return word