italian.py 8.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230
  1. from .bases import _StandardStemmer
  2. from whoosh.compat import u
  3. class ItalianStemmer(_StandardStemmer):
  4. """
  5. The Italian Snowball stemmer.
  6. :cvar __vowels: The Italian vowels.
  7. :type __vowels: unicode
  8. :cvar __step0_suffixes: Suffixes to be deleted in step 0 of the algorithm.
  9. :type __step0_suffixes: tuple
  10. :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
  11. :type __step1_suffixes: tuple
  12. :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
  13. :type __step2_suffixes: tuple
  14. :note: A detailed description of the Italian
  15. stemming algorithm can be found under
  16. http://snowball.tartarus.org/algorithms/italian/stemmer.html
  17. """
  18. __vowels = u("aeiou\xE0\xE8\xEC\xF2\xF9")
  19. __step0_suffixes = ('gliela', 'gliele', 'glieli', 'glielo',
  20. 'gliene', 'sene', 'mela', 'mele', 'meli',
  21. 'melo', 'mene', 'tela', 'tele', 'teli',
  22. 'telo', 'tene', 'cela', 'cele', 'celi',
  23. 'celo', 'cene', 'vela', 'vele', 'veli',
  24. 'velo', 'vene', 'gli', 'ci', 'la', 'le',
  25. 'li', 'lo', 'mi', 'ne', 'si', 'ti', 'vi')
  26. __step1_suffixes = ('atrice', 'atrici', 'azione', 'azioni',
  27. 'uzione', 'uzioni', 'usione', 'usioni',
  28. 'amento', 'amenti', 'imento', 'imenti',
  29. 'amente', 'abile', 'abili', 'ibile', 'ibili',
  30. 'mente', 'atore', 'atori', 'logia', 'logie',
  31. 'anza', 'anze', 'iche', 'ichi', 'ismo',
  32. 'ismi', 'ista', 'iste', 'isti', u('ist\xE0'),
  33. u('ist\xE8'), u('ist\xEC'), 'ante', 'anti',
  34. 'enza', 'enze', 'ico', 'ici', 'ica', 'ice',
  35. 'oso', 'osi', 'osa', 'ose', u('it\xE0'),
  36. 'ivo', 'ivi', 'iva', 'ive')
  37. __step2_suffixes = ('erebbero', 'irebbero', 'assero', 'assimo',
  38. 'eranno', 'erebbe', 'eremmo', 'ereste',
  39. 'eresti', 'essero', 'iranno', 'irebbe',
  40. 'iremmo', 'ireste', 'iresti', 'iscano',
  41. 'iscono', 'issero', 'arono', 'avamo', 'avano',
  42. 'avate', 'eremo', 'erete', 'erono', 'evamo',
  43. 'evano', 'evate', 'iremo', 'irete', 'irono',
  44. 'ivamo', 'ivano', 'ivate', 'ammo', 'ando',
  45. 'asse', 'assi', 'emmo', 'enda', 'ende',
  46. 'endi', 'endo', 'erai', 'erei', 'Yamo',
  47. 'iamo', 'immo', 'irai', 'irei', 'isca',
  48. 'isce', 'isci', 'isco', 'ano', 'are', 'ata',
  49. 'ate', 'ati', 'ato', 'ava', 'avi', 'avo',
  50. u('er\xE0'), 'ere', u('er\xF2'), 'ete', 'eva',
  51. 'evi', 'evo', u('ir\xE0'), 'ire', u('ir\xF2'),
  52. 'ita', 'ite', 'iti', 'ito', 'iva', 'ivi',
  53. 'ivo', 'ono', 'uta', 'ute', 'uti', 'uto',
  54. 'ar', 'ir')
  55. def stem(self, word):
  56. """
  57. Stem an Italian word and return the stemmed form.
  58. :param word: The word that is stemmed.
  59. :type word: str or unicode
  60. :return: The stemmed form.
  61. :rtype: unicode
  62. """
  63. word = word.lower()
  64. step1_success = False
  65. # All acute accents are replaced by grave accents.
  66. word = (word.replace(u("\xE1"), u("\xE0"))
  67. .replace(u("\xE9"), u("\xE8"))
  68. .replace(u("\xED"), u("\xEC"))
  69. .replace(u("\xF3"), u("\xF2"))
  70. .replace(u("\xFA"), u("\xF9")))
  71. # Every occurrence of 'u' after 'q'
  72. # is put into upper case.
  73. for i in range(1, len(word)):
  74. if word[i - 1] == "q" and word[i] == "u":
  75. word = "".join((word[:i], "U", word[i + 1:]))
  76. # Every occurrence of 'u' and 'i'
  77. # between vowels is put into upper case.
  78. for i in range(1, len(word) - 1):
  79. if word[i - 1] in self.__vowels and word[i + 1] in self.__vowels:
  80. if word[i] == "u":
  81. word = "".join((word[:i], "U", word[i + 1:]))
  82. elif word[i] == "i":
  83. word = "".join((word[:i], "I", word[i + 1:]))
  84. r1, r2 = self._r1r2_standard(word, self.__vowels)
  85. rv = self._rv_standard(word, self.__vowels)
  86. # STEP 0: Attached pronoun
  87. for suffix in self.__step0_suffixes:
  88. if rv.endswith(suffix):
  89. if rv[-len(suffix) - 4:-len(suffix)] in ("ando", "endo"):
  90. word = word[:-len(suffix)]
  91. r1 = r1[:-len(suffix)]
  92. r2 = r2[:-len(suffix)]
  93. rv = rv[:-len(suffix)]
  94. elif (rv[-len(suffix) - 2:-len(suffix)] in
  95. ("ar", "er", "ir")):
  96. word = "".join((word[:-len(suffix)], "e"))
  97. r1 = "".join((r1[:-len(suffix)], "e"))
  98. r2 = "".join((r2[:-len(suffix)], "e"))
  99. rv = "".join((rv[:-len(suffix)], "e"))
  100. break
  101. # STEP 1: Standard suffix removal
  102. for suffix in self.__step1_suffixes:
  103. if word.endswith(suffix):
  104. if suffix == "amente" and r1.endswith(suffix):
  105. step1_success = True
  106. word = word[:-6]
  107. r2 = r2[:-6]
  108. rv = rv[:-6]
  109. if r2.endswith("iv"):
  110. word = word[:-2]
  111. r2 = r2[:-2]
  112. rv = rv[:-2]
  113. if r2.endswith("at"):
  114. word = word[:-2]
  115. rv = rv[:-2]
  116. elif r2.endswith(("os", "ic")):
  117. word = word[:-2]
  118. rv = rv[:-2]
  119. elif r2 .endswith("abil"):
  120. word = word[:-4]
  121. rv = rv[:-4]
  122. elif (suffix in ("amento", "amenti",
  123. "imento", "imenti") and
  124. rv.endswith(suffix)):
  125. step1_success = True
  126. word = word[:-6]
  127. rv = rv[:-6]
  128. elif r2.endswith(suffix):
  129. step1_success = True
  130. if suffix in ("azione", "azioni", "atore", "atori"):
  131. word = word[:-len(suffix)]
  132. r2 = r2[:-len(suffix)]
  133. rv = rv[:-len(suffix)]
  134. if r2.endswith("ic"):
  135. word = word[:-2]
  136. rv = rv[:-2]
  137. elif suffix in ("logia", "logie"):
  138. word = word[:-2]
  139. rv = word[:-2]
  140. elif suffix in ("uzione", "uzioni",
  141. "usione", "usioni"):
  142. word = word[:-5]
  143. rv = rv[:-5]
  144. elif suffix in ("enza", "enze"):
  145. word = "".join((word[:-2], "te"))
  146. rv = "".join((rv[:-2], "te"))
  147. elif suffix == u("it\xE0"):
  148. word = word[:-3]
  149. r2 = r2[:-3]
  150. rv = rv[:-3]
  151. if r2.endswith(("ic", "iv")):
  152. word = word[:-2]
  153. rv = rv[:-2]
  154. elif r2.endswith("abil"):
  155. word = word[:-4]
  156. rv = rv[:-4]
  157. elif suffix in ("ivo", "ivi", "iva", "ive"):
  158. word = word[:-3]
  159. r2 = r2[:-3]
  160. rv = rv[:-3]
  161. if r2.endswith("at"):
  162. word = word[:-2]
  163. r2 = r2[:-2]
  164. rv = rv[:-2]
  165. if r2.endswith("ic"):
  166. word = word[:-2]
  167. rv = rv[:-2]
  168. else:
  169. word = word[:-len(suffix)]
  170. rv = rv[:-len(suffix)]
  171. break
  172. # STEP 2: Verb suffixes
  173. if not step1_success:
  174. for suffix in self.__step2_suffixes:
  175. if rv.endswith(suffix):
  176. word = word[:-len(suffix)]
  177. rv = rv[:-len(suffix)]
  178. break
  179. # STEP 3a
  180. if rv.endswith(("a", "e", "i", "o", u("\xE0"), u("\xE8"),
  181. u("\xEC"), u("\xF2"))):
  182. word = word[:-1]
  183. rv = rv[:-1]
  184. if rv.endswith("i"):
  185. word = word[:-1]
  186. rv = rv[:-1]
  187. # STEP 3b
  188. if rv.endswith(("ch", "gh")):
  189. word = word[:-1]
  190. word = word.replace("I", "i").replace("U", "u")
  191. return word