spanish.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248
  1. from .bases import _StandardStemmer
  2. from whoosh.compat import u
  3. class SpanishStemmer(_StandardStemmer):
  4. """
  5. The Spanish Snowball stemmer.
  6. :cvar __vowels: The Spanish vowels.
  7. :type __vowels: unicode
  8. :cvar __step0_suffixes: Suffixes to be deleted in step 0 of the algorithm.
  9. :type __step0_suffixes: tuple
  10. :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
  11. :type __step1_suffixes: tuple
  12. :cvar __step2a_suffixes: Suffixes to be deleted in step 2a of the algorithm.
  13. :type __step2a_suffixes: tuple
  14. :cvar __step2b_suffixes: Suffixes to be deleted in step 2b of the algorithm.
  15. :type __step2b_suffixes: tuple
  16. :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm.
  17. :type __step3_suffixes: tuple
  18. :note: A detailed description of the Spanish
  19. stemming algorithm can be found under
  20. http://snowball.tartarus.org/algorithms/spanish/stemmer.html
  21. """
  22. __vowels = u("aeiou\xE1\xE9\xED\xF3\xFA\xFC")
  23. __step0_suffixes = ("selas", "selos", "sela", "selo", "las",
  24. "les", "los", "nos", "me", "se", "la", "le",
  25. "lo")
  26. __step1_suffixes = ('amientos', 'imientos', 'amiento', 'imiento',
  27. 'aciones', 'uciones', 'adoras', 'adores',
  28. 'ancias', u('log\xEDas'), 'encias', 'amente',
  29. 'idades', 'anzas', 'ismos', 'ables', 'ibles',
  30. 'istas', 'adora', u('aci\xF3n'), 'antes',
  31. 'ancia', u('log\xEDa'), u('uci\xf3n'), 'encia',
  32. 'mente', 'anza', 'icos', 'icas', 'ismo',
  33. 'able', 'ible', 'ista', 'osos', 'osas',
  34. 'ador', 'ante', 'idad', 'ivas', 'ivos',
  35. 'ico',
  36. 'ica', 'oso', 'osa', 'iva', 'ivo')
  37. __step2a_suffixes = ('yeron', 'yendo', 'yamos', 'yais', 'yan',
  38. 'yen', 'yas', 'yes', 'ya', 'ye', 'yo',
  39. u('y\xF3'))
  40. __step2b_suffixes = (u('ar\xEDamos'), u('er\xEDamos'), u('ir\xEDamos'),
  41. u('i\xE9ramos'), u('i\xE9semos'), u('ar\xEDais'),
  42. 'aremos', u('er\xEDais'), 'eremos',
  43. u('ir\xEDais'), 'iremos', 'ierais', 'ieseis',
  44. 'asteis', 'isteis', u('\xE1bamos'),
  45. u('\xE1ramos'), u('\xE1semos'), u('ar\xEDan'),
  46. u('ar\xEDas'), u('ar\xE9is'), u('er\xEDan'),
  47. u('er\xEDas'), u('er\xE9is'), u('ir\xEDan'),
  48. u('ir\xEDas'), u('ir\xE9is'),
  49. 'ieran', 'iesen', 'ieron', 'iendo', 'ieras',
  50. 'ieses', 'abais', 'arais', 'aseis',
  51. u('\xE9amos'), u('ar\xE1n'), u('ar\xE1s'),
  52. u('ar\xEDa'), u('er\xE1n'), u('er\xE1s'),
  53. u('er\xEDa'), u('ir\xE1n'), u('ir\xE1s'),
  54. u('ir\xEDa'), 'iera', 'iese', 'aste', 'iste',
  55. 'aban', 'aran', 'asen', 'aron', 'ando',
  56. 'abas', 'adas', 'idas', 'aras', 'ases',
  57. u('\xEDais'), 'ados', 'idos', 'amos', 'imos',
  58. 'emos', u('ar\xE1'), u('ar\xE9'), u('er\xE1'),
  59. u('er\xE9'), u('ir\xE1'), u('ir\xE9'), 'aba',
  60. 'ada', 'ida', 'ara', 'ase', u('\xEDan'),
  61. 'ado', 'ido', u('\xEDas'), u('\xE1is'),
  62. u('\xE9is'), u('\xEDa'), 'ad', 'ed', 'id',
  63. 'an', u('i\xF3'), 'ar', 'er', 'ir', 'as',
  64. u('\xEDs'), 'en', 'es')
  65. __step3_suffixes = ("os", "a", "e", "o", u("\xE1"),
  66. u("\xE9"), u("\xED"), u("\xF3"))
  67. def stem(self, word):
  68. """
  69. Stem a Spanish word and return the stemmed form.
  70. :param word: The word that is stemmed.
  71. :type word: str or unicode
  72. :return: The stemmed form.
  73. :rtype: unicode
  74. """
  75. word = word.lower()
  76. step1_success = False
  77. r1, r2 = self._r1r2_standard(word, self.__vowels)
  78. rv = self._rv_standard(word, self.__vowels)
  79. # STEP 0: Attached pronoun
  80. for suffix in self.__step0_suffixes:
  81. if word.endswith(suffix):
  82. if rv.endswith(suffix):
  83. if rv[:-len(suffix)].endswith((u("i\xE9ndo"),
  84. u("\xE1ndo"),
  85. u("\xE1r"), u("\xE9r"),
  86. u("\xEDr"))):
  87. word = (word[:-len(suffix)].replace(u("\xE1"), "a")
  88. .replace(u("\xE9"), "e")
  89. .replace(u("\xED"), "i"))
  90. r1 = (r1[:-len(suffix)].replace(u("\xE1"), "a")
  91. .replace(u("\xE9"), "e")
  92. .replace(u("\xED"), "i"))
  93. r2 = (r2[:-len(suffix)].replace(u("\xE1"), "a")
  94. .replace(u("\xE9"), "e")
  95. .replace(u("\xED"), "i"))
  96. rv = (rv[:-len(suffix)].replace(u("\xE1"), "a")
  97. .replace(u("\xE9"), "e")
  98. .replace(u("\xED"), "i"))
  99. elif rv[:-len(suffix)].endswith(("ando", "iendo",
  100. "ar", "er", "ir")):
  101. word = word[:-len(suffix)]
  102. r1 = r1[:-len(suffix)]
  103. r2 = r2[:-len(suffix)]
  104. rv = rv[:-len(suffix)]
  105. elif (rv[:-len(suffix)].endswith("yendo") and
  106. word[:-len(suffix)].endswith("uyendo")):
  107. word = word[:-len(suffix)]
  108. r1 = r1[:-len(suffix)]
  109. r2 = r2[:-len(suffix)]
  110. rv = rv[:-len(suffix)]
  111. break
  112. # STEP 1: Standard suffix removal
  113. for suffix in self.__step1_suffixes:
  114. if word.endswith(suffix):
  115. if suffix == "amente" and r1.endswith(suffix):
  116. step1_success = True
  117. word = word[:-6]
  118. r2 = r2[:-6]
  119. rv = rv[:-6]
  120. if r2.endswith("iv"):
  121. word = word[:-2]
  122. r2 = r2[:-2]
  123. rv = rv[:-2]
  124. if r2.endswith("at"):
  125. word = word[:-2]
  126. rv = rv[:-2]
  127. elif r2.endswith(("os", "ic", "ad")):
  128. word = word[:-2]
  129. rv = rv[:-2]
  130. elif r2.endswith(suffix):
  131. step1_success = True
  132. if suffix in ("adora", "ador", u("aci\xF3n"), "adoras",
  133. "adores", "aciones", "ante", "antes",
  134. "ancia", "ancias"):
  135. word = word[:-len(suffix)]
  136. r2 = r2[:-len(suffix)]
  137. rv = rv[:-len(suffix)]
  138. if r2.endswith("ic"):
  139. word = word[:-2]
  140. rv = rv[:-2]
  141. elif suffix in (u("log\xEDa"), u("log\xEDas")):
  142. word = word.replace(suffix, "log")
  143. rv = rv.replace(suffix, "log")
  144. elif suffix in (u("uci\xF3n"), "uciones"):
  145. word = word.replace(suffix, "u")
  146. rv = rv.replace(suffix, "u")
  147. elif suffix in ("encia", "encias"):
  148. word = word.replace(suffix, "ente")
  149. rv = rv.replace(suffix, "ente")
  150. elif suffix == "mente":
  151. word = word[:-5]
  152. r2 = r2[:-5]
  153. rv = rv[:-5]
  154. if r2.endswith(("ante", "able", "ible")):
  155. word = word[:-4]
  156. rv = rv[:-4]
  157. elif suffix in ("idad", "idades"):
  158. word = word[:-len(suffix)]
  159. r2 = r2[:-len(suffix)]
  160. rv = rv[:-len(suffix)]
  161. for pre_suff in ("abil", "ic", "iv"):
  162. if r2.endswith(pre_suff):
  163. word = word[:-len(pre_suff)]
  164. rv = rv[:-len(pre_suff)]
  165. elif suffix in ("ivo", "iva", "ivos", "ivas"):
  166. word = word[:-len(suffix)]
  167. r2 = r2[:-len(suffix)]
  168. rv = rv[:-len(suffix)]
  169. if r2.endswith("at"):
  170. word = word[:-2]
  171. rv = rv[:-2]
  172. else:
  173. word = word[:-len(suffix)]
  174. rv = rv[:-len(suffix)]
  175. break
  176. # STEP 2a: Verb suffixes beginning 'y'
  177. if not step1_success:
  178. for suffix in self.__step2a_suffixes:
  179. if (rv.endswith(suffix) and
  180. word[-len(suffix) - 1:-len(suffix)] == "u"):
  181. word = word[:-len(suffix)]
  182. rv = rv[:-len(suffix)]
  183. break
  184. # STEP 2b: Other verb suffixes
  185. for suffix in self.__step2b_suffixes:
  186. if rv.endswith(suffix):
  187. if suffix in ("en", "es", u("\xE9is"), "emos"):
  188. word = word[:-len(suffix)]
  189. rv = rv[:-len(suffix)]
  190. if word.endswith("gu"):
  191. word = word[:-1]
  192. if rv.endswith("gu"):
  193. rv = rv[:-1]
  194. else:
  195. word = word[:-len(suffix)]
  196. rv = rv[:-len(suffix)]
  197. break
  198. # STEP 3: Residual suffix
  199. for suffix in self.__step3_suffixes:
  200. if rv.endswith(suffix):
  201. if suffix in ("e", u("\xE9")):
  202. word = word[:-len(suffix)]
  203. rv = rv[:-len(suffix)]
  204. if len(word) >= 2 and word[-2:] == "gu" and rv[-1] == "u":
  205. word = word[:-1]
  206. else:
  207. word = word[:-len(suffix)]
  208. break
  209. word = (word.replace(u("\xE1"), "a").replace(u("\xE9"), "e")
  210. .replace(u("\xED"), "i").replace(u("\xF3"), "o")
  211. .replace(u("\xFA"), "u"))
  212. return word