swedish.py 2.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980
  1. from .bases import _ScandinavianStemmer
  2. from whoosh.compat import u
  3. class SwedishStemmer(_ScandinavianStemmer):
  4. """
  5. The Swedish Snowball stemmer.
  6. :cvar __vowels: The Swedish vowels.
  7. :type __vowels: unicode
  8. :cvar __s_ending: Letters that may directly appear before a word final 's'.
  9. :type __s_ending: unicode
  10. :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
  11. :type __step1_suffixes: tuple
  12. :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
  13. :type __step2_suffixes: tuple
  14. :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm.
  15. :type __step3_suffixes: tuple
  16. :note: A detailed description of the Swedish
  17. stemming algorithm can be found under
  18. http://snowball.tartarus.org/algorithms/swedish/stemmer.html
  19. """
  20. __vowels = u("aeiouy\xE4\xE5\xF6")
  21. __s_ending = "bcdfghjklmnoprtvy"
  22. __step1_suffixes = ("heterna", "hetens", "heter", "heten",
  23. "anden", "arnas", "ernas", "ornas", "andes",
  24. "andet", "arens", "arna", "erna", "orna",
  25. "ande", "arne", "aste", "aren", "ades",
  26. "erns", "ade", "are", "ern", "ens", "het",
  27. "ast", "ad", "en", "ar", "er", "or", "as",
  28. "es", "at", "a", "e", "s")
  29. __step2_suffixes = ("dd", "gd", "nn", "dt", "gt", "kt", "tt")
  30. __step3_suffixes = ("fullt", u("l\xF6st"), "els", "lig", "ig")
  31. def stem(self, word):
  32. """
  33. Stem a Swedish word and return the stemmed form.
  34. :param word: The word that is stemmed.
  35. :type word: str or unicode
  36. :return: The stemmed form.
  37. :rtype: unicode
  38. """
  39. word = word.lower()
  40. r1 = self._r1_scandinavian(word, self.__vowels)
  41. # STEP 1
  42. for suffix in self.__step1_suffixes:
  43. if r1.endswith(suffix):
  44. if suffix == "s":
  45. if word[-2] in self.__s_ending:
  46. word = word[:-1]
  47. r1 = r1[:-1]
  48. else:
  49. word = word[:-len(suffix)]
  50. r1 = r1[:-len(suffix)]
  51. break
  52. # STEP 2
  53. for suffix in self.__step2_suffixes:
  54. if r1.endswith(suffix):
  55. word = word[:-1]
  56. r1 = r1[:-1]
  57. break
  58. # STEP 3
  59. for suffix in self.__step3_suffixes:
  60. if r1.endswith(suffix):
  61. if suffix in ("els", "lig", "ig"):
  62. word = word[:-len(suffix)]
  63. elif suffix in ("fullt", u("l\xF6st")):
  64. word = word[:-1]
  65. break
  66. return word