norwegian.py 2.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384
  1. from .bases import _ScandinavianStemmer
  2. from whoosh.compat import u
  3. class NorwegianStemmer(_ScandinavianStemmer):
  4. """
  5. The Norwegian Snowball stemmer.
  6. :cvar __vowels: The Norwegian vowels.
  7. :type __vowels: unicode
  8. :cvar __s_ending: Letters that may directly appear before a word final 's'.
  9. :type __s_ending: unicode
  10. :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
  11. :type __step1_suffixes: tuple
  12. :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
  13. :type __step2_suffixes: tuple
  14. :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm.
  15. :type __step3_suffixes: tuple
  16. :note: A detailed description of the Norwegian
  17. stemming algorithm can be found under
  18. http://snowball.tartarus.org/algorithms/norwegian/stemmer.html
  19. """
  20. __vowels = u("aeiouy\xE6\xE5\xF8")
  21. __s_ending = "bcdfghjlmnoprtvyz"
  22. __step1_suffixes = ("hetenes", "hetene", "hetens", "heter",
  23. "heten", "endes", "ande", "ende", "edes",
  24. "enes", "erte", "ede", "ane", "ene", "ens",
  25. "ers", "ets", "het", "ast", "ert", "en",
  26. "ar", "er", "as", "es", "et", "a", "e", "s")
  27. __step2_suffixes = ("dt", "vt")
  28. __step3_suffixes = ("hetslov", "eleg", "elig", "elov", "slov",
  29. "leg", "eig", "lig", "els", "lov", "ig")
  30. def stem(self, word):
  31. """
  32. Stem a Norwegian word and return the stemmed form.
  33. :param word: The word that is stemmed.
  34. :type word: str or unicode
  35. :return: The stemmed form.
  36. :rtype: unicode
  37. """
  38. word = word.lower()
  39. r1 = self._r1_scandinavian(word, self.__vowels)
  40. # STEP 1
  41. for suffix in self.__step1_suffixes:
  42. if r1.endswith(suffix):
  43. if suffix in ("erte", "ert"):
  44. word = "".join((word[:-len(suffix)], "er"))
  45. r1 = "".join((r1[:-len(suffix)], "er"))
  46. elif suffix == "s":
  47. if (word[-2] in self.__s_ending or
  48. (word[-2] == "k" and word[-3] not in self.__vowels)):
  49. word = word[:-1]
  50. r1 = r1[:-1]
  51. else:
  52. word = word[:-len(suffix)]
  53. r1 = r1[:-len(suffix)]
  54. break
  55. # STEP 2
  56. for suffix in self.__step2_suffixes:
  57. if r1.endswith(suffix):
  58. word = word[:-1]
  59. r1 = r1[:-1]
  60. break
  61. # STEP 3
  62. for suffix in self.__step3_suffixes:
  63. if r1.endswith(suffix):
  64. word = word[:-len(suffix)]
  65. break
  66. return word