danish.py 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115
  1. from .bases import _ScandinavianStemmer
  2. from whoosh.compat import u
  3. class DanishStemmer(_ScandinavianStemmer):
  4. """
  5. The Danish Snowball stemmer.
  6. :cvar __vowels: The Danish vowels.
  7. :type __vowels: unicode
  8. :cvar __consonants: The Danish consonants.
  9. :type __consonants: unicode
  10. :cvar __double_consonants: The Danish double consonants.
  11. :type __double_consonants: tuple
  12. :cvar __s_ending: Letters that may directly appear before a word final 's'.
  13. :type __s_ending: unicode
  14. :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
  15. :type __step1_suffixes: tuple
  16. :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
  17. :type __step2_suffixes: tuple
  18. :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm.
  19. :type __step3_suffixes: tuple
  20. :note: A detailed description of the Danish
  21. stemming algorithm can be found under
  22. http://snowball.tartarus.org/algorithms/danish/stemmer.html
  23. """
  24. # The language's vowels and other important characters are defined.
  25. __vowels = u("aeiouy\xE6\xE5\xF8")
  26. __consonants = "bcdfghjklmnpqrstvwxz"
  27. __double_consonants = ("bb", "cc", "dd", "ff", "gg", "hh", "jj",
  28. "kk", "ll", "mm", "nn", "pp", "qq", "rr",
  29. "ss", "tt", "vv", "ww", "xx", "zz")
  30. __s_ending = u("abcdfghjklmnoprtvyz\xE5")
  31. # The different suffixes, divided into the algorithm's steps
  32. # and organized by length, are listed in tuples.
  33. __step1_suffixes = ("erendes", "erende", "hedens", "ethed",
  34. "erede", "heden", "heder", "endes",
  35. "ernes", "erens", "erets", "ered",
  36. "ende", "erne", "eren", "erer", "heds",
  37. "enes", "eres", "eret", "hed", "ene", "ere",
  38. "ens", "ers", "ets", "en", "er", "es", "et",
  39. "e", "s")
  40. __step2_suffixes = ("gd", "dt", "gt", "kt")
  41. __step3_suffixes = ("elig", u("l\xF8st"), "lig", "els", "ig")
  42. def stem(self, word):
  43. """
  44. Stem a Danish word and return the stemmed form.
  45. :param word: The word that is stemmed.
  46. :type word: str or unicode
  47. :return: The stemmed form.
  48. :rtype: unicode
  49. """
  50. # Every word is put into lower case for normalization.
  51. word = word.lower()
  52. # After this, the required regions are generated
  53. # by the respective helper method.
  54. r1 = self._r1_scandinavian(word, self.__vowels)
  55. # Then the actual stemming process starts.
  56. # Every new step is explicitly indicated
  57. # according to the descriptions on the Snowball website.
  58. # STEP 1
  59. for suffix in self.__step1_suffixes:
  60. if r1.endswith(suffix):
  61. if suffix == "s":
  62. if word[-2] in self.__s_ending:
  63. word = word[:-1]
  64. r1 = r1[:-1]
  65. else:
  66. word = word[:-len(suffix)]
  67. r1 = r1[:-len(suffix)]
  68. break
  69. # STEP 2
  70. for suffix in self.__step2_suffixes:
  71. if r1.endswith(suffix):
  72. word = word[:-1]
  73. r1 = r1[:-1]
  74. break
  75. # STEP 3
  76. if r1.endswith("igst"):
  77. word = word[:-2]
  78. r1 = r1[:-2]
  79. for suffix in self.__step3_suffixes:
  80. if r1.endswith(suffix):
  81. if suffix == u("l\xF8st"):
  82. word = word[:-1]
  83. r1 = r1[:-1]
  84. else:
  85. word = word[:-len(suffix)]
  86. r1 = r1[:-len(suffix)]
  87. if r1.endswith(self.__step2_suffixes):
  88. word = word[:-1]
  89. r1 = r1[:-1]
  90. break
  91. # STEP 4: Undouble
  92. for double_cons in self.__double_consonants:
  93. if word.endswith(double_cons) and len(word) > 3:
  94. word = word[:-1]
  95. break
  96. return word