german.py 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144
  1. from .bases import _StandardStemmer
  2. from whoosh.compat import u
  3. class GermanStemmer(_StandardStemmer):
  4. """
  5. The German Snowball stemmer.
  6. :cvar __vowels: The German vowels.
  7. :type __vowels: unicode
  8. :cvar __s_ending: Letters that may directly appear before a word final 's'.
  9. :type __s_ending: unicode
  10. :cvar __st_ending: Letter that may directly appear before a word final 'st'.
  11. :type __st_ending: unicode
  12. :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
  13. :type __step1_suffixes: tuple
  14. :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
  15. :type __step2_suffixes: tuple
  16. :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm.
  17. :type __step3_suffixes: tuple
  18. :note: A detailed description of the German
  19. stemming algorithm can be found under
  20. http://snowball.tartarus.org/algorithms/german/stemmer.html
  21. """
  22. __vowels = u("aeiouy\xE4\xF6\xFC")
  23. __s_ending = "bdfghklmnrt"
  24. __st_ending = "bdfghklmnt"
  25. __step1_suffixes = ("ern", "em", "er", "en", "es", "e", "s")
  26. __step2_suffixes = ("est", "en", "er", "st")
  27. __step3_suffixes = ("isch", "lich", "heit", "keit",
  28. "end", "ung", "ig", "ik")
  29. def stem(self, word):
  30. """
  31. Stem a German word and return the stemmed form.
  32. :param word: The word that is stemmed.
  33. :type word: str or unicode
  34. :return: The stemmed form.
  35. :rtype: unicode
  36. """
  37. word = word.lower()
  38. word = word.replace(u("\xDF"), "ss")
  39. # Every occurrence of 'u' and 'y'
  40. # between vowels is put into upper case.
  41. for i in range(1, len(word) - 1):
  42. if word[i - 1] in self.__vowels and word[i + 1] in self.__vowels:
  43. if word[i] == "u":
  44. word = "".join((word[:i], "U", word[i + 1:]))
  45. elif word[i] == "y":
  46. word = "".join((word[:i], "Y", word[i + 1:]))
  47. r1, r2 = self._r1r2_standard(word, self.__vowels)
  48. # R1 is adjusted so that the region before it
  49. # contains at least 3 letters.
  50. for i in range(1, len(word)):
  51. if word[i] not in self.__vowels and word[i - 1] in self.__vowels:
  52. if len(word[:i + 1]) < 3 and len(word[:i + 1]) > 0:
  53. r1 = word[3:]
  54. elif len(word[:i + 1]) == 0:
  55. return word
  56. break
  57. # STEP 1
  58. for suffix in self.__step1_suffixes:
  59. if r1.endswith(suffix):
  60. if (suffix in ("en", "es", "e") and
  61. word[-len(suffix) - 4:-len(suffix)] == "niss"):
  62. word = word[:-len(suffix) - 1]
  63. r1 = r1[:-len(suffix) - 1]
  64. r2 = r2[:-len(suffix) - 1]
  65. elif suffix == "s":
  66. if word[-2] in self.__s_ending:
  67. word = word[:-1]
  68. r1 = r1[:-1]
  69. r2 = r2[:-1]
  70. else:
  71. word = word[:-len(suffix)]
  72. r1 = r1[:-len(suffix)]
  73. r2 = r2[:-len(suffix)]
  74. break
  75. # STEP 2
  76. for suffix in self.__step2_suffixes:
  77. if r1.endswith(suffix):
  78. if suffix == "st":
  79. if word[-3] in self.__st_ending and len(word[:-3]) >= 3:
  80. word = word[:-2]
  81. r1 = r1[:-2]
  82. r2 = r2[:-2]
  83. else:
  84. word = word[:-len(suffix)]
  85. r1 = r1[:-len(suffix)]
  86. r2 = r2[:-len(suffix)]
  87. break
  88. # STEP 3: Derivational suffixes
  89. for suffix in self.__step3_suffixes:
  90. if r2.endswith(suffix):
  91. if suffix in ("end", "ung"):
  92. if ("ig" in r2[-len(suffix) - 2:-len(suffix)] and
  93. "e" not in r2[-len(suffix) - 3:-len(suffix) - 2]):
  94. word = word[:-len(suffix) - 2]
  95. else:
  96. word = word[:-len(suffix)]
  97. elif (suffix in ("ig", "ik", "isch") and
  98. "e" not in r2[-len(suffix) - 1:-len(suffix)]):
  99. word = word[:-len(suffix)]
  100. elif suffix in ("lich", "heit"):
  101. if ("er" in r1[-len(suffix) - 2:-len(suffix)] or
  102. "en" in r1[-len(suffix) - 2:-len(suffix)]):
  103. word = word[:-len(suffix) - 2]
  104. else:
  105. word = word[:-len(suffix)]
  106. elif suffix == "keit":
  107. if "lich" in r2[-len(suffix) - 4:-len(suffix)]:
  108. word = word[:-len(suffix) - 4]
  109. elif "ig" in r2[-len(suffix) - 2:-len(suffix)]:
  110. word = word[:-len(suffix) - 2]
  111. else:
  112. word = word[:-len(suffix)]
  113. break
  114. # Umlaut accents are removed and
  115. # 'u' and 'y' are put back into lower case.
  116. word = (word.replace(u("\xE4"), "a").replace(u("\xF6"), "o")
  117. .replace(u("\xFC"), "u").replace("U", "u")
  118. .replace("Y", "y"))
  119. return word