dutch.py 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173
  1. from .bases import _StandardStemmer
  2. from whoosh.compat import u
  3. class DutchStemmer(_StandardStemmer):
  4. """
  5. The Dutch Snowball stemmer.
  6. :cvar __vowels: The Dutch vowels.
  7. :type __vowels: unicode
  8. :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
  9. :type __step1_suffixes: tuple
  10. :cvar __step3b_suffixes: Suffixes to be deleted in step 3b of the algorithm.
  11. :type __step3b_suffixes: tuple
  12. :note: A detailed description of the Dutch
  13. stemming algorithm can be found under
  14. http://snowball.tartarus.org/algorithms/dutch/stemmer.html
  15. """
  16. __vowels = u("aeiouy\xE8")
  17. __step1_suffixes = ("heden", "ene", "en", "se", "s")
  18. __step3b_suffixes = ("baar", "lijk", "bar", "end", "ing", "ig")
  19. def stem(self, word):
  20. """
  21. Stem a Dutch word and return the stemmed form.
  22. :param word: The word that is stemmed.
  23. :type word: str or unicode
  24. :return: The stemmed form.
  25. :rtype: unicode
  26. """
  27. word = word.lower()
  28. step2_success = False
  29. # Vowel accents are removed.
  30. word = (word.replace(u("\xE4"), "a").replace(u("\xE1"), "a")
  31. .replace(u("\xEB"), "e").replace(u("\xE9"), "e")
  32. .replace(u("\xED"), "i").replace(u("\xEF"), "i")
  33. .replace(u("\xF6"), "o").replace(u("\xF3"), "o")
  34. .replace(u("\xFC"), "u").replace(u("\xFA"), "u"))
  35. # An initial 'y', a 'y' after a vowel,
  36. # and an 'i' between self.__vowels is put into upper case.
  37. # As from now these are treated as consonants.
  38. if word.startswith("y"):
  39. word = "".join(("Y", word[1:]))
  40. for i in range(1, len(word)):
  41. if word[i - 1] in self.__vowels and word[i] == "y":
  42. word = "".join((word[:i], "Y", word[i + 1:]))
  43. for i in range(1, len(word) - 1):
  44. if (word[i - 1] in self.__vowels and word[i] == "i" and
  45. word[i + 1] in self.__vowels):
  46. word = "".join((word[:i], "I", word[i + 1:]))
  47. r1, r2 = self._r1r2_standard(word, self.__vowels)
  48. # R1 is adjusted so that the region before it
  49. # contains at least 3 letters.
  50. for i in range(1, len(word)):
  51. if word[i] not in self.__vowels and word[i - 1] in self.__vowels:
  52. if len(word[:i + 1]) < 3 and len(word[:i + 1]) > 0:
  53. r1 = word[3:]
  54. elif len(word[:i + 1]) == 0:
  55. return word
  56. break
  57. # STEP 1
  58. for suffix in self.__step1_suffixes:
  59. if r1.endswith(suffix):
  60. if suffix == "heden":
  61. word = "".join((word[:-5], "heid"))
  62. r1 = "".join((r1[:-5], "heid"))
  63. if r2.endswith("heden"):
  64. r2 = "".join((r2[:-5], "heid"))
  65. elif (suffix in ("ene", "en") and
  66. not word.endswith("heden") and
  67. word[-len(suffix) - 1] not in self.__vowels and
  68. word[-len(suffix) - 3:-len(suffix)] != "gem"):
  69. word = word[:-len(suffix)]
  70. r1 = r1[:-len(suffix)]
  71. r2 = r2[:-len(suffix)]
  72. if word.endswith(("kk", "dd", "tt")):
  73. word = word[:-1]
  74. r1 = r1[:-1]
  75. r2 = r2[:-1]
  76. elif (suffix in ("se", "s") and
  77. word[-len(suffix) - 1] not in self.__vowels and
  78. word[-len(suffix) - 1] != "j"):
  79. word = word[:-len(suffix)]
  80. r1 = r1[:-len(suffix)]
  81. r2 = r2[:-len(suffix)]
  82. break
  83. # STEP 2
  84. if r1.endswith("e") and word[-2] not in self.__vowels:
  85. step2_success = True
  86. word = word[:-1]
  87. r1 = r1[:-1]
  88. r2 = r2[:-1]
  89. if word.endswith(("kk", "dd", "tt")):
  90. word = word[:-1]
  91. r1 = r1[:-1]
  92. r2 = r2[:-1]
  93. # STEP 3a
  94. if r2.endswith("heid") and word[-5] != "c":
  95. word = word[:-4]
  96. r1 = r1[:-4]
  97. r2 = r2[:-4]
  98. if (r1.endswith("en") and word[-3] not in self.__vowels and
  99. word[-5:-2] != "gem"):
  100. word = word[:-2]
  101. r1 = r1[:-2]
  102. r2 = r2[:-2]
  103. if word.endswith(("kk", "dd", "tt")):
  104. word = word[:-1]
  105. r1 = r1[:-1]
  106. r2 = r2[:-1]
  107. # STEP 3b: Derivational suffixes
  108. for suffix in self.__step3b_suffixes:
  109. if r2.endswith(suffix):
  110. if suffix in ("end", "ing"):
  111. word = word[:-3]
  112. r2 = r2[:-3]
  113. if r2.endswith("ig") and word[-3] != "e":
  114. word = word[:-2]
  115. else:
  116. if word.endswith(("kk", "dd", "tt")):
  117. word = word[:-1]
  118. elif suffix == "ig" and word[-3] != "e":
  119. word = word[:-2]
  120. elif suffix == "lijk":
  121. word = word[:-4]
  122. r1 = r1[:-4]
  123. if r1.endswith("e") and word[-2] not in self.__vowels:
  124. word = word[:-1]
  125. if word.endswith(("kk", "dd", "tt")):
  126. word = word[:-1]
  127. elif suffix == "baar":
  128. word = word[:-4]
  129. elif suffix == "bar" and step2_success:
  130. word = word[:-3]
  131. break
  132. # STEP 4: Undouble vowel
  133. if len(word) >= 4:
  134. if word[-1] not in self.__vowels and word[-1] != "I":
  135. if word[-3:-1] in ("aa", "ee", "oo", "uu"):
  136. if word[-4] not in self.__vowels:
  137. word = "".join((word[:-3], word[-3], word[-1]))
  138. # All occurrences of 'I' and 'Y' are put back into lower case.
  139. word = word.replace("I", "i").replace("Y", "y")
  140. return word