dayuan
/
manyi


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173
							from .bases import _StandardStemmer

from whoosh.compat import u


class DutchStemmer(_StandardStemmer):
    """
    The Dutch Snowball stemmer.

    :cvar __vowels: The Dutch vowels.
    :type __vowels: unicode
    :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
    :type __step1_suffixes: tuple
    :cvar __step3b_suffixes: Suffixes to be deleted in step 3b of the algorithm.
    :type __step3b_suffixes: tuple
    :note: A detailed description of the Dutch
           stemming algorithm can be found under
           http://snowball.tartarus.org/algorithms/dutch/stemmer.html
    """

    __vowels = u("aeiouy\xE8")
    __step1_suffixes = ("heden", "ene", "en", "se", "s")
    __step3b_suffixes = ("baar", "lijk", "bar", "end", "ing", "ig")

    def stem(self, word):
        """
        Stem a Dutch word and return the stemmed form.

        :param word: The word that is stemmed.
        :type word: str or unicode
        :return: The stemmed form.
        :rtype: unicode

        """
        word = word.lower()

        step2_success = False

        # Vowel accents are removed.
        word = (word.replace(u("\xE4"), "a").replace(u("\xE1"), "a")
                    .replace(u("\xEB"), "e").replace(u("\xE9"), "e")
                    .replace(u("\xED"), "i").replace(u("\xEF"), "i")
                    .replace(u("\xF6"), "o").replace(u("\xF3"), "o")
                    .replace(u("\xFC"), "u").replace(u("\xFA"), "u"))

        # An initial 'y', a 'y' after a vowel,
        # and an 'i' between self.__vowels is put into upper case.
        # As from now these are treated as consonants.
        if word.startswith("y"):
            word = "".join(("Y", word[1:]))

        for i in range(1, len(word)):
            if word[i - 1] in self.__vowels and word[i] == "y":
                word = "".join((word[:i], "Y", word[i + 1:]))

        for i in range(1, len(word) - 1):
            if (word[i - 1] in self.__vowels and word[i] == "i" and
               word[i + 1] in self.__vowels):
                word = "".join((word[:i], "I", word[i + 1:]))

        r1, r2 = self._r1r2_standard(word, self.__vowels)

        # R1 is adjusted so that the region before it
        # contains at least 3 letters.
        for i in range(1, len(word)):
            if word[i] not in self.__vowels and word[i - 1] in self.__vowels:
                if len(word[:i + 1]) < 3 and len(word[:i + 1]) > 0:
                    r1 = word[3:]
                elif len(word[:i + 1]) == 0:
                    return word
                break

        # STEP 1
        for suffix in self.__step1_suffixes:
            if r1.endswith(suffix):
                if suffix == "heden":
                    word = "".join((word[:-5], "heid"))
                    r1 = "".join((r1[:-5], "heid"))
                    if r2.endswith("heden"):
                        r2 = "".join((r2[:-5], "heid"))

                elif (suffix in ("ene", "en") and
                      not word.endswith("heden") and
                      word[-len(suffix) - 1] not in self.__vowels and
                      word[-len(suffix) - 3:-len(suffix)] != "gem"):
                    word = word[:-len(suffix)]
                    r1 = r1[:-len(suffix)]
                    r2 = r2[:-len(suffix)]
                    if word.endswith(("kk", "dd", "tt")):
                        word = word[:-1]
                        r1 = r1[:-1]
                        r2 = r2[:-1]

                elif (suffix in ("se", "s") and
                      word[-len(suffix) - 1] not in self.__vowels and
                      word[-len(suffix) - 1] != "j"):
                    word = word[:-len(suffix)]
                    r1 = r1[:-len(suffix)]
                    r2 = r2[:-len(suffix)]
                break

        # STEP 2
        if r1.endswith("e") and word[-2] not in self.__vowels:
            step2_success = True
            word = word[:-1]
            r1 = r1[:-1]
            r2 = r2[:-1]

            if word.endswith(("kk", "dd", "tt")):
                word = word[:-1]
                r1 = r1[:-1]
                r2 = r2[:-1]

        # STEP 3a
        if r2.endswith("heid") and word[-5] != "c":
            word = word[:-4]
            r1 = r1[:-4]
            r2 = r2[:-4]

            if (r1.endswith("en") and word[-3] not in self.__vowels and
                word[-5:-2] != "gem"):
                word = word[:-2]
                r1 = r1[:-2]
                r2 = r2[:-2]

                if word.endswith(("kk", "dd", "tt")):
                    word = word[:-1]
                    r1 = r1[:-1]
                    r2 = r2[:-1]

        # STEP 3b: Derivational suffixes
        for suffix in self.__step3b_suffixes:
            if r2.endswith(suffix):
                if suffix in ("end", "ing"):
                    word = word[:-3]
                    r2 = r2[:-3]

                    if r2.endswith("ig") and word[-3] != "e":
                        word = word[:-2]
                    else:
                        if word.endswith(("kk", "dd", "tt")):
                            word = word[:-1]

                elif suffix == "ig" and word[-3] != "e":
                    word = word[:-2]

                elif suffix == "lijk":
                    word = word[:-4]
                    r1 = r1[:-4]

                    if r1.endswith("e") and word[-2] not in self.__vowels:
                        word = word[:-1]
                        if word.endswith(("kk", "dd", "tt")):
                            word = word[:-1]

                elif suffix == "baar":
                    word = word[:-4]

                elif suffix == "bar" and step2_success:
                    word = word[:-3]
                break

        # STEP 4: Undouble vowel
        if len(word) >= 4:
            if word[-1] not in self.__vowels and word[-1] != "I":
                if word[-3:-1] in ("aa", "ee", "oo", "uu"):
                    if word[-4] not in self.__vowels:
                        word = "".join((word[:-3], word[-3], word[-1]))

        # All occurrences of 'I' and 'Y' are put back into lower case.
        word = word.replace("I", "i").replace("Y", "y")

        return word