123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384 |
- from .bases import _ScandinavianStemmer
- from whoosh.compat import u
- class NorwegianStemmer(_ScandinavianStemmer):
- """
- The Norwegian Snowball stemmer.
- :cvar __vowels: The Norwegian vowels.
- :type __vowels: unicode
- :cvar __s_ending: Letters that may directly appear before a word final 's'.
- :type __s_ending: unicode
- :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
- :type __step1_suffixes: tuple
- :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
- :type __step2_suffixes: tuple
- :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm.
- :type __step3_suffixes: tuple
- :note: A detailed description of the Norwegian
- stemming algorithm can be found under
- http://snowball.tartarus.org/algorithms/norwegian/stemmer.html
- """
- __vowels = u("aeiouy\xE6\xE5\xF8")
- __s_ending = "bcdfghjlmnoprtvyz"
- __step1_suffixes = ("hetenes", "hetene", "hetens", "heter",
- "heten", "endes", "ande", "ende", "edes",
- "enes", "erte", "ede", "ane", "ene", "ens",
- "ers", "ets", "het", "ast", "ert", "en",
- "ar", "er", "as", "es", "et", "a", "e", "s")
- __step2_suffixes = ("dt", "vt")
- __step3_suffixes = ("hetslov", "eleg", "elig", "elov", "slov",
- "leg", "eig", "lig", "els", "lov", "ig")
- def stem(self, word):
- """
- Stem a Norwegian word and return the stemmed form.
- :param word: The word that is stemmed.
- :type word: str or unicode
- :return: The stemmed form.
- :rtype: unicode
- """
- word = word.lower()
- r1 = self._r1_scandinavian(word, self.__vowels)
- # STEP 1
- for suffix in self.__step1_suffixes:
- if r1.endswith(suffix):
- if suffix in ("erte", "ert"):
- word = "".join((word[:-len(suffix)], "er"))
- r1 = "".join((r1[:-len(suffix)], "er"))
- elif suffix == "s":
- if (word[-2] in self.__s_ending or
- (word[-2] == "k" and word[-3] not in self.__vowels)):
- word = word[:-1]
- r1 = r1[:-1]
- else:
- word = word[:-len(suffix)]
- r1 = r1[:-len(suffix)]
- break
- # STEP 2
- for suffix in self.__step2_suffixes:
- if r1.endswith(suffix):
- word = word[:-1]
- r1 = r1[:-1]
- break
- # STEP 3
- for suffix in self.__step3_suffixes:
- if r1.endswith(suffix):
- word = word[:-len(suffix)]
- break
- return word
|