1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980 |
- from .bases import _ScandinavianStemmer
- from whoosh.compat import u
- class SwedishStemmer(_ScandinavianStemmer):
- """
- The Swedish Snowball stemmer.
- :cvar __vowels: The Swedish vowels.
- :type __vowels: unicode
- :cvar __s_ending: Letters that may directly appear before a word final 's'.
- :type __s_ending: unicode
- :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
- :type __step1_suffixes: tuple
- :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
- :type __step2_suffixes: tuple
- :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm.
- :type __step3_suffixes: tuple
- :note: A detailed description of the Swedish
- stemming algorithm can be found under
- http://snowball.tartarus.org/algorithms/swedish/stemmer.html
- """
- __vowels = u("aeiouy\xE4\xE5\xF6")
- __s_ending = "bcdfghjklmnoprtvy"
- __step1_suffixes = ("heterna", "hetens", "heter", "heten",
- "anden", "arnas", "ernas", "ornas", "andes",
- "andet", "arens", "arna", "erna", "orna",
- "ande", "arne", "aste", "aren", "ades",
- "erns", "ade", "are", "ern", "ens", "het",
- "ast", "ad", "en", "ar", "er", "or", "as",
- "es", "at", "a", "e", "s")
- __step2_suffixes = ("dd", "gd", "nn", "dt", "gt", "kt", "tt")
- __step3_suffixes = ("fullt", u("l\xF6st"), "els", "lig", "ig")
- def stem(self, word):
- """
- Stem a Swedish word and return the stemmed form.
- :param word: The word that is stemmed.
- :type word: str or unicode
- :return: The stemmed form.
- :rtype: unicode
- """
- word = word.lower()
- r1 = self._r1_scandinavian(word, self.__vowels)
- # STEP 1
- for suffix in self.__step1_suffixes:
- if r1.endswith(suffix):
- if suffix == "s":
- if word[-2] in self.__s_ending:
- word = word[:-1]
- r1 = r1[:-1]
- else:
- word = word[:-len(suffix)]
- r1 = r1[:-len(suffix)]
- break
- # STEP 2
- for suffix in self.__step2_suffixes:
- if r1.endswith(suffix):
- word = word[:-1]
- r1 = r1[:-1]
- break
- # STEP 3
- for suffix in self.__step3_suffixes:
- if r1.endswith(suffix):
- if suffix in ("els", "lig", "ig"):
- word = word[:-len(suffix)]
- elif suffix in ("fullt", u("l\xF6st")):
- word = word[:-1]
- break
- return word
|