123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266 |
- from .bases import _StandardStemmer
- from whoosh.compat import u
- class FinnishStemmer(_StandardStemmer):
- """
- The Finnish Snowball stemmer.
- :cvar __vowels: The Finnish vowels.
- :type __vowels: unicode
- :cvar __restricted_vowels: A subset of the Finnish vowels.
- :type __restricted_vowels: unicode
- :cvar __long_vowels: The Finnish vowels in their long forms.
- :type __long_vowels: tuple
- :cvar __consonants: The Finnish consonants.
- :type __consonants: unicode
- :cvar __double_consonants: The Finnish double consonants.
- :type __double_consonants: tuple
- :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
- :type __step1_suffixes: tuple
- :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
- :type __step2_suffixes: tuple
- :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm.
- :type __step3_suffixes: tuple
- :cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm.
- :type __step4_suffixes: tuple
- :note: A detailed description of the Finnish
- stemming algorithm can be found under
- http://snowball.tartarus.org/algorithms/finnish/stemmer.html
- """
- __vowels = u("aeiouy\xE4\xF6")
- __restricted_vowels = u("aeiou\xE4\xF6")
- __long_vowels = ("aa", "ee", "ii", "oo", "uu", u("\xE4\xE4"),
- u("\xF6\xF6"))
- __consonants = "bcdfghjklmnpqrstvwxz"
- __double_consonants = ("bb", "cc", "dd", "ff", "gg", "hh", "jj",
- "kk", "ll", "mm", "nn", "pp", "qq", "rr",
- "ss", "tt", "vv", "ww", "xx", "zz")
- __step1_suffixes = ('kaan', u('k\xE4\xE4n'), 'sti', 'kin', 'han',
- u('h\xE4n'), 'ko', u('k\xF6'), 'pa', u('p\xE4'))
- __step2_suffixes = ('nsa', u('ns\xE4'), 'mme', 'nne', 'si', 'ni',
- 'an', u('\xE4n'), 'en')
- __step3_suffixes = ('siin', 'tten', 'seen', 'han', 'hen', 'hin',
- 'hon', u('h\xE4n'), u('h\xF6n'), 'den', 'tta',
- u('tt\xE4'), 'ssa', u('ss\xE4'), 'sta',
- u('st\xE4'), 'lla', u('ll\xE4'), 'lta',
- u('lt\xE4'), 'lle', 'ksi', 'ine', 'ta',
- u('t\xE4'), 'na', u('n\xE4'), 'a', u('\xE4'),
- 'n')
- __step4_suffixes = ('impi', 'impa', u('imp\xE4'), 'immi', 'imma',
- u('imm\xE4'), 'mpi', 'mpa', u('mp\xE4'), 'mmi',
- 'mma', u('mm\xE4'), 'eja', u('ej\xE4'))
- def stem(self, word):
- """
- Stem a Finnish word and return the stemmed form.
- :param word: The word that is stemmed.
- :type word: str or unicode
- :return: The stemmed form.
- :rtype: unicode
- """
- word = word.lower()
- step3_success = False
- r1, r2 = self._r1r2_standard(word, self.__vowels)
- # STEP 1: Particles etc.
- for suffix in self.__step1_suffixes:
- if r1.endswith(suffix):
- if suffix == "sti":
- if suffix in r2:
- word = word[:-3]
- r1 = r1[:-3]
- r2 = r2[:-3]
- else:
- if word[-len(suffix) - 1] in u("ntaeiouy\xE4\xF6"):
- word = word[:-len(suffix)]
- r1 = r1[:-len(suffix)]
- r2 = r2[:-len(suffix)]
- break
- # STEP 2: Possessives
- for suffix in self.__step2_suffixes:
- if r1.endswith(suffix):
- if suffix == "si":
- if word[-3] != "k":
- word = word[:-2]
- r1 = r1[:-2]
- r2 = r2[:-2]
- elif suffix == "ni":
- word = word[:-2]
- r1 = r1[:-2]
- r2 = r2[:-2]
- if word.endswith("kse"):
- word = "".join((word[:-3], "ksi"))
- if r1.endswith("kse"):
- r1 = "".join((r1[:-3], "ksi"))
- if r2.endswith("kse"):
- r2 = "".join((r2[:-3], "ksi"))
- elif suffix == "an":
- if (word[-4:-2] in ("ta", "na") or
- word[-5:-2] in ("ssa", "sta", "lla", "lta")):
- word = word[:-2]
- r1 = r1[:-2]
- r2 = r2[:-2]
- elif suffix == u("\xE4n"):
- if (word[-4:-2] in (u("t\xE4"), u("n\xE4")) or
- word[-5:-2] in (u("ss\xE4"), u("st\xE4"),
- u("ll\xE4"), u("lt\xE4"))):
- word = word[:-2]
- r1 = r1[:-2]
- r2 = r2[:-2]
- elif suffix == "en":
- if word[-5:-2] in ("lle", "ine"):
- word = word[:-2]
- r1 = r1[:-2]
- r2 = r2[:-2]
- else:
- word = word[:-3]
- r1 = r1[:-3]
- r2 = r2[:-3]
- break
- # STEP 3: Cases
- for suffix in self.__step3_suffixes:
- if r1.endswith(suffix):
- if suffix in ("han", "hen", "hin", "hon", u("h\xE4n"),
- u("h\xF6n")):
- if ((suffix == "han" and word[-4] == "a") or
- (suffix == "hen" and word[-4] == "e") or
- (suffix == "hin" and word[-4] == "i") or
- (suffix == "hon" and word[-4] == "o") or
- (suffix == u("h\xE4n") and word[-4] == u("\xE4")) or
- (suffix == u("h\xF6n") and word[-4] == u("\xF6"))):
- word = word[:-3]
- r1 = r1[:-3]
- r2 = r2[:-3]
- step3_success = True
- elif suffix in ("siin", "den", "tten"):
- if (word[-len(suffix) - 1] == "i" and
- word[-len(suffix) - 2] in self.__restricted_vowels):
- word = word[:-len(suffix)]
- r1 = r1[:-len(suffix)]
- r2 = r2[:-len(suffix)]
- step3_success = True
- else:
- continue
- elif suffix == "seen":
- if word[-6:-4] in self.__long_vowels:
- word = word[:-4]
- r1 = r1[:-4]
- r2 = r2[:-4]
- step3_success = True
- else:
- continue
- elif suffix in ("a", u("\xE4")):
- if word[-2] in self.__vowels and word[-3] in self.__consonants:
- word = word[:-1]
- r1 = r1[:-1]
- r2 = r2[:-1]
- step3_success = True
- elif suffix in ("tta", u("tt\xE4")):
- if word[-4] == "e":
- word = word[:-3]
- r1 = r1[:-3]
- r2 = r2[:-3]
- step3_success = True
- elif suffix == "n":
- word = word[:-1]
- r1 = r1[:-1]
- r2 = r2[:-1]
- step3_success = True
- if word[-2:] == "ie" or word[-2:] in self.__long_vowels:
- word = word[:-1]
- r1 = r1[:-1]
- r2 = r2[:-1]
- else:
- word = word[:-len(suffix)]
- r1 = r1[:-len(suffix)]
- r2 = r2[:-len(suffix)]
- step3_success = True
- break
- # STEP 4: Other endings
- for suffix in self.__step4_suffixes:
- if r2.endswith(suffix):
- if suffix in ("mpi", "mpa", u("mp\xE4"), "mmi", "mma",
- u("mm\xE4")):
- if word[-5:-3] != "po":
- word = word[:-3]
- r1 = r1[:-3]
- r2 = r2[:-3]
- else:
- word = word[:-len(suffix)]
- r1 = r1[:-len(suffix)]
- r2 = r2[:-len(suffix)]
- break
- # STEP 5: Plurals
- if step3_success and len(r1) >= 1 and r1[-1] in "ij":
- word = word[:-1]
- r1 = r1[:-1]
- elif (not step3_success and len(r1) >= 2 and
- r1[-1] == "t" and r1[-2] in self.__vowels):
- word = word[:-1]
- r1 = r1[:-1]
- r2 = r2[:-1]
- if r2.endswith("imma"):
- word = word[:-4]
- r1 = r1[:-4]
- elif r2.endswith("mma") and r2[-5:-3] != "po":
- word = word[:-3]
- r1 = r1[:-3]
- # STEP 6: Tidying up
- if r1[-2:] in self.__long_vowels:
- word = word[:-1]
- r1 = r1[:-1]
- if (len(r1) >= 2 and r1[-2] in self.__consonants and
- r1[-1] in u("a\xE4ei")):
- word = word[:-1]
- r1 = r1[:-1]
- if r1.endswith(("oj", "uj")):
- word = word[:-1]
- r1 = r1[:-1]
- if r1.endswith("jo"):
- word = word[:-1]
- r1 = r1[:-1]
- # If the word ends with a double consonant
- # followed by zero or more vowels, the last consonant is removed.
- for i in range(1, len(word)):
- if word[-i] in self.__vowels:
- continue
- else:
- if i == 1:
- if word[-i - 1:] in self.__double_consonants:
- word = word[:-1]
- else:
- if word[-i - 1:-i + 1] in self.__double_consonants:
- word = "".join((word[:-i], word[-i + 1:]))
- break
- return word
|