123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268 |
- from whoosh.compat import u
- class HungarianStemmer(object):
- """
- The Hungarian Snowball stemmer.
- :cvar __vowels: The Hungarian vowels.
- :type __vowels: unicode
- :cvar __digraphs: The Hungarian digraphs.
- :type __digraphs: tuple
- :cvar __double_consonants: The Hungarian double consonants.
- :type __double_consonants: tuple
- :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
- :type __step1_suffixes: tuple
- :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
- :type __step2_suffixes: tuple
- :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm.
- :type __step3_suffixes: tuple
- :cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm.
- :type __step4_suffixes: tuple
- :cvar __step5_suffixes: Suffixes to be deleted in step 5 of the algorithm.
- :type __step5_suffixes: tuple
- :cvar __step6_suffixes: Suffixes to be deleted in step 6 of the algorithm.
- :type __step6_suffixes: tuple
- :cvar __step7_suffixes: Suffixes to be deleted in step 7 of the algorithm.
- :type __step7_suffixes: tuple
- :cvar __step8_suffixes: Suffixes to be deleted in step 8 of the algorithm.
- :type __step8_suffixes: tuple
- :cvar __step9_suffixes: Suffixes to be deleted in step 9 of the algorithm.
- :type __step9_suffixes: tuple
- :note: A detailed description of the Hungarian
- stemming algorithm can be found under
- http://snowball.tartarus.org/algorithms/hungarian/stemmer.html
- """
- __vowels = u("aeiou\xF6\xFC\xE1\xE9\xED\xF3\xF5\xFA\xFB")
- __digraphs = ("cs", "dz", "dzs", "gy", "ly", "ny", "ty", "zs")
- __double_consonants = ("bb", "cc", "ccs", "dd", "ff", "gg",
- "ggy", "jj", "kk", "ll", "lly", "mm",
- "nn", "nny", "pp", "rr", "ss", "ssz",
- "tt", "tty", "vv", "zz", "zzs")
- __step1_suffixes = ("al", "el")
- __step2_suffixes = (u('k\xE9ppen'), u('onk\xE9nt'), u('enk\xE9nt'),
- u('ank\xE9nt'), u('k\xE9pp'), u('k\xE9nt'), 'ban',
- 'ben', 'nak', 'nek', 'val', 'vel', u('t\xF3l'),
- u('t\xF5l'), u('r\xF3l'), u('r\xF5l'), u('b\xF3l'),
- u('b\xF5l'), 'hoz', 'hez', u('h\xF6z'),
- u('n\xE1l'), u('n\xE9l'), u('\xE9rt'), 'kor',
- 'ba', 'be', 'ra', 're', 'ig', 'at', 'et',
- 'ot', u('\xF6t'), 'ul', u('\xFCl'), u('v\xE1'),
- u('v\xE9'), 'en', 'on', 'an', u('\xF6n'),
- 'n', 't')
- __step3_suffixes = (u("\xE1nk\xE9nt"), u("\xE1n"), u("\xE9n"))
- __step4_suffixes = ('astul', u('est\xFCl'), u('\xE1stul'),
- u('\xE9st\xFCl'), 'stul', u('st\xFCl'))
- __step5_suffixes = (u("\xE1"), u("\xE9"))
- __step6_suffixes = (u('ok\xE9'), u('\xF6k\xE9'), u('ak\xE9'),
- u('ek\xE9'), u('\xE1k\xE9'), u('\xE1\xE9i'),
- u('\xE9k\xE9'), u('\xE9\xE9i'), u('k\xE9'),
- u('\xE9i'), u('\xE9\xE9'), u('\xE9'))
- __step7_suffixes = (u('\xE1juk'), u('\xE9j\xFCk'), u('\xFCnk'),
- 'unk', 'juk', u('j\xFCk'), u('\xE1nk'),
- u('\xE9nk'), 'nk', 'uk', u('\xFCk'), 'em',
- 'om', 'am', 'od', 'ed', 'ad', u('\xF6d'),
- 'ja', 'je', u('\xE1m'), u('\xE1d'), u('\xE9m'),
- u('\xE9d'), 'm', 'd', 'a', 'e', 'o',
- u('\xE1'), u('\xE9'))
- __step8_suffixes = ('jaitok', 'jeitek', 'jaink', 'jeink', 'aitok',
- 'eitek', u('\xE1itok'), u('\xE9itek'), 'jaim',
- 'jeim', 'jaid', 'jeid', 'eink', 'aink',
- 'itek', 'jeik', 'jaik', u('\xE1ink'),
- u('\xE9ink'), 'aim', 'eim', 'aid', 'eid',
- 'jai', 'jei', 'ink', 'aik', 'eik',
- u('\xE1im'), u('\xE1id'), u('\xE1ik'), u('\xE9im'),
- u('\xE9id'), u('\xE9ik'), 'im', 'id', 'ai',
- 'ei', 'ik', u('\xE1i'), u('\xE9i'), 'i')
- __step9_suffixes = (u("\xE1k"), u("\xE9k"), u("\xF6k"), "ok",
- "ek", "ak", "k")
- def stem(self, word):
- """
- Stem an Hungarian word and return the stemmed form.
- :param word: The word that is stemmed.
- :type word: str or unicode
- :return: The stemmed form.
- :rtype: unicode
- """
- word = word.lower()
- r1 = self.__r1_hungarian(word, self.__vowels, self.__digraphs)
- # STEP 1: Remove instrumental case
- if r1.endswith(self.__step1_suffixes):
- for double_cons in self.__double_consonants:
- if word[-2 - len(double_cons):-2] == double_cons:
- word = "".join((word[:-4], word[-3]))
- if r1[-2 - len(double_cons):-2] == double_cons:
- r1 = "".join((r1[:-4], r1[-3]))
- break
- # STEP 2: Remove frequent cases
- for suffix in self.__step2_suffixes:
- if word.endswith(suffix):
- if r1.endswith(suffix):
- word = word[:-len(suffix)]
- r1 = r1[:-len(suffix)]
- if r1.endswith(u("\xE1")):
- word = "".join((word[:-1], "a"))
- r1 = "".join((r1[:-1], "a"))
- elif r1.endswith(u("\xE9")):
- word = "".join((word[:-1], "e"))
- r1 = "".join((r1[:-1], "e"))
- break
- # STEP 3: Remove special cases
- for suffix in self.__step3_suffixes:
- if r1.endswith(suffix):
- if suffix == u("\xE9n"):
- word = "".join((word[:-2], "e"))
- r1 = "".join((r1[:-2], "e"))
- else:
- word = "".join((word[:-len(suffix)], "a"))
- r1 = "".join((r1[:-len(suffix)], "a"))
- break
- # STEP 4: Remove other cases
- for suffix in self.__step4_suffixes:
- if r1.endswith(suffix):
- if suffix == u("\xE1stul"):
- word = "".join((word[:-5], "a"))
- r1 = "".join((r1[:-5], "a"))
- elif suffix == u("\xE9st\xFCl"):
- word = "".join((word[:-5], "e"))
- r1 = "".join((r1[:-5], "e"))
- else:
- word = word[:-len(suffix)]
- r1 = r1[:-len(suffix)]
- break
- # STEP 5: Remove factive case
- for suffix in self.__step5_suffixes:
- if r1.endswith(suffix):
- for double_cons in self.__double_consonants:
- if word[-1 - len(double_cons):-1] == double_cons:
- word = "".join((word[:-3], word[-2]))
- if r1[-1 - len(double_cons):-1] == double_cons:
- r1 = "".join((r1[:-3], r1[-2]))
- break
- # STEP 6: Remove owned
- for suffix in self.__step6_suffixes:
- if r1.endswith(suffix):
- if suffix in (u("\xE1k\xE9"), u("\xE1\xE9i")):
- word = "".join((word[:-3], "a"))
- r1 = "".join((r1[:-3], "a"))
- elif suffix in (u("\xE9k\xE9"), u("\xE9\xE9i"),
- u("\xE9\xE9")):
- word = "".join((word[:-len(suffix)], "e"))
- r1 = "".join((r1[:-len(suffix)], "e"))
- else:
- word = word[:-len(suffix)]
- r1 = r1[:-len(suffix)]
- break
- # STEP 7: Remove singular owner suffixes
- for suffix in self.__step7_suffixes:
- if word.endswith(suffix):
- if r1.endswith(suffix):
- if suffix in (u("\xE1nk"), u("\xE1juk"), u("\xE1m"),
- u("\xE1d"), u("\xE1")):
- word = "".join((word[:-len(suffix)], "a"))
- r1 = "".join((r1[:-len(suffix)], "a"))
- elif suffix in (u("\xE9nk"), u("\xE9j\xFCk"),
- u("\xE9m"), u("\xE9d"), u("\xE9")):
- word = "".join((word[:-len(suffix)], "e"))
- r1 = "".join((r1[:-len(suffix)], "e"))
- else:
- word = word[:-len(suffix)]
- r1 = r1[:-len(suffix)]
- break
- # STEP 8: Remove plural owner suffixes
- for suffix in self.__step8_suffixes:
- if word.endswith(suffix):
- if r1.endswith(suffix):
- if suffix in (u("\xE1im"), u("\xE1id"), u("\xE1i"),
- u("\xE1ink"), u("\xE1itok"), u("\xE1ik")):
- word = "".join((word[:-len(suffix)], "a"))
- r1 = "".join((r1[:-len(suffix)], "a"))
- elif suffix in (u("\xE9im"), u("\xE9id"), u("\xE9i"),
- u("\xE9ink"), u("\xE9itek"), u("\xE9ik")):
- word = "".join((word[:-len(suffix)], "e"))
- r1 = "".join((r1[:-len(suffix)], "e"))
- else:
- word = word[:-len(suffix)]
- r1 = r1[:-len(suffix)]
- break
- # STEP 9: Remove plural suffixes
- for suffix in self.__step9_suffixes:
- if word.endswith(suffix):
- if r1.endswith(suffix):
- if suffix == u("\xE1k"):
- word = "".join((word[:-2], "a"))
- elif suffix == u("\xE9k"):
- word = "".join((word[:-2], "e"))
- else:
- word = word[:-len(suffix)]
- break
- return word
- def __r1_hungarian(self, word, vowels, digraphs):
- """
- Return the region R1 that is used by the Hungarian stemmer.
- If the word begins with a vowel, R1 is defined as the region
- after the first consonant or digraph (= two letters stand for
- one phoneme) in the word. If the word begins with a consonant,
- it is defined as the region after the first vowel in the word.
- If the word does not contain both a vowel and consonant, R1
- is the null region at the end of the word.
- :param word: The Hungarian word whose region R1 is determined.
- :type word: str or unicode
- :param vowels: The Hungarian vowels that are used to determine
- the region R1.
- :type vowels: unicode
- :param digraphs: The digraphs that are used to determine the
- region R1.
- :type digraphs: tuple
- :return: the region R1 for the respective word.
- :rtype: unicode
- :note: This helper method is invoked by the stem method of the subclass
- HungarianStemmer. It is not to be invoked directly!
- """
- r1 = ""
- if word[0] in vowels:
- for digraph in digraphs:
- if digraph in word[1:]:
- r1 = word[word.index(digraph[-1]) + 1:]
- return r1
- for i in range(1, len(word)):
- if word[i] not in vowels:
- r1 = word[i + 1:]
- break
- else:
- for i in range(1, len(word)):
- if word[i] in vowels:
- r1 = word[i + 1:]
- break
- return r1
|