123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257 |
- from .bases import _StandardStemmer
- from whoosh.compat import u
- class RomanianStemmer(_StandardStemmer):
- """
- The Romanian Snowball stemmer.
- :cvar __vowels: The Romanian vowels.
- :type __vowels: unicode
- :cvar __step0_suffixes: Suffixes to be deleted in step 0 of the algorithm.
- :type __step0_suffixes: tuple
- :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
- :type __step1_suffixes: tuple
- :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm.
- :type __step2_suffixes: tuple
- :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm.
- :type __step3_suffixes: tuple
- :note: A detailed description of the Romanian
- stemming algorithm can be found under
- http://snowball.tartarus.org/algorithms/romanian/stemmer.html
- """
- __vowels = u("aeiou\u0103\xE2\xEE")
- __step0_suffixes = ('iilor', 'ului', 'elor', 'iile', 'ilor',
- 'atei', u('a\u0163ie'), u('a\u0163ia'), 'aua',
- 'ele', 'iua', 'iei', 'ile', 'ul', 'ea',
- 'ii')
- __step1_suffixes = ('abilitate', 'abilitati', u('abilit\u0103\u0163i'),
- 'ibilitate', u('abilit\u0103i'), 'ivitate',
- 'ivitati', u('ivit\u0103\u0163i'), 'icitate',
- 'icitati', u('icit\u0103\u0163i'), 'icatori',
- u('ivit\u0103i'), u('icit\u0103i'), 'icator',
- u('a\u0163iune'), 'atoare', u('\u0103toare'),
- u('i\u0163iune'), 'itoare', 'iciva', 'icive',
- 'icivi', u('iciv\u0103'), 'icala', 'icale',
- 'icali', u('ical\u0103'), 'ativa', 'ative',
- 'ativi', u('ativ\u0103'), 'atori', u('\u0103tori'),
- 'itiva', 'itive', 'itivi', u('itiv\u0103'),
- 'itori', 'iciv', 'ical', 'ativ', 'ator',
- u('\u0103tor'), 'itiv', 'itor')
- __step2_suffixes = ('abila', 'abile', 'abili', u('abil\u0103'),
- 'ibila', 'ibile', 'ibili', u('ibil\u0103'),
- 'atori', 'itate', 'itati', u('it\u0103\u0163i'),
- 'abil', 'ibil', 'oasa', u('oas\u0103'), 'oase',
- 'anta', 'ante', 'anti', u('ant\u0103'), 'ator',
- u('it\u0103i'), 'iune', 'iuni', 'isme', 'ista',
- 'iste', 'isti', u('ist\u0103'), u('i\u015Fti'),
- 'ata', u('at\u0103'), 'ati', 'ate', 'uta',
- u('ut\u0103'), 'uti', 'ute', 'ita', u('it\u0103'),
- 'iti', 'ite', 'ica', 'ice', 'ici', u('ic\u0103'),
- 'osi', u('o\u015Fi'), 'ant', 'iva', 'ive', 'ivi',
- u('iv\u0103'), 'ism', 'ist', 'at', 'ut', 'it',
- 'ic', 'os', 'iv')
- __step3_suffixes = (u('seser\u0103\u0163i'), u('aser\u0103\u0163i'),
- u('iser\u0103\u0163i'), u('\xE2ser\u0103\u0163i'),
- u('user\u0103\u0163i'), u('seser\u0103m'),
- u('aser\u0103m'), u('iser\u0103m'), u('\xE2ser\u0103m'),
- u('user\u0103m'), u('ser\u0103\u0163i'), u('sese\u015Fi'),
- u('seser\u0103'), u('easc\u0103'), u('ar\u0103\u0163i'),
- u('ur\u0103\u0163i'), u('ir\u0103\u0163i'),
- u('\xE2r\u0103\u0163i'), u('ase\u015Fi'),
- u('aser\u0103'), u('ise\u015Fi'), u('iser\u0103'),
- u('\xe2se\u015Fi'), u('\xE2ser\u0103'),
- u('use\u015Fi'), u('user\u0103'), u('ser\u0103m'),
- 'sesem', 'indu', '\xE2ndu', u('eaz\u0103'),
- u('e\u015Fti'), u('e\u015Fte'), u('\u0103\u015Fti'),
- u('\u0103\u015Fte'), u('ea\u0163i'), u('ia\u0163i'),
- u('ar\u0103m'), u('ur\u0103m'), u('ir\u0103m'),
- u('\xE2r\u0103m'), 'asem', 'isem',
- '\xE2sem', 'usem', u('se\u015Fi'), u('ser\u0103'),
- 'sese', 'are', 'ere', 'ire', '\xE2re',
- 'ind', '\xE2nd', 'eze', 'ezi', 'esc',
- u('\u0103sc'), 'eam', 'eai', 'eau', 'iam',
- 'iai', 'iau', u('a\u015Fi'), u('ar\u0103'),
- u('u\u015Fi'), u('ur\u0103'), u('i\u015Fi'), u('ir\u0103'),
- u('\xE2\u015Fi'), u('\xe2r\u0103'), 'ase',
- 'ise', '\xE2se', 'use', u('a\u0163i'),
- u('e\u0163i'), u('i\u0163i'), u('\xe2\u0163i'), 'sei',
- 'ez', 'am', 'ai', 'au', 'ea', 'ia', 'ui',
- '\xE2i', u('\u0103m'), 'em', 'im', '\xE2m',
- 'se')
- def stem(self, word):
- """
- Stem a Romanian word and return the stemmed form.
- :param word: The word that is stemmed.
- :type word: str or unicode
- :return: The stemmed form.
- :rtype: unicode
- """
- word = word.lower()
- step1_success = False
- step2_success = False
- for i in range(1, len(word) - 1):
- if word[i - 1] in self.__vowels and word[i + 1] in self.__vowels:
- if word[i] == "u":
- word = "".join((word[:i], "U", word[i + 1:]))
- elif word[i] == "i":
- word = "".join((word[:i], "I", word[i + 1:]))
- r1, r2 = self._r1r2_standard(word, self.__vowels)
- rv = self._rv_standard(word, self.__vowels)
- # STEP 0: Removal of plurals and other simplifications
- for suffix in self.__step0_suffixes:
- if word.endswith(suffix):
- if suffix in r1:
- if suffix in ("ul", "ului"):
- word = word[:-len(suffix)]
- if suffix in rv:
- rv = rv[:-len(suffix)]
- else:
- rv = ""
- elif (suffix == "aua" or suffix == "atei" or
- (suffix == "ile" and word[-5:-3] != "ab")):
- word = word[:-2]
- elif suffix in ("ea", "ele", "elor"):
- word = "".join((word[:-len(suffix)], "e"))
- if suffix in rv:
- rv = "".join((rv[:-len(suffix)], "e"))
- else:
- rv = ""
- elif suffix in ("ii", "iua", "iei",
- "iile", "iilor", "ilor"):
- word = "".join((word[:-len(suffix)], "i"))
- if suffix in rv:
- rv = "".join((rv[:-len(suffix)], "i"))
- else:
- rv = ""
- elif suffix in ("a\u0163ie", "a\u0163ia"):
- word = word[:-1]
- break
- # STEP 1: Reduction of combining suffixes
- while True:
- replacement_done = False
- for suffix in self.__step1_suffixes:
- if word.endswith(suffix):
- if suffix in r1:
- step1_success = True
- replacement_done = True
- if suffix in ("abilitate", "abilitati",
- "abilit\u0103i",
- "abilit\u0103\u0163i"):
- word = "".join((word[:-len(suffix)], "abil"))
- elif suffix == "ibilitate":
- word = word[:-5]
- elif suffix in ("ivitate", "ivitati",
- "ivit\u0103i",
- "ivit\u0103\u0163i"):
- word = "".join((word[:-len(suffix)], "iv"))
- elif suffix in ("icitate", "icitati", "icit\u0103i",
- "icit\u0103\u0163i", "icator",
- "icatori", "iciv", "iciva",
- "icive", "icivi", "iciv\u0103",
- "ical", "icala", "icale", "icali",
- "ical\u0103"):
- word = "".join((word[:-len(suffix)], "ic"))
- elif suffix in ("ativ", "ativa", "ative", "ativi",
- "ativ\u0103", "a\u0163iune",
- "atoare", "ator", "atori",
- "\u0103toare",
- "\u0103tor", "\u0103tori"):
- word = "".join((word[:-len(suffix)], "at"))
- if suffix in r2:
- r2 = "".join((r2[:-len(suffix)], "at"))
- elif suffix in ("itiv", "itiva", "itive", "itivi",
- "itiv\u0103", "i\u0163iune",
- "itoare", "itor", "itori"):
- word = "".join((word[:-len(suffix)], "it"))
- if suffix in r2:
- r2 = "".join((r2[:-len(suffix)], "it"))
- else:
- step1_success = False
- break
- if not replacement_done:
- break
- # STEP 2: Removal of standard suffixes
- for suffix in self.__step2_suffixes:
- if word.endswith(suffix):
- if suffix in r2:
- step2_success = True
- if suffix in ("iune", "iuni"):
- if word[-5] == "\u0163":
- word = "".join((word[:-5], "t"))
- elif suffix in ("ism", "isme", "ist", "ista", "iste",
- "isti", "ist\u0103", "i\u015Fti"):
- word = "".join((word[:-len(suffix)], "ist"))
- else:
- word = word[:-len(suffix)]
- break
- # STEP 3: Removal of verb suffixes
- if not step1_success and not step2_success:
- for suffix in self.__step3_suffixes:
- try:
- if word.endswith(suffix):
- if suffix in rv:
- if suffix in (u('seser\u0103\u0163i'), u('seser\u0103m'),
- u('ser\u0103\u0163i'), u('sese\u015Fi'),
- u('seser\u0103'), u('ser\u0103m'), 'sesem',
- u('se\u015Fi'), u('ser\u0103'), 'sese',
- u('a\u0163i'), u('e\u0163i'), u('i\u0163i'),
- u('\xE2\u0163i'), 'sei', u('\u0103m'),
- 'em', 'im', '\xE2m', 'se'):
- word = word[:-len(suffix)]
- rv = rv[:-len(suffix)]
- else:
- if (not rv.startswith(suffix) and
- rv[rv.index(suffix) - 1] not in
- "aeio\u0103\xE2\xEE"):
- word = word[:-len(suffix)]
- break
- except UnicodeDecodeError:
- # The word is unicode, but suffix is not
- continue
- # STEP 4: Removal of final vowel
- for suffix in ("ie", "a", "e", "i", "\u0103"):
- if word.endswith(suffix):
- if suffix in rv:
- word = word[:-len(suffix)]
- break
- word = word.replace("I", "i").replace("U", "u")
- return word
|