123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348 |
- from .bases import _StandardStemmer
- from whoosh.compat import u
- class FrenchStemmer(_StandardStemmer):
- """
- The French Snowball stemmer.
- :cvar __vowels: The French vowels.
- :type __vowels: unicode
- :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm.
- :type __step1_suffixes: tuple
- :cvar __step2a_suffixes: Suffixes to be deleted in step 2a of the algorithm.
- :type __step2a_suffixes: tuple
- :cvar __step2b_suffixes: Suffixes to be deleted in step 2b of the algorithm.
- :type __step2b_suffixes: tuple
- :cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm.
- :type __step4_suffixes: tuple
- :note: A detailed description of the French
- stemming algorithm can be found under
- http://snowball.tartarus.org/algorithms/french/stemmer.html
- """
- __vowels = u("aeiouy\xE2\xE0\xEB\xE9\xEA\xE8\xEF\xEE\xF4\xFB\xF9")
- __step1_suffixes = ('issements', 'issement', 'atrices', 'atrice',
- 'ateurs', 'ations', 'logies', 'usions',
- 'utions', 'ements', 'amment', 'emment',
- 'ances', 'iqUes', 'ismes', 'ables', 'istes',
- 'ateur', 'ation', 'logie', 'usion', 'ution',
- 'ences', 'ement', 'euses', 'ments', 'ance',
- 'iqUe', 'isme', 'able', 'iste', 'ence',
- u('it\xE9s'), 'ives', 'eaux', 'euse', 'ment',
- 'eux', u('it\xE9'), 'ive', 'ifs', 'aux', 'if')
- __step2a_suffixes = ('issaIent', 'issantes', 'iraIent', 'issante',
- 'issants', 'issions', 'irions', 'issais',
- 'issait', 'issant', 'issent', 'issiez', 'issons',
- 'irais', 'irait', 'irent', 'iriez', 'irons',
- 'iront', 'isses', 'issez', u('\xEEmes'),
- u('\xEEtes'), 'irai', 'iras', 'irez', 'isse',
- 'ies', 'ira', u('\xEEt'), 'ie', 'ir', 'is',
- 'it', 'i')
- __step2b_suffixes = ('eraIent', 'assions', 'erions', 'assent',
- 'assiez', u('\xE8rent'), 'erais', 'erait',
- 'eriez', 'erons', 'eront', 'aIent', 'antes',
- 'asses', 'ions', 'erai', 'eras', 'erez',
- u('\xE2mes'), u('\xE2tes'), 'ante', 'ants',
- 'asse', u('\xE9es'), 'era', 'iez', 'ais',
- 'ait', 'ant', u('\xE9e'), u('\xE9s'), 'er',
- 'ez', u('\xE2t'), 'ai', 'as', u('\xE9'), 'a')
- __step4_suffixes = (u('i\xE8re'), u('I\xE8re'), 'ion', 'ier', 'Ier',
- 'e', u('\xEB'))
- def stem(self, word):
- """
- Stem a French word and return the stemmed form.
- :param word: The word that is stemmed.
- :type word: str or unicode
- :return: The stemmed form.
- :rtype: unicode
- """
- word = word.lower()
- step1_success = False
- rv_ending_found = False
- step2a_success = False
- step2b_success = False
- # Every occurrence of 'u' after 'q' is put into upper case.
- for i in range(1, len(word)):
- if word[i - 1] == "q" and word[i] == "u":
- word = "".join((word[:i], "U", word[i + 1:]))
- # Every occurrence of 'u' and 'i'
- # between vowels is put into upper case.
- # Every occurrence of 'y' preceded or
- # followed by a vowel is also put into upper case.
- for i in range(1, len(word) - 1):
- if word[i - 1] in self.__vowels and word[i + 1] in self.__vowels:
- if word[i] == "u":
- word = "".join((word[:i], "U", word[i + 1:]))
- elif word[i] == "i":
- word = "".join((word[:i], "I", word[i + 1:]))
- if word[i - 1] in self.__vowels or word[i + 1] in self.__vowels:
- if word[i] == "y":
- word = "".join((word[:i], "Y", word[i + 1:]))
- r1, r2 = self._r1r2_standard(word, self.__vowels)
- rv = self.__rv_french(word, self.__vowels)
- # STEP 1: Standard suffix removal
- for suffix in self.__step1_suffixes:
- if word.endswith(suffix):
- if suffix == "eaux":
- word = word[:-1]
- step1_success = True
- elif suffix in ("euse", "euses"):
- if suffix in r2:
- word = word[:-len(suffix)]
- step1_success = True
- elif suffix in r1:
- word = "".join((word[:-len(suffix)], "eux"))
- step1_success = True
- elif suffix in ("ement", "ements") and suffix in rv:
- word = word[:-len(suffix)]
- step1_success = True
- if word[-2:] == "iv" and "iv" in r2:
- word = word[:-2]
- if word[-2:] == "at" and "at" in r2:
- word = word[:-2]
- elif word[-3:] == "eus":
- if "eus" in r2:
- word = word[:-3]
- elif "eus" in r1:
- word = "".join((word[:-1], "x"))
- elif word[-3:] in ("abl", "iqU"):
- if "abl" in r2 or "iqU" in r2:
- word = word[:-3]
- elif word[-3:] in (u("i\xE8r"), u("I\xE8r")):
- if u("i\xE8r") in rv or u("I\xE8r") in rv:
- word = "".join((word[:-3], "i"))
- elif suffix == "amment" and suffix in rv:
- word = "".join((word[:-6], "ant"))
- rv = "".join((rv[:-6], "ant"))
- rv_ending_found = True
- elif suffix == "emment" and suffix in rv:
- word = "".join((word[:-6], "ent"))
- rv_ending_found = True
- elif (suffix in ("ment", "ments") and suffix in rv and
- not rv.startswith(suffix) and
- rv[rv.rindex(suffix) - 1] in self.__vowels):
- word = word[:-len(suffix)]
- rv = rv[:-len(suffix)]
- rv_ending_found = True
- elif suffix == "aux" and suffix in r1:
- word = "".join((word[:-2], "l"))
- step1_success = True
- elif (suffix in ("issement", "issements") and suffix in r1
- and word[-len(suffix) - 1] not in self.__vowels):
- word = word[:-len(suffix)]
- step1_success = True
- elif suffix in ("ance", "iqUe", "isme", "able", "iste",
- "eux", "ances", "iqUes", "ismes",
- "ables", "istes") and suffix in r2:
- word = word[:-len(suffix)]
- step1_success = True
- elif suffix in ("atrice", "ateur", "ation", "atrices",
- "ateurs", "ations") and suffix in r2:
- word = word[:-len(suffix)]
- step1_success = True
- if word[-2:] == "ic":
- if "ic" in r2:
- word = word[:-2]
- else:
- word = "".join((word[:-2], "iqU"))
- elif suffix in ("logie", "logies") and suffix in r2:
- word = "".join((word[:-len(suffix)], "log"))
- step1_success = True
- elif (suffix in ("usion", "ution", "usions", "utions") and
- suffix in r2):
- word = "".join((word[:-len(suffix)], "u"))
- step1_success = True
- elif suffix in ("ence", "ences") and suffix in r2:
- word = "".join((word[:-len(suffix)], "ent"))
- step1_success = True
- elif suffix in (u("it\xE9"), u("it\xE9s")) and suffix in r2:
- word = word[:-len(suffix)]
- step1_success = True
- if word[-4:] == "abil":
- if "abil" in r2:
- word = word[:-4]
- else:
- word = "".join((word[:-2], "l"))
- elif word[-2:] == "ic":
- if "ic" in r2:
- word = word[:-2]
- else:
- word = "".join((word[:-2], "iqU"))
- elif word[-2:] == "iv":
- if "iv" in r2:
- word = word[:-2]
- elif (suffix in ("if", "ive", "ifs", "ives") and
- suffix in r2):
- word = word[:-len(suffix)]
- step1_success = True
- if word[-2:] == "at" and "at" in r2:
- word = word[:-2]
- if word[-2:] == "ic":
- if "ic" in r2:
- word = word[:-2]
- else:
- word = "".join((word[:-2], "iqU"))
- break
- # STEP 2a: Verb suffixes beginning 'i'
- if not step1_success or rv_ending_found:
- for suffix in self.__step2a_suffixes:
- if word.endswith(suffix):
- if (suffix in rv and len(rv) > len(suffix) and
- rv[rv.rindex(suffix) - 1] not in self.__vowels):
- word = word[:-len(suffix)]
- step2a_success = True
- break
- # STEP 2b: Other verb suffixes
- if not step2a_success:
- for suffix in self.__step2b_suffixes:
- if rv.endswith(suffix):
- if suffix == "ions" and "ions" in r2:
- word = word[:-4]
- step2b_success = True
- elif suffix in ('eraIent', 'erions', u('\xE8rent'),
- 'erais', 'erait', 'eriez',
- 'erons', 'eront', 'erai', 'eras',
- 'erez', u('\xE9es'), 'era', 'iez',
- u('\xE9e'), u('\xE9s'), 'er', 'ez',
- u('\xE9')):
- word = word[:-len(suffix)]
- step2b_success = True
- elif suffix in ('assions', 'assent', 'assiez',
- 'aIent', 'antes', 'asses',
- u('\xE2mes'), u('\xE2tes'), 'ante',
- 'ants', 'asse', 'ais', 'ait',
- 'ant', u('\xE2t'), 'ai', 'as',
- 'a'):
- word = word[:-len(suffix)]
- rv = rv[:-len(suffix)]
- step2b_success = True
- if rv.endswith("e"):
- word = word[:-1]
- break
- # STEP 3
- if step1_success or step2a_success or step2b_success:
- if word[-1] == "Y":
- word = "".join((word[:-1], "i"))
- elif word[-1] == u("\xE7"):
- word = "".join((word[:-1], "c"))
- # STEP 4: Residual suffixes
- else:
- if (len(word) >= 2 and word[-1] == "s" and
- word[-2] not in u("aiou\xE8s")):
- word = word[:-1]
- for suffix in self.__step4_suffixes:
- if word.endswith(suffix):
- if suffix in rv:
- if (suffix == "ion" and suffix in r2 and
- rv[-4] in "st"):
- word = word[:-3]
- elif suffix in ("ier", u("i\xE8re"), "Ier",
- u("I\xE8re")):
- word = "".join((word[:-len(suffix)], "i"))
- elif suffix == "e":
- word = word[:-1]
- elif suffix == u("\xEB") and word[-3:-1] == "gu":
- word = word[:-1]
- break
- # STEP 5: Undouble
- if word.endswith(("enn", "onn", "ett", "ell", "eill")):
- word = word[:-1]
- # STEP 6: Un-accent
- for i in range(1, len(word)):
- if word[-i] not in self.__vowels:
- i += 1
- else:
- if i != 1 and word[-i] in (u("\xE9"), u("\xE8")):
- word = "".join((word[:-i], "e", word[-i + 1:]))
- break
- word = (word.replace("I", "i")
- .replace("U", "u")
- .replace("Y", "y"))
- return word
- def __rv_french(self, word, vowels):
- """
- Return the region RV that is used by the French stemmer.
- If the word begins with two vowels, RV is the region after
- the third letter. Otherwise, it is the region after the first
- vowel not at the beginning of the word, or the end of the word
- if these positions cannot be found. (Exceptionally, u'par',
- u'col' or u'tap' at the beginning of a word is also taken to
- define RV as the region to their right.)
- :param word: The French word whose region RV is determined.
- :type word: str or unicode
- :param vowels: The French vowels that are used to determine
- the region RV.
- :type vowels: unicode
- :return: the region RV for the respective French word.
- :rtype: unicode
- :note: This helper method is invoked by the stem method of
- the subclass FrenchStemmer. It is not to be invoked directly!
- """
- rv = ""
- if len(word) >= 2:
- if (word.startswith(("par", "col", "tap")) or
- (word[0] in vowels and word[1] in vowels)):
- rv = word[3:]
- else:
- for i in range(1, len(word)):
- if word[i] in vowels:
- rv = word[i + 1:]
- break
- return rv
|