123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140 |
- # coding=utf-8
- # Copyright 2012 Matt Chaput. All rights reserved.
- #
- # Redistribution and use in source and binary forms, with or without
- # modification, are permitted provided that the following conditions are met:
- #
- # 1. Redistributions of source code must retain the above copyright notice,
- # this list of conditions and the following disclaimer.
- #
- # 2. Redistributions in binary form must reproduce the above copyright
- # notice, this list of conditions and the following disclaimer in the
- # documentation and/or other materials provided with the distribution.
- #
- # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
- # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
- # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
- # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
- # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- #
- # The views and conclusions contained in the software and documentation are
- # those of the authors and should not be interpreted as representing official
- # policies, either expressed or implied, of Matt Chaput.
- # Exceptions
- class NoStemmer(Exception):
- pass
- class NoStopWords(Exception):
- pass
- # Data and functions for language names
- languages = ("ar", "da", "nl", "en", "fi", "fr", "de", "hu", "it", "no", "pt",
- "ro", "ru", "es", "sv", "tr")
- aliases = {
- # By ISO 639-1 three letter codes
- "ara": "ar",
- "dan": "da", "nld": "nl", "eng": "en", "fin": "fi", "fra": "fr",
- "deu": "de", "hun": "hu", "ita": "it", "nor": "no", "por": "pt",
- "ron": "ro", "rus": "ru", "spa": "es", "swe": "sv", "tur": "tr",
- # By name in English
- "arabic": "ar",
- "danish": "da",
- "dutch": "nl",
- "english": "en",
- "finnish": "fi",
- "french": "fr",
- "german": "de",
- "hungarian": "hu",
- "italian": "it",
- "norwegian": "no",
- "portuguese": "pt",
- "romanian": "ro",
- "russian": "ru",
- "spanish": "es",
- "swedish": "sv",
- "turkish": "tr",
- # By name in own language
- "العربية": "ar",
- "dansk": "da",
- "nederlands": "nl",
- "suomi": "fi",
- "français": "fr",
- "deutsch": "de",
- "magyar": "hu",
- "italiano": "it",
- "norsk": "no",
- "português": "pt",
- "русский язык": "ru",
- "español": "es",
- "svenska": "sv",
- "türkçe": "tr",
- }
- def two_letter_code(name):
- if name in languages:
- return name
- if name in aliases:
- return aliases[name]
- return None
- # Getter functions
- def has_stemmer(lang):
- try:
- return bool(stemmer_for_language(lang))
- except NoStemmer:
- return False
- def has_stopwords(lang):
- try:
- return bool(stopwords_for_language(lang))
- except NoStopWords:
- return False
- def stemmer_for_language(lang):
- if lang == "en_porter":
- # Original porter stemming algorithm is several times faster than the
- # more correct porter2 algorithm in snowball package
- from .porter import stem as porter_stem
- return porter_stem
- tlc = two_letter_code(lang)
- if tlc == "ar":
- from .isri import ISRIStemmer
- return ISRIStemmer().stem
- from .snowball import classes as snowball_classes
- if tlc in snowball_classes:
- return snowball_classes[tlc]().stem
- raise NoStemmer("No stemmer available for %r" % lang)
- def stopwords_for_language(lang):
- from .stopwords import stoplists
- tlc = two_letter_code(lang)
- if tlc in stoplists:
- return stoplists[tlc]
- raise NoStopWords("No stop-word list available for %r" % lang)
|