__init__.py 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140
  1. # coding=utf-8
  2. # Copyright 2012 Matt Chaput. All rights reserved.
  3. #
  4. # Redistribution and use in source and binary forms, with or without
  5. # modification, are permitted provided that the following conditions are met:
  6. #
  7. # 1. Redistributions of source code must retain the above copyright notice,
  8. # this list of conditions and the following disclaimer.
  9. #
  10. # 2. Redistributions in binary form must reproduce the above copyright
  11. # notice, this list of conditions and the following disclaimer in the
  12. # documentation and/or other materials provided with the distribution.
  13. #
  14. # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
  15. # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  16. # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
  17. # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  18. # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  19. # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
  20. # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  21. # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  22. # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
  23. # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  24. #
  25. # The views and conclusions contained in the software and documentation are
  26. # those of the authors and should not be interpreted as representing official
  27. # policies, either expressed or implied, of Matt Chaput.
  28. # Exceptions
  29. class NoStemmer(Exception):
  30. pass
  31. class NoStopWords(Exception):
  32. pass
  33. # Data and functions for language names
  34. languages = ("ar", "da", "nl", "en", "fi", "fr", "de", "hu", "it", "no", "pt",
  35. "ro", "ru", "es", "sv", "tr")
  36. aliases = {
  37. # By ISO 639-1 three letter codes
  38. "ara": "ar",
  39. "dan": "da", "nld": "nl", "eng": "en", "fin": "fi", "fra": "fr",
  40. "deu": "de", "hun": "hu", "ita": "it", "nor": "no", "por": "pt",
  41. "ron": "ro", "rus": "ru", "spa": "es", "swe": "sv", "tur": "tr",
  42. # By name in English
  43. "arabic": "ar",
  44. "danish": "da",
  45. "dutch": "nl",
  46. "english": "en",
  47. "finnish": "fi",
  48. "french": "fr",
  49. "german": "de",
  50. "hungarian": "hu",
  51. "italian": "it",
  52. "norwegian": "no",
  53. "portuguese": "pt",
  54. "romanian": "ro",
  55. "russian": "ru",
  56. "spanish": "es",
  57. "swedish": "sv",
  58. "turkish": "tr",
  59. # By name in own language
  60. "العربية": "ar",
  61. "dansk": "da",
  62. "nederlands": "nl",
  63. "suomi": "fi",
  64. "français": "fr",
  65. "deutsch": "de",
  66. "magyar": "hu",
  67. "italiano": "it",
  68. "norsk": "no",
  69. "português": "pt",
  70. "русский язык": "ru",
  71. "español": "es",
  72. "svenska": "sv",
  73. "türkçe": "tr",
  74. }
  75. def two_letter_code(name):
  76. if name in languages:
  77. return name
  78. if name in aliases:
  79. return aliases[name]
  80. return None
  81. # Getter functions
  82. def has_stemmer(lang):
  83. try:
  84. return bool(stemmer_for_language(lang))
  85. except NoStemmer:
  86. return False
  87. def has_stopwords(lang):
  88. try:
  89. return bool(stopwords_for_language(lang))
  90. except NoStopWords:
  91. return False
  92. def stemmer_for_language(lang):
  93. if lang == "en_porter":
  94. # Original porter stemming algorithm is several times faster than the
  95. # more correct porter2 algorithm in snowball package
  96. from .porter import stem as porter_stem
  97. return porter_stem
  98. tlc = two_letter_code(lang)
  99. if tlc == "ar":
  100. from .isri import ISRIStemmer
  101. return ISRIStemmer().stem
  102. from .snowball import classes as snowball_classes
  103. if tlc in snowball_classes:
  104. return snowball_classes[tlc]().stem
  105. raise NoStemmer("No stemmer available for %r" % lang)
  106. def stopwords_for_language(lang):
  107. from .stopwords import stoplists
  108. tlc = two_letter_code(lang)
  109. if tlc in stoplists:
  110. return stoplists[tlc]
  111. raise NoStopWords("No stop-word list available for %r" % lang)