bases.py 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133
  1. # Base classes
  2. class _ScandinavianStemmer(object):
  3. """
  4. This subclass encapsulates a method for defining the string region R1.
  5. It is used by the Danish, Norwegian, and Swedish stemmer.
  6. """
  7. def _r1_scandinavian(self, word, vowels):
  8. """
  9. Return the region R1 that is used by the Scandinavian stemmers.
  10. R1 is the region after the first non-vowel following a vowel,
  11. or is the null region at the end of the word if there is no
  12. such non-vowel. But then R1 is adjusted so that the region
  13. before it contains at least three letters.
  14. :param word: The word whose region R1 is determined.
  15. :type word: str or unicode
  16. :param vowels: The vowels of the respective language that are
  17. used to determine the region R1.
  18. :type vowels: unicode
  19. :return: the region R1 for the respective word.
  20. :rtype: unicode
  21. :note: This helper method is invoked by the respective stem method of
  22. the subclasses DanishStemmer, NorwegianStemmer, and
  23. SwedishStemmer. It is not to be invoked directly!
  24. """
  25. r1 = ""
  26. for i in range(1, len(word)):
  27. if word[i] not in vowels and word[i - 1] in vowels:
  28. if len(word[:i + 1]) < 3 and len(word[:i + 1]) > 0:
  29. r1 = word[3:]
  30. elif len(word[:i + 1]) >= 3:
  31. r1 = word[i + 1:]
  32. else:
  33. return word
  34. break
  35. return r1
  36. class _StandardStemmer(object):
  37. """
  38. This subclass encapsulates two methods for defining the standard versions
  39. of the string regions R1, R2, and RV.
  40. """
  41. def _r1r2_standard(self, word, vowels):
  42. """
  43. Return the standard interpretations of the string regions R1 and R2.
  44. R1 is the region after the first non-vowel following a vowel,
  45. or is the null region at the end of the word if there is no
  46. such non-vowel.
  47. R2 is the region after the first non-vowel following a vowel
  48. in R1, or is the null region at the end of the word if there
  49. is no such non-vowel.
  50. :param word: The word whose regions R1 and R2 are determined.
  51. :type word: str or unicode
  52. :param vowels: The vowels of the respective language that are
  53. used to determine the regions R1 and R2.
  54. :type vowels: unicode
  55. :return: (r1,r2), the regions R1 and R2 for the respective word.
  56. :rtype: tuple
  57. :note: This helper method is invoked by the respective stem method of
  58. the subclasses DutchStemmer, FinnishStemmer,
  59. FrenchStemmer, GermanStemmer, ItalianStemmer,
  60. PortugueseStemmer, RomanianStemmer, and SpanishStemmer.
  61. It is not to be invoked directly!
  62. :note: A detailed description of how to define R1 and R2
  63. can be found at http://snowball.tartarus.org/texts/r1r2.html
  64. """
  65. r1 = ""
  66. r2 = ""
  67. for i in range(1, len(word)):
  68. if word[i] not in vowels and word[i - 1] in vowels:
  69. r1 = word[i + 1:]
  70. break
  71. for i in range(1, len(r1)):
  72. if r1[i] not in vowels and r1[i - 1] in vowels:
  73. r2 = r1[i + 1:]
  74. break
  75. return (r1, r2)
  76. def _rv_standard(self, word, vowels):
  77. """
  78. Return the standard interpretation of the string region RV.
  79. If the second letter is a consonant, RV is the region after the
  80. next following vowel. If the first two letters are vowels, RV is
  81. the region after the next following consonant. Otherwise, RV is
  82. the region after the third letter.
  83. :param word: The word whose region RV is determined.
  84. :type word: str or unicode
  85. :param vowels: The vowels of the respective language that are
  86. used to determine the region RV.
  87. :type vowels: unicode
  88. :return: the region RV for the respective word.
  89. :rtype: unicode
  90. :note: This helper method is invoked by the respective stem method of
  91. the subclasses ItalianStemmer, PortugueseStemmer,
  92. RomanianStemmer, and SpanishStemmer. It is not to be
  93. invoked directly!
  94. """
  95. rv = ""
  96. if len(word) >= 2:
  97. if word[1] not in vowels:
  98. for i in range(2, len(word)):
  99. if word[i] in vowels:
  100. rv = word[i + 1:]
  101. break
  102. elif word[:2] in vowels:
  103. for i in range(2, len(word)):
  104. if word[i] not in vowels:
  105. rv = word[i + 1:]
  106. break
  107. else:
  108. rv = word[3:]
  109. return rv