porter.py 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175
  1. """
  2. Reimplementation of the
  3. `Porter stemming algorithm <http://tartarus.org/~martin/PorterStemmer/>`_
  4. in Python.
  5. In my quick tests, this implementation about 3.5 times faster than the
  6. seriously weird Python linked from the official page.
  7. """
  8. import re
  9. # Suffix replacement lists
  10. _step2list = {
  11. "ational": "ate",
  12. "tional": "tion",
  13. "enci": "ence",
  14. "anci": "ance",
  15. "izer": "ize",
  16. "bli": "ble",
  17. "alli": "al",
  18. "entli": "ent",
  19. "eli": "e",
  20. "ousli": "ous",
  21. "ization": "ize",
  22. "ation": "ate",
  23. "ator": "ate",
  24. "alism": "al",
  25. "iveness": "ive",
  26. "fulness": "ful",
  27. "ousness": "ous",
  28. "aliti": "al",
  29. "iviti": "ive",
  30. "biliti": "ble",
  31. "logi": "log",
  32. }
  33. _step3list = {
  34. "icate": "ic",
  35. "ative": "",
  36. "alize": "al",
  37. "iciti": "ic",
  38. "ical": "ic",
  39. "ful": "",
  40. "ness": "",
  41. }
  42. _cons = "[^aeiou]"
  43. _vowel = "[aeiouy]"
  44. _cons_seq = "[^aeiouy]+"
  45. _vowel_seq = "[aeiou]+"
  46. # m > 0
  47. _mgr0 = re.compile("^(" + _cons_seq + ")?" + _vowel_seq + _cons_seq)
  48. # m == 0
  49. _meq1 = re.compile("^(" + _cons_seq + ")?" + _vowel_seq + _cons_seq + "(" + _vowel_seq + ")?$")
  50. # m > 1
  51. _mgr1 = re.compile("^(" + _cons_seq + ")?" + _vowel_seq + _cons_seq + _vowel_seq + _cons_seq)
  52. # vowel in stem
  53. _s_v = re.compile("^(" + _cons_seq + ")?" + _vowel)
  54. # ???
  55. _c_v = re.compile("^" + _cons_seq + _vowel + "[^aeiouwxy]$")
  56. # Patterns used in the rules
  57. _ed_ing = re.compile("^(.*)(ed|ing)$")
  58. _at_bl_iz = re.compile("(at|bl|iz)$")
  59. _step1b = re.compile("([^aeiouylsz])\\1$")
  60. _step2 = re.compile("^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$")
  61. _step3 = re.compile("^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$")
  62. _step4_1 = re.compile("^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$")
  63. _step4_2 = re.compile("^(.+?)(s|t)(ion)$")
  64. _step5 = re.compile("^(.+?)e$")
  65. # Stemming function
  66. def stem(w):
  67. """Uses the Porter stemming algorithm to remove suffixes from English
  68. words.
  69. >>> stem("fundamentally")
  70. "fundament"
  71. """
  72. if len(w) < 3:
  73. return w
  74. first_is_y = w[0] == "y"
  75. if first_is_y:
  76. w = "Y" + w[1:]
  77. # Step 1a
  78. if w.endswith("s"):
  79. if w.endswith("sses"):
  80. w = w[:-2]
  81. elif w.endswith("ies"):
  82. w = w[:-2]
  83. elif w[-2] != "s":
  84. w = w[:-1]
  85. # Step 1b
  86. if w.endswith("eed"):
  87. s = w[:-3]
  88. if _mgr0.match(s):
  89. w = w[:-1]
  90. else:
  91. m = _ed_ing.match(w)
  92. if m:
  93. stem = m.group(1)
  94. if _s_v.match(stem):
  95. w = stem
  96. if _at_bl_iz.match(w):
  97. w += "e"
  98. elif _step1b.match(w):
  99. w = w[:-1]
  100. elif _c_v.match(w):
  101. w += "e"
  102. # Step 1c
  103. if w.endswith("y"):
  104. stem = w[:-1]
  105. if _s_v.match(stem):
  106. w = stem + "i"
  107. # Step 2
  108. m = _step2.match(w)
  109. if m:
  110. stem = m.group(1)
  111. suffix = m.group(2)
  112. if _mgr0.match(stem):
  113. w = stem + _step2list[suffix]
  114. # Step 3
  115. m = _step3.match(w)
  116. if m:
  117. stem = m.group(1)
  118. suffix = m.group(2)
  119. if _mgr0.match(stem):
  120. w = stem + _step3list[suffix]
  121. # Step 4
  122. m = _step4_1.match(w)
  123. if m:
  124. stem = m.group(1)
  125. if _mgr1.match(stem):
  126. w = stem
  127. else:
  128. m = _step4_2.match(w)
  129. if m:
  130. stem = m.group(1) + m.group(2)
  131. if _mgr1.match(stem):
  132. w = stem
  133. # Step 5
  134. m = _step5.match(w)
  135. if m:
  136. stem = m.group(1)
  137. if _mgr1.match(stem) or (_meq1.match(stem) and not _c_v.match(stem)):
  138. w = stem
  139. if w.endswith("ll") and _mgr1.match(w):
  140. w = w[:-1]
  141. if first_is_y:
  142. w = "y" + w[1:]
  143. return w