lovins.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570
  1. """This module implements the Lovins stemming algorithm. Use the ``stem()``
  2. function::
  3. stemmed_word = stem(word)
  4. """
  5. from collections import defaultdict
  6. # Conditions
  7. def A(base):
  8. # A No restrictions on stem
  9. return True
  10. def B(base):
  11. # B Minimum stem length = 3
  12. return len(base) > 2
  13. def C(base):
  14. # C Minimum stem length = 4
  15. return len(base) > 3
  16. def D(base):
  17. # D Minimum stem length = 5
  18. return len(base) > 4
  19. def E(base):
  20. # E Do not remove ending after e
  21. return base[-1] != "e"
  22. def F(base):
  23. # F Minimum stem length = 3 and do not remove ending after e
  24. return len(base) > 2 and base[-1] != "e"
  25. def G(base):
  26. # G Minimum stem length = 3 and remove ending only after f
  27. return len(base) > 2 and base[-1] == "f"
  28. def H(base):
  29. # H Remove ending only after t or ll
  30. c1, c2 = base[-2:]
  31. return c2 == "t" or (c2 == "l" and c1 == "l")
  32. def I(base):
  33. # I Do not remove ending after o or e
  34. c = base[-1]
  35. return c != "o" and c != "e"
  36. def J(base):
  37. # J Do not remove ending after a or e
  38. c = base[-1]
  39. return c != "a" and c != "e"
  40. def K(base):
  41. # K Minimum stem length = 3 and remove ending only after l, i or u*e
  42. c = base[-1]
  43. cc = base[-3]
  44. return len(base) > 2 and (c == "l" or c == "i" or (c == "e" and cc == "u"))
  45. def L(base):
  46. # L Do not remove ending after u, x or s, unless s follows o
  47. c1, c2 = base[-2:]
  48. return c2 != "u" and c2 != "x" and (c2 != "s" or c1 == "o")
  49. def M(base):
  50. # M Do not remove ending after a, c, e or m
  51. c = base[-1]
  52. return c != "a" and c != "c" and c != "e" and c != "m"
  53. def N(base):
  54. # N Minimum stem length = 4 after s**, elsewhere = 3
  55. return len(base) > 3 or (len(base) == 3 and base[-1] != "s")
  56. def O(base):
  57. # O Remove ending only after l or i
  58. c = base[-1]
  59. return c == "l" or c == "i"
  60. def P(base):
  61. # P Do not remove ending after c
  62. return base[-1] != "c"
  63. def Q(base):
  64. # Q Minimum stem length = 3 and do not remove ending after l or n
  65. c = base[-1]
  66. return len(base) > 2 and (c != "l" and c != "n")
  67. def R(base):
  68. # R Remove ending only after n or r
  69. c = base[-1]
  70. return c == "n" or c == "r"
  71. def S(base):
  72. # S Remove ending only after dr or t, unless t follows t
  73. l2 = base[-2]
  74. return l2 == "rd" or (base[-1] == "t" and l2 != "tt")
  75. def T(base):
  76. # T Remove ending only after s or t, unless t follows o
  77. c1, c2 = base[-2:]
  78. return c2 == "s" or (c2 == "t" and c1 != "o")
  79. def U(base):
  80. # U Remove ending only after l, m, n or r
  81. c = base[-1]
  82. return c == "l" or c == "m" or c == "n" or c == "r"
  83. def V(base):
  84. # V Remove ending only after c
  85. return base[-1] == "c"
  86. def W(base):
  87. # W Do not remove ending after s or u
  88. c = base[-1]
  89. return c != "s" and c != "u"
  90. def X(base):
  91. # X Remove ending only after l, i or u*e
  92. c = base[-1]
  93. cc = base[-3]
  94. return c == "l" or c == "i" or (c == "e" and cc == "u")
  95. def Y(base):
  96. # Y Remove ending only after in
  97. return base[-2:] == "in"
  98. def Z(base):
  99. # Z Do not remove ending after f
  100. return base[-1] != "f"
  101. def a(base):
  102. # a Remove ending only after d, f, ph, th, l, er, or, es or t
  103. c = base[-1]
  104. l2 = base[-2:]
  105. return (c == "d" or c == "f" or l2 == "ph" or l2 == "th" or c == "l"
  106. or l2 == "er" or l2 == "or" or l2 == "es" or c == "t")
  107. def b(base):
  108. # b Minimum stem length = 3 and do not remove ending after met or ryst
  109. return len(base) > 2 and not (base.endswith("met")
  110. or base.endswith("ryst"))
  111. def c(base):
  112. # c Remove ending only after l
  113. return base[-1] == "l"
  114. # Endings
  115. m = [None] * 12
  116. m[11] = dict((
  117. ("alistically", B),
  118. ("arizability", A),
  119. ("izationally", B)))
  120. m[10] = dict((
  121. ("antialness", A),
  122. ("arisations", A),
  123. ("arizations", A),
  124. ("entialness", A)))
  125. m[9] = dict((
  126. ("allically", C),
  127. ("antaneous", A),
  128. ("antiality", A),
  129. ("arisation", A),
  130. ("arization", A),
  131. ("ationally", B),
  132. ("ativeness", A),
  133. ("eableness", E),
  134. ("entations", A),
  135. ("entiality", A),
  136. ("entialize", A),
  137. ("entiation", A),
  138. ("ionalness", A),
  139. ("istically", A),
  140. ("itousness", A),
  141. ("izability", A),
  142. ("izational", A)))
  143. m[8] = dict((
  144. ("ableness", A),
  145. ("arizable", A),
  146. ("entation", A),
  147. ("entially", A),
  148. ("eousness", A),
  149. ("ibleness", A),
  150. ("icalness", A),
  151. ("ionalism", A),
  152. ("ionality", A),
  153. ("ionalize", A),
  154. ("iousness", A),
  155. ("izations", A),
  156. ("lessness", A)))
  157. m[7] = dict((
  158. ("ability", A),
  159. ("aically", A),
  160. ("alistic", B),
  161. ("alities", A),
  162. ("ariness", E),
  163. ("aristic", A),
  164. ("arizing", A),
  165. ("ateness", A),
  166. ("atingly", A),
  167. ("ational", B),
  168. ("atively", A),
  169. ("ativism", A),
  170. ("elihood", E),
  171. ("encible", A),
  172. ("entally", A),
  173. ("entials", A),
  174. ("entiate", A),
  175. ("entness", A),
  176. ("fulness", A),
  177. ("ibility", A),
  178. ("icalism", A),
  179. ("icalist", A),
  180. ("icality", A),
  181. ("icalize", A),
  182. ("ication", G),
  183. ("icianry", A),
  184. ("ination", A),
  185. ("ingness", A),
  186. ("ionally", A),
  187. ("isation", A),
  188. ("ishness", A),
  189. ("istical", A),
  190. ("iteness", A),
  191. ("iveness", A),
  192. ("ivistic", A),
  193. ("ivities", A),
  194. ("ization", F),
  195. ("izement", A),
  196. ("oidally", A),
  197. ("ousness", A)))
  198. m[6] = dict((
  199. ("aceous", A),
  200. ("acious", B),
  201. ("action", G),
  202. ("alness", A),
  203. ("ancial", A),
  204. ("ancies", A),
  205. ("ancing", B),
  206. ("ariser", A),
  207. ("arized", A),
  208. ("arizer", A),
  209. ("atable", A),
  210. ("ations", B),
  211. ("atives", A),
  212. ("eature", Z),
  213. ("efully", A),
  214. ("encies", A),
  215. ("encing", A),
  216. ("ential", A),
  217. ("enting", C),
  218. ("entist", A),
  219. ("eously", A),
  220. ("ialist", A),
  221. ("iality", A),
  222. ("ialize", A),
  223. ("ically", A),
  224. ("icance", A),
  225. ("icians", A),
  226. ("icists", A),
  227. ("ifully", A),
  228. ("ionals", A),
  229. ("ionate", D),
  230. ("ioning", A),
  231. ("ionist", A),
  232. ("iously", A),
  233. ("istics", A),
  234. ("izable", E),
  235. ("lessly", A),
  236. ("nesses", A),
  237. ("oidism", A)))
  238. m[5] = dict((
  239. ("acies", A),
  240. ("acity", A),
  241. ("aging", B),
  242. ("aical", A),
  243. ("alist", A),
  244. ("alism", B),
  245. ("ality", A),
  246. ("alize", A),
  247. ("allic", b),
  248. ("anced", B),
  249. ("ances", B),
  250. ("antic", C),
  251. ("arial", A),
  252. ("aries", A),
  253. ("arily", A),
  254. ("arity", B),
  255. ("arize", A),
  256. ("aroid", A),
  257. ("ately", A),
  258. ("ating", I),
  259. ("ation", B),
  260. ("ative", A),
  261. ("ators", A),
  262. ("atory", A),
  263. ("ature", E),
  264. ("early", Y),
  265. ("ehood", A),
  266. ("eless", A),
  267. ("elily", A),
  268. ("ement", A),
  269. ("enced", A),
  270. ("ences", A),
  271. ("eness", E),
  272. ("ening", E),
  273. ("ental", A),
  274. ("ented", C),
  275. ("ently", A),
  276. ("fully", A),
  277. ("ially", A),
  278. ("icant", A),
  279. ("ician", A),
  280. ("icide", A),
  281. ("icism", A),
  282. ("icist", A),
  283. ("icity", A),
  284. ("idine", I),
  285. ("iedly", A),
  286. ("ihood", A),
  287. ("inate", A),
  288. ("iness", A),
  289. ("ingly", B),
  290. ("inism", J),
  291. ("inity", c),
  292. ("ional", A),
  293. ("ioned", A),
  294. ("ished", A),
  295. ("istic", A),
  296. ("ities", A),
  297. ("itous", A),
  298. ("ively", A),
  299. ("ivity", A),
  300. ("izers", F),
  301. ("izing", F),
  302. ("oidal", A),
  303. ("oides", A),
  304. ("otide", A),
  305. ("ously", A)))
  306. m[4] = dict((
  307. ("able", A),
  308. ("ably", A),
  309. ("ages", B),
  310. ("ally", B),
  311. ("ance", B),
  312. ("ancy", B),
  313. ("ants", B),
  314. ("aric", A),
  315. ("arly", K),
  316. ("ated", I),
  317. ("ates", A),
  318. ("atic", B),
  319. ("ator", A),
  320. ("ealy", Y),
  321. ("edly", E),
  322. ("eful", A),
  323. ("eity", A),
  324. ("ence", A),
  325. ("ency", A),
  326. ("ened", E),
  327. ("enly", E),
  328. ("eous", A),
  329. ("hood", A),
  330. ("ials", A),
  331. ("ians", A),
  332. ("ible", A),
  333. ("ibly", A),
  334. ("ical", A),
  335. ("ides", L),
  336. ("iers", A),
  337. ("iful", A),
  338. ("ines", M),
  339. ("ings", N),
  340. ("ions", B),
  341. ("ious", A),
  342. ("isms", B),
  343. ("ists", A),
  344. ("itic", H),
  345. ("ized", F),
  346. ("izer", F),
  347. ("less", A),
  348. ("lily", A),
  349. ("ness", A),
  350. ("ogen", A),
  351. ("ward", A),
  352. ("wise", A),
  353. ("ying", B),
  354. ("yish", A)))
  355. m[3] = dict((
  356. ("acy", A),
  357. ("age", B),
  358. ("aic", A),
  359. ("als", b),
  360. ("ant", B),
  361. ("ars", O),
  362. ("ary", F),
  363. ("ata", A),
  364. ("ate", A),
  365. ("eal", Y),
  366. ("ear", Y),
  367. ("ely", E),
  368. ("ene", E),
  369. ("ent", C),
  370. ("ery", E),
  371. ("ese", A),
  372. ("ful", A),
  373. ("ial", A),
  374. ("ian", A),
  375. ("ics", A),
  376. ("ide", L),
  377. ("ied", A),
  378. ("ier", A),
  379. ("ies", P),
  380. ("ily", A),
  381. ("ine", M),
  382. ("ing", N),
  383. ("ion", Q),
  384. ("ish", C),
  385. ("ism", B),
  386. ("ist", A),
  387. ("ite", a),
  388. ("ity", A),
  389. ("ium", A),
  390. ("ive", A),
  391. ("ize", F),
  392. ("oid", A),
  393. ("one", R),
  394. ("ous", A)))
  395. m[2] = dict((
  396. ("ae", A),
  397. ("al", b),
  398. ("ar", X),
  399. ("as", B),
  400. ("ed", E),
  401. ("en", F),
  402. ("es", E),
  403. ("ia", A),
  404. ("ic", A),
  405. ("is", A),
  406. ("ly", B),
  407. ("on", S),
  408. ("or", T),
  409. ("um", U),
  410. ("us", V),
  411. ("yl", R),
  412. ("s'", A),
  413. ("'s", A)))
  414. m[1] = dict((
  415. ("a", A),
  416. ("e", A),
  417. ("i", A),
  418. ("o", A),
  419. ("s", W),
  420. ("y", B)))
  421. def remove_ending(word):
  422. length = len(word)
  423. el = 11
  424. while el > 0:
  425. if length - el > 1:
  426. ending = word[length - el:]
  427. cond = m[el].get(ending)
  428. if cond:
  429. base = word[:length - el]
  430. if cond(base):
  431. return base
  432. el -= 1
  433. return word
  434. _endings = (("iev", "ief"),
  435. ("uct", "uc"),
  436. ("iev", "ief"),
  437. ("uct", "uc"),
  438. ("umpt", "um"),
  439. ("rpt", "rb"),
  440. ("urs", "ur"),
  441. ("istr", "ister"),
  442. ("metr", "meter"),
  443. ("olv", "olut"),
  444. ("ul", "l", "aoi"),
  445. ("bex", "bic"),
  446. ("dex", "dic"),
  447. ("pex", "pic"),
  448. ("tex", "tic"),
  449. ("ax", "ac"),
  450. ("ex", "ec"),
  451. ("ix", "ic"),
  452. ("lux", "luc"),
  453. ("uad", "uas"),
  454. ("vad", "vas"),
  455. ("cid", "cis"),
  456. ("lid", "lis"),
  457. ("erid", "eris"),
  458. ("pand", "pans"),
  459. ("end", "ens", "s"),
  460. ("ond", "ons"),
  461. ("lud", "lus"),
  462. ("rud", "rus"),
  463. ("her", "hes", "pt"),
  464. ("mit", "mis"),
  465. ("ent", "ens", "m"),
  466. ("ert", "ers"),
  467. ("et", "es", "n"),
  468. ("yt", "ys"),
  469. ("yz", "ys"))
  470. # Hash the ending rules by the last letter of the target ending
  471. _endingrules = defaultdict(list)
  472. for rule in _endings:
  473. _endingrules[rule[0][-1]].append(rule)
  474. _doubles = frozenset(("dd", "gg", "ll", "mm", "nn", "pp", "rr", "ss", "tt"))
  475. def fix_ending(word):
  476. if word[-2:] in _doubles:
  477. word = word[:-1]
  478. for endingrule in _endingrules[word[-1]]:
  479. target, newend = endingrule[:2]
  480. if word.endswith(target):
  481. if len(endingrule) > 2:
  482. exceptafter = endingrule[2]
  483. c = word[0 - (len(target) + 1)]
  484. if c in exceptafter:
  485. return word
  486. return word[:0 - len(target)] + newend
  487. return word
  488. def stem(word):
  489. """Returns the stemmed version of the argument string.
  490. """
  491. return fix_ending(remove_ending(word))