__init__.py 2.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100
  1. from __future__ import absolute_import, unicode_literals
  2. import re
  3. import os
  4. import sys
  5. import pickle
  6. from .._compat import *
  7. MIN_FLOAT = -3.14e100
  8. PROB_START_P = "prob_start.p"
  9. PROB_TRANS_P = "prob_trans.p"
  10. PROB_EMIT_P = "prob_emit.p"
  11. PrevStatus = {
  12. 'B': 'ES',
  13. 'M': 'MB',
  14. 'S': 'SE',
  15. 'E': 'BM'
  16. }
  17. Force_Split_Words = set([])
  18. def load_model():
  19. start_p = pickle.load(get_module_res("finalseg", PROB_START_P))
  20. trans_p = pickle.load(get_module_res("finalseg", PROB_TRANS_P))
  21. emit_p = pickle.load(get_module_res("finalseg", PROB_EMIT_P))
  22. return start_p, trans_p, emit_p
  23. if sys.platform.startswith("java"):
  24. start_P, trans_P, emit_P = load_model()
  25. else:
  26. from .prob_start import P as start_P
  27. from .prob_trans import P as trans_P
  28. from .prob_emit import P as emit_P
  29. def viterbi(obs, states, start_p, trans_p, emit_p):
  30. V = [{}] # tabular
  31. path = {}
  32. for y in states: # init
  33. V[0][y] = start_p[y] + emit_p[y].get(obs[0], MIN_FLOAT)
  34. path[y] = [y]
  35. for t in xrange(1, len(obs)):
  36. V.append({})
  37. newpath = {}
  38. for y in states:
  39. em_p = emit_p[y].get(obs[t], MIN_FLOAT)
  40. (prob, state) = max(
  41. [(V[t - 1][y0] + trans_p[y0].get(y, MIN_FLOAT) + em_p, y0) for y0 in PrevStatus[y]])
  42. V[t][y] = prob
  43. newpath[y] = path[state] + [y]
  44. path = newpath
  45. (prob, state) = max((V[len(obs) - 1][y], y) for y in 'ES')
  46. return (prob, path[state])
  47. def __cut(sentence):
  48. global emit_P
  49. prob, pos_list = viterbi(sentence, 'BMES', start_P, trans_P, emit_P)
  50. begin, nexti = 0, 0
  51. # print pos_list, sentence
  52. for i, char in enumerate(sentence):
  53. pos = pos_list[i]
  54. if pos == 'B':
  55. begin = i
  56. elif pos == 'E':
  57. yield sentence[begin:i + 1]
  58. nexti = i + 1
  59. elif pos == 'S':
  60. yield char
  61. nexti = i + 1
  62. if nexti < len(sentence):
  63. yield sentence[nexti:]
  64. re_han = re.compile("([\u4E00-\u9FD5]+)")
  65. re_skip = re.compile("([a-zA-Z0-9]+(?:\.\d+)?%?)")
  66. def add_force_split(word):
  67. global Force_Split_Words
  68. Force_Split_Words.add(word)
  69. def cut(sentence):
  70. sentence = strdecode(sentence)
  71. blocks = re_han.split(sentence)
  72. for blk in blocks:
  73. if re_han.match(blk):
  74. for word in __cut(blk):
  75. if word not in Force_Split_Words:
  76. yield word
  77. else:
  78. for c in word:
  79. yield c
  80. else:
  81. tmp = re_skip.split(blk)
  82. for x in tmp:
  83. if x:
  84. yield x