__init__.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619
  1. from __future__ import absolute_import, unicode_literals
  2. __version__ = '0.42.1'
  3. __license__ = 'MIT'
  4. import marshal
  5. import re
  6. import tempfile
  7. import threading
  8. import time
  9. from hashlib import md5
  10. from math import log
  11. from . import finalseg
  12. from ._compat import *
  13. if os.name == 'nt':
  14. from shutil import move as _replace_file
  15. else:
  16. _replace_file = os.rename
  17. _get_abs_path = lambda path: os.path.normpath(os.path.join(os.getcwd(), path))
  18. DEFAULT_DICT = None
  19. DEFAULT_DICT_NAME = "dict.txt"
  20. log_console = logging.StreamHandler(sys.stderr)
  21. default_logger = logging.getLogger(__name__)
  22. default_logger.setLevel(logging.DEBUG)
  23. default_logger.addHandler(log_console)
  24. DICT_WRITING = {}
  25. pool = None
  26. re_userdict = re.compile('^(.+?)( [0-9]+)?( [a-z]+)?$', re.U)
  27. re_eng = re.compile('[a-zA-Z0-9]', re.U)
  28. # \u4E00-\u9FD5a-zA-Z0-9+#&\._ : All non-space characters. Will be handled with re_han
  29. # \r\n|\s : whitespace characters. Will not be handled.
  30. # re_han_default = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._%]+)", re.U)
  31. # Adding "-" symbol in re_han_default
  32. re_han_default = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._%\-]+)", re.U)
  33. re_skip_default = re.compile("(\r\n|\s)", re.U)
  34. def setLogLevel(log_level):
  35. default_logger.setLevel(log_level)
  36. class Tokenizer(object):
  37. def __init__(self, dictionary=DEFAULT_DICT):
  38. self.lock = threading.RLock()
  39. if dictionary == DEFAULT_DICT:
  40. self.dictionary = dictionary
  41. else:
  42. self.dictionary = _get_abs_path(dictionary)
  43. self.FREQ = {}
  44. self.total = 0
  45. self.user_word_tag_tab = {}
  46. self.initialized = False
  47. self.tmp_dir = None
  48. self.cache_file = None
  49. def __repr__(self):
  50. return '<Tokenizer dictionary=%r>' % self.dictionary
  51. @staticmethod
  52. def gen_pfdict(f):
  53. lfreq = {}
  54. ltotal = 0
  55. f_name = resolve_filename(f)
  56. for lineno, line in enumerate(f, 1):
  57. try:
  58. line = line.strip().decode('utf-8')
  59. word, freq = line.split(' ')[:2]
  60. freq = int(freq)
  61. lfreq[word] = freq
  62. ltotal += freq
  63. for ch in xrange(len(word)):
  64. wfrag = word[:ch + 1]
  65. if wfrag not in lfreq:
  66. lfreq[wfrag] = 0
  67. except ValueError:
  68. raise ValueError(
  69. 'invalid dictionary entry in %s at Line %s: %s' % (f_name, lineno, line))
  70. f.close()
  71. return lfreq, ltotal
  72. def initialize(self, dictionary=None):
  73. if dictionary:
  74. abs_path = _get_abs_path(dictionary)
  75. if self.dictionary == abs_path and self.initialized:
  76. return
  77. else:
  78. self.dictionary = abs_path
  79. self.initialized = False
  80. else:
  81. abs_path = self.dictionary
  82. with self.lock:
  83. try:
  84. with DICT_WRITING[abs_path]:
  85. pass
  86. except KeyError:
  87. pass
  88. if self.initialized:
  89. return
  90. default_logger.debug("Building prefix dict from %s ..." % (abs_path or 'the default dictionary'))
  91. t1 = time.time()
  92. if self.cache_file:
  93. cache_file = self.cache_file
  94. # default dictionary
  95. elif abs_path == DEFAULT_DICT:
  96. cache_file = "jieba.cache"
  97. # custom dictionary
  98. else:
  99. cache_file = "jieba.u%s.cache" % md5(
  100. abs_path.encode('utf-8', 'replace')).hexdigest()
  101. cache_file = os.path.join(
  102. self.tmp_dir or tempfile.gettempdir(), cache_file)
  103. # prevent absolute path in self.cache_file
  104. tmpdir = os.path.dirname(cache_file)
  105. load_from_cache_fail = True
  106. if os.path.isfile(cache_file) and (abs_path == DEFAULT_DICT or
  107. os.path.getmtime(cache_file) > os.path.getmtime(abs_path)):
  108. default_logger.debug(
  109. "Loading model from cache %s" % cache_file)
  110. try:
  111. with open(cache_file, 'rb') as cf:
  112. self.FREQ, self.total = marshal.load(cf)
  113. load_from_cache_fail = False
  114. except Exception:
  115. load_from_cache_fail = True
  116. if load_from_cache_fail:
  117. wlock = DICT_WRITING.get(abs_path, threading.RLock())
  118. DICT_WRITING[abs_path] = wlock
  119. with wlock:
  120. self.FREQ, self.total = self.gen_pfdict(self.get_dict_file())
  121. default_logger.debug(
  122. "Dumping model to file cache %s" % cache_file)
  123. try:
  124. # prevent moving across different filesystems
  125. fd, fpath = tempfile.mkstemp(dir=tmpdir)
  126. with os.fdopen(fd, 'wb') as temp_cache_file:
  127. marshal.dump(
  128. (self.FREQ, self.total), temp_cache_file)
  129. _replace_file(fpath, cache_file)
  130. except Exception:
  131. default_logger.exception("Dump cache file failed.")
  132. try:
  133. del DICT_WRITING[abs_path]
  134. except KeyError:
  135. pass
  136. self.initialized = True
  137. default_logger.debug(
  138. "Loading model cost %.3f seconds." % (time.time() - t1))
  139. default_logger.debug("Prefix dict has been built successfully.")
  140. def check_initialized(self):
  141. if not self.initialized:
  142. self.initialize()
  143. def calc(self, sentence, DAG, route):
  144. N = len(sentence)
  145. route[N] = (0, 0)
  146. logtotal = log(self.total)
  147. for idx in xrange(N - 1, -1, -1):
  148. route[idx] = max((log(self.FREQ.get(sentence[idx:x + 1]) or 1) -
  149. logtotal + route[x + 1][0], x) for x in DAG[idx])
  150. def get_DAG(self, sentence):
  151. self.check_initialized()
  152. DAG = {}
  153. N = len(sentence)
  154. for k in xrange(N):
  155. tmplist = []
  156. i = k
  157. frag = sentence[k]
  158. while i < N and frag in self.FREQ:
  159. if self.FREQ[frag]:
  160. tmplist.append(i)
  161. i += 1
  162. frag = sentence[k:i + 1]
  163. if not tmplist:
  164. tmplist.append(k)
  165. DAG[k] = tmplist
  166. return DAG
  167. def __cut_all(self, sentence):
  168. dag = self.get_DAG(sentence)
  169. old_j = -1
  170. eng_scan = 0
  171. eng_buf = u''
  172. for k, L in iteritems(dag):
  173. if eng_scan == 1 and not re_eng.match(sentence[k]):
  174. eng_scan = 0
  175. yield eng_buf
  176. if len(L) == 1 and k > old_j:
  177. word = sentence[k:L[0] + 1]
  178. if re_eng.match(word):
  179. if eng_scan == 0:
  180. eng_scan = 1
  181. eng_buf = word
  182. else:
  183. eng_buf += word
  184. if eng_scan == 0:
  185. yield word
  186. old_j = L[0]
  187. else:
  188. for j in L:
  189. if j > k:
  190. yield sentence[k:j + 1]
  191. old_j = j
  192. if eng_scan == 1:
  193. yield eng_buf
  194. def __cut_DAG_NO_HMM(self, sentence):
  195. DAG = self.get_DAG(sentence)
  196. route = {}
  197. self.calc(sentence, DAG, route)
  198. x = 0
  199. N = len(sentence)
  200. buf = ''
  201. while x < N:
  202. y = route[x][1] + 1
  203. l_word = sentence[x:y]
  204. if re_eng.match(l_word) and len(l_word) == 1:
  205. buf += l_word
  206. x = y
  207. else:
  208. if buf:
  209. yield buf
  210. buf = ''
  211. yield l_word
  212. x = y
  213. if buf:
  214. yield buf
  215. buf = ''
  216. def __cut_DAG(self, sentence):
  217. DAG = self.get_DAG(sentence)
  218. route = {}
  219. self.calc(sentence, DAG, route)
  220. x = 0
  221. buf = ''
  222. N = len(sentence)
  223. while x < N:
  224. y = route[x][1] + 1
  225. l_word = sentence[x:y]
  226. if y - x == 1:
  227. buf += l_word
  228. else:
  229. if buf:
  230. if len(buf) == 1:
  231. yield buf
  232. buf = ''
  233. else:
  234. if not self.FREQ.get(buf):
  235. recognized = finalseg.cut(buf)
  236. for t in recognized:
  237. yield t
  238. else:
  239. for elem in buf:
  240. yield elem
  241. buf = ''
  242. yield l_word
  243. x = y
  244. if buf:
  245. if len(buf) == 1:
  246. yield buf
  247. elif not self.FREQ.get(buf):
  248. recognized = finalseg.cut(buf)
  249. for t in recognized:
  250. yield t
  251. else:
  252. for elem in buf:
  253. yield elem
  254. def cut(self, sentence, cut_all=False, HMM=True, use_paddle=False):
  255. """
  256. The main function that segments an entire sentence that contains
  257. Chinese characters into separated words.
  258. Parameter:
  259. - sentence: The str(unicode) to be segmented.
  260. - cut_all: Model type. True for full pattern, False for accurate pattern.
  261. - HMM: Whether to use the Hidden Markov Model.
  262. """
  263. is_paddle_installed = check_paddle_install['is_paddle_installed']
  264. sentence = strdecode(sentence)
  265. if use_paddle and is_paddle_installed:
  266. # if sentence is null, it will raise core exception in paddle.
  267. if sentence is None or len(sentence) == 0:
  268. return
  269. import jieba.lac_small.predict as predict
  270. results = predict.get_sent(sentence)
  271. for sent in results:
  272. if sent is None:
  273. continue
  274. yield sent
  275. return
  276. re_han = re_han_default
  277. re_skip = re_skip_default
  278. if cut_all:
  279. cut_block = self.__cut_all
  280. elif HMM:
  281. cut_block = self.__cut_DAG
  282. else:
  283. cut_block = self.__cut_DAG_NO_HMM
  284. blocks = re_han.split(sentence)
  285. for blk in blocks:
  286. if not blk:
  287. continue
  288. if re_han.match(blk):
  289. for word in cut_block(blk):
  290. yield word
  291. else:
  292. tmp = re_skip.split(blk)
  293. for x in tmp:
  294. if re_skip.match(x):
  295. yield x
  296. elif not cut_all:
  297. for xx in x:
  298. yield xx
  299. else:
  300. yield x
  301. def cut_for_search(self, sentence, HMM=True):
  302. """
  303. Finer segmentation for search engines.
  304. """
  305. words = self.cut(sentence, HMM=HMM)
  306. for w in words:
  307. if len(w) > 2:
  308. for i in xrange(len(w) - 1):
  309. gram2 = w[i:i + 2]
  310. if self.FREQ.get(gram2):
  311. yield gram2
  312. if len(w) > 3:
  313. for i in xrange(len(w) - 2):
  314. gram3 = w[i:i + 3]
  315. if self.FREQ.get(gram3):
  316. yield gram3
  317. yield w
  318. def lcut(self, *args, **kwargs):
  319. return list(self.cut(*args, **kwargs))
  320. def lcut_for_search(self, *args, **kwargs):
  321. return list(self.cut_for_search(*args, **kwargs))
  322. _lcut = lcut
  323. _lcut_for_search = lcut_for_search
  324. def _lcut_no_hmm(self, sentence):
  325. return self.lcut(sentence, False, False)
  326. def _lcut_all(self, sentence):
  327. return self.lcut(sentence, True)
  328. def _lcut_for_search_no_hmm(self, sentence):
  329. return self.lcut_for_search(sentence, False)
  330. def get_dict_file(self):
  331. if self.dictionary == DEFAULT_DICT:
  332. return get_module_res(DEFAULT_DICT_NAME)
  333. else:
  334. return open(self.dictionary, 'rb')
  335. def load_userdict(self, f):
  336. '''
  337. Load personalized dict to improve detect rate.
  338. Parameter:
  339. - f : A plain text file contains words and their ocurrences.
  340. Can be a file-like object, or the path of the dictionary file,
  341. whose encoding must be utf-8.
  342. Structure of dict file:
  343. word1 freq1 word_type1
  344. word2 freq2 word_type2
  345. ...
  346. Word type may be ignored
  347. '''
  348. self.check_initialized()
  349. if isinstance(f, string_types):
  350. f_name = f
  351. f = open(f, 'rb')
  352. else:
  353. f_name = resolve_filename(f)
  354. for lineno, ln in enumerate(f, 1):
  355. line = ln.strip()
  356. if not isinstance(line, text_type):
  357. try:
  358. line = line.decode('utf-8').lstrip('\ufeff')
  359. except UnicodeDecodeError:
  360. raise ValueError('dictionary file %s must be utf-8' % f_name)
  361. if not line:
  362. continue
  363. # match won't be None because there's at least one character
  364. word, freq, tag = re_userdict.match(line).groups()
  365. if freq is not None:
  366. freq = freq.strip()
  367. if tag is not None:
  368. tag = tag.strip()
  369. self.add_word(word, freq, tag)
  370. def add_word(self, word, freq=None, tag=None):
  371. """
  372. Add a word to dictionary.
  373. freq and tag can be omitted, freq defaults to be a calculated value
  374. that ensures the word can be cut out.
  375. """
  376. self.check_initialized()
  377. word = strdecode(word)
  378. freq = int(freq) if freq is not None else self.suggest_freq(word, False)
  379. self.FREQ[word] = freq
  380. self.total += freq
  381. if tag:
  382. self.user_word_tag_tab[word] = tag
  383. for ch in xrange(len(word)):
  384. wfrag = word[:ch + 1]
  385. if wfrag not in self.FREQ:
  386. self.FREQ[wfrag] = 0
  387. if freq == 0:
  388. finalseg.add_force_split(word)
  389. def del_word(self, word):
  390. """
  391. Convenient function for deleting a word.
  392. """
  393. self.add_word(word, 0)
  394. def suggest_freq(self, segment, tune=False):
  395. """
  396. Suggest word frequency to force the characters in a word to be
  397. joined or splitted.
  398. Parameter:
  399. - segment : The segments that the word is expected to be cut into,
  400. If the word should be treated as a whole, use a str.
  401. - tune : If True, tune the word frequency.
  402. Note that HMM may affect the final result. If the result doesn't change,
  403. set HMM=False.
  404. """
  405. self.check_initialized()
  406. ftotal = float(self.total)
  407. freq = 1
  408. if isinstance(segment, string_types):
  409. word = segment
  410. for seg in self.cut(word, HMM=False):
  411. freq *= self.FREQ.get(seg, 1) / ftotal
  412. freq = max(int(freq * self.total) + 1, self.FREQ.get(word, 1))
  413. else:
  414. segment = tuple(map(strdecode, segment))
  415. word = ''.join(segment)
  416. for seg in segment:
  417. freq *= self.FREQ.get(seg, 1) / ftotal
  418. freq = min(int(freq * self.total), self.FREQ.get(word, 0))
  419. if tune:
  420. self.add_word(word, freq)
  421. return freq
  422. def tokenize(self, unicode_sentence, mode="default", HMM=True):
  423. """
  424. Tokenize a sentence and yields tuples of (word, start, end)
  425. Parameter:
  426. - sentence: the str(unicode) to be segmented.
  427. - mode: "default" or "search", "search" is for finer segmentation.
  428. - HMM: whether to use the Hidden Markov Model.
  429. """
  430. if not isinstance(unicode_sentence, text_type):
  431. raise ValueError("jieba: the input parameter should be unicode.")
  432. start = 0
  433. if mode == 'default':
  434. for w in self.cut(unicode_sentence, HMM=HMM):
  435. width = len(w)
  436. yield (w, start, start + width)
  437. start += width
  438. else:
  439. for w in self.cut(unicode_sentence, HMM=HMM):
  440. width = len(w)
  441. if len(w) > 2:
  442. for i in xrange(len(w) - 1):
  443. gram2 = w[i:i + 2]
  444. if self.FREQ.get(gram2):
  445. yield (gram2, start + i, start + i + 2)
  446. if len(w) > 3:
  447. for i in xrange(len(w) - 2):
  448. gram3 = w[i:i + 3]
  449. if self.FREQ.get(gram3):
  450. yield (gram3, start + i, start + i + 3)
  451. yield (w, start, start + width)
  452. start += width
  453. def set_dictionary(self, dictionary_path):
  454. with self.lock:
  455. abs_path = _get_abs_path(dictionary_path)
  456. if not os.path.isfile(abs_path):
  457. raise Exception("jieba: file does not exist: " + abs_path)
  458. self.dictionary = abs_path
  459. self.initialized = False
  460. # default Tokenizer instance
  461. dt = Tokenizer()
  462. # global functions
  463. get_FREQ = lambda k, d=None: dt.FREQ.get(k, d)
  464. add_word = dt.add_word
  465. calc = dt.calc
  466. cut = dt.cut
  467. lcut = dt.lcut
  468. cut_for_search = dt.cut_for_search
  469. lcut_for_search = dt.lcut_for_search
  470. del_word = dt.del_word
  471. get_DAG = dt.get_DAG
  472. get_dict_file = dt.get_dict_file
  473. initialize = dt.initialize
  474. load_userdict = dt.load_userdict
  475. set_dictionary = dt.set_dictionary
  476. suggest_freq = dt.suggest_freq
  477. tokenize = dt.tokenize
  478. user_word_tag_tab = dt.user_word_tag_tab
  479. def _lcut_all(s):
  480. return dt._lcut_all(s)
  481. def _lcut(s):
  482. return dt._lcut(s)
  483. def _lcut_no_hmm(s):
  484. return dt._lcut_no_hmm(s)
  485. def _lcut_all(s):
  486. return dt._lcut_all(s)
  487. def _lcut_for_search(s):
  488. return dt._lcut_for_search(s)
  489. def _lcut_for_search_no_hmm(s):
  490. return dt._lcut_for_search_no_hmm(s)
  491. def _pcut(sentence, cut_all=False, HMM=True):
  492. parts = strdecode(sentence).splitlines(True)
  493. if cut_all:
  494. result = pool.map(_lcut_all, parts)
  495. elif HMM:
  496. result = pool.map(_lcut, parts)
  497. else:
  498. result = pool.map(_lcut_no_hmm, parts)
  499. for r in result:
  500. for w in r:
  501. yield w
  502. def _pcut_for_search(sentence, HMM=True):
  503. parts = strdecode(sentence).splitlines(True)
  504. if HMM:
  505. result = pool.map(_lcut_for_search, parts)
  506. else:
  507. result = pool.map(_lcut_for_search_no_hmm, parts)
  508. for r in result:
  509. for w in r:
  510. yield w
  511. def enable_parallel(processnum=None):
  512. """
  513. Change the module's `cut` and `cut_for_search` functions to the
  514. parallel version.
  515. Note that this only works using dt, custom Tokenizer
  516. instances are not supported.
  517. """
  518. global pool, dt, cut, cut_for_search
  519. from multiprocessing import cpu_count
  520. if os.name == 'nt':
  521. raise NotImplementedError(
  522. "jieba: parallel mode only supports posix system")
  523. else:
  524. from multiprocessing import Pool
  525. dt.check_initialized()
  526. if processnum is None:
  527. processnum = cpu_count()
  528. pool = Pool(processnum)
  529. cut = _pcut
  530. cut_for_search = _pcut_for_search
  531. def disable_parallel():
  532. global pool, dt, cut, cut_for_search
  533. if pool:
  534. pool.close()
  535. pool = None
  536. cut = dt.cut
  537. cut_for_search = dt.cut_for_search