123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619 |
- from __future__ import absolute_import, unicode_literals
- __version__ = '0.42.1'
- __license__ = 'MIT'
- import marshal
- import re
- import tempfile
- import threading
- import time
- from hashlib import md5
- from math import log
- from . import finalseg
- from ._compat import *
- if os.name == 'nt':
- from shutil import move as _replace_file
- else:
- _replace_file = os.rename
- _get_abs_path = lambda path: os.path.normpath(os.path.join(os.getcwd(), path))
- DEFAULT_DICT = None
- DEFAULT_DICT_NAME = "dict.txt"
- log_console = logging.StreamHandler(sys.stderr)
- default_logger = logging.getLogger(__name__)
- default_logger.setLevel(logging.DEBUG)
- default_logger.addHandler(log_console)
- DICT_WRITING = {}
- pool = None
- re_userdict = re.compile('^(.+?)( [0-9]+)?( [a-z]+)?$', re.U)
- re_eng = re.compile('[a-zA-Z0-9]', re.U)
- # \u4E00-\u9FD5a-zA-Z0-9+#&\._ : All non-space characters. Will be handled with re_han
- # \r\n|\s : whitespace characters. Will not be handled.
- # re_han_default = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._%]+)", re.U)
- # Adding "-" symbol in re_han_default
- re_han_default = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._%\-]+)", re.U)
- re_skip_default = re.compile("(\r\n|\s)", re.U)
- def setLogLevel(log_level):
- default_logger.setLevel(log_level)
- class Tokenizer(object):
- def __init__(self, dictionary=DEFAULT_DICT):
- self.lock = threading.RLock()
- if dictionary == DEFAULT_DICT:
- self.dictionary = dictionary
- else:
- self.dictionary = _get_abs_path(dictionary)
- self.FREQ = {}
- self.total = 0
- self.user_word_tag_tab = {}
- self.initialized = False
- self.tmp_dir = None
- self.cache_file = None
- def __repr__(self):
- return '<Tokenizer dictionary=%r>' % self.dictionary
- @staticmethod
- def gen_pfdict(f):
- lfreq = {}
- ltotal = 0
- f_name = resolve_filename(f)
- for lineno, line in enumerate(f, 1):
- try:
- line = line.strip().decode('utf-8')
- word, freq = line.split(' ')[:2]
- freq = int(freq)
- lfreq[word] = freq
- ltotal += freq
- for ch in xrange(len(word)):
- wfrag = word[:ch + 1]
- if wfrag not in lfreq:
- lfreq[wfrag] = 0
- except ValueError:
- raise ValueError(
- 'invalid dictionary entry in %s at Line %s: %s' % (f_name, lineno, line))
- f.close()
- return lfreq, ltotal
- def initialize(self, dictionary=None):
- if dictionary:
- abs_path = _get_abs_path(dictionary)
- if self.dictionary == abs_path and self.initialized:
- return
- else:
- self.dictionary = abs_path
- self.initialized = False
- else:
- abs_path = self.dictionary
- with self.lock:
- try:
- with DICT_WRITING[abs_path]:
- pass
- except KeyError:
- pass
- if self.initialized:
- return
- default_logger.debug("Building prefix dict from %s ..." % (abs_path or 'the default dictionary'))
- t1 = time.time()
- if self.cache_file:
- cache_file = self.cache_file
- # default dictionary
- elif abs_path == DEFAULT_DICT:
- cache_file = "jieba.cache"
- # custom dictionary
- else:
- cache_file = "jieba.u%s.cache" % md5(
- abs_path.encode('utf-8', 'replace')).hexdigest()
- cache_file = os.path.join(
- self.tmp_dir or tempfile.gettempdir(), cache_file)
- # prevent absolute path in self.cache_file
- tmpdir = os.path.dirname(cache_file)
- load_from_cache_fail = True
- if os.path.isfile(cache_file) and (abs_path == DEFAULT_DICT or
- os.path.getmtime(cache_file) > os.path.getmtime(abs_path)):
- default_logger.debug(
- "Loading model from cache %s" % cache_file)
- try:
- with open(cache_file, 'rb') as cf:
- self.FREQ, self.total = marshal.load(cf)
- load_from_cache_fail = False
- except Exception:
- load_from_cache_fail = True
- if load_from_cache_fail:
- wlock = DICT_WRITING.get(abs_path, threading.RLock())
- DICT_WRITING[abs_path] = wlock
- with wlock:
- self.FREQ, self.total = self.gen_pfdict(self.get_dict_file())
- default_logger.debug(
- "Dumping model to file cache %s" % cache_file)
- try:
- # prevent moving across different filesystems
- fd, fpath = tempfile.mkstemp(dir=tmpdir)
- with os.fdopen(fd, 'wb') as temp_cache_file:
- marshal.dump(
- (self.FREQ, self.total), temp_cache_file)
- _replace_file(fpath, cache_file)
- except Exception:
- default_logger.exception("Dump cache file failed.")
- try:
- del DICT_WRITING[abs_path]
- except KeyError:
- pass
- self.initialized = True
- default_logger.debug(
- "Loading model cost %.3f seconds." % (time.time() - t1))
- default_logger.debug("Prefix dict has been built successfully.")
- def check_initialized(self):
- if not self.initialized:
- self.initialize()
- def calc(self, sentence, DAG, route):
- N = len(sentence)
- route[N] = (0, 0)
- logtotal = log(self.total)
- for idx in xrange(N - 1, -1, -1):
- route[idx] = max((log(self.FREQ.get(sentence[idx:x + 1]) or 1) -
- logtotal + route[x + 1][0], x) for x in DAG[idx])
- def get_DAG(self, sentence):
- self.check_initialized()
- DAG = {}
- N = len(sentence)
- for k in xrange(N):
- tmplist = []
- i = k
- frag = sentence[k]
- while i < N and frag in self.FREQ:
- if self.FREQ[frag]:
- tmplist.append(i)
- i += 1
- frag = sentence[k:i + 1]
- if not tmplist:
- tmplist.append(k)
- DAG[k] = tmplist
- return DAG
- def __cut_all(self, sentence):
- dag = self.get_DAG(sentence)
- old_j = -1
- eng_scan = 0
- eng_buf = u''
- for k, L in iteritems(dag):
- if eng_scan == 1 and not re_eng.match(sentence[k]):
- eng_scan = 0
- yield eng_buf
- if len(L) == 1 and k > old_j:
- word = sentence[k:L[0] + 1]
- if re_eng.match(word):
- if eng_scan == 0:
- eng_scan = 1
- eng_buf = word
- else:
- eng_buf += word
- if eng_scan == 0:
- yield word
- old_j = L[0]
- else:
- for j in L:
- if j > k:
- yield sentence[k:j + 1]
- old_j = j
- if eng_scan == 1:
- yield eng_buf
- def __cut_DAG_NO_HMM(self, sentence):
- DAG = self.get_DAG(sentence)
- route = {}
- self.calc(sentence, DAG, route)
- x = 0
- N = len(sentence)
- buf = ''
- while x < N:
- y = route[x][1] + 1
- l_word = sentence[x:y]
- if re_eng.match(l_word) and len(l_word) == 1:
- buf += l_word
- x = y
- else:
- if buf:
- yield buf
- buf = ''
- yield l_word
- x = y
- if buf:
- yield buf
- buf = ''
- def __cut_DAG(self, sentence):
- DAG = self.get_DAG(sentence)
- route = {}
- self.calc(sentence, DAG, route)
- x = 0
- buf = ''
- N = len(sentence)
- while x < N:
- y = route[x][1] + 1
- l_word = sentence[x:y]
- if y - x == 1:
- buf += l_word
- else:
- if buf:
- if len(buf) == 1:
- yield buf
- buf = ''
- else:
- if not self.FREQ.get(buf):
- recognized = finalseg.cut(buf)
- for t in recognized:
- yield t
- else:
- for elem in buf:
- yield elem
- buf = ''
- yield l_word
- x = y
- if buf:
- if len(buf) == 1:
- yield buf
- elif not self.FREQ.get(buf):
- recognized = finalseg.cut(buf)
- for t in recognized:
- yield t
- else:
- for elem in buf:
- yield elem
- def cut(self, sentence, cut_all=False, HMM=True, use_paddle=False):
- """
- The main function that segments an entire sentence that contains
- Chinese characters into separated words.
- Parameter:
- - sentence: The str(unicode) to be segmented.
- - cut_all: Model type. True for full pattern, False for accurate pattern.
- - HMM: Whether to use the Hidden Markov Model.
- """
- is_paddle_installed = check_paddle_install['is_paddle_installed']
- sentence = strdecode(sentence)
- if use_paddle and is_paddle_installed:
- # if sentence is null, it will raise core exception in paddle.
- if sentence is None or len(sentence) == 0:
- return
- import jieba.lac_small.predict as predict
- results = predict.get_sent(sentence)
- for sent in results:
- if sent is None:
- continue
- yield sent
- return
- re_han = re_han_default
- re_skip = re_skip_default
- if cut_all:
- cut_block = self.__cut_all
- elif HMM:
- cut_block = self.__cut_DAG
- else:
- cut_block = self.__cut_DAG_NO_HMM
- blocks = re_han.split(sentence)
- for blk in blocks:
- if not blk:
- continue
- if re_han.match(blk):
- for word in cut_block(blk):
- yield word
- else:
- tmp = re_skip.split(blk)
- for x in tmp:
- if re_skip.match(x):
- yield x
- elif not cut_all:
- for xx in x:
- yield xx
- else:
- yield x
- def cut_for_search(self, sentence, HMM=True):
- """
- Finer segmentation for search engines.
- """
- words = self.cut(sentence, HMM=HMM)
- for w in words:
- if len(w) > 2:
- for i in xrange(len(w) - 1):
- gram2 = w[i:i + 2]
- if self.FREQ.get(gram2):
- yield gram2
- if len(w) > 3:
- for i in xrange(len(w) - 2):
- gram3 = w[i:i + 3]
- if self.FREQ.get(gram3):
- yield gram3
- yield w
- def lcut(self, *args, **kwargs):
- return list(self.cut(*args, **kwargs))
- def lcut_for_search(self, *args, **kwargs):
- return list(self.cut_for_search(*args, **kwargs))
- _lcut = lcut
- _lcut_for_search = lcut_for_search
- def _lcut_no_hmm(self, sentence):
- return self.lcut(sentence, False, False)
- def _lcut_all(self, sentence):
- return self.lcut(sentence, True)
- def _lcut_for_search_no_hmm(self, sentence):
- return self.lcut_for_search(sentence, False)
- def get_dict_file(self):
- if self.dictionary == DEFAULT_DICT:
- return get_module_res(DEFAULT_DICT_NAME)
- else:
- return open(self.dictionary, 'rb')
- def load_userdict(self, f):
- '''
- Load personalized dict to improve detect rate.
- Parameter:
- - f : A plain text file contains words and their ocurrences.
- Can be a file-like object, or the path of the dictionary file,
- whose encoding must be utf-8.
- Structure of dict file:
- word1 freq1 word_type1
- word2 freq2 word_type2
- ...
- Word type may be ignored
- '''
- self.check_initialized()
- if isinstance(f, string_types):
- f_name = f
- f = open(f, 'rb')
- else:
- f_name = resolve_filename(f)
- for lineno, ln in enumerate(f, 1):
- line = ln.strip()
- if not isinstance(line, text_type):
- try:
- line = line.decode('utf-8').lstrip('\ufeff')
- except UnicodeDecodeError:
- raise ValueError('dictionary file %s must be utf-8' % f_name)
- if not line:
- continue
- # match won't be None because there's at least one character
- word, freq, tag = re_userdict.match(line).groups()
- if freq is not None:
- freq = freq.strip()
- if tag is not None:
- tag = tag.strip()
- self.add_word(word, freq, tag)
- def add_word(self, word, freq=None, tag=None):
- """
- Add a word to dictionary.
- freq and tag can be omitted, freq defaults to be a calculated value
- that ensures the word can be cut out.
- """
- self.check_initialized()
- word = strdecode(word)
- freq = int(freq) if freq is not None else self.suggest_freq(word, False)
- self.FREQ[word] = freq
- self.total += freq
- if tag:
- self.user_word_tag_tab[word] = tag
- for ch in xrange(len(word)):
- wfrag = word[:ch + 1]
- if wfrag not in self.FREQ:
- self.FREQ[wfrag] = 0
- if freq == 0:
- finalseg.add_force_split(word)
- def del_word(self, word):
- """
- Convenient function for deleting a word.
- """
- self.add_word(word, 0)
- def suggest_freq(self, segment, tune=False):
- """
- Suggest word frequency to force the characters in a word to be
- joined or splitted.
- Parameter:
- - segment : The segments that the word is expected to be cut into,
- If the word should be treated as a whole, use a str.
- - tune : If True, tune the word frequency.
- Note that HMM may affect the final result. If the result doesn't change,
- set HMM=False.
- """
- self.check_initialized()
- ftotal = float(self.total)
- freq = 1
- if isinstance(segment, string_types):
- word = segment
- for seg in self.cut(word, HMM=False):
- freq *= self.FREQ.get(seg, 1) / ftotal
- freq = max(int(freq * self.total) + 1, self.FREQ.get(word, 1))
- else:
- segment = tuple(map(strdecode, segment))
- word = ''.join(segment)
- for seg in segment:
- freq *= self.FREQ.get(seg, 1) / ftotal
- freq = min(int(freq * self.total), self.FREQ.get(word, 0))
- if tune:
- self.add_word(word, freq)
- return freq
- def tokenize(self, unicode_sentence, mode="default", HMM=True):
- """
- Tokenize a sentence and yields tuples of (word, start, end)
- Parameter:
- - sentence: the str(unicode) to be segmented.
- - mode: "default" or "search", "search" is for finer segmentation.
- - HMM: whether to use the Hidden Markov Model.
- """
- if not isinstance(unicode_sentence, text_type):
- raise ValueError("jieba: the input parameter should be unicode.")
- start = 0
- if mode == 'default':
- for w in self.cut(unicode_sentence, HMM=HMM):
- width = len(w)
- yield (w, start, start + width)
- start += width
- else:
- for w in self.cut(unicode_sentence, HMM=HMM):
- width = len(w)
- if len(w) > 2:
- for i in xrange(len(w) - 1):
- gram2 = w[i:i + 2]
- if self.FREQ.get(gram2):
- yield (gram2, start + i, start + i + 2)
- if len(w) > 3:
- for i in xrange(len(w) - 2):
- gram3 = w[i:i + 3]
- if self.FREQ.get(gram3):
- yield (gram3, start + i, start + i + 3)
- yield (w, start, start + width)
- start += width
- def set_dictionary(self, dictionary_path):
- with self.lock:
- abs_path = _get_abs_path(dictionary_path)
- if not os.path.isfile(abs_path):
- raise Exception("jieba: file does not exist: " + abs_path)
- self.dictionary = abs_path
- self.initialized = False
- # default Tokenizer instance
- dt = Tokenizer()
- # global functions
- get_FREQ = lambda k, d=None: dt.FREQ.get(k, d)
- add_word = dt.add_word
- calc = dt.calc
- cut = dt.cut
- lcut = dt.lcut
- cut_for_search = dt.cut_for_search
- lcut_for_search = dt.lcut_for_search
- del_word = dt.del_word
- get_DAG = dt.get_DAG
- get_dict_file = dt.get_dict_file
- initialize = dt.initialize
- load_userdict = dt.load_userdict
- set_dictionary = dt.set_dictionary
- suggest_freq = dt.suggest_freq
- tokenize = dt.tokenize
- user_word_tag_tab = dt.user_word_tag_tab
- def _lcut_all(s):
- return dt._lcut_all(s)
- def _lcut(s):
- return dt._lcut(s)
- def _lcut_no_hmm(s):
- return dt._lcut_no_hmm(s)
- def _lcut_all(s):
- return dt._lcut_all(s)
- def _lcut_for_search(s):
- return dt._lcut_for_search(s)
- def _lcut_for_search_no_hmm(s):
- return dt._lcut_for_search_no_hmm(s)
- def _pcut(sentence, cut_all=False, HMM=True):
- parts = strdecode(sentence).splitlines(True)
- if cut_all:
- result = pool.map(_lcut_all, parts)
- elif HMM:
- result = pool.map(_lcut, parts)
- else:
- result = pool.map(_lcut_no_hmm, parts)
- for r in result:
- for w in r:
- yield w
- def _pcut_for_search(sentence, HMM=True):
- parts = strdecode(sentence).splitlines(True)
- if HMM:
- result = pool.map(_lcut_for_search, parts)
- else:
- result = pool.map(_lcut_for_search_no_hmm, parts)
- for r in result:
- for w in r:
- yield w
- def enable_parallel(processnum=None):
- """
- Change the module's `cut` and `cut_for_search` functions to the
- parallel version.
- Note that this only works using dt, custom Tokenizer
- instances are not supported.
- """
- global pool, dt, cut, cut_for_search
- from multiprocessing import cpu_count
- if os.name == 'nt':
- raise NotImplementedError(
- "jieba: parallel mode only supports posix system")
- else:
- from multiprocessing import Pool
- dt.check_initialized()
- if processnum is None:
- processnum = cpu_count()
- pool = Pool(processnum)
- cut = _pcut
- cut_for_search = _pcut_for_search
- def disable_parallel():
- global pool, dt, cut, cut_for_search
- if pool:
- pool.close()
- pool = None
- cut = dt.cut
- cut_for_search = dt.cut_for_search
|