123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110 |
- #!/usr/bin/env python
- # -*- coding: utf-8 -*-
- from __future__ import absolute_import, unicode_literals
- import sys
- from operator import itemgetter
- from collections import defaultdict
- import jieba.posseg
- from .tfidf import KeywordExtractor
- from .._compat import *
- class UndirectWeightedGraph:
- d = 0.85
- def __init__(self):
- self.graph = defaultdict(list)
- def addEdge(self, start, end, weight):
- # use a tuple (start, end, weight) instead of a Edge object
- self.graph[start].append((start, end, weight))
- self.graph[end].append((end, start, weight))
- def rank(self):
- ws = defaultdict(float)
- outSum = defaultdict(float)
- wsdef = 1.0 / (len(self.graph) or 1.0)
- for n, out in self.graph.items():
- ws[n] = wsdef
- outSum[n] = sum((e[2] for e in out), 0.0)
- # this line for build stable iteration
- sorted_keys = sorted(self.graph.keys())
- for x in xrange(10): # 10 iters
- for n in sorted_keys:
- s = 0
- for e in self.graph[n]:
- s += e[2] / outSum[e[1]] * ws[e[1]]
- ws[n] = (1 - self.d) + self.d * s
- (min_rank, max_rank) = (sys.float_info[0], sys.float_info[3])
- for w in itervalues(ws):
- if w < min_rank:
- min_rank = w
- if w > max_rank:
- max_rank = w
- for n, w in ws.items():
- # to unify the weights, don't *100.
- ws[n] = (w - min_rank / 10.0) / (max_rank - min_rank / 10.0)
- return ws
- class TextRank(KeywordExtractor):
- def __init__(self):
- self.tokenizer = self.postokenizer = jieba.posseg.dt
- self.stop_words = self.STOP_WORDS.copy()
- self.pos_filt = frozenset(('ns', 'n', 'vn', 'v'))
- self.span = 5
- def pairfilter(self, wp):
- return (wp.flag in self.pos_filt and len(wp.word.strip()) >= 2
- and wp.word.lower() not in self.stop_words)
- def textrank(self, sentence, topK=20, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v'), withFlag=False):
- """
- Extract keywords from sentence using TextRank algorithm.
- Parameter:
- - topK: return how many top keywords. `None` for all possible words.
- - withWeight: if True, return a list of (word, weight);
- if False, return a list of words.
- - allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v'].
- if the POS of w is not in this list, it will be filtered.
- - withFlag: if True, return a list of pair(word, weight) like posseg.cut
- if False, return a list of words
- """
- self.pos_filt = frozenset(allowPOS)
- g = UndirectWeightedGraph()
- cm = defaultdict(int)
- words = tuple(self.tokenizer.cut(sentence))
- for i, wp in enumerate(words):
- if self.pairfilter(wp):
- for j in xrange(i + 1, i + self.span):
- if j >= len(words):
- break
- if not self.pairfilter(words[j]):
- continue
- if allowPOS and withFlag:
- cm[(wp, words[j])] += 1
- else:
- cm[(wp.word, words[j].word)] += 1
- for terms, w in cm.items():
- g.addEdge(terms[0], terms[1], w)
- nodes_rank = g.rank()
- if withWeight:
- tags = sorted(nodes_rank.items(), key=itemgetter(1), reverse=True)
- else:
- tags = sorted(nodes_rank, key=nodes_rank.__getitem__, reverse=True)
- if topK:
- return tags[:topK]
- else:
- return tags
- extract_tags = textrank
|