tfidf.py 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116
  1. # encoding=utf-8
  2. from __future__ import absolute_import
  3. import os
  4. import jieba
  5. import jieba.posseg
  6. from operator import itemgetter
  7. _get_module_path = lambda path: os.path.normpath(os.path.join(os.getcwd(),
  8. os.path.dirname(__file__), path))
  9. _get_abs_path = jieba._get_abs_path
  10. DEFAULT_IDF = _get_module_path("idf.txt")
  11. class KeywordExtractor(object):
  12. STOP_WORDS = set((
  13. "the", "of", "is", "and", "to", "in", "that", "we", "for", "an", "are",
  14. "by", "be", "as", "on", "with", "can", "if", "from", "which", "you", "it",
  15. "this", "then", "at", "have", "all", "not", "one", "has", "or", "that"
  16. ))
  17. def set_stop_words(self, stop_words_path):
  18. abs_path = _get_abs_path(stop_words_path)
  19. if not os.path.isfile(abs_path):
  20. raise Exception("jieba: file does not exist: " + abs_path)
  21. content = open(abs_path, 'rb').read().decode('utf-8')
  22. for line in content.splitlines():
  23. self.stop_words.add(line)
  24. def extract_tags(self, *args, **kwargs):
  25. raise NotImplementedError
  26. class IDFLoader(object):
  27. def __init__(self, idf_path=None):
  28. self.path = ""
  29. self.idf_freq = {}
  30. self.median_idf = 0.0
  31. if idf_path:
  32. self.set_new_path(idf_path)
  33. def set_new_path(self, new_idf_path):
  34. if self.path != new_idf_path:
  35. self.path = new_idf_path
  36. content = open(new_idf_path, 'rb').read().decode('utf-8')
  37. self.idf_freq = {}
  38. for line in content.splitlines():
  39. word, freq = line.strip().split(' ')
  40. self.idf_freq[word] = float(freq)
  41. self.median_idf = sorted(
  42. self.idf_freq.values())[len(self.idf_freq) // 2]
  43. def get_idf(self):
  44. return self.idf_freq, self.median_idf
  45. class TFIDF(KeywordExtractor):
  46. def __init__(self, idf_path=None):
  47. self.tokenizer = jieba.dt
  48. self.postokenizer = jieba.posseg.dt
  49. self.stop_words = self.STOP_WORDS.copy()
  50. self.idf_loader = IDFLoader(idf_path or DEFAULT_IDF)
  51. self.idf_freq, self.median_idf = self.idf_loader.get_idf()
  52. def set_idf_path(self, idf_path):
  53. new_abs_path = _get_abs_path(idf_path)
  54. if not os.path.isfile(new_abs_path):
  55. raise Exception("jieba: file does not exist: " + new_abs_path)
  56. self.idf_loader.set_new_path(new_abs_path)
  57. self.idf_freq, self.median_idf = self.idf_loader.get_idf()
  58. def extract_tags(self, sentence, topK=20, withWeight=False, allowPOS=(), withFlag=False):
  59. """
  60. Extract keywords from sentence using TF-IDF algorithm.
  61. Parameter:
  62. - topK: return how many top keywords. `None` for all possible words.
  63. - withWeight: if True, return a list of (word, weight);
  64. if False, return a list of words.
  65. - allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v','nr'].
  66. if the POS of w is not in this list,it will be filtered.
  67. - withFlag: only work with allowPOS is not empty.
  68. if True, return a list of pair(word, weight) like posseg.cut
  69. if False, return a list of words
  70. """
  71. if allowPOS:
  72. allowPOS = frozenset(allowPOS)
  73. words = self.postokenizer.cut(sentence)
  74. else:
  75. words = self.tokenizer.cut(sentence)
  76. freq = {}
  77. for w in words:
  78. if allowPOS:
  79. if w.flag not in allowPOS:
  80. continue
  81. elif not withFlag:
  82. w = w.word
  83. wc = w.word if allowPOS and withFlag else w
  84. if len(wc.strip()) < 2 or wc.lower() in self.stop_words:
  85. continue
  86. freq[w] = freq.get(w, 0.0) + 1.0
  87. total = sum(freq.values())
  88. for k in freq:
  89. kw = k.word if allowPOS and withFlag else k
  90. freq[k] *= self.idf_freq.get(kw, self.median_idf) / total
  91. if withWeight:
  92. tags = sorted(freq.items(), key=itemgetter(1), reverse=True)
  93. else:
  94. tags = sorted(freq, key=freq.__getitem__, reverse=True)
  95. if topK:
  96. return tags[:topK]
  97. else:
  98. return tags