classify.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377
  1. # Copyright 2008 Matt Chaput. All rights reserved.
  2. #
  3. # Redistribution and use in source and binary forms, with or without
  4. # modification, are permitted provided that the following conditions are met:
  5. #
  6. # 1. Redistributions of source code must retain the above copyright notice,
  7. # this list of conditions and the following disclaimer.
  8. #
  9. # 2. Redistributions in binary form must reproduce the above copyright
  10. # notice, this list of conditions and the following disclaimer in the
  11. # documentation and/or other materials provided with the distribution.
  12. #
  13. # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
  14. # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  15. # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
  16. # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  17. # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  18. # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
  19. # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  20. # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  21. # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
  22. # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  23. #
  24. # The views and conclusions contained in the software and documentation are
  25. # those of the authors and should not be interpreted as representing official
  26. # policies, either expressed or implied, of Matt Chaput.
  27. """Classes and functions for classifying and extracting information from
  28. documents.
  29. """
  30. from __future__ import division
  31. import random
  32. from collections import defaultdict
  33. from math import log
  34. from whoosh.compat import xrange, iteritems
  35. # Expansion models
  36. class ExpansionModel(object):
  37. def __init__(self, doc_count, field_length):
  38. self.N = doc_count
  39. self.collection_total = field_length
  40. if self.N:
  41. self.mean_length = self.collection_total / self.N
  42. else:
  43. self.mean_length = 0
  44. def normalizer(self, maxweight, top_total):
  45. raise NotImplementedError
  46. def score(self, weight_in_top, weight_in_collection, top_total):
  47. raise NotImplementedError
  48. class Bo1Model(ExpansionModel):
  49. def normalizer(self, maxweight, top_total):
  50. f = maxweight / self.N
  51. return (maxweight * log((1.0 + f) / f) + log(1.0 + f)) / log(2.0)
  52. def score(self, weight_in_top, weight_in_collection, top_total):
  53. f = weight_in_collection / self.N
  54. return weight_in_top * log((1.0 + f) / f, 2) + log(1.0 + f, 2)
  55. class Bo2Model(ExpansionModel):
  56. def normalizer(self, maxweight, top_total):
  57. f = maxweight * self.N / self.collection_total
  58. return maxweight * log((1.0 + f) / f, 2) + log(1.0 + f, 2)
  59. def score(self, weight_in_top, weight_in_collection, top_total):
  60. f = weight_in_top * top_total / self.collection_total
  61. return weight_in_top * log((1.0 + f) / f, 2) + log(1.0 + f, 2)
  62. class KLModel(ExpansionModel):
  63. def normalizer(self, maxweight, top_total):
  64. return (maxweight * log(self.collection_total / top_total) / log(2.0)
  65. * top_total)
  66. def score(self, weight_in_top, weight_in_collection, top_total):
  67. wit_over_tt = weight_in_top / top_total
  68. wic_over_ct = weight_in_collection / self.collection_total
  69. if wit_over_tt < wic_over_ct:
  70. return 0
  71. else:
  72. return wit_over_tt * log(wit_over_tt
  73. / (weight_in_top / self.collection_total),
  74. 2)
  75. class Expander(object):
  76. """Uses an ExpansionModel to expand the set of query terms based on the top
  77. N result documents.
  78. """
  79. def __init__(self, ixreader, fieldname, model=Bo1Model):
  80. """
  81. :param reader: A :class:whoosh.reading.IndexReader object.
  82. :param fieldname: The name of the field in which to search.
  83. :param model: (classify.ExpansionModel) The model to use for expanding
  84. the query terms. If you omit this parameter, the expander uses
  85. :class:`Bo1Model` by default.
  86. """
  87. self.ixreader = ixreader
  88. self.fieldname = fieldname
  89. doccount = self.ixreader.doc_count_all()
  90. fieldlen = self.ixreader.field_length(fieldname)
  91. if type(model) is type:
  92. model = model(doccount, fieldlen)
  93. self.model = model
  94. # Maps words to their weight in the top N documents.
  95. self.topN_weight = defaultdict(float)
  96. # Total weight of all terms in the top N documents.
  97. self.top_total = 0
  98. def add(self, vector):
  99. """Adds forward-index information about one of the "top N" documents.
  100. :param vector: A series of (text, weight) tuples, such as is
  101. returned by Reader.vector_as("weight", docnum, fieldname).
  102. """
  103. total_weight = 0
  104. topN_weight = self.topN_weight
  105. for word, weight in vector:
  106. total_weight += weight
  107. topN_weight[word] += weight
  108. self.top_total += total_weight
  109. def add_document(self, docnum):
  110. ixreader = self.ixreader
  111. if self.ixreader.has_vector(docnum, self.fieldname):
  112. self.add(ixreader.vector_as("weight", docnum, self.fieldname))
  113. elif self.ixreader.schema[self.fieldname].stored:
  114. self.add_text(ixreader.stored_fields(docnum).get(self.fieldname))
  115. else:
  116. raise Exception("Field %r in document %s is not vectored or stored"
  117. % (self.fieldname, docnum))
  118. def add_text(self, string):
  119. # Unfortunately since field.index() yields bytes texts, and we want
  120. # unicode, we end up encoding and decoding unnecessarily.
  121. #
  122. # TODO: Find a way around this
  123. field = self.ixreader.schema[self.fieldname]
  124. from_bytes = field.from_bytes
  125. self.add((from_bytes(text), weight) for text, _, weight, _
  126. in field.index(string))
  127. def expanded_terms(self, number, normalize=True):
  128. """Returns the N most important terms in the vectors added so far.
  129. :param number: The number of terms to return.
  130. :param normalize: Whether to normalize the weights.
  131. :returns: A list of ("term", weight) tuples.
  132. """
  133. model = self.model
  134. fieldname = self.fieldname
  135. ixreader = self.ixreader
  136. field = ixreader.schema[fieldname]
  137. tlist = []
  138. maxweight = 0
  139. # If no terms have been added, return an empty list
  140. if not self.topN_weight:
  141. return []
  142. for word, weight in iteritems(self.topN_weight):
  143. btext = field.to_bytes(word)
  144. if (fieldname, btext) in ixreader:
  145. cf = ixreader.frequency(fieldname, btext)
  146. score = model.score(weight, cf, self.top_total)
  147. if score > maxweight:
  148. maxweight = score
  149. tlist.append((score, word))
  150. if normalize:
  151. norm = model.normalizer(maxweight, self.top_total)
  152. else:
  153. norm = maxweight
  154. tlist = [(weight / norm, t) for weight, t in tlist]
  155. tlist.sort(key=lambda x: (0 - x[0], x[1]))
  156. return [(t, weight) for weight, t in tlist[:number]]
  157. # Similarity functions
  158. def shingles(input, size=2):
  159. d = defaultdict(int)
  160. for shingle in (input[i:i + size]
  161. for i in xrange(len(input) - (size - 1))):
  162. d[shingle] += 1
  163. return iteritems(d)
  164. def simhash(features, hashbits=32):
  165. if hashbits == 32:
  166. hashfn = hash
  167. else:
  168. hashfn = lambda s: _hash(s, hashbits)
  169. vs = [0] * hashbits
  170. for feature, weight in features:
  171. h = hashfn(feature)
  172. for i in xrange(hashbits):
  173. if h & (1 << i):
  174. vs[i] += weight
  175. else:
  176. vs[i] -= weight
  177. out = 0
  178. for i, v in enumerate(vs):
  179. if v > 0:
  180. out |= 1 << i
  181. return out
  182. def _hash(s, hashbits):
  183. # A variable-length version of Python's builtin hash
  184. if s == "":
  185. return 0
  186. else:
  187. x = ord(s[0]) << 7
  188. m = 1000003
  189. mask = 2 ** hashbits - 1
  190. for c in s:
  191. x = ((x * m) ^ ord(c)) & mask
  192. x ^= len(s)
  193. if x == -1:
  194. x = -2
  195. return x
  196. def hamming_distance(first_hash, other_hash, hashbits=32):
  197. x = (first_hash ^ other_hash) & ((1 << hashbits) - 1)
  198. tot = 0
  199. while x:
  200. tot += 1
  201. x &= x - 1
  202. return tot
  203. # Clustering
  204. def kmeans(data, k, t=0.0001, distfun=None, maxiter=50, centers=None):
  205. """
  206. One-dimensional K-means clustering function.
  207. :param data: list of data points.
  208. :param k: number of clusters.
  209. :param t: tolerance; stop if changes between iterations are smaller than
  210. this value.
  211. :param distfun: a distance function.
  212. :param centers: a list of centroids to start with.
  213. :param maxiter: maximum number of iterations to run.
  214. """
  215. # Adapted from a C version by Roger Zhang, <rogerz@cs.dal.ca>
  216. # http://cs.smu.ca/~r_zhang/code/kmeans.c
  217. DOUBLE_MAX = 1.797693e308
  218. n = len(data)
  219. error = DOUBLE_MAX # sum of squared euclidean distance
  220. counts = [0] * k # size of each cluster
  221. labels = [0] * n # output cluster label for each data point
  222. # c1 is an array of len k of the temp centroids
  223. c1 = [0] * k
  224. # choose k initial centroids
  225. if centers:
  226. c = centers
  227. else:
  228. c = random.sample(data, k)
  229. niter = 0
  230. # main loop
  231. while True:
  232. # save error from last step
  233. old_error = error
  234. error = 0
  235. # clear old counts and temp centroids
  236. for i in xrange(k):
  237. counts[i] = 0
  238. c1[i] = 0
  239. for h in xrange(n):
  240. # identify the closest cluster
  241. min_distance = DOUBLE_MAX
  242. for i in xrange(k):
  243. distance = (data[h] - c[i]) ** 2
  244. if distance < min_distance:
  245. labels[h] = i
  246. min_distance = distance
  247. # update size and temp centroid of the destination cluster
  248. c1[labels[h]] += data[h]
  249. counts[labels[h]] += 1
  250. # update standard error
  251. error += min_distance
  252. for i in xrange(k): # update all centroids
  253. c[i] = c1[i] / counts[i] if counts[i] else c1[i]
  254. niter += 1
  255. if (abs(error - old_error) < t) or (niter > maxiter):
  256. break
  257. return labels, c
  258. # Sliding window clusters
  259. def two_pass_variance(data):
  260. n = 0
  261. sum1 = 0
  262. sum2 = 0
  263. for x in data:
  264. n += 1
  265. sum1 = sum1 + x
  266. mean = sum1 / n
  267. for x in data:
  268. sum2 += (x - mean) * (x - mean)
  269. variance = sum2 / (n - 1)
  270. return variance
  271. def weighted_incremental_variance(data_weight_pairs):
  272. mean = 0
  273. S = 0
  274. sumweight = 0
  275. for x, weight in data_weight_pairs:
  276. temp = weight + sumweight
  277. Q = x - mean
  278. R = Q * weight / temp
  279. S += sumweight * Q * R
  280. mean += R
  281. sumweight = temp
  282. Variance = S / (sumweight - 1) # if sample is the population, omit -1
  283. return Variance
  284. def swin(data, size):
  285. clusters = []
  286. for i, left in enumerate(data):
  287. j = i
  288. right = data[j]
  289. while j < len(data) - 1 and right - left < size:
  290. j += 1
  291. right = data[j]
  292. v = 99999
  293. if j - i > 1:
  294. v = two_pass_variance(data[i:j + 1])
  295. clusters.append((left, right, j - i, v))
  296. clusters.sort(key=lambda x: (0 - x[2], x[3]))
  297. return clusters