123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377 |
- # Copyright 2008 Matt Chaput. All rights reserved.
- #
- # Redistribution and use in source and binary forms, with or without
- # modification, are permitted provided that the following conditions are met:
- #
- # 1. Redistributions of source code must retain the above copyright notice,
- # this list of conditions and the following disclaimer.
- #
- # 2. Redistributions in binary form must reproduce the above copyright
- # notice, this list of conditions and the following disclaimer in the
- # documentation and/or other materials provided with the distribution.
- #
- # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
- # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
- # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
- # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
- # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- #
- # The views and conclusions contained in the software and documentation are
- # those of the authors and should not be interpreted as representing official
- # policies, either expressed or implied, of Matt Chaput.
- """Classes and functions for classifying and extracting information from
- documents.
- """
- from __future__ import division
- import random
- from collections import defaultdict
- from math import log
- from whoosh.compat import xrange, iteritems
- # Expansion models
- class ExpansionModel(object):
- def __init__(self, doc_count, field_length):
- self.N = doc_count
- self.collection_total = field_length
- if self.N:
- self.mean_length = self.collection_total / self.N
- else:
- self.mean_length = 0
- def normalizer(self, maxweight, top_total):
- raise NotImplementedError
- def score(self, weight_in_top, weight_in_collection, top_total):
- raise NotImplementedError
- class Bo1Model(ExpansionModel):
- def normalizer(self, maxweight, top_total):
- f = maxweight / self.N
- return (maxweight * log((1.0 + f) / f) + log(1.0 + f)) / log(2.0)
- def score(self, weight_in_top, weight_in_collection, top_total):
- f = weight_in_collection / self.N
- return weight_in_top * log((1.0 + f) / f, 2) + log(1.0 + f, 2)
- class Bo2Model(ExpansionModel):
- def normalizer(self, maxweight, top_total):
- f = maxweight * self.N / self.collection_total
- return maxweight * log((1.0 + f) / f, 2) + log(1.0 + f, 2)
- def score(self, weight_in_top, weight_in_collection, top_total):
- f = weight_in_top * top_total / self.collection_total
- return weight_in_top * log((1.0 + f) / f, 2) + log(1.0 + f, 2)
- class KLModel(ExpansionModel):
- def normalizer(self, maxweight, top_total):
- return (maxweight * log(self.collection_total / top_total) / log(2.0)
- * top_total)
- def score(self, weight_in_top, weight_in_collection, top_total):
- wit_over_tt = weight_in_top / top_total
- wic_over_ct = weight_in_collection / self.collection_total
- if wit_over_tt < wic_over_ct:
- return 0
- else:
- return wit_over_tt * log(wit_over_tt
- / (weight_in_top / self.collection_total),
- 2)
- class Expander(object):
- """Uses an ExpansionModel to expand the set of query terms based on the top
- N result documents.
- """
- def __init__(self, ixreader, fieldname, model=Bo1Model):
- """
- :param reader: A :class:whoosh.reading.IndexReader object.
- :param fieldname: The name of the field in which to search.
- :param model: (classify.ExpansionModel) The model to use for expanding
- the query terms. If you omit this parameter, the expander uses
- :class:`Bo1Model` by default.
- """
- self.ixreader = ixreader
- self.fieldname = fieldname
- doccount = self.ixreader.doc_count_all()
- fieldlen = self.ixreader.field_length(fieldname)
- if type(model) is type:
- model = model(doccount, fieldlen)
- self.model = model
- # Maps words to their weight in the top N documents.
- self.topN_weight = defaultdict(float)
- # Total weight of all terms in the top N documents.
- self.top_total = 0
- def add(self, vector):
- """Adds forward-index information about one of the "top N" documents.
- :param vector: A series of (text, weight) tuples, such as is
- returned by Reader.vector_as("weight", docnum, fieldname).
- """
- total_weight = 0
- topN_weight = self.topN_weight
- for word, weight in vector:
- total_weight += weight
- topN_weight[word] += weight
- self.top_total += total_weight
- def add_document(self, docnum):
- ixreader = self.ixreader
- if self.ixreader.has_vector(docnum, self.fieldname):
- self.add(ixreader.vector_as("weight", docnum, self.fieldname))
- elif self.ixreader.schema[self.fieldname].stored:
- self.add_text(ixreader.stored_fields(docnum).get(self.fieldname))
- else:
- raise Exception("Field %r in document %s is not vectored or stored"
- % (self.fieldname, docnum))
- def add_text(self, string):
- # Unfortunately since field.index() yields bytes texts, and we want
- # unicode, we end up encoding and decoding unnecessarily.
- #
- # TODO: Find a way around this
- field = self.ixreader.schema[self.fieldname]
- from_bytes = field.from_bytes
- self.add((from_bytes(text), weight) for text, _, weight, _
- in field.index(string))
- def expanded_terms(self, number, normalize=True):
- """Returns the N most important terms in the vectors added so far.
- :param number: The number of terms to return.
- :param normalize: Whether to normalize the weights.
- :returns: A list of ("term", weight) tuples.
- """
- model = self.model
- fieldname = self.fieldname
- ixreader = self.ixreader
- field = ixreader.schema[fieldname]
- tlist = []
- maxweight = 0
- # If no terms have been added, return an empty list
- if not self.topN_weight:
- return []
- for word, weight in iteritems(self.topN_weight):
- btext = field.to_bytes(word)
- if (fieldname, btext) in ixreader:
- cf = ixreader.frequency(fieldname, btext)
- score = model.score(weight, cf, self.top_total)
- if score > maxweight:
- maxweight = score
- tlist.append((score, word))
- if normalize:
- norm = model.normalizer(maxweight, self.top_total)
- else:
- norm = maxweight
- tlist = [(weight / norm, t) for weight, t in tlist]
- tlist.sort(key=lambda x: (0 - x[0], x[1]))
- return [(t, weight) for weight, t in tlist[:number]]
- # Similarity functions
- def shingles(input, size=2):
- d = defaultdict(int)
- for shingle in (input[i:i + size]
- for i in xrange(len(input) - (size - 1))):
- d[shingle] += 1
- return iteritems(d)
- def simhash(features, hashbits=32):
- if hashbits == 32:
- hashfn = hash
- else:
- hashfn = lambda s: _hash(s, hashbits)
- vs = [0] * hashbits
- for feature, weight in features:
- h = hashfn(feature)
- for i in xrange(hashbits):
- if h & (1 << i):
- vs[i] += weight
- else:
- vs[i] -= weight
- out = 0
- for i, v in enumerate(vs):
- if v > 0:
- out |= 1 << i
- return out
- def _hash(s, hashbits):
- # A variable-length version of Python's builtin hash
- if s == "":
- return 0
- else:
- x = ord(s[0]) << 7
- m = 1000003
- mask = 2 ** hashbits - 1
- for c in s:
- x = ((x * m) ^ ord(c)) & mask
- x ^= len(s)
- if x == -1:
- x = -2
- return x
- def hamming_distance(first_hash, other_hash, hashbits=32):
- x = (first_hash ^ other_hash) & ((1 << hashbits) - 1)
- tot = 0
- while x:
- tot += 1
- x &= x - 1
- return tot
- # Clustering
- def kmeans(data, k, t=0.0001, distfun=None, maxiter=50, centers=None):
- """
- One-dimensional K-means clustering function.
- :param data: list of data points.
- :param k: number of clusters.
- :param t: tolerance; stop if changes between iterations are smaller than
- this value.
- :param distfun: a distance function.
- :param centers: a list of centroids to start with.
- :param maxiter: maximum number of iterations to run.
- """
- # Adapted from a C version by Roger Zhang, <rogerz@cs.dal.ca>
- # http://cs.smu.ca/~r_zhang/code/kmeans.c
- DOUBLE_MAX = 1.797693e308
- n = len(data)
- error = DOUBLE_MAX # sum of squared euclidean distance
- counts = [0] * k # size of each cluster
- labels = [0] * n # output cluster label for each data point
- # c1 is an array of len k of the temp centroids
- c1 = [0] * k
- # choose k initial centroids
- if centers:
- c = centers
- else:
- c = random.sample(data, k)
- niter = 0
- # main loop
- while True:
- # save error from last step
- old_error = error
- error = 0
- # clear old counts and temp centroids
- for i in xrange(k):
- counts[i] = 0
- c1[i] = 0
- for h in xrange(n):
- # identify the closest cluster
- min_distance = DOUBLE_MAX
- for i in xrange(k):
- distance = (data[h] - c[i]) ** 2
- if distance < min_distance:
- labels[h] = i
- min_distance = distance
- # update size and temp centroid of the destination cluster
- c1[labels[h]] += data[h]
- counts[labels[h]] += 1
- # update standard error
- error += min_distance
- for i in xrange(k): # update all centroids
- c[i] = c1[i] / counts[i] if counts[i] else c1[i]
- niter += 1
- if (abs(error - old_error) < t) or (niter > maxiter):
- break
- return labels, c
- # Sliding window clusters
- def two_pass_variance(data):
- n = 0
- sum1 = 0
- sum2 = 0
- for x in data:
- n += 1
- sum1 = sum1 + x
- mean = sum1 / n
- for x in data:
- sum2 += (x - mean) * (x - mean)
- variance = sum2 / (n - 1)
- return variance
- def weighted_incremental_variance(data_weight_pairs):
- mean = 0
- S = 0
- sumweight = 0
- for x, weight in data_weight_pairs:
- temp = weight + sumweight
- Q = x - mean
- R = Q * weight / temp
- S += sumweight * Q * R
- mean += R
- sumweight = temp
- Variance = S / (sumweight - 1) # if sample is the population, omit -1
- return Variance
- def swin(data, size):
- clusters = []
- for i, left in enumerate(data):
- j = i
- right = data[j]
- while j < len(data) - 1 and right - left < size:
- j += 1
- right = data[j]
- v = 99999
- if j - i > 1:
- v = two_pass_variance(data[i:j + 1])
- clusters.append((left, right, j - i, v))
- clusters.sort(key=lambda x: (0 - x[2], x[3]))
- return clusters
|