formats.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481
  1. # Copyright 2009 Matt Chaput. All rights reserved.
  2. #
  3. # Redistribution and use in source and binary forms, with or without
  4. # modification, are permitted provided that the following conditions are met:
  5. #
  6. # 1. Redistributions of source code must retain the above copyright notice,
  7. # this list of conditions and the following disclaimer.
  8. #
  9. # 2. Redistributions in binary form must reproduce the above copyright
  10. # notice, this list of conditions and the following disclaimer in the
  11. # documentation and/or other materials provided with the distribution.
  12. #
  13. # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
  14. # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  15. # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
  16. # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  17. # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  18. # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
  19. # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  20. # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  21. # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
  22. # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  23. #
  24. # The views and conclusions contained in the software and documentation are
  25. # those of the authors and should not be interpreted as representing official
  26. # policies, either expressed or implied, of Matt Chaput.
  27. """
  28. The classes in this module encode and decode posting information for a field.
  29. The field format essentially determines what information is stored about each
  30. occurance of a term.
  31. """
  32. from collections import defaultdict
  33. from whoosh.analysis import unstopped, entoken
  34. from whoosh.compat import iteritems, dumps, loads, b
  35. from whoosh.system import emptybytes
  36. from whoosh.system import _INT_SIZE, _FLOAT_SIZE
  37. from whoosh.system import pack_uint, unpack_uint, pack_float, unpack_float
  38. # Format base class
  39. class Format(object):
  40. """Abstract base class representing a storage format for a field or vector.
  41. Format objects are responsible for writing and reading the low-level
  42. representation of a field. It controls what kind/level of information to
  43. store about the indexed fields.
  44. """
  45. posting_size = -1
  46. textual = True
  47. __inittypes__ = dict(field_boost=float)
  48. def __init__(self, field_boost=1.0, **options):
  49. """
  50. :param field_boost: A constant boost factor to scale to the score
  51. of all queries matching terms in this field.
  52. """
  53. self.field_boost = field_boost
  54. self.options = options
  55. def __eq__(self, other):
  56. return (other
  57. and self.__class__ is other.__class__
  58. and self.__dict__ == other.__dict__)
  59. def __repr__(self):
  60. return "%s(boost=%s)" % (self.__class__.__name__, self.field_boost)
  61. def fixed_value_size(self):
  62. if self.posting_size < 0:
  63. return None
  64. return self.posting_size
  65. def word_values(self, value, analyzer, **kwargs):
  66. """Takes the text value to be indexed and yields a series of
  67. ("tokentext", frequency, weight, valuestring) tuples, where frequency
  68. is the number of times "tokentext" appeared in the value, weight is the
  69. weight (a float usually equal to frequency in the absence of per-term
  70. boosts) and valuestring is encoded field-specific posting value for the
  71. token. For example, in a Frequency format, the value string would be
  72. the same as frequency; in a Positions format, the value string would
  73. encode a list of token positions at which "tokentext" occured.
  74. :param value: The unicode text to index.
  75. :param analyzer: The analyzer to use to process the text.
  76. """
  77. raise NotImplementedError
  78. def supports(self, name):
  79. """Returns True if this format supports interpreting its posting
  80. value as 'name' (e.g. "frequency" or "positions").
  81. """
  82. return hasattr(self, "decode_" + name)
  83. def decoder(self, name):
  84. """Returns the bound method for interpreting value as 'name',
  85. where 'name' is for example "frequency" or "positions". This
  86. object must have a corresponding Format.decode_<name>() method.
  87. """
  88. return getattr(self, "decode_" + name)
  89. def decode_as(self, astype, valuestring):
  90. """Interprets the encoded value string as 'astype', where 'astype' is
  91. for example "frequency" or "positions". This object must have a
  92. corresponding decode_<astype>() method.
  93. """
  94. return self.decoder(astype)(valuestring)
  95. # Concrete field classes
  96. # TODO: as a legacy thing most of these formats store the frequency but not the
  97. # weight in the value string, so if you use field or term boosts
  98. # postreader.value_as("weight") will not match postreader.weight()
  99. def tokens(value, analyzer, kwargs):
  100. if isinstance(value, (tuple, list)):
  101. gen = entoken(value, **kwargs)
  102. else:
  103. gen = analyzer(value, **kwargs)
  104. return unstopped(gen)
  105. class Existence(Format):
  106. """Only indexes whether a given term occurred in a given document; it does
  107. not store frequencies or positions. This is useful for fields that should
  108. be searchable but not scorable, such as file path.
  109. Supports: frequency, weight (always reports frequency = 1).
  110. """
  111. posting_size = 0
  112. __inittypes__ = dict(field_boost=float)
  113. def __init__(self, field_boost=1.0, **options):
  114. self.field_boost = field_boost
  115. self.options = options
  116. def word_values(self, value, analyzer, **kwargs):
  117. fb = self.field_boost
  118. wordset = set(t.text for t in tokens(value, analyzer, kwargs))
  119. return ((w, 1, fb, emptybytes) for w in wordset)
  120. def encode(self, value):
  121. return emptybytes
  122. def decode_frequency(self, valuestring):
  123. return 1
  124. def decode_weight(self, valuestring):
  125. return self.field_boost
  126. def combine(self, vs):
  127. return emptybytes
  128. class Frequency(Format):
  129. """Stores frequency information for each posting.
  130. Supports: frequency, weight.
  131. """
  132. posting_size = _INT_SIZE
  133. __inittypes__ = dict(field_boost=float, boost_as_freq=bool)
  134. def __init__(self, field_boost=1.0, boost_as_freq=False,
  135. **options):
  136. """
  137. :param field_boost: A constant boost factor to scale to the score of
  138. all queries matching terms in this field.
  139. """
  140. assert isinstance(field_boost, float)
  141. self.field_boost = field_boost
  142. self.options = options
  143. def word_values(self, value, analyzer, **kwargs):
  144. fb = self.field_boost
  145. length = 0
  146. freqs = defaultdict(int)
  147. weights = defaultdict(float)
  148. kwargs["boosts"] = True
  149. for t in tokens(value, analyzer, kwargs):
  150. length += 1
  151. freqs[t.text] += 1
  152. weights[t.text] += t.boost
  153. wvs = ((w, freq, weights[w] * fb, pack_uint(freq)) for w, freq
  154. in iteritems(freqs))
  155. return wvs
  156. def decode_frequency(self, valuestring):
  157. return unpack_uint(valuestring)[0]
  158. def decode_weight(self, valuestring):
  159. freq = unpack_uint(valuestring)[0]
  160. return freq * self.field_boost
  161. def combine(self, vs):
  162. return pack_uint(sum(self.decode_value(v) for v in vs))
  163. class Positions(Format):
  164. """Stores position information in each posting, to allow phrase searching
  165. and "near" queries.
  166. Supports: frequency, weight, positions, position_boosts (always reports
  167. position boost = 1.0).
  168. """
  169. def word_values(self, value, analyzer, **kwargs):
  170. fb = self.field_boost
  171. poses = defaultdict(list)
  172. weights = defaultdict(float)
  173. kwargs["positions"] = True
  174. kwargs["boosts"] = True
  175. for t in tokens(value, analyzer, kwargs):
  176. poses[t.text].append(t.pos)
  177. weights[t.text] += t.boost
  178. for w, poslist in iteritems(poses):
  179. value = self.encode(poslist)
  180. yield (w, len(poslist), weights[w] * fb, value)
  181. def encode(self, poslist):
  182. deltas = []
  183. base = 0
  184. for pos in poslist:
  185. deltas.append(pos - base)
  186. base = pos
  187. return pack_uint(len(deltas)) + dumps(deltas, 2)
  188. def decode_positions(self, valuestring):
  189. if not valuestring.endswith(b(".")):
  190. valuestring += b(".")
  191. codes = loads(valuestring[_INT_SIZE:])
  192. position = 0
  193. positions = []
  194. for code in codes:
  195. position += code
  196. positions.append(position)
  197. return positions
  198. def decode_frequency(self, valuestring):
  199. return unpack_uint(valuestring[:_INT_SIZE])[0]
  200. def decode_weight(self, valuestring):
  201. return self.decode_frequency(valuestring) * self.field_boost
  202. def decode_position_boosts(self, valuestring):
  203. return [(pos, 1) for pos in self.decode_positions(valuestring)]
  204. def combine(self, vs):
  205. s = set()
  206. for v in vs:
  207. s.update(self.decode_positions(v))
  208. return self.encode(sorted(s))
  209. class Characters(Positions):
  210. """Stores token position and character start and end information for each
  211. posting.
  212. Supports: frequency, weight, positions, position_boosts (always reports
  213. position boost = 1.0), characters.
  214. """
  215. def word_values(self, value, analyzer, **kwargs):
  216. fb = self.field_boost
  217. seen = defaultdict(list)
  218. weights = defaultdict(float)
  219. kwargs["positions"] = True
  220. kwargs["chars"] = True
  221. kwargs["boosts"] = True
  222. for t in tokens(value, analyzer, kwargs):
  223. seen[t.text].append((t.pos, t.startchar, t.endchar))
  224. weights[t.text] += t.boost
  225. for w, poslist in iteritems(seen):
  226. value = self.encode(poslist)
  227. yield (w, len(poslist), weights[w] * fb, value)
  228. def encode(self, poslist):
  229. deltas = []
  230. posbase = 0
  231. charbase = 0
  232. for pos, startchar, endchar in poslist:
  233. deltas.append((pos - posbase, startchar - charbase,
  234. endchar - startchar))
  235. posbase = pos
  236. charbase = endchar
  237. return pack_uint(len(deltas)) + dumps(deltas, 2)
  238. def decode_characters(self, valuestring):
  239. if not valuestring.endswith(b(".")):
  240. valuestring += b(".")
  241. codes = loads(valuestring[_INT_SIZE:])
  242. position = 0
  243. endchar = 0
  244. posns_chars = []
  245. for code in codes:
  246. position = code[0] + position
  247. startchar = code[1] + endchar
  248. endchar = code[2] + startchar
  249. posns_chars.append((position, startchar, endchar))
  250. return posns_chars
  251. def decode_positions(self, valuestring):
  252. if not valuestring.endswith(b(".")):
  253. valuestring += b(".")
  254. codes = loads(valuestring[_INT_SIZE:])
  255. position = 0
  256. posns = []
  257. for code in codes:
  258. position = code[0] + position
  259. posns.append(position)
  260. return posns
  261. def combine(self, vs):
  262. s = {}
  263. for v in vs:
  264. for pos, sc, ec in self.decode_characters(v):
  265. if pos in s:
  266. old_sc, old_ec = pos[s]
  267. s[pos] = (min(sc, old_sc), max(ec, old_ec))
  268. else:
  269. s[pos] = (sc, ec)
  270. poses = [(pos, s[pos][0], s[pos][1]) for pos in sorted(s.keys())]
  271. return self.encode(poses)
  272. class PositionBoosts(Positions):
  273. """A format that stores positions and per-position boost information
  274. in each posting.
  275. Supports: frequency, weight, positions, position_boosts.
  276. """
  277. def word_values(self, value, analyzer, **kwargs):
  278. fb = self.field_boost
  279. seen = defaultdict(list)
  280. kwargs["positions"] = True
  281. kwargs["boosts"] = True
  282. for t in tokens(value, analyzer, kwargs):
  283. pos = t.pos
  284. boost = t.boost
  285. seen[t.text].append((pos, boost))
  286. for w, poses in iteritems(seen):
  287. value = self.encode(poses)
  288. yield (w, len(poses), sum(p[1] for p in poses) * fb, value)
  289. def encode(self, poses):
  290. codes = []
  291. base = 0
  292. summedboost = 0
  293. for pos, boost in poses:
  294. summedboost += boost
  295. codes.append((pos - base, boost))
  296. base = pos
  297. return (pack_uint(len(poses)) + pack_float(summedboost)
  298. + dumps(codes, 2))
  299. def decode_position_boosts(self, valuestring):
  300. if not valuestring.endswith(b(".")):
  301. valuestring += b(".")
  302. codes = loads(valuestring[_INT_SIZE + _FLOAT_SIZE:])
  303. position = 0
  304. posns_boosts = []
  305. for code in codes:
  306. position = code[0] + position
  307. posns_boosts.append((position, code[1]))
  308. return posns_boosts
  309. def decode_positions(self, valuestring):
  310. if not valuestring.endswith(b(".")):
  311. valuestring += b(".")
  312. codes = loads(valuestring[_INT_SIZE + _FLOAT_SIZE:])
  313. position = 0
  314. posns = []
  315. for code in codes:
  316. position = code[0] + position
  317. posns.append(position)
  318. return posns
  319. def decode_weight(self, v):
  320. summedboost = unpack_float(v[_INT_SIZE:_INT_SIZE + _FLOAT_SIZE])[0]
  321. return summedboost * self.field_boost
  322. def combine(self, vs):
  323. s = defaultdict(float)
  324. for v in vs:
  325. for pos, boost in self.decode_position_boosts(v):
  326. s[pos] += boost
  327. return self.encode(sorted(s.items()))
  328. class CharacterBoosts(Characters):
  329. """A format that stores positions, character start and end, and
  330. per-position boost information in each posting.
  331. Supports: frequency, weight, positions, position_boosts, characters,
  332. character_boosts.
  333. """
  334. def word_values(self, value, analyzer, **kwargs):
  335. seen = defaultdict(list)
  336. kwargs["positions"] = True
  337. kwargs["chars"] = True
  338. kwargs["boosts"] = True
  339. for t in tokens(value, analyzer, kwargs):
  340. seen[t.text].append((t.pos, t.startchar, t.endchar, t.boost))
  341. for w, poses in iteritems(seen):
  342. value, summedboost = self.encode(poses)
  343. yield (w, len(poses), summedboost, value)
  344. def encode(self, poses):
  345. fb = self.field_boost
  346. # posns_chars_boosts = [(pos, startchar, endchar, boost), ...]
  347. codes = []
  348. posbase = 0
  349. charbase = 0
  350. summedboost = 0
  351. for pos, startchar, endchar, boost in poses:
  352. codes.append((pos - posbase, startchar - charbase,
  353. endchar - startchar, boost))
  354. posbase = pos
  355. charbase = endchar
  356. summedboost += boost
  357. return ((pack_uint(len(poses)) + pack_float(summedboost * fb)
  358. + dumps(codes, 2)), summedboost)
  359. def decode_character_boosts(self, valuestring):
  360. if not valuestring.endswith(b(".")):
  361. valuestring += b(".")
  362. codes = loads(valuestring[_INT_SIZE + _FLOAT_SIZE:])
  363. position = 0
  364. endchar = 0
  365. posn_char_boosts = []
  366. for code in codes:
  367. position = position + code[0]
  368. startchar = endchar + code[1]
  369. endchar = startchar + code[2]
  370. posn_char_boosts.append((position, startchar, endchar, code[3]))
  371. return posn_char_boosts
  372. def decode_positions(self, valuestring):
  373. return [item[0] for item in self.decode_character_boosts(valuestring)]
  374. def decode_characters(self, valuestring):
  375. return [(pos, startchar, endchar) for pos, startchar, endchar, _
  376. in self.decode_character_boosts(valuestring)]
  377. def decode_position_boosts(self, valuestring):
  378. return [(pos, boost) for pos, _, _, boost
  379. in self.decode_character_boosts(valuestring)]
  380. def combine(self, vs):
  381. s = {}
  382. for v in vs:
  383. for pos, sc, ec, boost in self.decode_character_boosts(v):
  384. if pos in s:
  385. old_sc, old_ec, old_boost = pos[s]
  386. s[pos] = (min(sc, old_sc), max(ec, old_ec),
  387. old_boost + boost)
  388. else:
  389. s[pos] = (sc, ec, boost)
  390. poses = [(pos, sc, ec, boost) for pos, (sc, ec, boost)
  391. in sorted(s.items())]
  392. return self.encode(poses)[0] # encode() returns value, summedboost