highlight.py 33 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954
  1. # Copyright 2008 Matt Chaput. All rights reserved.
  2. #
  3. # Redistribution and use in source and binary forms, with or without
  4. # modification, are permitted provided that the following conditions are met:
  5. #
  6. # 1. Redistributions of source code must retain the above copyright notice,
  7. # this list of conditions and the following disclaimer.
  8. #
  9. # 2. Redistributions in binary form must reproduce the above copyright
  10. # notice, this list of conditions and the following disclaimer in the
  11. # documentation and/or other materials provided with the distribution.
  12. #
  13. # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
  14. # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  15. # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
  16. # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  17. # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  18. # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
  19. # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  20. # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  21. # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
  22. # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  23. #
  24. # The views and conclusions contained in the software and documentation are
  25. # those of the authors and should not be interpreted as representing official
  26. # policies, either expressed or implied, of Matt Chaput.
  27. """The highlight module contains classes and functions for displaying short
  28. excerpts from hit documents in the search results you present to the user, with
  29. query terms highlighted.
  30. The highlighting system has four main elements.
  31. * **Fragmenters** chop up the original text into __fragments__, based on the
  32. locations of matched terms in the text.
  33. * **Scorers** assign a score to each fragment, allowing the system to rank the
  34. best fragments by whatever criterion.
  35. * **Order functions** control in what order the top-scoring fragments are
  36. presented to the user. For example, you can show the fragments in the order
  37. they appear in the document (FIRST) or show higher-scoring fragments first
  38. (SCORE)
  39. * **Formatters** turn the fragment objects into human-readable output, such as
  40. an HTML string.
  41. See :doc:`/highlight` for more information.
  42. """
  43. from __future__ import division
  44. from collections import deque
  45. from heapq import nlargest
  46. from itertools import groupby
  47. from whoosh.compat import htmlescape
  48. from whoosh.analysis import Token
  49. # The default value for the maximum chars to examine when fragmenting
  50. DEFAULT_CHARLIMIT = 2 ** 15
  51. # Fragment object
  52. def mkfrag(text, tokens, startchar=None, endchar=None,
  53. charsbefore=0, charsafter=0):
  54. """Returns a :class:`Fragment` object based on the :class:`analysis.Token`
  55. objects in ``tokens`.
  56. """
  57. if startchar is None:
  58. startchar = tokens[0].startchar if tokens else 0
  59. if endchar is None:
  60. endchar = tokens[-1].endchar if tokens else len(text)
  61. startchar = max(0, startchar - charsbefore)
  62. endchar = min(len(text), endchar + charsafter)
  63. return Fragment(text, tokens, startchar, endchar)
  64. class Fragment(object):
  65. """Represents a fragment (extract) from a hit document. This object is
  66. mainly used to keep track of the start and end points of the fragment and
  67. the "matched" character ranges inside; it does not contain the text of the
  68. fragment or do much else.
  69. The useful attributes are:
  70. ``Fragment.text``
  71. The entire original text from which this fragment is taken.
  72. ``Fragment.matches``
  73. An ordered list of objects representing the matched terms in the
  74. fragment. These objects have ``startchar`` and ``endchar`` attributes.
  75. ``Fragment.startchar``
  76. The index of the first character in the fragment.
  77. ``Fragment.endchar``
  78. The index of the last character in the fragment.
  79. ``Fragment.matched_terms``
  80. A ``set`` of the ``text`` of the matched terms in the fragment (if
  81. available).
  82. """
  83. def __init__(self, text, matches, startchar=0, endchar= -1):
  84. """
  85. :param text: the source text of the fragment.
  86. :param matches: a list of objects which have ``startchar`` and
  87. ``endchar`` attributes, and optionally a ``text`` attribute.
  88. :param startchar: the index into ``text`` at which the fragment starts.
  89. The default is 0.
  90. :param endchar: the index into ``text`` at which the fragment ends.
  91. The default is -1, which is interpreted as the length of ``text``.
  92. """
  93. self.text = text
  94. self.matches = matches
  95. if endchar == -1:
  96. endchar = len(text)
  97. self.startchar = startchar
  98. self.endchar = endchar
  99. self.matched_terms = set()
  100. for t in matches:
  101. if hasattr(t, "text"):
  102. self.matched_terms.add(t.text)
  103. def __repr__(self):
  104. return "<Fragment %d:%d %d>" % (self.startchar, self.endchar,
  105. len(self.matches))
  106. def __len__(self):
  107. return self.endchar - self.startchar
  108. def overlaps(self, fragment):
  109. sc = self.startchar
  110. ec = self.endchar
  111. fsc = fragment.startchar
  112. fec = fragment.endchar
  113. return (sc < fsc < ec) or (sc < fec < ec)
  114. def overlapped_length(self, fragment):
  115. sc = self.startchar
  116. ec = self.endchar
  117. fsc = fragment.startchar
  118. fec = fragment.endchar
  119. return max(ec, fec) - min(sc, fsc)
  120. def __lt__(self, other):
  121. return id(self) < id(other)
  122. # Tokenizing
  123. def set_matched_filter(tokens, termset):
  124. for t in tokens:
  125. t.matched = t.text in termset
  126. yield t
  127. # Fragmenters
  128. class Fragmenter(object):
  129. def must_retokenize(self):
  130. """Returns True if this fragmenter requires retokenized text.
  131. If this method returns True, the fragmenter's ``fragment_tokens``
  132. method will be called with an iterator of ALL tokens from the text,
  133. with the tokens for matched terms having the ``matched`` attribute set
  134. to True.
  135. If this method returns False, the fragmenter's ``fragment_matches``
  136. method will be called with a LIST of matching tokens.
  137. """
  138. return True
  139. def fragment_tokens(self, text, all_tokens):
  140. """Yields :class:`Fragment` objects based on the tokenized text.
  141. :param text: the string being highlighted.
  142. :param all_tokens: an iterator of :class:`analysis.Token`
  143. objects from the string.
  144. """
  145. raise NotImplementedError
  146. def fragment_matches(self, text, matched_tokens):
  147. """Yields :class:`Fragment` objects based on the text and the matched
  148. terms.
  149. :param text: the string being highlighted.
  150. :param matched_tokens: a list of :class:`analysis.Token` objects
  151. representing the term matches in the string.
  152. """
  153. raise NotImplementedError
  154. class WholeFragmenter(Fragmenter):
  155. """Doesn't fragment the token stream. This object just returns the entire
  156. entire stream as one "fragment". This is useful if you want to highlight
  157. the entire text.
  158. Note that even if you use the `WholeFragmenter`, the highlight code will
  159. return no fragment if no terms matched in the given field. To return the
  160. whole fragment even in that case, call `highlights()` with `minscore=0`::
  161. # Query where no terms match in the "text" field
  162. q = query.Term("tag", "new")
  163. r = mysearcher.search(q)
  164. r.fragmenter = highlight.WholeFragmenter()
  165. r.formatter = highlight.UppercaseFormatter()
  166. # Since no terms in the "text" field matched, we get no fragments back
  167. assert r[0].highlights("text") == ""
  168. # If we lower the minimum score to 0, we get a fragment even though it
  169. # has no matching terms
  170. assert r[0].highlights("text", minscore=0) == "This is the text field."
  171. """
  172. def __init__(self, charlimit=DEFAULT_CHARLIMIT):
  173. self.charlimit = charlimit
  174. def fragment_tokens(self, text, tokens):
  175. charlimit = self.charlimit
  176. matches = []
  177. for t in tokens:
  178. if charlimit and t.endchar > charlimit:
  179. break
  180. if t.matched:
  181. matches.append(t.copy())
  182. return [Fragment(text, matches)]
  183. # Backwards compatiblity
  184. NullFragmeter = WholeFragmenter
  185. class SentenceFragmenter(Fragmenter):
  186. """Breaks the text up on sentence end punctuation characters
  187. (".", "!", or "?"). This object works by looking in the original text for a
  188. sentence end as the next character after each token's 'endchar'.
  189. When highlighting with this fragmenter, you should use an analyzer that
  190. does NOT remove stop words, for example::
  191. sa = StandardAnalyzer(stoplist=None)
  192. """
  193. def __init__(self, maxchars=200, sentencechars=".!?",
  194. charlimit=DEFAULT_CHARLIMIT):
  195. """
  196. :param maxchars: The maximum number of characters allowed in a
  197. fragment.
  198. """
  199. self.maxchars = maxchars
  200. self.sentencechars = frozenset(sentencechars)
  201. self.charlimit = charlimit
  202. def fragment_tokens(self, text, tokens):
  203. maxchars = self.maxchars
  204. sentencechars = self.sentencechars
  205. charlimit = self.charlimit
  206. textlen = len(text)
  207. # startchar of first token in the current sentence
  208. first = None
  209. # Buffer for matched tokens in the current sentence
  210. tks = []
  211. endchar = None
  212. # Number of chars in the current sentence
  213. currentlen = 0
  214. for t in tokens:
  215. startchar = t.startchar
  216. endchar = t.endchar
  217. if charlimit and endchar > charlimit:
  218. break
  219. if first is None:
  220. # Remember the startchar of the first token in a sentence
  221. first = startchar
  222. currentlen = 0
  223. tlength = endchar - startchar
  224. currentlen += tlength
  225. if t.matched:
  226. tks.append(t.copy())
  227. # If the character after the current token is end-of-sentence
  228. # punctuation, finish the sentence and reset
  229. if endchar < textlen and text[endchar] in sentencechars:
  230. # Don't break for two periods in a row (e.g. ignore "...")
  231. if endchar + 1 < textlen and text[endchar + 1] in sentencechars:
  232. continue
  233. # If the sentence had matches and it's not too long, yield it
  234. # as a token
  235. if tks and currentlen <= maxchars:
  236. yield mkfrag(text, tks, startchar=first, endchar=endchar)
  237. # Reset the counts
  238. tks = []
  239. first = None
  240. currentlen = 0
  241. # If we get to the end of the text and there's still a sentence
  242. # in the buffer, yield it
  243. if tks:
  244. yield mkfrag(text, tks, startchar=first, endchar=endchar)
  245. class ContextFragmenter(Fragmenter):
  246. """Looks for matched terms and aggregates them with their surrounding
  247. context.
  248. """
  249. def __init__(self, maxchars=200, surround=20, charlimit=DEFAULT_CHARLIMIT):
  250. """
  251. :param maxchars: The maximum number of characters allowed in a
  252. fragment.
  253. :param surround: The number of extra characters of context to add both
  254. before the first matched term and after the last matched term.
  255. """
  256. self.maxchars = maxchars
  257. self.surround = surround
  258. self.charlimit = charlimit
  259. def fragment_tokens(self, text, tokens):
  260. maxchars = self.maxchars
  261. surround = self.surround
  262. charlimit = self.charlimit
  263. # startchar of the first token in the fragment
  264. first = None
  265. # Stack of startchars
  266. firsts = deque()
  267. # Each time we see a matched token, we reset the countdown to finishing
  268. # the fragment. This also indicates whether we're currently inside a
  269. # fragment (< 0 not in fragment, >= 0 in fragment)
  270. countdown = -1
  271. # Tokens in current fragment
  272. tks = []
  273. endchar = None
  274. # Number of chars in the current fragment
  275. currentlen = 0
  276. for t in tokens:
  277. startchar = t.startchar
  278. endchar = t.endchar
  279. tlength = endchar - startchar
  280. if charlimit and endchar > charlimit:
  281. break
  282. if countdown < 0 and not t.matched:
  283. # We're not in a fragment currently, so just maintain the
  284. # "charsbefore" buffer
  285. firsts.append(startchar)
  286. while firsts and endchar - firsts[0] > surround:
  287. firsts.popleft()
  288. elif currentlen + tlength > maxchars:
  289. # We're in a fragment, but adding this token would put us past
  290. # the maximum size. Zero the countdown so the code below will
  291. # cause the fragment to be emitted
  292. countdown = 0
  293. elif t.matched:
  294. # Start/restart the countdown
  295. countdown = surround
  296. # Remember the first char of this fragment
  297. if first is None:
  298. if firsts:
  299. first = firsts[0]
  300. else:
  301. first = startchar
  302. # Add on unused front context
  303. countdown += surround
  304. tks.append(t.copy())
  305. # If we're in a fragment...
  306. if countdown >= 0:
  307. # Update the counts
  308. currentlen += tlength
  309. countdown -= tlength
  310. # If the countdown is expired
  311. if countdown <= 0:
  312. # Finish the fragment
  313. yield mkfrag(text, tks, startchar=first, endchar=endchar)
  314. # Reset the counts
  315. tks = []
  316. firsts = deque()
  317. first = None
  318. currentlen = 0
  319. # If there's a fragment left over at the end, yield it
  320. if tks:
  321. yield mkfrag(text, tks, startchar=first, endchar=endchar)
  322. class PinpointFragmenter(Fragmenter):
  323. """This is a NON-RETOKENIZING fragmenter. It builds fragments from the
  324. positions of the matched terms.
  325. """
  326. def __init__(self, maxchars=200, surround=20, autotrim=False,
  327. charlimit=DEFAULT_CHARLIMIT):
  328. """
  329. :param maxchars: The maximum number of characters allowed in a
  330. fragment.
  331. :param surround: The number of extra characters of context to add both
  332. before the first matched term and after the last matched term.
  333. :param autotrim: automatically trims text before the first space and
  334. after the last space in the fragments, to try to avoid truncated
  335. words at the start and end. For short fragments or fragments with
  336. long runs between spaces this may give strange results.
  337. """
  338. self.maxchars = maxchars
  339. self.surround = surround
  340. self.autotrim = autotrim
  341. self.charlimit = charlimit
  342. def must_retokenize(self):
  343. return False
  344. def fragment_tokens(self, text, tokens):
  345. matched = [t for t in tokens if t.matched]
  346. return self.fragment_matches(text, matched)
  347. @staticmethod
  348. def _autotrim(fragment):
  349. text = fragment.text
  350. startchar = fragment.startchar
  351. endchar = fragment.endchar
  352. firstspace = text.find(" ", startchar, endchar)
  353. if firstspace > 0:
  354. startchar = firstspace + 1
  355. lastspace = text.rfind(" ", startchar, endchar)
  356. if lastspace > 0:
  357. endchar = lastspace
  358. if fragment.matches:
  359. startchar = min(startchar, fragment.matches[0].startchar)
  360. endchar = max(endchar, fragment.matches[-1].endchar)
  361. fragment.startchar = startchar
  362. fragment.endchar = endchar
  363. def fragment_matches(self, text, tokens):
  364. maxchars = self.maxchars
  365. surround = self.surround
  366. autotrim = self.autotrim
  367. charlimit = self.charlimit
  368. j = -1
  369. for i, t in enumerate(tokens):
  370. if j >= i:
  371. continue
  372. j = i
  373. left = t.startchar
  374. right = t.endchar
  375. if charlimit and right > charlimit:
  376. break
  377. currentlen = right - left
  378. while j < len(tokens) - 1 and currentlen < maxchars:
  379. next = tokens[j + 1]
  380. ec = next.endchar
  381. if ec - right <= surround and ec - left <= maxchars:
  382. j += 1
  383. right = ec
  384. currentlen += (ec - next.startchar)
  385. else:
  386. break
  387. left = max(0, left - surround)
  388. right = min(len(text), right + surround)
  389. fragment = Fragment(text, tokens[i:j + 1], left, right)
  390. if autotrim:
  391. self._autotrim(fragment)
  392. yield fragment
  393. # Fragment scorers
  394. class FragmentScorer(object):
  395. pass
  396. class BasicFragmentScorer(FragmentScorer):
  397. def __call__(self, f):
  398. # Add up the boosts for the matched terms in this passage
  399. score = sum(t.boost for t in f.matches)
  400. # Favor diversity: multiply score by the number of separate
  401. # terms matched
  402. score *= (len(f.matched_terms) * 100) or 1
  403. return score
  404. # Fragment sorters
  405. def SCORE(fragment):
  406. "Sorts higher scored passages first."
  407. return 1
  408. def FIRST(fragment):
  409. "Sorts passages from earlier in the document first."
  410. return fragment.startchar
  411. def LONGER(fragment):
  412. "Sorts longer passages first."
  413. return 0 - len(fragment)
  414. def SHORTER(fragment):
  415. "Sort shorter passages first."
  416. return len(fragment)
  417. # Formatters
  418. def get_text(original, token, replace):
  419. """Convenience function for getting the text to use for a match when
  420. formatting.
  421. If ``replace`` is False, returns the part of ``original`` between
  422. ``token.startchar`` and ``token.endchar``. If ``replace`` is True, returns
  423. ``token.text``.
  424. """
  425. if replace:
  426. return token.text
  427. else:
  428. return original[token.startchar:token.endchar]
  429. class Formatter(object):
  430. """Base class for formatters.
  431. For highlighters that return strings, it is usually only necessary to
  432. override :meth:`Formatter.format_token`.
  433. Use the :func:`get_text` function as a convenience to get the token text::
  434. class MyFormatter(Formatter):
  435. def format_token(text, token, replace=False):
  436. ttext = get_text(text, token, replace)
  437. return "[%s]" % ttext
  438. """
  439. between = "..."
  440. def _text(self, text):
  441. return text
  442. def format_token(self, text, token, replace=False):
  443. """Returns a formatted version of the given "token" object, which
  444. should have at least ``startchar`` and ``endchar`` attributes, and
  445. a ``text`` attribute if ``replace`` is True.
  446. :param text: the original fragment text being highlighted.
  447. :param token: an object having ``startchar`` and ``endchar`` attributes
  448. and optionally a ``text`` attribute (if ``replace`` is True).
  449. :param replace: if True, the original text between the token's
  450. ``startchar`` and ``endchar`` indices will be replaced with the
  451. value of the token's ``text`` attribute.
  452. """
  453. raise NotImplementedError
  454. def format_fragment(self, fragment, replace=False):
  455. """Returns a formatted version of the given text, using the "token"
  456. objects in the given :class:`Fragment`.
  457. :param fragment: a :class:`Fragment` object representing a list of
  458. matches in the text.
  459. :param replace: if True, the original text corresponding to each
  460. match will be replaced with the value of the token object's
  461. ``text`` attribute.
  462. """
  463. output = []
  464. index = fragment.startchar
  465. text = fragment.text
  466. for t in fragment.matches:
  467. if t.startchar is None:
  468. continue
  469. if t.startchar < index:
  470. continue
  471. if t.startchar > index:
  472. output.append(self._text(text[index:t.startchar]))
  473. output.append(self.format_token(text, t, replace))
  474. index = t.endchar
  475. output.append(self._text(text[index:fragment.endchar]))
  476. out_string = "".join(output)
  477. return out_string
  478. def format(self, fragments, replace=False):
  479. """Returns a formatted version of the given text, using a list of
  480. :class:`Fragment` objects.
  481. """
  482. formatted = [self.format_fragment(f, replace=replace)
  483. for f in fragments]
  484. return self.between.join(formatted)
  485. def __call__(self, text, fragments):
  486. # For backwards compatibility
  487. return self.format(fragments)
  488. class NullFormatter(Formatter):
  489. """Formatter that does not modify the string.
  490. """
  491. def format_token(self, text, token, replace=False):
  492. return get_text(text, token, replace)
  493. class UppercaseFormatter(Formatter):
  494. """Returns a string in which the matched terms are in UPPERCASE.
  495. """
  496. def __init__(self, between="..."):
  497. """
  498. :param between: the text to add between fragments.
  499. """
  500. self.between = between
  501. def format_token(self, text, token, replace=False):
  502. ttxt = get_text(text, token, replace)
  503. return ttxt.upper()
  504. class HtmlFormatter(Formatter):
  505. """Returns a string containing HTML formatting around the matched terms.
  506. This formatter wraps matched terms in an HTML element with two class names.
  507. The first class name (set with the constructor argument ``classname``) is
  508. the same for each match. The second class name (set with the constructor
  509. argument ``termclass`` is different depending on which term matched. This
  510. allows you to give different formatting (for example, different background
  511. colors) to the different terms in the excerpt.
  512. >>> hf = HtmlFormatter(tagname="span", classname="match", termclass="term")
  513. >>> hf(mytext, myfragments)
  514. "The <span class="match term0">template</span> <span class="match term1">geometry</span> is..."
  515. This object maintains a dictionary mapping terms to HTML class names (e.g.
  516. ``term0`` and ``term1`` above), so that multiple excerpts will use the same
  517. class for the same term. If you want to re-use the same HtmlFormatter
  518. object with different searches, you should call HtmlFormatter.clear()
  519. between searches to clear the mapping.
  520. """
  521. template = '<%(tag)s class=%(q)s%(cls)s%(tn)s%(q)s>%(t)s</%(tag)s>'
  522. def __init__(self, tagname="strong", between="...",
  523. classname="match", termclass="term", maxclasses=5,
  524. attrquote='"'):
  525. """
  526. :param tagname: the tag to wrap around matching terms.
  527. :param between: the text to add between fragments.
  528. :param classname: the class name to add to the elements wrapped around
  529. matching terms.
  530. :param termclass: the class name prefix for the second class which is
  531. different for each matched term.
  532. :param maxclasses: the maximum number of term classes to produce. This
  533. limits the number of classes you have to define in CSS by recycling
  534. term class names. For example, if you set maxclasses to 3 and have
  535. 5 terms, the 5 terms will use the CSS classes ``term0``, ``term1``,
  536. ``term2``, ``term0``, ``term1``.
  537. """
  538. self.between = between
  539. self.tagname = tagname
  540. self.classname = classname
  541. self.termclass = termclass
  542. self.attrquote = attrquote
  543. self.maxclasses = maxclasses
  544. self.seen = {}
  545. self.htmlclass = " ".join((self.classname, self.termclass))
  546. def _text(self, text):
  547. return htmlescape(text, quote=False)
  548. def format_token(self, text, token, replace=False):
  549. seen = self.seen
  550. ttext = self._text(get_text(text, token, replace))
  551. if ttext in seen:
  552. termnum = seen[ttext]
  553. else:
  554. termnum = len(seen) % self.maxclasses
  555. seen[ttext] = termnum
  556. return self.template % {"tag": self.tagname, "q": self.attrquote,
  557. "cls": self.htmlclass, "t": ttext,
  558. "tn": termnum}
  559. def clean(self):
  560. """Clears the dictionary mapping terms to HTML classnames.
  561. """
  562. self.seen = {}
  563. class GenshiFormatter(Formatter):
  564. """Returns a Genshi event stream containing HTML formatting around the
  565. matched terms.
  566. """
  567. def __init__(self, qname="strong", between="..."):
  568. """
  569. :param qname: the QName for the tag to wrap around matched terms.
  570. :param between: the text to add between fragments.
  571. """
  572. self.qname = qname
  573. self.between = between
  574. from genshi.core import START, END, TEXT # @UnresolvedImport
  575. from genshi.core import Attrs, Stream # @UnresolvedImport
  576. self.START, self.END, self.TEXT = START, END, TEXT
  577. self.Attrs, self.Stream = Attrs, Stream
  578. def _add_text(self, text, output):
  579. if output and output[-1][0] == self.TEXT:
  580. output[-1] = (self.TEXT, output[-1][1] + text, output[-1][2])
  581. else:
  582. output.append((self.TEXT, text, (None, -1, -1)))
  583. def format_token(self, text, token, replace=False):
  584. qn = self.qname
  585. txt = get_text(text, token, replace)
  586. return self.Stream([(self.START, (qn, self.Attrs()), (None, -1, -1)),
  587. (self.TEXT, txt, (None, -1, -1)),
  588. (self.END, qn, (None, -1, -1))])
  589. def format_fragment(self, fragment, replace=False):
  590. output = []
  591. index = fragment.startchar
  592. text = fragment.text
  593. for t in fragment.matches:
  594. if t.startchar > index:
  595. self._add_text(text[index:t.startchar], output)
  596. output.append((text, t, replace))
  597. index = t.endchar
  598. if index < len(text):
  599. self._add_text(text[index:], output)
  600. return self.Stream(output)
  601. def format(self, fragments, replace=False):
  602. output = []
  603. first = True
  604. for fragment in fragments:
  605. if not first:
  606. self._add_text(self.between, output)
  607. output += self.format_fragment(fragment, replace=replace)
  608. first = False
  609. return self.Stream(output)
  610. # Highlighting
  611. def top_fragments(fragments, count, scorer, order, minscore=1):
  612. scored_fragments = ((scorer(f), f) for f in fragments)
  613. scored_fragments = nlargest(count, scored_fragments)
  614. best_fragments = [sf for score, sf in scored_fragments if score >= minscore]
  615. best_fragments.sort(key=order)
  616. return best_fragments
  617. def highlight(text, terms, analyzer, fragmenter, formatter, top=3,
  618. scorer=None, minscore=1, order=FIRST, mode="query"):
  619. if scorer is None:
  620. scorer = BasicFragmentScorer()
  621. if type(fragmenter) is type:
  622. fragmenter = fragmenter()
  623. if type(formatter) is type:
  624. formatter = formatter()
  625. if type(scorer) is type:
  626. scorer = scorer()
  627. if scorer is None:
  628. scorer = BasicFragmentScorer()
  629. termset = frozenset(terms)
  630. tokens = analyzer(text, chars=True, mode=mode, removestops=False)
  631. tokens = set_matched_filter(tokens, termset)
  632. fragments = fragmenter.fragment_tokens(text, tokens)
  633. fragments = top_fragments(fragments, top, scorer, order, minscore)
  634. return formatter(text, fragments)
  635. class Highlighter(object):
  636. def __init__(self, fragmenter=None, scorer=None, formatter=None,
  637. always_retokenize=False, order=FIRST):
  638. self.fragmenter = fragmenter or ContextFragmenter()
  639. self.scorer = scorer or BasicFragmentScorer()
  640. self.formatter = formatter or HtmlFormatter(tagname="b")
  641. self.order = order
  642. self.always_retokenize = always_retokenize
  643. def can_load_chars(self, results, fieldname):
  644. # Is it possible to build a mapping between the matched terms/docs and
  645. # their start and end chars for "pinpoint" highlighting (ie not require
  646. # re-tokenizing text)?
  647. if self.always_retokenize:
  648. # No, we've been configured to always retokenize some text
  649. return False
  650. if not results.has_matched_terms():
  651. # No, we don't know what the matched terms are yet
  652. return False
  653. if self.fragmenter.must_retokenize():
  654. # No, the configured fragmenter doesn't support it
  655. return False
  656. # Maybe, if the field was configured to store characters
  657. field = results.searcher.schema[fieldname]
  658. return field.supports("characters")
  659. @staticmethod
  660. def _load_chars(results, fieldname, texts, to_bytes):
  661. # For each docnum, create a mapping of text -> [(startchar, endchar)]
  662. # for the matched terms
  663. results._char_cache[fieldname] = cache = {}
  664. sorted_ids = sorted(docnum for _, docnum in results.top_n)
  665. for docnum in sorted_ids:
  666. cache[docnum] = {}
  667. for text in texts:
  668. btext = to_bytes(text)
  669. m = results.searcher.postings(fieldname, btext)
  670. docset = set(results.termdocs[(fieldname, btext)])
  671. for docnum in sorted_ids:
  672. if docnum in docset:
  673. m.skip_to(docnum)
  674. assert m.id() == docnum
  675. cache[docnum][text] = m.value_as("characters")
  676. @staticmethod
  677. def _merge_matched_tokens(tokens):
  678. # Merges consecutive matched tokens together, so they are highlighted
  679. # as one
  680. token = None
  681. for t in tokens:
  682. if not t.matched:
  683. if token is not None:
  684. yield token
  685. token = None
  686. yield t
  687. continue
  688. if token is None:
  689. token = t.copy()
  690. elif t.startchar <= token.endchar:
  691. if t.endchar > token.endchar:
  692. token.text += t.text[token.endchar-t.endchar:]
  693. token.endchar = t.endchar
  694. else:
  695. yield token
  696. token = None
  697. # t was not merged, also has to be yielded
  698. yield t
  699. if token is not None:
  700. yield token
  701. def highlight_hit(self, hitobj, fieldname, text=None, top=3, minscore=1):
  702. results = hitobj.results
  703. schema = results.searcher.schema
  704. field = schema[fieldname]
  705. to_bytes = field.to_bytes
  706. from_bytes = field.from_bytes
  707. if text is None:
  708. if fieldname not in hitobj:
  709. raise KeyError("Field %r is not stored." % fieldname)
  710. text = hitobj[fieldname]
  711. # Get the terms searched for/matched in this field
  712. if results.has_matched_terms():
  713. bterms = (term for term in results.matched_terms()
  714. if term[0] == fieldname)
  715. else:
  716. bterms = results.query_terms(expand=True, fieldname=fieldname)
  717. # Convert bytes to unicode
  718. words = frozenset(from_bytes(term[1]) for term in bterms)
  719. # If we can do "pinpoint" highlighting...
  720. if self.can_load_chars(results, fieldname):
  721. # Build the docnum->[(startchar, endchar),] map
  722. if fieldname not in results._char_cache:
  723. self._load_chars(results, fieldname, words, to_bytes)
  724. hitterms = (from_bytes(term[1]) for term in hitobj.matched_terms()
  725. if term[0] == fieldname)
  726. # Grab the word->[(startchar, endchar)] map for this docnum
  727. cmap = results._char_cache[fieldname][hitobj.docnum]
  728. # A list of Token objects for matched words
  729. tokens = []
  730. charlimit = self.fragmenter.charlimit
  731. for word in hitterms:
  732. chars = cmap[word]
  733. for pos, startchar, endchar in chars:
  734. if charlimit and endchar > charlimit:
  735. break
  736. tokens.append(Token(text=word, pos=pos,
  737. startchar=startchar, endchar=endchar))
  738. tokens.sort(key=lambda t: t.startchar)
  739. tokens = [max(group, key=lambda t: t.endchar - t.startchar)
  740. for key, group in groupby(tokens, lambda t: t.startchar)]
  741. fragments = self.fragmenter.fragment_matches(text, tokens)
  742. else:
  743. # Retokenize the text
  744. analyzer = results.searcher.schema[fieldname].analyzer
  745. tokens = analyzer(text, positions=True, chars=True, mode="index",
  746. removestops=False)
  747. # Set Token.matched attribute for tokens that match a query term
  748. tokens = set_matched_filter(tokens, words)
  749. tokens = self._merge_matched_tokens(tokens)
  750. fragments = self.fragmenter.fragment_tokens(text, tokens)
  751. fragments = top_fragments(fragments, top, self.scorer, self.order,
  752. minscore=minscore)
  753. output = self.formatter.format(fragments)
  754. return output