123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954 |
- # Copyright 2008 Matt Chaput. All rights reserved.
- #
- # Redistribution and use in source and binary forms, with or without
- # modification, are permitted provided that the following conditions are met:
- #
- # 1. Redistributions of source code must retain the above copyright notice,
- # this list of conditions and the following disclaimer.
- #
- # 2. Redistributions in binary form must reproduce the above copyright
- # notice, this list of conditions and the following disclaimer in the
- # documentation and/or other materials provided with the distribution.
- #
- # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
- # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
- # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
- # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
- # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- #
- # The views and conclusions contained in the software and documentation are
- # those of the authors and should not be interpreted as representing official
- # policies, either expressed or implied, of Matt Chaput.
- """The highlight module contains classes and functions for displaying short
- excerpts from hit documents in the search results you present to the user, with
- query terms highlighted.
- The highlighting system has four main elements.
- * **Fragmenters** chop up the original text into __fragments__, based on the
- locations of matched terms in the text.
- * **Scorers** assign a score to each fragment, allowing the system to rank the
- best fragments by whatever criterion.
- * **Order functions** control in what order the top-scoring fragments are
- presented to the user. For example, you can show the fragments in the order
- they appear in the document (FIRST) or show higher-scoring fragments first
- (SCORE)
- * **Formatters** turn the fragment objects into human-readable output, such as
- an HTML string.
- See :doc:`/highlight` for more information.
- """
- from __future__ import division
- from collections import deque
- from heapq import nlargest
- from itertools import groupby
- from whoosh.compat import htmlescape
- from whoosh.analysis import Token
- # The default value for the maximum chars to examine when fragmenting
- DEFAULT_CHARLIMIT = 2 ** 15
- # Fragment object
- def mkfrag(text, tokens, startchar=None, endchar=None,
- charsbefore=0, charsafter=0):
- """Returns a :class:`Fragment` object based on the :class:`analysis.Token`
- objects in ``tokens`.
- """
- if startchar is None:
- startchar = tokens[0].startchar if tokens else 0
- if endchar is None:
- endchar = tokens[-1].endchar if tokens else len(text)
- startchar = max(0, startchar - charsbefore)
- endchar = min(len(text), endchar + charsafter)
- return Fragment(text, tokens, startchar, endchar)
- class Fragment(object):
- """Represents a fragment (extract) from a hit document. This object is
- mainly used to keep track of the start and end points of the fragment and
- the "matched" character ranges inside; it does not contain the text of the
- fragment or do much else.
- The useful attributes are:
- ``Fragment.text``
- The entire original text from which this fragment is taken.
- ``Fragment.matches``
- An ordered list of objects representing the matched terms in the
- fragment. These objects have ``startchar`` and ``endchar`` attributes.
- ``Fragment.startchar``
- The index of the first character in the fragment.
- ``Fragment.endchar``
- The index of the last character in the fragment.
- ``Fragment.matched_terms``
- A ``set`` of the ``text`` of the matched terms in the fragment (if
- available).
- """
- def __init__(self, text, matches, startchar=0, endchar= -1):
- """
- :param text: the source text of the fragment.
- :param matches: a list of objects which have ``startchar`` and
- ``endchar`` attributes, and optionally a ``text`` attribute.
- :param startchar: the index into ``text`` at which the fragment starts.
- The default is 0.
- :param endchar: the index into ``text`` at which the fragment ends.
- The default is -1, which is interpreted as the length of ``text``.
- """
- self.text = text
- self.matches = matches
- if endchar == -1:
- endchar = len(text)
- self.startchar = startchar
- self.endchar = endchar
- self.matched_terms = set()
- for t in matches:
- if hasattr(t, "text"):
- self.matched_terms.add(t.text)
- def __repr__(self):
- return "<Fragment %d:%d %d>" % (self.startchar, self.endchar,
- len(self.matches))
- def __len__(self):
- return self.endchar - self.startchar
- def overlaps(self, fragment):
- sc = self.startchar
- ec = self.endchar
- fsc = fragment.startchar
- fec = fragment.endchar
- return (sc < fsc < ec) or (sc < fec < ec)
- def overlapped_length(self, fragment):
- sc = self.startchar
- ec = self.endchar
- fsc = fragment.startchar
- fec = fragment.endchar
- return max(ec, fec) - min(sc, fsc)
- def __lt__(self, other):
- return id(self) < id(other)
- # Tokenizing
- def set_matched_filter(tokens, termset):
- for t in tokens:
- t.matched = t.text in termset
- yield t
- # Fragmenters
- class Fragmenter(object):
- def must_retokenize(self):
- """Returns True if this fragmenter requires retokenized text.
- If this method returns True, the fragmenter's ``fragment_tokens``
- method will be called with an iterator of ALL tokens from the text,
- with the tokens for matched terms having the ``matched`` attribute set
- to True.
- If this method returns False, the fragmenter's ``fragment_matches``
- method will be called with a LIST of matching tokens.
- """
- return True
- def fragment_tokens(self, text, all_tokens):
- """Yields :class:`Fragment` objects based on the tokenized text.
- :param text: the string being highlighted.
- :param all_tokens: an iterator of :class:`analysis.Token`
- objects from the string.
- """
- raise NotImplementedError
- def fragment_matches(self, text, matched_tokens):
- """Yields :class:`Fragment` objects based on the text and the matched
- terms.
- :param text: the string being highlighted.
- :param matched_tokens: a list of :class:`analysis.Token` objects
- representing the term matches in the string.
- """
- raise NotImplementedError
- class WholeFragmenter(Fragmenter):
- """Doesn't fragment the token stream. This object just returns the entire
- entire stream as one "fragment". This is useful if you want to highlight
- the entire text.
- Note that even if you use the `WholeFragmenter`, the highlight code will
- return no fragment if no terms matched in the given field. To return the
- whole fragment even in that case, call `highlights()` with `minscore=0`::
- # Query where no terms match in the "text" field
- q = query.Term("tag", "new")
- r = mysearcher.search(q)
- r.fragmenter = highlight.WholeFragmenter()
- r.formatter = highlight.UppercaseFormatter()
- # Since no terms in the "text" field matched, we get no fragments back
- assert r[0].highlights("text") == ""
- # If we lower the minimum score to 0, we get a fragment even though it
- # has no matching terms
- assert r[0].highlights("text", minscore=0) == "This is the text field."
- """
- def __init__(self, charlimit=DEFAULT_CHARLIMIT):
- self.charlimit = charlimit
- def fragment_tokens(self, text, tokens):
- charlimit = self.charlimit
- matches = []
- for t in tokens:
- if charlimit and t.endchar > charlimit:
- break
- if t.matched:
- matches.append(t.copy())
- return [Fragment(text, matches)]
- # Backwards compatiblity
- NullFragmeter = WholeFragmenter
- class SentenceFragmenter(Fragmenter):
- """Breaks the text up on sentence end punctuation characters
- (".", "!", or "?"). This object works by looking in the original text for a
- sentence end as the next character after each token's 'endchar'.
- When highlighting with this fragmenter, you should use an analyzer that
- does NOT remove stop words, for example::
- sa = StandardAnalyzer(stoplist=None)
- """
- def __init__(self, maxchars=200, sentencechars=".!?",
- charlimit=DEFAULT_CHARLIMIT):
- """
- :param maxchars: The maximum number of characters allowed in a
- fragment.
- """
- self.maxchars = maxchars
- self.sentencechars = frozenset(sentencechars)
- self.charlimit = charlimit
- def fragment_tokens(self, text, tokens):
- maxchars = self.maxchars
- sentencechars = self.sentencechars
- charlimit = self.charlimit
- textlen = len(text)
- # startchar of first token in the current sentence
- first = None
- # Buffer for matched tokens in the current sentence
- tks = []
- endchar = None
- # Number of chars in the current sentence
- currentlen = 0
- for t in tokens:
- startchar = t.startchar
- endchar = t.endchar
- if charlimit and endchar > charlimit:
- break
- if first is None:
- # Remember the startchar of the first token in a sentence
- first = startchar
- currentlen = 0
- tlength = endchar - startchar
- currentlen += tlength
- if t.matched:
- tks.append(t.copy())
- # If the character after the current token is end-of-sentence
- # punctuation, finish the sentence and reset
- if endchar < textlen and text[endchar] in sentencechars:
- # Don't break for two periods in a row (e.g. ignore "...")
- if endchar + 1 < textlen and text[endchar + 1] in sentencechars:
- continue
- # If the sentence had matches and it's not too long, yield it
- # as a token
- if tks and currentlen <= maxchars:
- yield mkfrag(text, tks, startchar=first, endchar=endchar)
- # Reset the counts
- tks = []
- first = None
- currentlen = 0
- # If we get to the end of the text and there's still a sentence
- # in the buffer, yield it
- if tks:
- yield mkfrag(text, tks, startchar=first, endchar=endchar)
- class ContextFragmenter(Fragmenter):
- """Looks for matched terms and aggregates them with their surrounding
- context.
- """
- def __init__(self, maxchars=200, surround=20, charlimit=DEFAULT_CHARLIMIT):
- """
- :param maxchars: The maximum number of characters allowed in a
- fragment.
- :param surround: The number of extra characters of context to add both
- before the first matched term and after the last matched term.
- """
- self.maxchars = maxchars
- self.surround = surround
- self.charlimit = charlimit
- def fragment_tokens(self, text, tokens):
- maxchars = self.maxchars
- surround = self.surround
- charlimit = self.charlimit
- # startchar of the first token in the fragment
- first = None
- # Stack of startchars
- firsts = deque()
- # Each time we see a matched token, we reset the countdown to finishing
- # the fragment. This also indicates whether we're currently inside a
- # fragment (< 0 not in fragment, >= 0 in fragment)
- countdown = -1
- # Tokens in current fragment
- tks = []
- endchar = None
- # Number of chars in the current fragment
- currentlen = 0
- for t in tokens:
- startchar = t.startchar
- endchar = t.endchar
- tlength = endchar - startchar
- if charlimit and endchar > charlimit:
- break
- if countdown < 0 and not t.matched:
- # We're not in a fragment currently, so just maintain the
- # "charsbefore" buffer
- firsts.append(startchar)
- while firsts and endchar - firsts[0] > surround:
- firsts.popleft()
- elif currentlen + tlength > maxchars:
- # We're in a fragment, but adding this token would put us past
- # the maximum size. Zero the countdown so the code below will
- # cause the fragment to be emitted
- countdown = 0
- elif t.matched:
- # Start/restart the countdown
- countdown = surround
- # Remember the first char of this fragment
- if first is None:
- if firsts:
- first = firsts[0]
- else:
- first = startchar
- # Add on unused front context
- countdown += surround
- tks.append(t.copy())
- # If we're in a fragment...
- if countdown >= 0:
- # Update the counts
- currentlen += tlength
- countdown -= tlength
- # If the countdown is expired
- if countdown <= 0:
- # Finish the fragment
- yield mkfrag(text, tks, startchar=first, endchar=endchar)
- # Reset the counts
- tks = []
- firsts = deque()
- first = None
- currentlen = 0
- # If there's a fragment left over at the end, yield it
- if tks:
- yield mkfrag(text, tks, startchar=first, endchar=endchar)
- class PinpointFragmenter(Fragmenter):
- """This is a NON-RETOKENIZING fragmenter. It builds fragments from the
- positions of the matched terms.
- """
- def __init__(self, maxchars=200, surround=20, autotrim=False,
- charlimit=DEFAULT_CHARLIMIT):
- """
- :param maxchars: The maximum number of characters allowed in a
- fragment.
- :param surround: The number of extra characters of context to add both
- before the first matched term and after the last matched term.
- :param autotrim: automatically trims text before the first space and
- after the last space in the fragments, to try to avoid truncated
- words at the start and end. For short fragments or fragments with
- long runs between spaces this may give strange results.
- """
- self.maxchars = maxchars
- self.surround = surround
- self.autotrim = autotrim
- self.charlimit = charlimit
- def must_retokenize(self):
- return False
- def fragment_tokens(self, text, tokens):
- matched = [t for t in tokens if t.matched]
- return self.fragment_matches(text, matched)
- @staticmethod
- def _autotrim(fragment):
- text = fragment.text
- startchar = fragment.startchar
- endchar = fragment.endchar
- firstspace = text.find(" ", startchar, endchar)
- if firstspace > 0:
- startchar = firstspace + 1
- lastspace = text.rfind(" ", startchar, endchar)
- if lastspace > 0:
- endchar = lastspace
- if fragment.matches:
- startchar = min(startchar, fragment.matches[0].startchar)
- endchar = max(endchar, fragment.matches[-1].endchar)
- fragment.startchar = startchar
- fragment.endchar = endchar
- def fragment_matches(self, text, tokens):
- maxchars = self.maxchars
- surround = self.surround
- autotrim = self.autotrim
- charlimit = self.charlimit
- j = -1
- for i, t in enumerate(tokens):
- if j >= i:
- continue
- j = i
- left = t.startchar
- right = t.endchar
- if charlimit and right > charlimit:
- break
- currentlen = right - left
- while j < len(tokens) - 1 and currentlen < maxchars:
- next = tokens[j + 1]
- ec = next.endchar
- if ec - right <= surround and ec - left <= maxchars:
- j += 1
- right = ec
- currentlen += (ec - next.startchar)
- else:
- break
- left = max(0, left - surround)
- right = min(len(text), right + surround)
- fragment = Fragment(text, tokens[i:j + 1], left, right)
- if autotrim:
- self._autotrim(fragment)
- yield fragment
- # Fragment scorers
- class FragmentScorer(object):
- pass
- class BasicFragmentScorer(FragmentScorer):
- def __call__(self, f):
- # Add up the boosts for the matched terms in this passage
- score = sum(t.boost for t in f.matches)
- # Favor diversity: multiply score by the number of separate
- # terms matched
- score *= (len(f.matched_terms) * 100) or 1
- return score
- # Fragment sorters
- def SCORE(fragment):
- "Sorts higher scored passages first."
- return 1
- def FIRST(fragment):
- "Sorts passages from earlier in the document first."
- return fragment.startchar
- def LONGER(fragment):
- "Sorts longer passages first."
- return 0 - len(fragment)
- def SHORTER(fragment):
- "Sort shorter passages first."
- return len(fragment)
- # Formatters
- def get_text(original, token, replace):
- """Convenience function for getting the text to use for a match when
- formatting.
- If ``replace`` is False, returns the part of ``original`` between
- ``token.startchar`` and ``token.endchar``. If ``replace`` is True, returns
- ``token.text``.
- """
- if replace:
- return token.text
- else:
- return original[token.startchar:token.endchar]
- class Formatter(object):
- """Base class for formatters.
- For highlighters that return strings, it is usually only necessary to
- override :meth:`Formatter.format_token`.
- Use the :func:`get_text` function as a convenience to get the token text::
- class MyFormatter(Formatter):
- def format_token(text, token, replace=False):
- ttext = get_text(text, token, replace)
- return "[%s]" % ttext
- """
- between = "..."
- def _text(self, text):
- return text
- def format_token(self, text, token, replace=False):
- """Returns a formatted version of the given "token" object, which
- should have at least ``startchar`` and ``endchar`` attributes, and
- a ``text`` attribute if ``replace`` is True.
- :param text: the original fragment text being highlighted.
- :param token: an object having ``startchar`` and ``endchar`` attributes
- and optionally a ``text`` attribute (if ``replace`` is True).
- :param replace: if True, the original text between the token's
- ``startchar`` and ``endchar`` indices will be replaced with the
- value of the token's ``text`` attribute.
- """
- raise NotImplementedError
- def format_fragment(self, fragment, replace=False):
- """Returns a formatted version of the given text, using the "token"
- objects in the given :class:`Fragment`.
- :param fragment: a :class:`Fragment` object representing a list of
- matches in the text.
- :param replace: if True, the original text corresponding to each
- match will be replaced with the value of the token object's
- ``text`` attribute.
- """
- output = []
- index = fragment.startchar
- text = fragment.text
- for t in fragment.matches:
- if t.startchar is None:
- continue
- if t.startchar < index:
- continue
- if t.startchar > index:
- output.append(self._text(text[index:t.startchar]))
- output.append(self.format_token(text, t, replace))
- index = t.endchar
- output.append(self._text(text[index:fragment.endchar]))
- out_string = "".join(output)
- return out_string
- def format(self, fragments, replace=False):
- """Returns a formatted version of the given text, using a list of
- :class:`Fragment` objects.
- """
- formatted = [self.format_fragment(f, replace=replace)
- for f in fragments]
- return self.between.join(formatted)
- def __call__(self, text, fragments):
- # For backwards compatibility
- return self.format(fragments)
- class NullFormatter(Formatter):
- """Formatter that does not modify the string.
- """
- def format_token(self, text, token, replace=False):
- return get_text(text, token, replace)
- class UppercaseFormatter(Formatter):
- """Returns a string in which the matched terms are in UPPERCASE.
- """
- def __init__(self, between="..."):
- """
- :param between: the text to add between fragments.
- """
- self.between = between
- def format_token(self, text, token, replace=False):
- ttxt = get_text(text, token, replace)
- return ttxt.upper()
- class HtmlFormatter(Formatter):
- """Returns a string containing HTML formatting around the matched terms.
- This formatter wraps matched terms in an HTML element with two class names.
- The first class name (set with the constructor argument ``classname``) is
- the same for each match. The second class name (set with the constructor
- argument ``termclass`` is different depending on which term matched. This
- allows you to give different formatting (for example, different background
- colors) to the different terms in the excerpt.
- >>> hf = HtmlFormatter(tagname="span", classname="match", termclass="term")
- >>> hf(mytext, myfragments)
- "The <span class="match term0">template</span> <span class="match term1">geometry</span> is..."
- This object maintains a dictionary mapping terms to HTML class names (e.g.
- ``term0`` and ``term1`` above), so that multiple excerpts will use the same
- class for the same term. If you want to re-use the same HtmlFormatter
- object with different searches, you should call HtmlFormatter.clear()
- between searches to clear the mapping.
- """
- template = '<%(tag)s class=%(q)s%(cls)s%(tn)s%(q)s>%(t)s</%(tag)s>'
- def __init__(self, tagname="strong", between="...",
- classname="match", termclass="term", maxclasses=5,
- attrquote='"'):
- """
- :param tagname: the tag to wrap around matching terms.
- :param between: the text to add between fragments.
- :param classname: the class name to add to the elements wrapped around
- matching terms.
- :param termclass: the class name prefix for the second class which is
- different for each matched term.
- :param maxclasses: the maximum number of term classes to produce. This
- limits the number of classes you have to define in CSS by recycling
- term class names. For example, if you set maxclasses to 3 and have
- 5 terms, the 5 terms will use the CSS classes ``term0``, ``term1``,
- ``term2``, ``term0``, ``term1``.
- """
- self.between = between
- self.tagname = tagname
- self.classname = classname
- self.termclass = termclass
- self.attrquote = attrquote
- self.maxclasses = maxclasses
- self.seen = {}
- self.htmlclass = " ".join((self.classname, self.termclass))
- def _text(self, text):
- return htmlescape(text, quote=False)
- def format_token(self, text, token, replace=False):
- seen = self.seen
- ttext = self._text(get_text(text, token, replace))
- if ttext in seen:
- termnum = seen[ttext]
- else:
- termnum = len(seen) % self.maxclasses
- seen[ttext] = termnum
- return self.template % {"tag": self.tagname, "q": self.attrquote,
- "cls": self.htmlclass, "t": ttext,
- "tn": termnum}
- def clean(self):
- """Clears the dictionary mapping terms to HTML classnames.
- """
- self.seen = {}
- class GenshiFormatter(Formatter):
- """Returns a Genshi event stream containing HTML formatting around the
- matched terms.
- """
- def __init__(self, qname="strong", between="..."):
- """
- :param qname: the QName for the tag to wrap around matched terms.
- :param between: the text to add between fragments.
- """
- self.qname = qname
- self.between = between
- from genshi.core import START, END, TEXT # @UnresolvedImport
- from genshi.core import Attrs, Stream # @UnresolvedImport
- self.START, self.END, self.TEXT = START, END, TEXT
- self.Attrs, self.Stream = Attrs, Stream
- def _add_text(self, text, output):
- if output and output[-1][0] == self.TEXT:
- output[-1] = (self.TEXT, output[-1][1] + text, output[-1][2])
- else:
- output.append((self.TEXT, text, (None, -1, -1)))
- def format_token(self, text, token, replace=False):
- qn = self.qname
- txt = get_text(text, token, replace)
- return self.Stream([(self.START, (qn, self.Attrs()), (None, -1, -1)),
- (self.TEXT, txt, (None, -1, -1)),
- (self.END, qn, (None, -1, -1))])
- def format_fragment(self, fragment, replace=False):
- output = []
- index = fragment.startchar
- text = fragment.text
- for t in fragment.matches:
- if t.startchar > index:
- self._add_text(text[index:t.startchar], output)
- output.append((text, t, replace))
- index = t.endchar
- if index < len(text):
- self._add_text(text[index:], output)
- return self.Stream(output)
- def format(self, fragments, replace=False):
- output = []
- first = True
- for fragment in fragments:
- if not first:
- self._add_text(self.between, output)
- output += self.format_fragment(fragment, replace=replace)
- first = False
- return self.Stream(output)
- # Highlighting
- def top_fragments(fragments, count, scorer, order, minscore=1):
- scored_fragments = ((scorer(f), f) for f in fragments)
- scored_fragments = nlargest(count, scored_fragments)
- best_fragments = [sf for score, sf in scored_fragments if score >= minscore]
- best_fragments.sort(key=order)
- return best_fragments
- def highlight(text, terms, analyzer, fragmenter, formatter, top=3,
- scorer=None, minscore=1, order=FIRST, mode="query"):
- if scorer is None:
- scorer = BasicFragmentScorer()
- if type(fragmenter) is type:
- fragmenter = fragmenter()
- if type(formatter) is type:
- formatter = formatter()
- if type(scorer) is type:
- scorer = scorer()
- if scorer is None:
- scorer = BasicFragmentScorer()
- termset = frozenset(terms)
- tokens = analyzer(text, chars=True, mode=mode, removestops=False)
- tokens = set_matched_filter(tokens, termset)
- fragments = fragmenter.fragment_tokens(text, tokens)
- fragments = top_fragments(fragments, top, scorer, order, minscore)
- return formatter(text, fragments)
- class Highlighter(object):
- def __init__(self, fragmenter=None, scorer=None, formatter=None,
- always_retokenize=False, order=FIRST):
- self.fragmenter = fragmenter or ContextFragmenter()
- self.scorer = scorer or BasicFragmentScorer()
- self.formatter = formatter or HtmlFormatter(tagname="b")
- self.order = order
- self.always_retokenize = always_retokenize
- def can_load_chars(self, results, fieldname):
- # Is it possible to build a mapping between the matched terms/docs and
- # their start and end chars for "pinpoint" highlighting (ie not require
- # re-tokenizing text)?
- if self.always_retokenize:
- # No, we've been configured to always retokenize some text
- return False
- if not results.has_matched_terms():
- # No, we don't know what the matched terms are yet
- return False
- if self.fragmenter.must_retokenize():
- # No, the configured fragmenter doesn't support it
- return False
- # Maybe, if the field was configured to store characters
- field = results.searcher.schema[fieldname]
- return field.supports("characters")
- @staticmethod
- def _load_chars(results, fieldname, texts, to_bytes):
- # For each docnum, create a mapping of text -> [(startchar, endchar)]
- # for the matched terms
- results._char_cache[fieldname] = cache = {}
- sorted_ids = sorted(docnum for _, docnum in results.top_n)
- for docnum in sorted_ids:
- cache[docnum] = {}
- for text in texts:
- btext = to_bytes(text)
- m = results.searcher.postings(fieldname, btext)
- docset = set(results.termdocs[(fieldname, btext)])
- for docnum in sorted_ids:
- if docnum in docset:
- m.skip_to(docnum)
- assert m.id() == docnum
- cache[docnum][text] = m.value_as("characters")
- @staticmethod
- def _merge_matched_tokens(tokens):
- # Merges consecutive matched tokens together, so they are highlighted
- # as one
- token = None
- for t in tokens:
- if not t.matched:
- if token is not None:
- yield token
- token = None
- yield t
- continue
- if token is None:
- token = t.copy()
- elif t.startchar <= token.endchar:
- if t.endchar > token.endchar:
- token.text += t.text[token.endchar-t.endchar:]
- token.endchar = t.endchar
- else:
- yield token
- token = None
- # t was not merged, also has to be yielded
- yield t
- if token is not None:
- yield token
- def highlight_hit(self, hitobj, fieldname, text=None, top=3, minscore=1):
- results = hitobj.results
- schema = results.searcher.schema
- field = schema[fieldname]
- to_bytes = field.to_bytes
- from_bytes = field.from_bytes
- if text is None:
- if fieldname not in hitobj:
- raise KeyError("Field %r is not stored." % fieldname)
- text = hitobj[fieldname]
- # Get the terms searched for/matched in this field
- if results.has_matched_terms():
- bterms = (term for term in results.matched_terms()
- if term[0] == fieldname)
- else:
- bterms = results.query_terms(expand=True, fieldname=fieldname)
- # Convert bytes to unicode
- words = frozenset(from_bytes(term[1]) for term in bterms)
- # If we can do "pinpoint" highlighting...
- if self.can_load_chars(results, fieldname):
- # Build the docnum->[(startchar, endchar),] map
- if fieldname not in results._char_cache:
- self._load_chars(results, fieldname, words, to_bytes)
- hitterms = (from_bytes(term[1]) for term in hitobj.matched_terms()
- if term[0] == fieldname)
- # Grab the word->[(startchar, endchar)] map for this docnum
- cmap = results._char_cache[fieldname][hitobj.docnum]
- # A list of Token objects for matched words
- tokens = []
- charlimit = self.fragmenter.charlimit
- for word in hitterms:
- chars = cmap[word]
- for pos, startchar, endchar in chars:
- if charlimit and endchar > charlimit:
- break
- tokens.append(Token(text=word, pos=pos,
- startchar=startchar, endchar=endchar))
- tokens.sort(key=lambda t: t.startchar)
- tokens = [max(group, key=lambda t: t.endchar - t.startchar)
- for key, group in groupby(tokens, lambda t: t.startchar)]
- fragments = self.fragmenter.fragment_matches(text, tokens)
- else:
- # Retokenize the text
- analyzer = results.searcher.schema[fieldname].analyzer
- tokens = analyzer(text, positions=True, chars=True, mode="index",
- removestops=False)
- # Set Token.matched attribute for tokens that match a query term
- tokens = set_matched_filter(tokens, words)
- tokens = self._merge_matched_tokens(tokens)
- fragments = self.fragmenter.fragment_tokens(text, tokens)
- fragments = top_fragments(fragments, top, self.scorer, self.order,
- minscore=minscore)
- output = self.formatter.format(fragments)
- return output
|