acore.py 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156
  1. # Copyright 2007 Matt Chaput. All rights reserved.
  2. #
  3. # Redistribution and use in source and binary forms, with or without
  4. # modification, are permitted provided that the following conditions are met:
  5. #
  6. # 1. Redistributions of source code must retain the above copyright notice,
  7. # this list of conditions and the following disclaimer.
  8. #
  9. # 2. Redistributions in binary form must reproduce the above copyright
  10. # notice, this list of conditions and the following disclaimer in the
  11. # documentation and/or other materials provided with the distribution.
  12. #
  13. # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
  14. # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  15. # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
  16. # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  17. # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  18. # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
  19. # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  20. # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  21. # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
  22. # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  23. #
  24. # The views and conclusions contained in the software and documentation are
  25. # those of the authors and should not be interpreted as representing official
  26. # policies, either expressed or implied, of Matt Chaput.
  27. from whoosh.compat import iteritems
  28. # Exceptions
  29. class CompositionError(Exception):
  30. pass
  31. # Utility functions
  32. def unstopped(tokenstream):
  33. """Removes tokens from a token stream where token.stopped = True.
  34. """
  35. return (t for t in tokenstream if not t.stopped)
  36. def entoken(textstream, positions=False, chars=False, start_pos=0,
  37. start_char=0, **kwargs):
  38. """Takes a sequence of unicode strings and yields a series of Token objects
  39. (actually the same Token object over and over, for performance reasons),
  40. with the attributes filled in with reasonable values (for example, if
  41. ``positions`` or ``chars`` is True, the function assumes each token was
  42. separated by one space).
  43. """
  44. pos = start_pos
  45. char = start_char
  46. t = Token(positions=positions, chars=chars, **kwargs)
  47. for text in textstream:
  48. t.text = text
  49. if positions:
  50. t.pos = pos
  51. pos += 1
  52. if chars:
  53. t.startchar = char
  54. char = char + len(text)
  55. t.endchar = char
  56. yield t
  57. # Token object
  58. class Token(object):
  59. """
  60. Represents a "token" (usually a word) extracted from the source text being
  61. indexed.
  62. See "Advanced analysis" in the user guide for more information.
  63. Because object instantiation in Python is slow, tokenizers should create
  64. ONE SINGLE Token object and YIELD IT OVER AND OVER, changing the attributes
  65. each time.
  66. This trick means that consumers of tokens (i.e. filters) must never try to
  67. hold onto the token object between loop iterations, or convert the token
  68. generator into a list. Instead, save the attributes between iterations,
  69. not the object::
  70. def RemoveDuplicatesFilter(self, stream):
  71. # Removes duplicate words.
  72. lasttext = None
  73. for token in stream:
  74. # Only yield the token if its text doesn't
  75. # match the previous token.
  76. if lasttext != token.text:
  77. yield token
  78. lasttext = token.text
  79. ...or, call token.copy() to get a copy of the token object.
  80. """
  81. def __init__(self, positions=False, chars=False, removestops=True, mode='',
  82. **kwargs):
  83. """
  84. :param positions: Whether tokens should have the token position in the
  85. 'pos' attribute.
  86. :param chars: Whether tokens should have character offsets in the
  87. 'startchar' and 'endchar' attributes.
  88. :param removestops: whether to remove stop words from the stream (if
  89. the tokens pass through a stop filter).
  90. :param mode: contains a string describing the purpose for which the
  91. analyzer is being called, i.e. 'index' or 'query'.
  92. """
  93. self.positions = positions
  94. self.chars = chars
  95. self.stopped = False
  96. self.boost = 1.0
  97. self.removestops = removestops
  98. self.mode = mode
  99. self.__dict__.update(kwargs)
  100. def __repr__(self):
  101. parms = ", ".join("%s=%r" % (name, value)
  102. for name, value in iteritems(self.__dict__))
  103. return "%s(%s)" % (self.__class__.__name__, parms)
  104. def copy(self):
  105. # This is faster than using the copy module
  106. return Token(**self.__dict__)
  107. # Composition support
  108. class Composable(object):
  109. is_morph = False
  110. def __or__(self, other):
  111. from whoosh.analysis.analyzers import CompositeAnalyzer
  112. if not isinstance(other, Composable):
  113. raise TypeError("%r is not composable with %r" % (self, other))
  114. return CompositeAnalyzer(self, other)
  115. def __repr__(self):
  116. attrs = ""
  117. if self.__dict__:
  118. attrs = ", ".join("%s=%r" % (key, value)
  119. for key, value
  120. in iteritems(self.__dict__))
  121. return self.__class__.__name__ + "(%s)" % attrs
  122. def has_morph(self):
  123. return self.is_morph