text.py 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132
  1. # Copyright 2007 Matt Chaput. All rights reserved.
  2. #
  3. # Redistribution and use in source and binary forms, with or without
  4. # modification, are permitted provided that the following conditions are met:
  5. #
  6. # 1. Redistributions of source code must retain the above copyright notice,
  7. # this list of conditions and the following disclaimer.
  8. #
  9. # 2. Redistributions in binary form must reproduce the above copyright
  10. # notice, this list of conditions and the following disclaimer in the
  11. # documentation and/or other materials provided with the distribution.
  12. #
  13. # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
  14. # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  15. # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
  16. # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  17. # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  18. # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
  19. # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  20. # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  21. # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
  22. # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  23. #
  24. # The views and conclusions contained in the software and documentation are
  25. # those of the authors and should not be interpreted as representing official
  26. # policies, either expressed or implied, of Matt Chaput.
  27. import codecs, re
  28. from whoosh.compat import string_type, u, byte
  29. # Note: these functions return a tuple of (text, length), so when you call
  30. # them, you have to add [0] on the end, e.g. str = utf8encode(unicode)[0]
  31. utf8encode = codecs.getencoder("utf-8")
  32. utf8decode = codecs.getdecoder("utf-8")
  33. # Prefix encoding functions
  34. def first_diff(a, b):
  35. """
  36. Returns the position of the first differing character in the sequences a
  37. and b. For example, first_diff('render', 'rending') == 4. This function
  38. limits the return value to 255 so the difference can be encoded in a single
  39. byte.
  40. """
  41. i = 0
  42. while i <= 255 and i < len(a) and i < len(b) and a[i] == b[i]:
  43. i += 1
  44. return i
  45. def prefix_encode(a, b):
  46. """
  47. Compresses bytestring b as a byte representing the prefix it shares with a,
  48. followed by the suffix bytes.
  49. """
  50. i = first_diff(a, b)
  51. return byte(i) + b[i:]
  52. def prefix_encode_all(ls):
  53. """Compresses the given list of (unicode) strings by storing each string
  54. (except the first one) as an integer (encoded in a byte) representing
  55. the prefix it shares with its predecessor, followed by the suffix encoded
  56. as UTF-8.
  57. """
  58. last = u('')
  59. for w in ls:
  60. i = first_diff(last, w)
  61. yield chr(i) + w[i:].encode("utf-8")
  62. last = w
  63. def prefix_decode_all(ls):
  64. """Decompresses a list of strings compressed by prefix_encode().
  65. """
  66. last = u('')
  67. for w in ls:
  68. i = ord(w[0])
  69. decoded = last[:i] + w[1:].decode("utf-8")
  70. yield decoded
  71. last = decoded
  72. # Natural key sorting function
  73. _nkre = re.compile(r"\D+|\d+", re.UNICODE)
  74. def _nkconv(i):
  75. try:
  76. return int(i)
  77. except ValueError:
  78. return i.lower()
  79. def natural_key(s):
  80. """Converts string ``s`` into a tuple that will sort "naturally" (i.e.,
  81. ``name5`` will come before ``name10`` and ``1`` will come before ``A``).
  82. This function is designed to be used as the ``key`` argument to sorting
  83. functions.
  84. :param s: the str/unicode string to convert.
  85. :rtype: tuple
  86. """
  87. # Use _nkre to split the input string into a sequence of
  88. # digit runs and non-digit runs. Then use _nkconv() to convert
  89. # the digit runs into ints and the non-digit runs to lowercase.
  90. return tuple(_nkconv(m) for m in _nkre.findall(s))
  91. # Regular expression functions
  92. def rcompile(pattern, flags=0, verbose=False):
  93. """A wrapper for re.compile that checks whether "pattern" is a regex object
  94. or a string to be compiled, and automatically adds the re.UNICODE flag.
  95. """
  96. if not isinstance(pattern, string_type):
  97. # If it's not a string, assume it's already a compiled pattern
  98. return pattern
  99. if verbose:
  100. flags |= re.VERBOSE
  101. return re.compile(pattern, re.UNICODE | flags)