charmap.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336
  1. # coding=utf-8
  2. #
  3. # This file is part of Hypothesis, which may be found at
  4. # https://github.com/HypothesisWorks/hypothesis-python
  5. #
  6. # Most of this work is copyright (C) 2013-2018 David R. MacIver
  7. # (david@drmaciver.com), but it contains contributions by others. See
  8. # CONTRIBUTING.rst for a full list of people who may hold copyright, and
  9. # consult the git log if you need to determine who owns an individual
  10. # contribution.
  11. #
  12. # This Source Code Form is subject to the terms of the Mozilla Public License,
  13. # v. 2.0. If a copy of the MPL was not distributed with this file, You can
  14. # obtain one at http://mozilla.org/MPL/2.0/.
  15. #
  16. # END HEADER
  17. from __future__ import division, print_function, absolute_import
  18. import os
  19. import sys
  20. import gzip
  21. import pickle
  22. import tempfile
  23. import unicodedata
  24. from hypothesis.configuration import tmpdir, storage_directory
  25. from hypothesis.internal.compat import hunichr
  26. def charmap_file():
  27. return os.path.join(
  28. storage_directory('unicodedata', unicodedata.unidata_version),
  29. 'charmap.pickle.gz'
  30. )
  31. _charmap = None
  32. def charmap():
  33. """Return a dict that maps a Unicode category, to a tuple of 2-tuples
  34. covering the codepoint intervals for characters in that category.
  35. >>> charmap()['Co']
  36. ((57344, 63743), (983040, 1048573), (1048576, 1114109))
  37. """
  38. global _charmap
  39. # Best-effort caching in the face of missing files and/or unwritable
  40. # filesystems is fairly simple: check if loaded, else try loading,
  41. # else calculate and try writing the cache.
  42. if _charmap is None:
  43. f = charmap_file()
  44. try:
  45. with gzip.GzipFile(f, 'rb') as i:
  46. _charmap = dict(pickle.load(i))
  47. except Exception:
  48. tmp_charmap = {}
  49. for i in range(0, sys.maxunicode + 1):
  50. cat = unicodedata.category(hunichr(i))
  51. rs = tmp_charmap.setdefault(cat, [])
  52. if rs and rs[-1][-1] == i - 1:
  53. rs[-1][-1] += 1
  54. else:
  55. rs.append([i, i])
  56. _charmap = {k: tuple(tuple(pair) for pair in pairs)
  57. for k, pairs in tmp_charmap.items()}
  58. try:
  59. # Write the Unicode table atomically
  60. fd, tmpfile = tempfile.mkstemp(dir=tmpdir())
  61. os.close(fd)
  62. # Explicitly set the mtime to get reproducible output
  63. with gzip.GzipFile(tmpfile, 'wb', mtime=1) as o:
  64. pickle.dump(sorted(_charmap.items()), o,
  65. pickle.HIGHEST_PROTOCOL)
  66. os.rename(tmpfile, f)
  67. except Exception: # pragma: no cover
  68. pass
  69. assert _charmap is not None
  70. return _charmap
  71. _categories = None
  72. def categories():
  73. """Return a tuple of Unicode categories in a normalised order.
  74. >>> categories() # doctest: +ELLIPSIS
  75. ('Zl', 'Zp', 'Co', 'Me', 'Pc', ..., 'Cc', 'Cs')
  76. """
  77. global _categories
  78. if _categories is None:
  79. cm = charmap()
  80. _categories = sorted(
  81. cm.keys(), key=lambda c: len(cm[c])
  82. )
  83. _categories.remove('Cc') # Other, Control
  84. _categories.remove('Cs') # Other, Surrogate
  85. _categories.append('Cc')
  86. _categories.append('Cs')
  87. return tuple(_categories)
  88. def _union_intervals(x, y):
  89. """Merge two sequences of intervals into a single tuple of intervals.
  90. Any integer bounded by `x` or `y` is also bounded by the result.
  91. >>> _union_intervals([(3, 10)], [(1, 2), (5, 17)])
  92. ((1, 17),)
  93. """
  94. if not x:
  95. return tuple((u, v) for u, v in y)
  96. if not y:
  97. return tuple((u, v) for u, v in x)
  98. intervals = sorted(x + y, reverse=True)
  99. result = [intervals.pop()]
  100. while intervals:
  101. # 1. intervals is in descending order
  102. # 2. pop() takes from the RHS.
  103. # 3. (a, b) was popped 1st, then (u, v) was popped 2nd
  104. # 4. Therefore: a <= u
  105. # 5. We assume that u <= v and a <= b
  106. # 6. So we need to handle 2 cases of overlap, and one disjoint case
  107. # | u--v | u----v | u--v |
  108. # | a----b | a--b | a--b |
  109. u, v = intervals.pop()
  110. a, b = result[-1]
  111. if u <= b + 1:
  112. # Overlap cases
  113. result[-1] = (a, max(v, b))
  114. else:
  115. # Disjoint case
  116. result.append((u, v))
  117. return tuple(result)
  118. def _subtract_intervals(x, y):
  119. """Set difference for lists of intervals. That is, returns a list of
  120. intervals that bounds all values bounded by x that are not also bounded by
  121. y. x and y are expected to be in sorted order.
  122. For example _subtract_intervals([(1, 10)], [(2, 3), (9, 15)]) would
  123. return [(1, 1), (4, 8)], removing the values 2, 3, 9 and 10 from the
  124. interval.
  125. """
  126. if not y:
  127. return tuple(x)
  128. x = list(map(list, x))
  129. i = 0
  130. j = 0
  131. result = []
  132. while i < len(x) and j < len(y):
  133. # Iterate in parallel over x and y. j stays pointing at the smallest
  134. # interval in the left hand side that could still overlap with some
  135. # element of x at index >= i.
  136. # Similarly, i is not incremented until we know that it does not
  137. # overlap with any element of y at index >= j.
  138. xl, xr = x[i]
  139. assert xl <= xr
  140. yl, yr = y[j]
  141. assert yl <= yr
  142. if yr < xl:
  143. # The interval at y[j] is strictly to the left of the interval at
  144. # x[i], so will not overlap with it or any later interval of x.
  145. j += 1
  146. elif yl > xr:
  147. # The interval at y[j] is strictly to the right of the interval at
  148. # x[i], so all of x[i] goes into the result as no further intervals
  149. # in y will intersect it.
  150. result.append(x[i])
  151. i += 1
  152. elif yl <= xl:
  153. if yr >= xr:
  154. # x[i] is contained entirely in y[j], so we just skip over it
  155. # without adding it to the result.
  156. i += 1
  157. else:
  158. # The beginning of x[i] is contained in y[j], so we update the
  159. # left endpoint of x[i] to remove this, and increment j as we
  160. # now have moved past it. Note that this is not added to the
  161. # result as is, as more intervals from y may intersect it so it
  162. # may need updating further.
  163. x[i][0] = yr + 1
  164. j += 1
  165. else:
  166. # yl > xl, so the left hand part of x[i] is not contained in y[j],
  167. # so there are some values we should add to the result.
  168. result.append((xl, yl - 1))
  169. if yr + 1 <= xr:
  170. # If y[j] finishes before x[i] does, there may be some values
  171. # in x[i] left that should go in the result (or they may be
  172. # removed by a later interval in y), so we update x[i] to
  173. # reflect that and increment j because it no longer overlaps
  174. # with any remaining element of x.
  175. x[i][0] = yr + 1
  176. j += 1
  177. else:
  178. # Every element of x[i] other than the initial part we have
  179. # already added is contained in y[j], so we move to the next
  180. # interval.
  181. i += 1
  182. # Any remaining intervals in x do not overlap with any of y, as if they did
  183. # we would not have incremented j to the end, so can be added to the result
  184. # as they are.
  185. result.extend(x[i:])
  186. return tuple(map(tuple, result))
  187. def _intervals(s):
  188. """Return a tuple of intervals, covering the codepoints of characters in
  189. `s`.
  190. >>> _intervals('abcdef0123456789')
  191. ((48, 57), (97, 102))
  192. """
  193. intervals = tuple((ord(c), ord(c)) for c in sorted(s))
  194. return _union_intervals(intervals, intervals)
  195. category_index_cache = {
  196. (): (),
  197. }
  198. def _category_key(exclude, include):
  199. """Return a normalised tuple of all Unicode categories that are in
  200. `include`, but not in `exclude`.
  201. If include is None then default to including all categories.
  202. Any item in include that is not a unicode character will be excluded.
  203. >>> _category_key(exclude=['So'], include=['Lu', 'Me', 'Cs', 'So', 'Xx'])
  204. ('Me', 'Lu', 'Cs')
  205. """
  206. cs = categories()
  207. if include is None:
  208. include = set(cs)
  209. else:
  210. include = set(include)
  211. exclude = set(exclude or ())
  212. include -= exclude
  213. result = tuple(c for c in cs if c in include)
  214. return result
  215. def _query_for_key(key):
  216. """Return a tuple of codepoint intervals covering characters that match one
  217. or more categories in the tuple of categories `key`.
  218. >>> _query_for_key(categories())
  219. ((0, 1114111),)
  220. >>> _query_for_key(('Zl', 'Zp', 'Co'))
  221. ((8232, 8233), (57344, 63743), (983040, 1048573), (1048576, 1114109))
  222. """
  223. try:
  224. return category_index_cache[key]
  225. except KeyError:
  226. pass
  227. assert key
  228. if set(key) == set(categories()):
  229. result = ((0, sys.maxunicode),)
  230. else:
  231. result = _union_intervals(
  232. _query_for_key(key[:-1]), charmap()[key[-1]]
  233. )
  234. category_index_cache[key] = result
  235. return result
  236. limited_category_index_cache = {}
  237. def query(
  238. exclude_categories=(), include_categories=None,
  239. min_codepoint=None,
  240. max_codepoint=None,
  241. include_characters='',
  242. exclude_characters='',
  243. ):
  244. """Return a tuple of intervals covering the codepoints for all characters
  245. that meet the critera (min_codepoint <= codepoint(c) <= max_codepoint and
  246. any(cat in include_categories for cat in categories(c)) and all(cat not in
  247. exclude_categories for cat in categories(c)) or (c in include_characters)
  248. >>> query()
  249. ((0, 1114111),)
  250. >>> query(min_codepoint=0, max_codepoint=128)
  251. ((0, 128),)
  252. >>> query(min_codepoint=0, max_codepoint=128, include_categories=['Lu'])
  253. ((65, 90),)
  254. >>> query(min_codepoint=0, max_codepoint=128, include_categories=['Lu'],
  255. ... include_characters=u'☃')
  256. ((65, 90), (9731, 9731))
  257. """
  258. if min_codepoint is None:
  259. min_codepoint = 0
  260. if max_codepoint is None:
  261. max_codepoint = sys.maxunicode
  262. catkey = _category_key(exclude_categories, include_categories)
  263. character_intervals = _intervals(include_characters or '')
  264. exclude_intervals = _intervals(exclude_characters or '')
  265. qkey = (
  266. catkey, min_codepoint, max_codepoint,
  267. character_intervals, exclude_intervals
  268. )
  269. try:
  270. return limited_category_index_cache[qkey]
  271. except KeyError:
  272. pass
  273. base = _query_for_key(catkey)
  274. result = []
  275. for u, v in base:
  276. if v >= min_codepoint and u <= max_codepoint:
  277. result.append((
  278. max(u, min_codepoint), min(v, max_codepoint)
  279. ))
  280. result = tuple(result)
  281. result = _union_intervals(result, character_intervals)
  282. result = _subtract_intervals(result, exclude_intervals)
  283. limited_category_index_cache[qkey] = result
  284. return result