utils.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353
  1. # coding=utf-8
  2. #
  3. # This file is part of Hypothesis, which may be found at
  4. # https://github.com/HypothesisWorks/hypothesis-python
  5. #
  6. # Most of this work is copyright (C) 2013-2018 David R. MacIver
  7. # (david@drmaciver.com), but it contains contributions by others. See
  8. # CONTRIBUTING.rst for a full list of people who may hold copyright, and
  9. # consult the git log if you need to determine who owns an individual
  10. # contribution.
  11. #
  12. # This Source Code Form is subject to the terms of the Mozilla Public License,
  13. # v. 2.0. If a copy of the MPL was not distributed with this file, You can
  14. # obtain one at http://mozilla.org/MPL/2.0/.
  15. #
  16. # END HEADER
  17. from __future__ import division, print_function, absolute_import
  18. import enum
  19. import math
  20. from collections import Sequence, OrderedDict
  21. from hypothesis._settings import note_deprecation
  22. from hypothesis.internal.compat import floor, hbytes, bit_length, \
  23. int_from_bytes
  24. from hypothesis.internal.floats import int_to_float
  25. def integer_range(data, lower, upper, center=None):
  26. assert lower <= upper
  27. if lower == upper:
  28. return int(lower)
  29. if center is None:
  30. center = lower
  31. center = min(max(center, lower), upper)
  32. if center == upper:
  33. above = False
  34. elif center == lower:
  35. above = True
  36. else:
  37. above = boolean(data)
  38. if above:
  39. gap = upper - center
  40. else:
  41. gap = center - lower
  42. assert gap > 0
  43. bits = bit_length(gap)
  44. probe = gap + 1
  45. while probe > gap:
  46. data.start_example()
  47. probe = data.draw_bits(bits)
  48. data.stop_example(discard=probe > gap)
  49. if above:
  50. result = center + probe
  51. else:
  52. result = center - probe
  53. assert lower <= result <= upper
  54. return int(result)
  55. def centered_integer_range(data, lower, upper, center):
  56. return integer_range(
  57. data, lower, upper, center=center
  58. )
  59. def check_sample(values):
  60. try:
  61. from numpy import ndarray
  62. if isinstance(values, ndarray):
  63. if values.ndim != 1:
  64. note_deprecation((
  65. 'Only one-dimensional arrays are supported for sampling, '
  66. 'and the given value has {ndim} dimensions (shape '
  67. '{shape}). This array would give samples of array slices '
  68. 'instead of elements! Use np.ravel(values) to convert '
  69. 'to a one-dimensional array, or tuple(values) if you '
  70. 'want to sample slices. Sampling a multi-dimensional '
  71. 'array will be an error in a future version of Hypothesis.'
  72. ).format(ndim=values.ndim, shape=values.shape))
  73. return tuple(values)
  74. except ImportError: # pragma: no cover
  75. pass
  76. if not isinstance(values, (OrderedDict, Sequence, enum.EnumMeta)):
  77. note_deprecation(
  78. ('Cannot sample from %r, not a sequence. ' % (values,)) +
  79. 'Hypothesis goes to some length to ensure that sampling an '
  80. 'element from a collection (with `sampled_from` or `choices`) is '
  81. 'replayable and can be minimised. To replay a saved example, '
  82. 'the sampled values must have the same iteration order on every '
  83. 'run - ruling out sets, dicts, etc due to hash randomisation. '
  84. 'Most cases can simply use `sorted(values)`, but mixed types or '
  85. 'special values such as math.nan require careful handling - and '
  86. 'note that when simplifying an example, Hypothesis treats '
  87. 'earlier values as simpler.')
  88. return tuple(values)
  89. def choice(data, values):
  90. return values[integer_range(data, 0, len(values) - 1)]
  91. def getrandbits(data, n):
  92. n_bytes = n // 8
  93. if n % 8 != 0:
  94. n_bytes += 1
  95. return int_from_bytes(data.draw_bytes(n_bytes)) & ((1 << n) - 1)
  96. FLOAT_PREFIX = 0b1111111111 << 52
  97. FULL_FLOAT = int_to_float(FLOAT_PREFIX | ((2 << 53) - 1)) - 1
  98. def fractional_float(data):
  99. return (
  100. int_to_float(FLOAT_PREFIX | getrandbits(data, 52)) - 1
  101. ) / FULL_FLOAT
  102. def geometric(data, p):
  103. denom = math.log1p(-p)
  104. data.start_example()
  105. while True:
  106. probe = fractional_float(data)
  107. if probe < 1.0:
  108. result = int(math.log1p(-probe) / denom)
  109. assert result >= 0, (probe, p, result)
  110. data.stop_example()
  111. return result
  112. def boolean(data):
  113. return bool(data.draw_bits(1))
  114. def biased_coin(data, p):
  115. """Return False with probability p (assuming a uniform generator),
  116. shrinking towards False."""
  117. data.start_example()
  118. while True:
  119. # The logic here is a bit complicated and special cased to make it
  120. # play better with the shrinker.
  121. # We imagine partitioning the real interval [0, 1] into 256 equal parts
  122. # and looking at each part and whether its interior is wholly <= p
  123. # or wholly >= p. At most one part can be neither.
  124. # We then pick a random part. If it's wholly on one side or the other
  125. # of p then we use that as the answer. If p is contained in the
  126. # interval then we start again with a new probability that is given
  127. # by the fraction of that interval that was <= our previous p.
  128. # We then take advantage of the fact that we have control of the
  129. # labelling to make this shrink better, using the following tricks:
  130. # If p is <= 0 or >= 1 the result of this coin is certain. We make sure
  131. # to write a byte to the data stream anyway so that these don't cause
  132. # difficulties when shrinking.
  133. if p <= 0:
  134. data.write(hbytes([0]))
  135. result = False
  136. elif p >= 1:
  137. data.write(hbytes([1]))
  138. result = True
  139. else:
  140. falsey = floor(256 * (1 - p))
  141. truthy = floor(256 * p)
  142. remainder = 256 * p - truthy
  143. if falsey + truthy == 256:
  144. m, n = p.as_integer_ratio()
  145. assert n & (n - 1) == 0, n # n is a power of 2
  146. assert n > m > 0
  147. truthy = m
  148. falsey = n - m
  149. bits = bit_length(n) - 1
  150. partial = False
  151. else:
  152. bits = 8
  153. partial = True
  154. i = data.draw_bits(bits)
  155. # We always label the region that causes us to repeat the loop as
  156. # 255 so that shrinking this byte never causes us to need to draw
  157. # more data.
  158. if partial and i == 255:
  159. p = remainder
  160. continue
  161. if falsey == 0:
  162. # Every other partition is truthy, so the result is true
  163. result = True
  164. elif truthy == 0:
  165. # Every other partition is falsey, so the result is false
  166. result = False
  167. elif i <= 1:
  168. # We special case so that zero is always false and 1 is always
  169. # true which makes shrinking easier because we can always
  170. # replace a truthy block with 1. This has the slightly weird
  171. # property that shrinking from 2 to 1 can cause the result to
  172. # grow, but the shrinker always tries 0 and 1 first anyway, so
  173. # this will usually be fine.
  174. result = bool(i)
  175. else:
  176. # Originally everything in the region 0 <= i < falsey was false
  177. # and everything above was true. We swapped one truthy element
  178. # into this region, so the region becomes 0 <= i <= falsey
  179. # except for i = 1. We know i > 1 here, so the test for truth
  180. # becomes i > falsey.
  181. result = i > falsey
  182. break
  183. data.stop_example()
  184. return result
  185. class Sampler(object):
  186. """Sampler based on "Succinct Sampling from Discrete Distributions" by
  187. Bringmann and Larsen. In general it has some advantages and disadvantages
  188. compared to the more normal alias method, but its big advantage for us is
  189. that it plays well with shrinking: The values are laid out in their natural
  190. order, so shrink in that order.
  191. Its big disadvantage is that for heavily biased distributions it can
  192. use a lot of memory. Solution is some mix of "don't do that then"
  193. and not worrying about it because Hypothesis is something of a
  194. memory hog anyway.
  195. """
  196. def __init__(self, weights):
  197. # We consider each weight expressed in terms of the average weight,
  198. # say t. We write the weight of i as nt + f, where n is an integer and
  199. # 0 <= f < 1. We then store n items for this weight which correspond
  200. # to drawing i unconditionally, and if f > 0 we store an additional
  201. # item that corresponds to drawing i with probability f. This ensures
  202. # that (under a uniform model) we draw i with probability proportionate
  203. # to its weight.
  204. # We then rearrange things to shrink better. The table with the whole
  205. # weights is kept in sorted order so that shrinking still corresponds
  206. # to shrinking leftwards. The fractional weights however are put in
  207. # a second table that is logically "to the right" of the whole weights
  208. # and are sorted in order of decreasing acceptance probaility. This
  209. # ensures that shrinking lexicographically always results in drawing
  210. # less data.
  211. self.table = []
  212. self.extras = []
  213. self.acceptance = []
  214. total = sum(weights)
  215. n = len(weights)
  216. for i, x in enumerate(weights):
  217. whole_occurrences = floor(x * n / total)
  218. acceptance = x - whole_occurrences
  219. self.acceptance.append(acceptance)
  220. for _ in range(whole_occurrences):
  221. self.table.append(i)
  222. if acceptance > 0:
  223. self.extras.append(i)
  224. self.extras.sort(key=self.acceptance.__getitem__, reverse=True)
  225. def sample(self, data):
  226. while True:
  227. data.start_example()
  228. # We always draw the acceptance data even if we don't need it,
  229. # because that way we keep the amount of data we use stable.
  230. i = integer_range(data, 0, len(self.table) + len(self.extras) - 1)
  231. if i < len(self.table):
  232. result = self.table[i]
  233. data.stop_example()
  234. return result
  235. else:
  236. result = self.extras[i - len(self.table)]
  237. accept = not biased_coin(data, 1 - self.acceptance[result])
  238. data.stop_example()
  239. if accept:
  240. return result
  241. class many(object):
  242. """Utility class for collections. Bundles up the logic we use for "should I
  243. keep drawing more values?" and handles starting and stopping examples in
  244. the right place.
  245. Intended usage is something like:
  246. elements = many(data, ...)
  247. while elements.more():
  248. add_stuff_to_result()
  249. """
  250. def __init__(self, data, min_size, max_size, average_size):
  251. self.min_size = min_size
  252. self.max_size = max_size
  253. self.data = data
  254. self.stopping_value = 1 - 1.0 / (1 + average_size)
  255. self.count = 0
  256. self.rejections = 0
  257. self.drawn = False
  258. self.force_stop = False
  259. self.rejected = False
  260. def more(self):
  261. """Should I draw another element to add to the collection?"""
  262. if self.drawn:
  263. self.data.stop_example(discard=self.rejected)
  264. self.drawn = True
  265. self.rejected = False
  266. if self.min_size == self.max_size:
  267. should_continue = self.count < self.min_size
  268. elif self.force_stop:
  269. should_continue = False
  270. else:
  271. if self.count < self.min_size:
  272. p_continue = 1.0
  273. elif self.count >= self.max_size:
  274. p_continue = 0.0
  275. else:
  276. p_continue = self.stopping_value
  277. should_continue = biased_coin(self.data, p_continue)
  278. if should_continue:
  279. self.data.start_example()
  280. self.count += 1
  281. return True
  282. else:
  283. return False
  284. def reject(self):
  285. """Reject the last example (i.e. don't count it towards our budget of
  286. elements because it's not going to go in the final collection)"""
  287. assert self.count > 0
  288. self.count -= 1
  289. self.rejections += 1
  290. self.rejected = True
  291. if self.rejections > 2 * self.count:
  292. if self.count < self.min_size:
  293. self.data.mark_invalid()
  294. else:
  295. self.force_stop = True