cache.py 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228
  1. # coding=utf-8
  2. #
  3. # This file is part of Hypothesis, which may be found at
  4. # https://github.com/HypothesisWorks/hypothesis-python
  5. #
  6. # Most of this work is copyright (C) 2013-2018 David R. MacIver
  7. # (david@drmaciver.com), but it contains contributions by others. See
  8. # CONTRIBUTING.rst for a full list of people who may hold copyright, and
  9. # consult the git log if you need to determine who owns an individual
  10. # contribution.
  11. #
  12. # This Source Code Form is subject to the terms of the Mozilla Public License,
  13. # v. 2.0. If a copy of the MPL was not distributed with this file, You can
  14. # obtain one at http://mozilla.org/MPL/2.0/.
  15. #
  16. # END HEADER
  17. from __future__ import division, print_function, absolute_import
  18. import attr
  19. @attr.s(slots=True)
  20. class Entry(object):
  21. key = attr.ib()
  22. value = attr.ib()
  23. score = attr.ib()
  24. class GenericCache(object):
  25. """Generic supertype for cache implementations.
  26. Defines a dict-like mapping with a maximum size, where as well as mapping
  27. to a value, each key also maps to a score. When a write would cause the
  28. dict to exceed its maximum size, it first evicts the existing key with
  29. the smallest score, then adds the new key to the map.
  30. A key has the following lifecycle:
  31. 1. key is written for the first time, the key is given the score
  32. self.new_entry(key, value)
  33. 2. whenever an existing key is read or written, self.on_access(key, value,
  34. score) is called. This returns a new score for the key.
  35. 3. When a key is evicted, self.on_evict(key, value, score) is called.
  36. The cache will be in a valid state in all of these cases.
  37. Implementations are expected to implement new_entry and optionally
  38. on_access and on_evict to implement a specific scoring strategy.
  39. """
  40. __slots__ = ('keys_to_indices', 'data', 'max_size')
  41. def __init__(self, max_size):
  42. self.max_size = max_size
  43. # Implementation: We store a binary heap of Entry objects in self.data,
  44. # with the heap property requiring that a parent's score is <= that of
  45. # its children. keys_to_index then maps keys to their index in the
  46. # heap. We keep these two in sync automatically - the heap is never
  47. # reordered without updating the index.
  48. self.keys_to_indices = {}
  49. self.data = []
  50. def __len__(self):
  51. assert len(self.keys_to_indices) == len(self.data)
  52. return len(self.data)
  53. def __getitem__(self, key):
  54. i = self.keys_to_indices[key]
  55. result = self.data[i]
  56. self.on_access(result.key, result.value, result.score)
  57. self.__balance(i)
  58. return result.value
  59. def __setitem__(self, key, value):
  60. if self.max_size == 0:
  61. return
  62. evicted = None
  63. try:
  64. i = self.keys_to_indices[key]
  65. except KeyError:
  66. entry = Entry(key, value, self.new_entry(key, value))
  67. if len(self.data) >= self.max_size:
  68. evicted = self.data[0]
  69. del self.keys_to_indices[evicted.key]
  70. i = 0
  71. self.data[0] = entry
  72. else:
  73. i = len(self.data)
  74. self.data.append(entry)
  75. self.keys_to_indices[key] = i
  76. else:
  77. entry = self.data[i]
  78. assert entry.key == key
  79. entry.value = value
  80. entry.score = self.on_access(entry.key, entry.value, entry.score)
  81. self.__balance(i)
  82. if evicted is not None:
  83. if self.data[0] is not entry:
  84. assert evicted.score <= self.data[0].score
  85. self.on_evict(evicted.key, evicted.value, evicted.score)
  86. def clear(self):
  87. del self.data[:]
  88. self.keys_to_indices.clear()
  89. def __repr__(self):
  90. return '{%s}' % (', '.join(
  91. '%r: %r' % (e.key, e.value) for e in self.data),)
  92. def new_entry(self, key, value):
  93. """Called when a key is written that does not currently appear in the
  94. map.
  95. Returns the score to associate with the key.
  96. """
  97. raise NotImplementedError()
  98. def on_access(self, key, value, score):
  99. """Called every time a key that is already in the map is read or
  100. written.
  101. Returns the new score for the key.
  102. """
  103. return score
  104. def on_evict(self, key, value, score):
  105. """Called after a key has been evicted, with the score it had had at
  106. the point of eviction."""
  107. pass
  108. def check_valid(self):
  109. """Debugging method for use in tests.
  110. Asserts that all of the cache's invariants hold. When everything
  111. is working correctly this should be an expensive no-op.
  112. """
  113. for i, e in enumerate(self.data):
  114. assert self.keys_to_indices[e.key] == i
  115. for j in [i * 2 + 1, i * 2 + 2]:
  116. if j < len(self.data):
  117. assert e.score <= self.data[j].score, self.data
  118. def __swap(self, i, j):
  119. assert i < j
  120. assert self.data[j].score < self.data[i].score
  121. self.data[i], self.data[j] = self.data[j], self.data[i]
  122. self.keys_to_indices[self.data[i].key] = i
  123. self.keys_to_indices[self.data[j].key] = j
  124. def __balance(self, i):
  125. """When we have made a modification to the heap such that means that
  126. the heap property has been violated locally around i but previously
  127. held for all other indexes (and no other values have been modified),
  128. this fixes the heap so that the heap property holds everywhere."""
  129. while i > 0:
  130. parent = (i - 1) // 2
  131. if self.__out_of_order(parent, i):
  132. self.__swap(parent, i)
  133. i = parent
  134. else:
  135. break
  136. while True:
  137. children = [
  138. j for j in (2 * i + 1, 2 * i + 2)
  139. if j < len(self.data)
  140. ]
  141. if len(children) == 2:
  142. children.sort(key=lambda j: self.data[j].score)
  143. for j in children:
  144. if self.__out_of_order(i, j):
  145. self.__swap(i, j)
  146. i = j
  147. break
  148. else:
  149. break
  150. def __out_of_order(self, i, j):
  151. """Returns True if the indices i, j are in the wrong order.
  152. i must be the parent of j.
  153. """
  154. assert i == (j - 1) // 2
  155. return self.data[j].score < self.data[i].score
  156. class LRUReusedCache(GenericCache):
  157. """The only concrete implementation of GenericCache we use outside of tests
  158. currently.
  159. Adopts a modified least-frequently used eviction policy: It evicts the key
  160. that has been used least recently, but it will always preferentially evict
  161. keys that have only ever been accessed once. Among keys that have been
  162. accessed more than once, it ignores the number of accesses.
  163. This retains most of the benefits of an LRU cache, but adds an element of
  164. scan-resistance to the process: If we end up scanning through a large
  165. number of keys without reusing them, this does not evict the existing
  166. entries in preference for the new ones.
  167. """
  168. __slots__ = ('__tick',)
  169. def __init__(self, max_size, ):
  170. super(LRUReusedCache, self).__init__(max_size)
  171. self.__tick = 0
  172. def tick(self):
  173. self.__tick += 1
  174. return self.__tick
  175. def new_entry(self, key, value):
  176. return [1, self.tick()]
  177. def on_access(self, key, value, score):
  178. score[0] = 2
  179. score[1] = self.tick()
  180. return score