httpcache.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446
  1. from __future__ import print_function
  2. import gzip
  3. import logging
  4. import os
  5. from email.utils import mktime_tz, parsedate_tz
  6. from importlib import import_module
  7. from time import time
  8. from warnings import warn
  9. from weakref import WeakKeyDictionary
  10. from six.moves import cPickle as pickle
  11. from w3lib.http import headers_raw_to_dict, headers_dict_to_raw
  12. from scrapy.exceptions import ScrapyDeprecationWarning
  13. from scrapy.http import Headers, Response
  14. from scrapy.responsetypes import responsetypes
  15. from scrapy.utils.httpobj import urlparse_cached
  16. from scrapy.utils.project import data_path
  17. from scrapy.utils.python import to_bytes, to_unicode, garbage_collect
  18. from scrapy.utils.request import request_fingerprint
  19. logger = logging.getLogger(__name__)
  20. class DummyPolicy(object):
  21. def __init__(self, settings):
  22. self.ignore_schemes = settings.getlist('HTTPCACHE_IGNORE_SCHEMES')
  23. self.ignore_http_codes = [int(x) for x in settings.getlist('HTTPCACHE_IGNORE_HTTP_CODES')]
  24. def should_cache_request(self, request):
  25. return urlparse_cached(request).scheme not in self.ignore_schemes
  26. def should_cache_response(self, response, request):
  27. return response.status not in self.ignore_http_codes
  28. def is_cached_response_fresh(self, cachedresponse, request):
  29. return True
  30. def is_cached_response_valid(self, cachedresponse, response, request):
  31. return True
  32. class RFC2616Policy(object):
  33. MAXAGE = 3600 * 24 * 365 # one year
  34. def __init__(self, settings):
  35. self.always_store = settings.getbool('HTTPCACHE_ALWAYS_STORE')
  36. self.ignore_schemes = settings.getlist('HTTPCACHE_IGNORE_SCHEMES')
  37. self.ignore_response_cache_controls = [to_bytes(cc) for cc in
  38. settings.getlist('HTTPCACHE_IGNORE_RESPONSE_CACHE_CONTROLS')]
  39. self._cc_parsed = WeakKeyDictionary()
  40. def _parse_cachecontrol(self, r):
  41. if r not in self._cc_parsed:
  42. cch = r.headers.get(b'Cache-Control', b'')
  43. parsed = parse_cachecontrol(cch)
  44. if isinstance(r, Response):
  45. for key in self.ignore_response_cache_controls:
  46. parsed.pop(key, None)
  47. self._cc_parsed[r] = parsed
  48. return self._cc_parsed[r]
  49. def should_cache_request(self, request):
  50. if urlparse_cached(request).scheme in self.ignore_schemes:
  51. return False
  52. cc = self._parse_cachecontrol(request)
  53. # obey user-agent directive "Cache-Control: no-store"
  54. if b'no-store' in cc:
  55. return False
  56. # Any other is eligible for caching
  57. return True
  58. def should_cache_response(self, response, request):
  59. # What is cacheable - https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.9.1
  60. # Response cacheability - https://www.w3.org/Protocols/rfc2616/rfc2616-sec13.html#sec13.4
  61. # Status code 206 is not included because cache can not deal with partial contents
  62. cc = self._parse_cachecontrol(response)
  63. # obey directive "Cache-Control: no-store"
  64. if b'no-store' in cc:
  65. return False
  66. # Never cache 304 (Not Modified) responses
  67. elif response.status == 304:
  68. return False
  69. # Cache unconditionally if configured to do so
  70. elif self.always_store:
  71. return True
  72. # Any hint on response expiration is good
  73. elif b'max-age' in cc or b'Expires' in response.headers:
  74. return True
  75. # Firefox fallbacks this statuses to one year expiration if none is set
  76. elif response.status in (300, 301, 308):
  77. return True
  78. # Other statuses without expiration requires at least one validator
  79. elif response.status in (200, 203, 401):
  80. return b'Last-Modified' in response.headers or b'ETag' in response.headers
  81. # Any other is probably not eligible for caching
  82. # Makes no sense to cache responses that does not contain expiration
  83. # info and can not be revalidated
  84. else:
  85. return False
  86. def is_cached_response_fresh(self, cachedresponse, request):
  87. cc = self._parse_cachecontrol(cachedresponse)
  88. ccreq = self._parse_cachecontrol(request)
  89. if b'no-cache' in cc or b'no-cache' in ccreq:
  90. return False
  91. now = time()
  92. freshnesslifetime = self._compute_freshness_lifetime(cachedresponse, request, now)
  93. currentage = self._compute_current_age(cachedresponse, request, now)
  94. reqmaxage = self._get_max_age(ccreq)
  95. if reqmaxage is not None:
  96. freshnesslifetime = min(freshnesslifetime, reqmaxage)
  97. if currentage < freshnesslifetime:
  98. return True
  99. if b'max-stale' in ccreq and b'must-revalidate' not in cc:
  100. # From RFC2616: "Indicates that the client is willing to
  101. # accept a response that has exceeded its expiration time.
  102. # If max-stale is assigned a value, then the client is
  103. # willing to accept a response that has exceeded its
  104. # expiration time by no more than the specified number of
  105. # seconds. If no value is assigned to max-stale, then the
  106. # client is willing to accept a stale response of any age."
  107. staleage = ccreq[b'max-stale']
  108. if staleage is None:
  109. return True
  110. try:
  111. if currentage < freshnesslifetime + max(0, int(staleage)):
  112. return True
  113. except ValueError:
  114. pass
  115. # Cached response is stale, try to set validators if any
  116. self._set_conditional_validators(request, cachedresponse)
  117. return False
  118. def is_cached_response_valid(self, cachedresponse, response, request):
  119. # Use the cached response if the new response is a server error,
  120. # as long as the old response didn't specify must-revalidate.
  121. if response.status >= 500:
  122. cc = self._parse_cachecontrol(cachedresponse)
  123. if b'must-revalidate' not in cc:
  124. return True
  125. # Use the cached response if the server says it hasn't changed.
  126. return response.status == 304
  127. def _set_conditional_validators(self, request, cachedresponse):
  128. if b'Last-Modified' in cachedresponse.headers:
  129. request.headers[b'If-Modified-Since'] = cachedresponse.headers[b'Last-Modified']
  130. if b'ETag' in cachedresponse.headers:
  131. request.headers[b'If-None-Match'] = cachedresponse.headers[b'ETag']
  132. def _get_max_age(self, cc):
  133. try:
  134. return max(0, int(cc[b'max-age']))
  135. except (KeyError, ValueError):
  136. return None
  137. def _compute_freshness_lifetime(self, response, request, now):
  138. # Reference nsHttpResponseHead::ComputeFreshnessLifetime
  139. # https://dxr.mozilla.org/mozilla-central/source/netwerk/protocol/http/nsHttpResponseHead.cpp#706
  140. cc = self._parse_cachecontrol(response)
  141. maxage = self._get_max_age(cc)
  142. if maxage is not None:
  143. return maxage
  144. # Parse date header or synthesize it if none exists
  145. date = rfc1123_to_epoch(response.headers.get(b'Date')) or now
  146. # Try HTTP/1.0 Expires header
  147. if b'Expires' in response.headers:
  148. expires = rfc1123_to_epoch(response.headers[b'Expires'])
  149. # When parsing Expires header fails RFC 2616 section 14.21 says we
  150. # should treat this as an expiration time in the past.
  151. return max(0, expires - date) if expires else 0
  152. # Fallback to heuristic using last-modified header
  153. # This is not in RFC but on Firefox caching implementation
  154. lastmodified = rfc1123_to_epoch(response.headers.get(b'Last-Modified'))
  155. if lastmodified and lastmodified <= date:
  156. return (date - lastmodified) / 10
  157. # This request can be cached indefinitely
  158. if response.status in (300, 301, 308):
  159. return self.MAXAGE
  160. # Insufficient information to compute fresshness lifetime
  161. return 0
  162. def _compute_current_age(self, response, request, now):
  163. # Reference nsHttpResponseHead::ComputeCurrentAge
  164. # https://dxr.mozilla.org/mozilla-central/source/netwerk/protocol/http/nsHttpResponseHead.cpp#658
  165. currentage = 0
  166. # If Date header is not set we assume it is a fast connection, and
  167. # clock is in sync with the server
  168. date = rfc1123_to_epoch(response.headers.get(b'Date')) or now
  169. if now > date:
  170. currentage = now - date
  171. if b'Age' in response.headers:
  172. try:
  173. age = int(response.headers[b'Age'])
  174. currentage = max(currentage, age)
  175. except ValueError:
  176. pass
  177. return currentage
  178. class DbmCacheStorage(object):
  179. def __init__(self, settings):
  180. self.cachedir = data_path(settings['HTTPCACHE_DIR'], createdir=True)
  181. self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS')
  182. self.dbmodule = import_module(settings['HTTPCACHE_DBM_MODULE'])
  183. self.db = None
  184. def open_spider(self, spider):
  185. dbpath = os.path.join(self.cachedir, '%s.db' % spider.name)
  186. self.db = self.dbmodule.open(dbpath, 'c')
  187. logger.debug("Using DBM cache storage in %(cachepath)s" % {'cachepath': dbpath}, extra={'spider': spider})
  188. def close_spider(self, spider):
  189. self.db.close()
  190. def retrieve_response(self, spider, request):
  191. data = self._read_data(spider, request)
  192. if data is None:
  193. return # not cached
  194. url = data['url']
  195. status = data['status']
  196. headers = Headers(data['headers'])
  197. body = data['body']
  198. respcls = responsetypes.from_args(headers=headers, url=url)
  199. response = respcls(url=url, headers=headers, status=status, body=body)
  200. return response
  201. def store_response(self, spider, request, response):
  202. key = self._request_key(request)
  203. data = {
  204. 'status': response.status,
  205. 'url': response.url,
  206. 'headers': dict(response.headers),
  207. 'body': response.body,
  208. }
  209. self.db['%s_data' % key] = pickle.dumps(data, protocol=2)
  210. self.db['%s_time' % key] = str(time())
  211. def _read_data(self, spider, request):
  212. key = self._request_key(request)
  213. db = self.db
  214. tkey = '%s_time' % key
  215. if tkey not in db:
  216. return # not found
  217. ts = db[tkey]
  218. if 0 < self.expiration_secs < time() - float(ts):
  219. return # expired
  220. return pickle.loads(db['%s_data' % key])
  221. def _request_key(self, request):
  222. return request_fingerprint(request)
  223. class FilesystemCacheStorage(object):
  224. def __init__(self, settings):
  225. self.cachedir = data_path(settings['HTTPCACHE_DIR'])
  226. self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS')
  227. self.use_gzip = settings.getbool('HTTPCACHE_GZIP')
  228. self._open = gzip.open if self.use_gzip else open
  229. def open_spider(self, spider):
  230. logger.debug("Using filesystem cache storage in %(cachedir)s" % {'cachedir': self.cachedir},
  231. extra={'spider': spider})
  232. def close_spider(self, spider):
  233. pass
  234. def retrieve_response(self, spider, request):
  235. """Return response if present in cache, or None otherwise."""
  236. metadata = self._read_meta(spider, request)
  237. if metadata is None:
  238. return # not cached
  239. rpath = self._get_request_path(spider, request)
  240. with self._open(os.path.join(rpath, 'response_body'), 'rb') as f:
  241. body = f.read()
  242. with self._open(os.path.join(rpath, 'response_headers'), 'rb') as f:
  243. rawheaders = f.read()
  244. url = metadata.get('response_url')
  245. status = metadata['status']
  246. headers = Headers(headers_raw_to_dict(rawheaders))
  247. respcls = responsetypes.from_args(headers=headers, url=url)
  248. response = respcls(url=url, headers=headers, status=status, body=body)
  249. return response
  250. def store_response(self, spider, request, response):
  251. """Store the given response in the cache."""
  252. rpath = self._get_request_path(spider, request)
  253. if not os.path.exists(rpath):
  254. os.makedirs(rpath)
  255. metadata = {
  256. 'url': request.url,
  257. 'method': request.method,
  258. 'status': response.status,
  259. 'response_url': response.url,
  260. 'timestamp': time(),
  261. }
  262. with self._open(os.path.join(rpath, 'meta'), 'wb') as f:
  263. f.write(to_bytes(repr(metadata)))
  264. with self._open(os.path.join(rpath, 'pickled_meta'), 'wb') as f:
  265. pickle.dump(metadata, f, protocol=2)
  266. with self._open(os.path.join(rpath, 'response_headers'), 'wb') as f:
  267. f.write(headers_dict_to_raw(response.headers))
  268. with self._open(os.path.join(rpath, 'response_body'), 'wb') as f:
  269. f.write(response.body)
  270. with self._open(os.path.join(rpath, 'request_headers'), 'wb') as f:
  271. f.write(headers_dict_to_raw(request.headers))
  272. with self._open(os.path.join(rpath, 'request_body'), 'wb') as f:
  273. f.write(request.body)
  274. def _get_request_path(self, spider, request):
  275. key = request_fingerprint(request)
  276. return os.path.join(self.cachedir, spider.name, key[0:2], key)
  277. def _read_meta(self, spider, request):
  278. rpath = self._get_request_path(spider, request)
  279. metapath = os.path.join(rpath, 'pickled_meta')
  280. if not os.path.exists(metapath):
  281. return # not found
  282. mtime = os.stat(metapath).st_mtime
  283. if 0 < self.expiration_secs < time() - mtime:
  284. return # expired
  285. with self._open(metapath, 'rb') as f:
  286. return pickle.load(f)
  287. class LeveldbCacheStorage(object):
  288. def __init__(self, settings):
  289. warn("The LevelDB storage backend is deprecated.",
  290. ScrapyDeprecationWarning, stacklevel=2)
  291. import leveldb
  292. self._leveldb = leveldb
  293. self.cachedir = data_path(settings['HTTPCACHE_DIR'], createdir=True)
  294. self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS')
  295. self.db = None
  296. def open_spider(self, spider):
  297. dbpath = os.path.join(self.cachedir, '%s.leveldb' % spider.name)
  298. self.db = self._leveldb.LevelDB(dbpath)
  299. logger.debug("Using LevelDB cache storage in %(cachepath)s" % {'cachepath': dbpath}, extra={'spider': spider})
  300. def close_spider(self, spider):
  301. # Do compactation each time to save space and also recreate files to
  302. # avoid them being removed in storages with timestamp-based autoremoval.
  303. self.db.CompactRange()
  304. del self.db
  305. garbage_collect()
  306. def retrieve_response(self, spider, request):
  307. data = self._read_data(spider, request)
  308. if data is None:
  309. return # not cached
  310. url = data['url']
  311. status = data['status']
  312. headers = Headers(data['headers'])
  313. body = data['body']
  314. respcls = responsetypes.from_args(headers=headers, url=url)
  315. response = respcls(url=url, headers=headers, status=status, body=body)
  316. return response
  317. def store_response(self, spider, request, response):
  318. key = self._request_key(request)
  319. data = {
  320. 'status': response.status,
  321. 'url': response.url,
  322. 'headers': dict(response.headers),
  323. 'body': response.body,
  324. }
  325. batch = self._leveldb.WriteBatch()
  326. batch.Put(key + b'_data', pickle.dumps(data, protocol=2))
  327. batch.Put(key + b'_time', to_bytes(str(time())))
  328. self.db.Write(batch)
  329. def _read_data(self, spider, request):
  330. key = self._request_key(request)
  331. try:
  332. ts = self.db.Get(key + b'_time')
  333. except KeyError:
  334. return # not found or invalid entry
  335. if 0 < self.expiration_secs < time() - float(ts):
  336. return # expired
  337. try:
  338. data = self.db.Get(key + b'_data')
  339. except KeyError:
  340. return # invalid entry
  341. else:
  342. return pickle.loads(data)
  343. def _request_key(self, request):
  344. return to_bytes(request_fingerprint(request))
  345. def parse_cachecontrol(header):
  346. """Parse Cache-Control header
  347. https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.9
  348. >>> parse_cachecontrol(b'public, max-age=3600') == {b'public': None,
  349. ... b'max-age': b'3600'}
  350. True
  351. >>> parse_cachecontrol(b'') == {}
  352. True
  353. """
  354. directives = {}
  355. for directive in header.split(b','):
  356. key, sep, val = directive.strip().partition(b'=')
  357. if key:
  358. directives[key.lower()] = val if sep else None
  359. return directives
  360. def rfc1123_to_epoch(date_str):
  361. try:
  362. date_str = to_unicode(date_str, encoding='ascii')
  363. return mktime_tz(parsedate_tz(date_str))
  364. except Exception:
  365. return None