123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446 |
- from __future__ import print_function
- import gzip
- import logging
- import os
- from email.utils import mktime_tz, parsedate_tz
- from importlib import import_module
- from time import time
- from warnings import warn
- from weakref import WeakKeyDictionary
- from six.moves import cPickle as pickle
- from w3lib.http import headers_raw_to_dict, headers_dict_to_raw
- from scrapy.exceptions import ScrapyDeprecationWarning
- from scrapy.http import Headers, Response
- from scrapy.responsetypes import responsetypes
- from scrapy.utils.httpobj import urlparse_cached
- from scrapy.utils.project import data_path
- from scrapy.utils.python import to_bytes, to_unicode, garbage_collect
- from scrapy.utils.request import request_fingerprint
- logger = logging.getLogger(__name__)
- class DummyPolicy(object):
- def __init__(self, settings):
- self.ignore_schemes = settings.getlist('HTTPCACHE_IGNORE_SCHEMES')
- self.ignore_http_codes = [int(x) for x in settings.getlist('HTTPCACHE_IGNORE_HTTP_CODES')]
- def should_cache_request(self, request):
- return urlparse_cached(request).scheme not in self.ignore_schemes
- def should_cache_response(self, response, request):
- return response.status not in self.ignore_http_codes
- def is_cached_response_fresh(self, cachedresponse, request):
- return True
- def is_cached_response_valid(self, cachedresponse, response, request):
- return True
- class RFC2616Policy(object):
- MAXAGE = 3600 * 24 * 365 # one year
- def __init__(self, settings):
- self.always_store = settings.getbool('HTTPCACHE_ALWAYS_STORE')
- self.ignore_schemes = settings.getlist('HTTPCACHE_IGNORE_SCHEMES')
- self.ignore_response_cache_controls = [to_bytes(cc) for cc in
- settings.getlist('HTTPCACHE_IGNORE_RESPONSE_CACHE_CONTROLS')]
- self._cc_parsed = WeakKeyDictionary()
- def _parse_cachecontrol(self, r):
- if r not in self._cc_parsed:
- cch = r.headers.get(b'Cache-Control', b'')
- parsed = parse_cachecontrol(cch)
- if isinstance(r, Response):
- for key in self.ignore_response_cache_controls:
- parsed.pop(key, None)
- self._cc_parsed[r] = parsed
- return self._cc_parsed[r]
- def should_cache_request(self, request):
- if urlparse_cached(request).scheme in self.ignore_schemes:
- return False
- cc = self._parse_cachecontrol(request)
- # obey user-agent directive "Cache-Control: no-store"
- if b'no-store' in cc:
- return False
- # Any other is eligible for caching
- return True
- def should_cache_response(self, response, request):
- # What is cacheable - https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.9.1
- # Response cacheability - https://www.w3.org/Protocols/rfc2616/rfc2616-sec13.html#sec13.4
- # Status code 206 is not included because cache can not deal with partial contents
- cc = self._parse_cachecontrol(response)
- # obey directive "Cache-Control: no-store"
- if b'no-store' in cc:
- return False
- # Never cache 304 (Not Modified) responses
- elif response.status == 304:
- return False
- # Cache unconditionally if configured to do so
- elif self.always_store:
- return True
- # Any hint on response expiration is good
- elif b'max-age' in cc or b'Expires' in response.headers:
- return True
- # Firefox fallbacks this statuses to one year expiration if none is set
- elif response.status in (300, 301, 308):
- return True
- # Other statuses without expiration requires at least one validator
- elif response.status in (200, 203, 401):
- return b'Last-Modified' in response.headers or b'ETag' in response.headers
- # Any other is probably not eligible for caching
- # Makes no sense to cache responses that does not contain expiration
- # info and can not be revalidated
- else:
- return False
- def is_cached_response_fresh(self, cachedresponse, request):
- cc = self._parse_cachecontrol(cachedresponse)
- ccreq = self._parse_cachecontrol(request)
- if b'no-cache' in cc or b'no-cache' in ccreq:
- return False
- now = time()
- freshnesslifetime = self._compute_freshness_lifetime(cachedresponse, request, now)
- currentage = self._compute_current_age(cachedresponse, request, now)
- reqmaxage = self._get_max_age(ccreq)
- if reqmaxage is not None:
- freshnesslifetime = min(freshnesslifetime, reqmaxage)
- if currentage < freshnesslifetime:
- return True
- if b'max-stale' in ccreq and b'must-revalidate' not in cc:
- # From RFC2616: "Indicates that the client is willing to
- # accept a response that has exceeded its expiration time.
- # If max-stale is assigned a value, then the client is
- # willing to accept a response that has exceeded its
- # expiration time by no more than the specified number of
- # seconds. If no value is assigned to max-stale, then the
- # client is willing to accept a stale response of any age."
- staleage = ccreq[b'max-stale']
- if staleage is None:
- return True
- try:
- if currentage < freshnesslifetime + max(0, int(staleage)):
- return True
- except ValueError:
- pass
- # Cached response is stale, try to set validators if any
- self._set_conditional_validators(request, cachedresponse)
- return False
- def is_cached_response_valid(self, cachedresponse, response, request):
- # Use the cached response if the new response is a server error,
- # as long as the old response didn't specify must-revalidate.
- if response.status >= 500:
- cc = self._parse_cachecontrol(cachedresponse)
- if b'must-revalidate' not in cc:
- return True
- # Use the cached response if the server says it hasn't changed.
- return response.status == 304
- def _set_conditional_validators(self, request, cachedresponse):
- if b'Last-Modified' in cachedresponse.headers:
- request.headers[b'If-Modified-Since'] = cachedresponse.headers[b'Last-Modified']
- if b'ETag' in cachedresponse.headers:
- request.headers[b'If-None-Match'] = cachedresponse.headers[b'ETag']
- def _get_max_age(self, cc):
- try:
- return max(0, int(cc[b'max-age']))
- except (KeyError, ValueError):
- return None
- def _compute_freshness_lifetime(self, response, request, now):
- # Reference nsHttpResponseHead::ComputeFreshnessLifetime
- # https://dxr.mozilla.org/mozilla-central/source/netwerk/protocol/http/nsHttpResponseHead.cpp#706
- cc = self._parse_cachecontrol(response)
- maxage = self._get_max_age(cc)
- if maxage is not None:
- return maxage
- # Parse date header or synthesize it if none exists
- date = rfc1123_to_epoch(response.headers.get(b'Date')) or now
- # Try HTTP/1.0 Expires header
- if b'Expires' in response.headers:
- expires = rfc1123_to_epoch(response.headers[b'Expires'])
- # When parsing Expires header fails RFC 2616 section 14.21 says we
- # should treat this as an expiration time in the past.
- return max(0, expires - date) if expires else 0
- # Fallback to heuristic using last-modified header
- # This is not in RFC but on Firefox caching implementation
- lastmodified = rfc1123_to_epoch(response.headers.get(b'Last-Modified'))
- if lastmodified and lastmodified <= date:
- return (date - lastmodified) / 10
- # This request can be cached indefinitely
- if response.status in (300, 301, 308):
- return self.MAXAGE
- # Insufficient information to compute fresshness lifetime
- return 0
- def _compute_current_age(self, response, request, now):
- # Reference nsHttpResponseHead::ComputeCurrentAge
- # https://dxr.mozilla.org/mozilla-central/source/netwerk/protocol/http/nsHttpResponseHead.cpp#658
- currentage = 0
- # If Date header is not set we assume it is a fast connection, and
- # clock is in sync with the server
- date = rfc1123_to_epoch(response.headers.get(b'Date')) or now
- if now > date:
- currentage = now - date
- if b'Age' in response.headers:
- try:
- age = int(response.headers[b'Age'])
- currentage = max(currentage, age)
- except ValueError:
- pass
- return currentage
- class DbmCacheStorage(object):
- def __init__(self, settings):
- self.cachedir = data_path(settings['HTTPCACHE_DIR'], createdir=True)
- self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS')
- self.dbmodule = import_module(settings['HTTPCACHE_DBM_MODULE'])
- self.db = None
- def open_spider(self, spider):
- dbpath = os.path.join(self.cachedir, '%s.db' % spider.name)
- self.db = self.dbmodule.open(dbpath, 'c')
- logger.debug("Using DBM cache storage in %(cachepath)s" % {'cachepath': dbpath}, extra={'spider': spider})
- def close_spider(self, spider):
- self.db.close()
- def retrieve_response(self, spider, request):
- data = self._read_data(spider, request)
- if data is None:
- return # not cached
- url = data['url']
- status = data['status']
- headers = Headers(data['headers'])
- body = data['body']
- respcls = responsetypes.from_args(headers=headers, url=url)
- response = respcls(url=url, headers=headers, status=status, body=body)
- return response
- def store_response(self, spider, request, response):
- key = self._request_key(request)
- data = {
- 'status': response.status,
- 'url': response.url,
- 'headers': dict(response.headers),
- 'body': response.body,
- }
- self.db['%s_data' % key] = pickle.dumps(data, protocol=2)
- self.db['%s_time' % key] = str(time())
- def _read_data(self, spider, request):
- key = self._request_key(request)
- db = self.db
- tkey = '%s_time' % key
- if tkey not in db:
- return # not found
- ts = db[tkey]
- if 0 < self.expiration_secs < time() - float(ts):
- return # expired
- return pickle.loads(db['%s_data' % key])
- def _request_key(self, request):
- return request_fingerprint(request)
- class FilesystemCacheStorage(object):
- def __init__(self, settings):
- self.cachedir = data_path(settings['HTTPCACHE_DIR'])
- self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS')
- self.use_gzip = settings.getbool('HTTPCACHE_GZIP')
- self._open = gzip.open if self.use_gzip else open
- def open_spider(self, spider):
- logger.debug("Using filesystem cache storage in %(cachedir)s" % {'cachedir': self.cachedir},
- extra={'spider': spider})
- def close_spider(self, spider):
- pass
- def retrieve_response(self, spider, request):
- """Return response if present in cache, or None otherwise."""
- metadata = self._read_meta(spider, request)
- if metadata is None:
- return # not cached
- rpath = self._get_request_path(spider, request)
- with self._open(os.path.join(rpath, 'response_body'), 'rb') as f:
- body = f.read()
- with self._open(os.path.join(rpath, 'response_headers'), 'rb') as f:
- rawheaders = f.read()
- url = metadata.get('response_url')
- status = metadata['status']
- headers = Headers(headers_raw_to_dict(rawheaders))
- respcls = responsetypes.from_args(headers=headers, url=url)
- response = respcls(url=url, headers=headers, status=status, body=body)
- return response
- def store_response(self, spider, request, response):
- """Store the given response in the cache."""
- rpath = self._get_request_path(spider, request)
- if not os.path.exists(rpath):
- os.makedirs(rpath)
- metadata = {
- 'url': request.url,
- 'method': request.method,
- 'status': response.status,
- 'response_url': response.url,
- 'timestamp': time(),
- }
- with self._open(os.path.join(rpath, 'meta'), 'wb') as f:
- f.write(to_bytes(repr(metadata)))
- with self._open(os.path.join(rpath, 'pickled_meta'), 'wb') as f:
- pickle.dump(metadata, f, protocol=2)
- with self._open(os.path.join(rpath, 'response_headers'), 'wb') as f:
- f.write(headers_dict_to_raw(response.headers))
- with self._open(os.path.join(rpath, 'response_body'), 'wb') as f:
- f.write(response.body)
- with self._open(os.path.join(rpath, 'request_headers'), 'wb') as f:
- f.write(headers_dict_to_raw(request.headers))
- with self._open(os.path.join(rpath, 'request_body'), 'wb') as f:
- f.write(request.body)
- def _get_request_path(self, spider, request):
- key = request_fingerprint(request)
- return os.path.join(self.cachedir, spider.name, key[0:2], key)
- def _read_meta(self, spider, request):
- rpath = self._get_request_path(spider, request)
- metapath = os.path.join(rpath, 'pickled_meta')
- if not os.path.exists(metapath):
- return # not found
- mtime = os.stat(metapath).st_mtime
- if 0 < self.expiration_secs < time() - mtime:
- return # expired
- with self._open(metapath, 'rb') as f:
- return pickle.load(f)
- class LeveldbCacheStorage(object):
- def __init__(self, settings):
- warn("The LevelDB storage backend is deprecated.",
- ScrapyDeprecationWarning, stacklevel=2)
- import leveldb
- self._leveldb = leveldb
- self.cachedir = data_path(settings['HTTPCACHE_DIR'], createdir=True)
- self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS')
- self.db = None
- def open_spider(self, spider):
- dbpath = os.path.join(self.cachedir, '%s.leveldb' % spider.name)
- self.db = self._leveldb.LevelDB(dbpath)
- logger.debug("Using LevelDB cache storage in %(cachepath)s" % {'cachepath': dbpath}, extra={'spider': spider})
- def close_spider(self, spider):
- # Do compactation each time to save space and also recreate files to
- # avoid them being removed in storages with timestamp-based autoremoval.
- self.db.CompactRange()
- del self.db
- garbage_collect()
- def retrieve_response(self, spider, request):
- data = self._read_data(spider, request)
- if data is None:
- return # not cached
- url = data['url']
- status = data['status']
- headers = Headers(data['headers'])
- body = data['body']
- respcls = responsetypes.from_args(headers=headers, url=url)
- response = respcls(url=url, headers=headers, status=status, body=body)
- return response
- def store_response(self, spider, request, response):
- key = self._request_key(request)
- data = {
- 'status': response.status,
- 'url': response.url,
- 'headers': dict(response.headers),
- 'body': response.body,
- }
- batch = self._leveldb.WriteBatch()
- batch.Put(key + b'_data', pickle.dumps(data, protocol=2))
- batch.Put(key + b'_time', to_bytes(str(time())))
- self.db.Write(batch)
- def _read_data(self, spider, request):
- key = self._request_key(request)
- try:
- ts = self.db.Get(key + b'_time')
- except KeyError:
- return # not found or invalid entry
- if 0 < self.expiration_secs < time() - float(ts):
- return # expired
- try:
- data = self.db.Get(key + b'_data')
- except KeyError:
- return # invalid entry
- else:
- return pickle.loads(data)
- def _request_key(self, request):
- return to_bytes(request_fingerprint(request))
- def parse_cachecontrol(header):
- """Parse Cache-Control header
- https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.9
- >>> parse_cachecontrol(b'public, max-age=3600') == {b'public': None,
- ... b'max-age': b'3600'}
- True
- >>> parse_cachecontrol(b'') == {}
- True
- """
- directives = {}
- for directive in header.split(b','):
- key, sep, val = directive.strip().partition(b'=')
- if key:
- directives[key.lower()] = val if sep else None
- return directives
- def rfc1123_to_epoch(date_str):
- try:
- date_str = to_unicode(date_str, encoding='ascii')
- return mktime_tz(parsedate_tz(date_str))
- except Exception:
- return None
|