httpcache.py 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108
  1. from email.utils import formatdate
  2. from twisted.internet import defer
  3. from twisted.internet.error import TimeoutError, DNSLookupError, \
  4. ConnectionRefusedError, ConnectionDone, ConnectError, \
  5. ConnectionLost, TCPTimedOutError
  6. from twisted.web.client import ResponseFailed
  7. from scrapy import signals
  8. from scrapy.exceptions import NotConfigured, IgnoreRequest
  9. from scrapy.utils.misc import load_object
  10. class HttpCacheMiddleware(object):
  11. DOWNLOAD_EXCEPTIONS = (defer.TimeoutError, TimeoutError, DNSLookupError,
  12. ConnectionRefusedError, ConnectionDone, ConnectError,
  13. ConnectionLost, TCPTimedOutError, ResponseFailed,
  14. IOError)
  15. def __init__(self, settings, stats):
  16. if not settings.getbool('HTTPCACHE_ENABLED'):
  17. raise NotConfigured
  18. self.policy = load_object(settings['HTTPCACHE_POLICY'])(settings)
  19. self.storage = load_object(settings['HTTPCACHE_STORAGE'])(settings)
  20. self.ignore_missing = settings.getbool('HTTPCACHE_IGNORE_MISSING')
  21. self.stats = stats
  22. @classmethod
  23. def from_crawler(cls, crawler):
  24. o = cls(crawler.settings, crawler.stats)
  25. crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
  26. crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
  27. return o
  28. def spider_opened(self, spider):
  29. self.storage.open_spider(spider)
  30. def spider_closed(self, spider):
  31. self.storage.close_spider(spider)
  32. def process_request(self, request, spider):
  33. if request.meta.get('dont_cache', False):
  34. return
  35. # Skip uncacheable requests
  36. if not self.policy.should_cache_request(request):
  37. request.meta['_dont_cache'] = True # flag as uncacheable
  38. return
  39. # Look for cached response and check if expired
  40. cachedresponse = self.storage.retrieve_response(spider, request)
  41. if cachedresponse is None:
  42. self.stats.inc_value('httpcache/miss', spider=spider)
  43. if self.ignore_missing:
  44. self.stats.inc_value('httpcache/ignore', spider=spider)
  45. raise IgnoreRequest("Ignored request not in cache: %s" % request)
  46. return # first time request
  47. # Return cached response only if not expired
  48. cachedresponse.flags.append('cached')
  49. if self.policy.is_cached_response_fresh(cachedresponse, request):
  50. self.stats.inc_value('httpcache/hit', spider=spider)
  51. return cachedresponse
  52. # Keep a reference to cached response to avoid a second cache lookup on
  53. # process_response hook
  54. request.meta['cached_response'] = cachedresponse
  55. def process_response(self, request, response, spider):
  56. if request.meta.get('dont_cache', False):
  57. return response
  58. # Skip cached responses and uncacheable requests
  59. if 'cached' in response.flags or '_dont_cache' in request.meta:
  60. request.meta.pop('_dont_cache', None)
  61. return response
  62. # RFC2616 requires origin server to set Date header,
  63. # https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.18
  64. if 'Date' not in response.headers:
  65. response.headers['Date'] = formatdate(usegmt=1)
  66. # Do not validate first-hand responses
  67. cachedresponse = request.meta.pop('cached_response', None)
  68. if cachedresponse is None:
  69. self.stats.inc_value('httpcache/firsthand', spider=spider)
  70. self._cache_response(spider, response, request, cachedresponse)
  71. return response
  72. if self.policy.is_cached_response_valid(cachedresponse, response, request):
  73. self.stats.inc_value('httpcache/revalidate', spider=spider)
  74. return cachedresponse
  75. self.stats.inc_value('httpcache/invalidate', spider=spider)
  76. self._cache_response(spider, response, request, cachedresponse)
  77. return response
  78. def process_exception(self, request, exception, spider):
  79. cachedresponse = request.meta.pop('cached_response', None)
  80. if cachedresponse is not None and isinstance(exception, self.DOWNLOAD_EXCEPTIONS):
  81. self.stats.inc_value('httpcache/errorrecovery', spider=spider)
  82. return cachedresponse
  83. def _cache_response(self, spider, response, request, cachedresponse):
  84. if self.policy.should_cache_response(response, request):
  85. self.stats.inc_value('httpcache/store', spider=spider)
  86. self.storage.store_response(spider, request, response)
  87. else:
  88. self.stats.inc_value('httpcache/uncacheable', spider=spider)