retry.py 3.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889
  1. """
  2. An extension to retry failed requests that are potentially caused by temporary
  3. problems such as a connection timeout or HTTP 500 error.
  4. You can change the behaviour of this middleware by modifing the scraping settings:
  5. RETRY_TIMES - how many times to retry a failed page
  6. RETRY_HTTP_CODES - which HTTP response codes to retry
  7. Failed pages are collected on the scraping process and rescheduled at the end,
  8. once the spider has finished crawling all regular (non failed) pages.
  9. """
  10. import logging
  11. from twisted.internet import defer
  12. from twisted.internet.error import TimeoutError, DNSLookupError, \
  13. ConnectionRefusedError, ConnectionDone, ConnectError, \
  14. ConnectionLost, TCPTimedOutError
  15. from twisted.web.client import ResponseFailed
  16. from scrapy.exceptions import NotConfigured
  17. from scrapy.utils.response import response_status_message
  18. from scrapy.core.downloader.handlers.http11 import TunnelError
  19. from scrapy.utils.python import global_object_name
  20. logger = logging.getLogger(__name__)
  21. class RetryMiddleware(object):
  22. # IOError is raised by the HttpCompression middleware when trying to
  23. # decompress an empty response
  24. EXCEPTIONS_TO_RETRY = (defer.TimeoutError, TimeoutError, DNSLookupError,
  25. ConnectionRefusedError, ConnectionDone, ConnectError,
  26. ConnectionLost, TCPTimedOutError, ResponseFailed,
  27. IOError, TunnelError)
  28. def __init__(self, settings):
  29. if not settings.getbool('RETRY_ENABLED'):
  30. raise NotConfigured
  31. self.max_retry_times = settings.getint('RETRY_TIMES')
  32. self.retry_http_codes = set(int(x) for x in settings.getlist('RETRY_HTTP_CODES'))
  33. self.priority_adjust = settings.getint('RETRY_PRIORITY_ADJUST')
  34. @classmethod
  35. def from_crawler(cls, crawler):
  36. return cls(crawler.settings)
  37. def process_response(self, request, response, spider):
  38. if request.meta.get('dont_retry', False):
  39. return response
  40. if response.status in self.retry_http_codes:
  41. reason = response_status_message(response.status)
  42. return self._retry(request, reason, spider) or response
  43. return response
  44. def process_exception(self, request, exception, spider):
  45. if isinstance(exception, self.EXCEPTIONS_TO_RETRY) \
  46. and not request.meta.get('dont_retry', False):
  47. return self._retry(request, exception, spider)
  48. def _retry(self, request, reason, spider):
  49. retries = request.meta.get('retry_times', 0) + 1
  50. retry_times = self.max_retry_times
  51. if 'max_retry_times' in request.meta:
  52. retry_times = request.meta['max_retry_times']
  53. stats = spider.crawler.stats
  54. if retries <= retry_times:
  55. logger.debug("Retrying %(request)s (failed %(retries)d times): %(reason)s",
  56. {'request': request, 'retries': retries, 'reason': reason},
  57. extra={'spider': spider})
  58. retryreq = request.copy()
  59. retryreq.meta['retry_times'] = retries
  60. retryreq.dont_filter = True
  61. retryreq.priority = request.priority + self.priority_adjust
  62. if isinstance(reason, Exception):
  63. reason = global_object_name(reason.__class__)
  64. stats.inc_value('retry/count')
  65. stats.inc_value('retry/reason_count/%s' % reason)
  66. return retryreq
  67. else:
  68. stats.inc_value('retry/max_reached')
  69. logger.debug("Gave up retrying %(request)s (failed %(retries)d times): %(reason)s",
  70. {'request': request, 'retries': retries, 'reason': reason},
  71. extra={'spider': spider})