httperror.py 1.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657
  1. """
  2. HttpError Spider Middleware
  3. See documentation in docs/topics/spider-middleware.rst
  4. """
  5. import logging
  6. from scrapy.exceptions import IgnoreRequest
  7. logger = logging.getLogger(__name__)
  8. class HttpError(IgnoreRequest):
  9. """A non-200 response was filtered"""
  10. def __init__(self, response, *args, **kwargs):
  11. self.response = response
  12. super(HttpError, self).__init__(*args, **kwargs)
  13. class HttpErrorMiddleware(object):
  14. @classmethod
  15. def from_crawler(cls, crawler):
  16. return cls(crawler.settings)
  17. def __init__(self, settings):
  18. self.handle_httpstatus_all = settings.getbool('HTTPERROR_ALLOW_ALL')
  19. self.handle_httpstatus_list = settings.getlist('HTTPERROR_ALLOWED_CODES')
  20. def process_spider_input(self, response, spider):
  21. if 200 <= response.status < 300: # common case
  22. return
  23. meta = response.meta
  24. if 'handle_httpstatus_all' in meta:
  25. return
  26. if 'handle_httpstatus_list' in meta:
  27. allowed_statuses = meta['handle_httpstatus_list']
  28. elif self.handle_httpstatus_all:
  29. return
  30. else:
  31. allowed_statuses = getattr(spider, 'handle_httpstatus_list', self.handle_httpstatus_list)
  32. if response.status in allowed_statuses:
  33. return
  34. raise HttpError(response, 'Ignoring non-200 response')
  35. def process_spider_exception(self, response, exception, spider):
  36. if isinstance(exception, HttpError):
  37. spider.crawler.stats.inc_value('httperror/response_ignored_count')
  38. spider.crawler.stats.inc_value(
  39. 'httperror/response_ignored_status_count/%s' % response.status
  40. )
  41. logger.info(
  42. "Ignoring response %(response)r: HTTP status code is not handled or not allowed",
  43. {'response': response}, extra={'spider': spider},
  44. )
  45. return []