123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657 |
- """
- HttpError Spider Middleware
- See documentation in docs/topics/spider-middleware.rst
- """
- import logging
- from scrapy.exceptions import IgnoreRequest
- logger = logging.getLogger(__name__)
- class HttpError(IgnoreRequest):
- """A non-200 response was filtered"""
- def __init__(self, response, *args, **kwargs):
- self.response = response
- super(HttpError, self).__init__(*args, **kwargs)
- class HttpErrorMiddleware(object):
- @classmethod
- def from_crawler(cls, crawler):
- return cls(crawler.settings)
- def __init__(self, settings):
- self.handle_httpstatus_all = settings.getbool('HTTPERROR_ALLOW_ALL')
- self.handle_httpstatus_list = settings.getlist('HTTPERROR_ALLOWED_CODES')
- def process_spider_input(self, response, spider):
- if 200 <= response.status < 300: # common case
- return
- meta = response.meta
- if 'handle_httpstatus_all' in meta:
- return
- if 'handle_httpstatus_list' in meta:
- allowed_statuses = meta['handle_httpstatus_list']
- elif self.handle_httpstatus_all:
- return
- else:
- allowed_statuses = getattr(spider, 'handle_httpstatus_list', self.handle_httpstatus_list)
- if response.status in allowed_statuses:
- return
- raise HttpError(response, 'Ignoring non-200 response')
- def process_spider_exception(self, response, exception, spider):
- if isinstance(exception, HttpError):
- spider.crawler.stats.inc_value('httperror/response_ignored_count')
- spider.crawler.stats.inc_value(
- 'httperror/response_ignored_status_count/%s' % response.status
- )
- logger.info(
- "Ignoring response %(response)r: HTTP status code is not handled or not allowed",
- {'response': response}, extra={'spider': spider},
- )
- return []
|