robotstxt.py 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112
  1. """
  2. This is a middleware to respect robots.txt policies. To activate it you must
  3. enable this middleware and enable the ROBOTSTXT_OBEY setting.
  4. """
  5. import logging
  6. import sys
  7. import re
  8. from twisted.internet.defer import Deferred, maybeDeferred
  9. from scrapy.exceptions import NotConfigured, IgnoreRequest
  10. from scrapy.http import Request
  11. from scrapy.utils.httpobj import urlparse_cached
  12. from scrapy.utils.log import failure_to_exc_info
  13. from scrapy.utils.python import to_native_str
  14. from scrapy.utils.misc import load_object
  15. logger = logging.getLogger(__name__)
  16. class RobotsTxtMiddleware(object):
  17. DOWNLOAD_PRIORITY = 1000
  18. def __init__(self, crawler):
  19. if not crawler.settings.getbool('ROBOTSTXT_OBEY'):
  20. raise NotConfigured
  21. self._default_useragent = crawler.settings.get('USER_AGENT', 'Scrapy')
  22. self._robotstxt_useragent = crawler.settings.get('ROBOTSTXT_USER_AGENT', None)
  23. self.crawler = crawler
  24. self._parsers = {}
  25. self._parserimpl = load_object(crawler.settings.get('ROBOTSTXT_PARSER'))
  26. # check if parser dependencies are met, this should throw an error otherwise.
  27. self._parserimpl.from_crawler(self.crawler, b'')
  28. @classmethod
  29. def from_crawler(cls, crawler):
  30. return cls(crawler)
  31. def process_request(self, request, spider):
  32. if request.meta.get('dont_obey_robotstxt'):
  33. return
  34. d = maybeDeferred(self.robot_parser, request, spider)
  35. d.addCallback(self.process_request_2, request, spider)
  36. return d
  37. def process_request_2(self, rp, request, spider):
  38. if rp is None:
  39. return
  40. useragent = self._robotstxt_useragent
  41. if not useragent:
  42. useragent = request.headers.get(b'User-Agent', self._default_useragent)
  43. if not rp.allowed(request.url, useragent):
  44. logger.debug("Forbidden by robots.txt: %(request)s",
  45. {'request': request}, extra={'spider': spider})
  46. self.crawler.stats.inc_value('robotstxt/forbidden')
  47. raise IgnoreRequest("Forbidden by robots.txt")
  48. def robot_parser(self, request, spider):
  49. url = urlparse_cached(request)
  50. netloc = url.netloc
  51. if netloc not in self._parsers:
  52. self._parsers[netloc] = Deferred()
  53. robotsurl = "%s://%s/robots.txt" % (url.scheme, url.netloc)
  54. robotsreq = Request(
  55. robotsurl,
  56. priority=self.DOWNLOAD_PRIORITY,
  57. meta={'dont_obey_robotstxt': True}
  58. )
  59. dfd = self.crawler.engine.download(robotsreq, spider)
  60. dfd.addCallback(self._parse_robots, netloc, spider)
  61. dfd.addErrback(self._logerror, robotsreq, spider)
  62. dfd.addErrback(self._robots_error, netloc)
  63. self.crawler.stats.inc_value('robotstxt/request_count')
  64. if isinstance(self._parsers[netloc], Deferred):
  65. d = Deferred()
  66. def cb(result):
  67. d.callback(result)
  68. return result
  69. self._parsers[netloc].addCallback(cb)
  70. return d
  71. else:
  72. return self._parsers[netloc]
  73. def _logerror(self, failure, request, spider):
  74. if failure.type is not IgnoreRequest:
  75. logger.error("Error downloading %(request)s: %(f_exception)s",
  76. {'request': request, 'f_exception': failure.value},
  77. exc_info=failure_to_exc_info(failure),
  78. extra={'spider': spider})
  79. return failure
  80. def _parse_robots(self, response, netloc, spider):
  81. self.crawler.stats.inc_value('robotstxt/response_count')
  82. self.crawler.stats.inc_value('robotstxt/response_status_count/{}'.format(response.status))
  83. rp = self._parserimpl.from_crawler(self.crawler, response.body)
  84. rp_dfd = self._parsers[netloc]
  85. self._parsers[netloc] = rp
  86. rp_dfd.callback(rp)
  87. def _robots_error(self, failure, netloc):
  88. if failure.type is not IgnoreRequest:
  89. key = 'robotstxt/exception_count/{}'.format(failure.type)
  90. self.crawler.stats.inc_value(key)
  91. rp_dfd = self._parsers[netloc]
  92. self._parsers[netloc] = None
  93. rp_dfd.callback(None)