robotstxt.py 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126
  1. import sys
  2. import logging
  3. from abc import ABCMeta, abstractmethod
  4. from six import with_metaclass
  5. from scrapy.utils.python import to_native_str, to_unicode
  6. logger = logging.getLogger(__name__)
  7. def decode_robotstxt(robotstxt_body, spider, to_native_str_type=False):
  8. try:
  9. if to_native_str_type:
  10. robotstxt_body = to_native_str(robotstxt_body)
  11. else:
  12. robotstxt_body = robotstxt_body.decode('utf-8')
  13. except UnicodeDecodeError:
  14. # If we found garbage or robots.txt in an encoding other than UTF-8, disregard it.
  15. # Switch to 'allow all' state.
  16. logger.warning("Failure while parsing robots.txt. "
  17. "File either contains garbage or is in an encoding other than UTF-8, treating it as an empty file.",
  18. exc_info=sys.exc_info(),
  19. extra={'spider': spider})
  20. robotstxt_body = ''
  21. return robotstxt_body
  22. class RobotParser(with_metaclass(ABCMeta)):
  23. @classmethod
  24. @abstractmethod
  25. def from_crawler(cls, crawler, robotstxt_body):
  26. """Parse the content of a robots.txt_ file as bytes. This must be a class method.
  27. It must return a new instance of the parser backend.
  28. :param crawler: crawler which made the request
  29. :type crawler: :class:`~scrapy.crawler.Crawler` instance
  30. :param robotstxt_body: content of a robots.txt_ file.
  31. :type robotstxt_body: bytes
  32. """
  33. pass
  34. @abstractmethod
  35. def allowed(self, url, user_agent):
  36. """Return ``True`` if ``user_agent`` is allowed to crawl ``url``, otherwise return ``False``.
  37. :param url: Absolute URL
  38. :type url: string
  39. :param user_agent: User agent
  40. :type user_agent: string
  41. """
  42. pass
  43. class PythonRobotParser(RobotParser):
  44. def __init__(self, robotstxt_body, spider):
  45. from six.moves.urllib_robotparser import RobotFileParser
  46. self.spider = spider
  47. robotstxt_body = decode_robotstxt(robotstxt_body, spider, to_native_str_type=True)
  48. self.rp = RobotFileParser()
  49. self.rp.parse(robotstxt_body.splitlines())
  50. @classmethod
  51. def from_crawler(cls, crawler, robotstxt_body):
  52. spider = None if not crawler else crawler.spider
  53. o = cls(robotstxt_body, spider)
  54. return o
  55. def allowed(self, url, user_agent):
  56. user_agent = to_native_str(user_agent)
  57. url = to_native_str(url)
  58. return self.rp.can_fetch(user_agent, url)
  59. class ReppyRobotParser(RobotParser):
  60. def __init__(self, robotstxt_body, spider):
  61. from reppy.robots import Robots
  62. self.spider = spider
  63. self.rp = Robots.parse('', robotstxt_body)
  64. @classmethod
  65. def from_crawler(cls, crawler, robotstxt_body):
  66. spider = None if not crawler else crawler.spider
  67. o = cls(robotstxt_body, spider)
  68. return o
  69. def allowed(self, url, user_agent):
  70. return self.rp.allowed(url, user_agent)
  71. class RerpRobotParser(RobotParser):
  72. def __init__(self, robotstxt_body, spider):
  73. from robotexclusionrulesparser import RobotExclusionRulesParser
  74. self.spider = spider
  75. self.rp = RobotExclusionRulesParser()
  76. robotstxt_body = decode_robotstxt(robotstxt_body, spider)
  77. self.rp.parse(robotstxt_body)
  78. @classmethod
  79. def from_crawler(cls, crawler, robotstxt_body):
  80. spider = None if not crawler else crawler.spider
  81. o = cls(robotstxt_body, spider)
  82. return o
  83. def allowed(self, url, user_agent):
  84. user_agent = to_unicode(user_agent)
  85. url = to_unicode(url)
  86. return self.rp.is_allowed(user_agent, url)
  87. class ProtegoRobotParser(RobotParser):
  88. def __init__(self, robotstxt_body, spider):
  89. from protego import Protego
  90. self.spider = spider
  91. robotstxt_body = decode_robotstxt(robotstxt_body, spider)
  92. self.rp = Protego.parse(robotstxt_body)
  93. @classmethod
  94. def from_crawler(cls, crawler, robotstxt_body):
  95. spider = None if not crawler else crawler.spider
  96. o = cls(robotstxt_body, spider)
  97. return o
  98. def allowed(self, url, user_agent):
  99. user_agent = to_unicode(user_agent)
  100. url = to_unicode(url)
  101. return self.rp.can_fetch(url, user_agent)