dayuan
/
manyi


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126
							import sys
import logging
from abc import ABCMeta, abstractmethod
from six import with_metaclass

from scrapy.utils.python import to_native_str, to_unicode

logger = logging.getLogger(__name__)

def decode_robotstxt(robotstxt_body, spider, to_native_str_type=False):
    try:
        if to_native_str_type:
            robotstxt_body = to_native_str(robotstxt_body)
        else:
            robotstxt_body = robotstxt_body.decode('utf-8')
    except UnicodeDecodeError:
        # If we found garbage or robots.txt in an encoding other than UTF-8, disregard it.
        # Switch to 'allow all' state.
        logger.warning("Failure while parsing robots.txt. "
                       "File either contains garbage or is in an encoding other than UTF-8, treating it as an empty file.",
                       exc_info=sys.exc_info(),
                       extra={'spider': spider})
        robotstxt_body = ''
    return robotstxt_body

class RobotParser(with_metaclass(ABCMeta)):
    @classmethod
    @abstractmethod
    def from_crawler(cls, crawler, robotstxt_body):
        """Parse the content of a robots.txt_ file as bytes. This must be a class method.
        It must return a new instance of the parser backend.

        :param crawler: crawler which made the request
        :type crawler: :class:`~scrapy.crawler.Crawler` instance

        :param robotstxt_body: content of a robots.txt_ file.
        :type robotstxt_body: bytes
        """
        pass

    @abstractmethod
    def allowed(self, url, user_agent):
        """Return ``True`` if  ``user_agent`` is allowed to crawl ``url``, otherwise return ``False``.

        :param url: Absolute URL
        :type url: string

        :param user_agent: User agent
        :type user_agent: string
        """
        pass


class PythonRobotParser(RobotParser):
    def __init__(self, robotstxt_body, spider):
        from six.moves.urllib_robotparser import RobotFileParser
        self.spider = spider
        robotstxt_body = decode_robotstxt(robotstxt_body, spider, to_native_str_type=True)
        self.rp = RobotFileParser()
        self.rp.parse(robotstxt_body.splitlines())

    @classmethod
    def from_crawler(cls, crawler, robotstxt_body):
        spider = None if not crawler else crawler.spider
        o = cls(robotstxt_body, spider)
        return o

    def allowed(self, url, user_agent):
        user_agent = to_native_str(user_agent)
        url = to_native_str(url)
        return self.rp.can_fetch(user_agent, url)


class ReppyRobotParser(RobotParser):
    def __init__(self, robotstxt_body, spider):
        from reppy.robots import Robots
        self.spider = spider
        self.rp = Robots.parse('', robotstxt_body)

    @classmethod
    def from_crawler(cls, crawler, robotstxt_body):
        spider = None if not crawler else crawler.spider
        o = cls(robotstxt_body, spider)
        return o

    def allowed(self, url, user_agent):
        return self.rp.allowed(url, user_agent)


class RerpRobotParser(RobotParser):
    def __init__(self, robotstxt_body, spider):
        from robotexclusionrulesparser import RobotExclusionRulesParser
        self.spider = spider
        self.rp = RobotExclusionRulesParser()
        robotstxt_body = decode_robotstxt(robotstxt_body, spider)
        self.rp.parse(robotstxt_body)

    @classmethod
    def from_crawler(cls, crawler, robotstxt_body):
        spider = None if not crawler else crawler.spider
        o = cls(robotstxt_body, spider)
        return o

    def allowed(self, url, user_agent):
        user_agent = to_unicode(user_agent)
        url = to_unicode(url)
        return self.rp.is_allowed(user_agent, url)


class ProtegoRobotParser(RobotParser):
    def __init__(self, robotstxt_body, spider):
        from protego import Protego
        self.spider = spider
        robotstxt_body = decode_robotstxt(robotstxt_body, spider)
        self.rp = Protego.parse(robotstxt_body)

    @classmethod
    def from_crawler(cls, crawler, robotstxt_body):
        spider = None if not crawler else crawler.spider
        o = cls(robotstxt_body, spider)
        return o

    def allowed(self, url, user_agent):
        user_agent = to_unicode(user_agent)
        url = to_unicode(url)
        return self.rp.can_fetch(url, user_agent)