123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347 |
- import six
- import signal
- import logging
- import warnings
- import sys
- from twisted.internet import reactor, defer
- from zope.interface.verify import verifyClass, DoesNotImplement
- from scrapy import Spider
- from scrapy.core.engine import ExecutionEngine
- from scrapy.resolver import CachingThreadedResolver
- from scrapy.interfaces import ISpiderLoader
- from scrapy.extension import ExtensionManager
- from scrapy.settings import overridden_settings, Settings
- from scrapy.signalmanager import SignalManager
- from scrapy.exceptions import ScrapyDeprecationWarning
- from scrapy.utils.ossignal import install_shutdown_handlers, signal_names
- from scrapy.utils.misc import load_object
- from scrapy.utils.log import (
- LogCounterHandler, configure_logging, log_scrapy_info,
- get_scrapy_root_handler, install_scrapy_root_handler)
- from scrapy import signals
- logger = logging.getLogger(__name__)
- class Crawler(object):
- def __init__(self, spidercls, settings=None):
- if isinstance(spidercls, Spider):
- raise ValueError(
- 'The spidercls argument must be a class, not an object')
- if isinstance(settings, dict) or settings is None:
- settings = Settings(settings)
- self.spidercls = spidercls
- self.settings = settings.copy()
- self.spidercls.update_settings(self.settings)
- self.signals = SignalManager(self)
- self.stats = load_object(self.settings['STATS_CLASS'])(self)
- handler = LogCounterHandler(self, level=self.settings.get('LOG_LEVEL'))
- logging.root.addHandler(handler)
- d = dict(overridden_settings(self.settings))
- logger.info("Overridden settings: %(settings)r", {'settings': d})
- if get_scrapy_root_handler() is not None:
- # scrapy root handler already installed: update it with new settings
- install_scrapy_root_handler(self.settings)
- # lambda is assigned to Crawler attribute because this way it is not
- # garbage collected after leaving __init__ scope
- self.__remove_handler = lambda: logging.root.removeHandler(handler)
- self.signals.connect(self.__remove_handler, signals.engine_stopped)
- lf_cls = load_object(self.settings['LOG_FORMATTER'])
- self.logformatter = lf_cls.from_crawler(self)
- self.extensions = ExtensionManager.from_crawler(self)
- self.settings.freeze()
- self.crawling = False
- self.spider = None
- self.engine = None
- @property
- def spiders(self):
- if not hasattr(self, '_spiders'):
- warnings.warn("Crawler.spiders is deprecated, use "
- "CrawlerRunner.spider_loader or instantiate "
- "scrapy.spiderloader.SpiderLoader with your "
- "settings.",
- category=ScrapyDeprecationWarning, stacklevel=2)
- self._spiders = _get_spider_loader(self.settings.frozencopy())
- return self._spiders
- @defer.inlineCallbacks
- def crawl(self, *args, **kwargs):
- assert not self.crawling, "Crawling already taking place"
- self.crawling = True
- try:
- self.spider = self._create_spider(*args, **kwargs)
- self.engine = self._create_engine()
- start_requests = iter(self.spider.start_requests())
- yield self.engine.open_spider(self.spider, start_requests)
- yield defer.maybeDeferred(self.engine.start)
- except Exception:
- # In Python 2 reraising an exception after yield discards
- # the original traceback (see https://bugs.python.org/issue7563),
- # so sys.exc_info() workaround is used.
- # This workaround also works in Python 3, but it is not needed,
- # and it is slower, so in Python 3 we use native `raise`.
- if six.PY2:
- exc_info = sys.exc_info()
- self.crawling = False
- if self.engine is not None:
- yield self.engine.close()
- if six.PY2:
- six.reraise(*exc_info)
- raise
- def _create_spider(self, *args, **kwargs):
- return self.spidercls.from_crawler(self, *args, **kwargs)
- def _create_engine(self):
- return ExecutionEngine(self, lambda _: self.stop())
- @defer.inlineCallbacks
- def stop(self):
- """Starts a graceful stop of the crawler and returns a deferred that is
- fired when the crawler is stopped."""
- if self.crawling:
- self.crawling = False
- yield defer.maybeDeferred(self.engine.stop)
- class CrawlerRunner(object):
- """
- This is a convenient helper class that keeps track of, manages and runs
- crawlers inside an already setup Twisted `reactor`_.
- The CrawlerRunner object must be instantiated with a
- :class:`~scrapy.settings.Settings` object.
- This class shouldn't be needed (since Scrapy is responsible of using it
- accordingly) unless writing scripts that manually handle the crawling
- process. See :ref:`run-from-script` for an example.
- """
- crawlers = property(
- lambda self: self._crawlers,
- doc="Set of :class:`crawlers <scrapy.crawler.Crawler>` started by "
- ":meth:`crawl` and managed by this class."
- )
- def __init__(self, settings=None):
- if isinstance(settings, dict) or settings is None:
- settings = Settings(settings)
- self.settings = settings
- self.spider_loader = _get_spider_loader(settings)
- self._crawlers = set()
- self._active = set()
- self.bootstrap_failed = False
- @property
- def spiders(self):
- warnings.warn("CrawlerRunner.spiders attribute is renamed to "
- "CrawlerRunner.spider_loader.",
- category=ScrapyDeprecationWarning, stacklevel=2)
- return self.spider_loader
- def crawl(self, crawler_or_spidercls, *args, **kwargs):
- """
- Run a crawler with the provided arguments.
- It will call the given Crawler's :meth:`~Crawler.crawl` method, while
- keeping track of it so it can be stopped later.
- If ``crawler_or_spidercls`` isn't a :class:`~scrapy.crawler.Crawler`
- instance, this method will try to create one using this parameter as
- the spider class given to it.
- Returns a deferred that is fired when the crawling is finished.
- :param crawler_or_spidercls: already created crawler, or a spider class
- or spider's name inside the project to create it
- :type crawler_or_spidercls: :class:`~scrapy.crawler.Crawler` instance,
- :class:`~scrapy.spiders.Spider` subclass or string
- :param list args: arguments to initialize the spider
- :param dict kwargs: keyword arguments to initialize the spider
- """
- if isinstance(crawler_or_spidercls, Spider):
- raise ValueError(
- 'The crawler_or_spidercls argument cannot be a spider object, '
- 'it must be a spider class (or a Crawler object)')
- crawler = self.create_crawler(crawler_or_spidercls)
- return self._crawl(crawler, *args, **kwargs)
- def _crawl(self, crawler, *args, **kwargs):
- self.crawlers.add(crawler)
- d = crawler.crawl(*args, **kwargs)
- self._active.add(d)
- def _done(result):
- self.crawlers.discard(crawler)
- self._active.discard(d)
- self.bootstrap_failed |= not getattr(crawler, 'spider', None)
- return result
- return d.addBoth(_done)
- def create_crawler(self, crawler_or_spidercls):
- """
- Return a :class:`~scrapy.crawler.Crawler` object.
- * If ``crawler_or_spidercls`` is a Crawler, it is returned as-is.
- * If ``crawler_or_spidercls`` is a Spider subclass, a new Crawler
- is constructed for it.
- * If ``crawler_or_spidercls`` is a string, this function finds
- a spider with this name in a Scrapy project (using spider loader),
- then creates a Crawler instance for it.
- """
- if isinstance(crawler_or_spidercls, Spider):
- raise ValueError(
- 'The crawler_or_spidercls argument cannot be a spider object, '
- 'it must be a spider class (or a Crawler object)')
- if isinstance(crawler_or_spidercls, Crawler):
- return crawler_or_spidercls
- return self._create_crawler(crawler_or_spidercls)
- def _create_crawler(self, spidercls):
- if isinstance(spidercls, six.string_types):
- spidercls = self.spider_loader.load(spidercls)
- return Crawler(spidercls, self.settings)
- def stop(self):
- """
- Stops simultaneously all the crawling jobs taking place.
- Returns a deferred that is fired when they all have ended.
- """
- return defer.DeferredList([c.stop() for c in list(self.crawlers)])
- @defer.inlineCallbacks
- def join(self):
- """
- join()
- Returns a deferred that is fired when all managed :attr:`crawlers` have
- completed their executions.
- """
- while self._active:
- yield defer.DeferredList(self._active)
- class CrawlerProcess(CrawlerRunner):
- """
- A class to run multiple scrapy crawlers in a process simultaneously.
- This class extends :class:`~scrapy.crawler.CrawlerRunner` by adding support
- for starting a Twisted `reactor`_ and handling shutdown signals, like the
- keyboard interrupt command Ctrl-C. It also configures top-level logging.
- This utility should be a better fit than
- :class:`~scrapy.crawler.CrawlerRunner` if you aren't running another
- Twisted `reactor`_ within your application.
- The CrawlerProcess object must be instantiated with a
- :class:`~scrapy.settings.Settings` object.
- :param install_root_handler: whether to install root logging handler
- (default: True)
- This class shouldn't be needed (since Scrapy is responsible of using it
- accordingly) unless writing scripts that manually handle the crawling
- process. See :ref:`run-from-script` for an example.
- """
- def __init__(self, settings=None, install_root_handler=True):
- super(CrawlerProcess, self).__init__(settings)
- install_shutdown_handlers(self._signal_shutdown)
- configure_logging(self.settings, install_root_handler)
- log_scrapy_info(self.settings)
- def _signal_shutdown(self, signum, _):
- install_shutdown_handlers(self._signal_kill)
- signame = signal_names[signum]
- logger.info("Received %(signame)s, shutting down gracefully. Send again to force ",
- {'signame': signame})
- reactor.callFromThread(self._graceful_stop_reactor)
- def _signal_kill(self, signum, _):
- install_shutdown_handlers(signal.SIG_IGN)
- signame = signal_names[signum]
- logger.info('Received %(signame)s twice, forcing unclean shutdown',
- {'signame': signame})
- reactor.callFromThread(self._stop_reactor)
- def start(self, stop_after_crawl=True):
- """
- This method starts a Twisted `reactor`_, adjusts its pool size to
- :setting:`REACTOR_THREADPOOL_MAXSIZE`, and installs a DNS cache based
- on :setting:`DNSCACHE_ENABLED` and :setting:`DNSCACHE_SIZE`.
- If ``stop_after_crawl`` is True, the reactor will be stopped after all
- crawlers have finished, using :meth:`join`.
- :param boolean stop_after_crawl: stop or not the reactor when all
- crawlers have finished
- """
- if stop_after_crawl:
- d = self.join()
- # Don't start the reactor if the deferreds are already fired
- if d.called:
- return
- d.addBoth(self._stop_reactor)
- reactor.installResolver(self._get_dns_resolver())
- tp = reactor.getThreadPool()
- tp.adjustPoolsize(maxthreads=self.settings.getint('REACTOR_THREADPOOL_MAXSIZE'))
- reactor.addSystemEventTrigger('before', 'shutdown', self.stop)
- reactor.run(installSignalHandlers=False) # blocking call
- def _get_dns_resolver(self):
- if self.settings.getbool('DNSCACHE_ENABLED'):
- cache_size = self.settings.getint('DNSCACHE_SIZE')
- else:
- cache_size = 0
- return CachingThreadedResolver(
- reactor=reactor,
- cache_size=cache_size,
- timeout=self.settings.getfloat('DNS_TIMEOUT')
- )
- def _graceful_stop_reactor(self):
- d = self.stop()
- d.addBoth(self._stop_reactor)
- return d
- def _stop_reactor(self, _=None):
- try:
- reactor.stop()
- except RuntimeError: # raised if already stopped or in shutdown stage
- pass
- def _get_spider_loader(settings):
- """ Get SpiderLoader instance from settings """
- cls_path = settings.get('SPIDER_LOADER_CLASS')
- loader_cls = load_object(cls_path)
- try:
- verifyClass(ISpiderLoader, loader_cls)
- except DoesNotImplement:
- warnings.warn(
- 'SPIDER_LOADER_CLASS (previously named SPIDER_MANAGER_CLASS) does '
- 'not fully implement scrapy.interfaces.ISpiderLoader interface. '
- 'Please add all missing methods to avoid unexpected runtime errors.',
- category=ScrapyDeprecationWarning, stacklevel=2
- )
- return loader_cls.from_settings(settings.frozencopy())
|