123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293 |
- import logging
- from scrapy.exceptions import NotConfigured
- from scrapy import signals
- logger = logging.getLogger(__name__)
- class AutoThrottle(object):
- def __init__(self, crawler):
- self.crawler = crawler
- if not crawler.settings.getbool('AUTOTHROTTLE_ENABLED'):
- raise NotConfigured
- self.debug = crawler.settings.getbool("AUTOTHROTTLE_DEBUG")
- self.target_concurrency = crawler.settings.getfloat("AUTOTHROTTLE_TARGET_CONCURRENCY")
- crawler.signals.connect(self._spider_opened, signal=signals.spider_opened)
- crawler.signals.connect(self._response_downloaded, signal=signals.response_downloaded)
- @classmethod
- def from_crawler(cls, crawler):
- return cls(crawler)
- def _spider_opened(self, spider):
- self.mindelay = self._min_delay(spider)
- self.maxdelay = self._max_delay(spider)
- spider.download_delay = self._start_delay(spider)
- def _min_delay(self, spider):
- s = self.crawler.settings
- return getattr(spider, 'download_delay', s.getfloat('DOWNLOAD_DELAY'))
- def _max_delay(self, spider):
- return self.crawler.settings.getfloat('AUTOTHROTTLE_MAX_DELAY')
- def _start_delay(self, spider):
- return max(self.mindelay, self.crawler.settings.getfloat('AUTOTHROTTLE_START_DELAY'))
- def _response_downloaded(self, response, request, spider):
- key, slot = self._get_slot(request, spider)
- latency = request.meta.get('download_latency')
- if latency is None or slot is None:
- return
- olddelay = slot.delay
- self._adjust_delay(slot, latency, response)
- if self.debug:
- diff = slot.delay - olddelay
- size = len(response.body)
- conc = len(slot.transferring)
- logger.info(
- "slot: %(slot)s | conc:%(concurrency)2d | "
- "delay:%(delay)5d ms (%(delaydiff)+d) | "
- "latency:%(latency)5d ms | size:%(size)6d bytes",
- {
- 'slot': key, 'concurrency': conc,
- 'delay': slot.delay * 1000, 'delaydiff': diff * 1000,
- 'latency': latency * 1000, 'size': size
- },
- extra={'spider': spider}
- )
- def _get_slot(self, request, spider):
- key = request.meta.get('download_slot')
- return key, self.crawler.engine.downloader.slots.get(key)
- def _adjust_delay(self, slot, latency, response):
- """Define delay adjustment policy"""
- # If a server needs `latency` seconds to respond then
- # we should send a request each `latency/N` seconds
- # to have N requests processed in parallel
- target_delay = latency / self.target_concurrency
- # Adjust the delay to make it closer to target_delay
- new_delay = (slot.delay + target_delay) / 2.0
- # If target delay is bigger than old delay, then use it instead of mean.
- # It works better with problematic sites.
- new_delay = max(target_delay, new_delay)
- # Make sure self.mindelay <= new_delay <= self.max_delay
- new_delay = min(max(self.mindelay, new_delay), self.maxdelay)
- # Dont adjust delay if response status != 200 and new delay is smaller
- # than old one, as error pages (and redirections) are usually small and
- # so tend to reduce latency, thus provoking a positive feedback by
- # reducing delay instead of increase.
- if response.status != 200 and new_delay <= slot.delay:
- return
- slot.delay = new_delay
|