throttle.py 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293
  1. import logging
  2. from scrapy.exceptions import NotConfigured
  3. from scrapy import signals
  4. logger = logging.getLogger(__name__)
  5. class AutoThrottle(object):
  6. def __init__(self, crawler):
  7. self.crawler = crawler
  8. if not crawler.settings.getbool('AUTOTHROTTLE_ENABLED'):
  9. raise NotConfigured
  10. self.debug = crawler.settings.getbool("AUTOTHROTTLE_DEBUG")
  11. self.target_concurrency = crawler.settings.getfloat("AUTOTHROTTLE_TARGET_CONCURRENCY")
  12. crawler.signals.connect(self._spider_opened, signal=signals.spider_opened)
  13. crawler.signals.connect(self._response_downloaded, signal=signals.response_downloaded)
  14. @classmethod
  15. def from_crawler(cls, crawler):
  16. return cls(crawler)
  17. def _spider_opened(self, spider):
  18. self.mindelay = self._min_delay(spider)
  19. self.maxdelay = self._max_delay(spider)
  20. spider.download_delay = self._start_delay(spider)
  21. def _min_delay(self, spider):
  22. s = self.crawler.settings
  23. return getattr(spider, 'download_delay', s.getfloat('DOWNLOAD_DELAY'))
  24. def _max_delay(self, spider):
  25. return self.crawler.settings.getfloat('AUTOTHROTTLE_MAX_DELAY')
  26. def _start_delay(self, spider):
  27. return max(self.mindelay, self.crawler.settings.getfloat('AUTOTHROTTLE_START_DELAY'))
  28. def _response_downloaded(self, response, request, spider):
  29. key, slot = self._get_slot(request, spider)
  30. latency = request.meta.get('download_latency')
  31. if latency is None or slot is None:
  32. return
  33. olddelay = slot.delay
  34. self._adjust_delay(slot, latency, response)
  35. if self.debug:
  36. diff = slot.delay - olddelay
  37. size = len(response.body)
  38. conc = len(slot.transferring)
  39. logger.info(
  40. "slot: %(slot)s | conc:%(concurrency)2d | "
  41. "delay:%(delay)5d ms (%(delaydiff)+d) | "
  42. "latency:%(latency)5d ms | size:%(size)6d bytes",
  43. {
  44. 'slot': key, 'concurrency': conc,
  45. 'delay': slot.delay * 1000, 'delaydiff': diff * 1000,
  46. 'latency': latency * 1000, 'size': size
  47. },
  48. extra={'spider': spider}
  49. )
  50. def _get_slot(self, request, spider):
  51. key = request.meta.get('download_slot')
  52. return key, self.crawler.engine.downloader.slots.get(key)
  53. def _adjust_delay(self, slot, latency, response):
  54. """Define delay adjustment policy"""
  55. # If a server needs `latency` seconds to respond then
  56. # we should send a request each `latency/N` seconds
  57. # to have N requests processed in parallel
  58. target_delay = latency / self.target_concurrency
  59. # Adjust the delay to make it closer to target_delay
  60. new_delay = (slot.delay + target_delay) / 2.0
  61. # If target delay is bigger than old delay, then use it instead of mean.
  62. # It works better with problematic sites.
  63. new_delay = max(target_delay, new_delay)
  64. # Make sure self.mindelay <= new_delay <= self.max_delay
  65. new_delay = min(max(self.mindelay, new_delay), self.maxdelay)
  66. # Dont adjust delay if response status != 200 and new delay is smaller
  67. # than old one, as error pages (and redirections) are usually small and
  68. # so tend to reduce latency, thus provoking a positive feedback by
  69. # reducing delay instead of increase.
  70. if response.status != 200 and new_delay <= slot.delay:
  71. return
  72. slot.delay = new_delay