logstats.py 1.7 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152
  1. import logging
  2. from twisted.internet import task
  3. from scrapy.exceptions import NotConfigured
  4. from scrapy import signals
  5. logger = logging.getLogger(__name__)
  6. class LogStats(object):
  7. """Log basic scraping stats periodically"""
  8. def __init__(self, stats, interval=60.0):
  9. self.stats = stats
  10. self.interval = interval
  11. self.multiplier = 60.0 / self.interval
  12. self.task = None
  13. @classmethod
  14. def from_crawler(cls, crawler):
  15. interval = crawler.settings.getfloat('LOGSTATS_INTERVAL')
  16. if not interval:
  17. raise NotConfigured
  18. o = cls(crawler.stats, interval)
  19. crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
  20. crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
  21. return o
  22. def spider_opened(self, spider):
  23. self.pagesprev = 0
  24. self.itemsprev = 0
  25. self.task = task.LoopingCall(self.log, spider)
  26. self.task.start(self.interval)
  27. def log(self, spider):
  28. items = self.stats.get_value('item_scraped_count', 0)
  29. pages = self.stats.get_value('response_received_count', 0)
  30. irate = (items - self.itemsprev) * self.multiplier
  31. prate = (pages - self.pagesprev) * self.multiplier
  32. self.pagesprev, self.itemsprev = pages, items
  33. msg = ("Crawled %(pages)d pages (at %(pagerate)d pages/min), "
  34. "scraped %(items)d items (at %(itemrate)d items/min)")
  35. log_args = {'pages': pages, 'pagerate': prate,
  36. 'items': items, 'itemrate': irate}
  37. logger.info(msg, log_args, extra={'spider': spider})
  38. def spider_closed(self, spider, reason):
  39. if self.task and self.task.running:
  40. self.task.stop()