12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152 |
- import logging
- from twisted.internet import task
- from scrapy.exceptions import NotConfigured
- from scrapy import signals
- logger = logging.getLogger(__name__)
- class LogStats(object):
- """Log basic scraping stats periodically"""
- def __init__(self, stats, interval=60.0):
- self.stats = stats
- self.interval = interval
- self.multiplier = 60.0 / self.interval
- self.task = None
- @classmethod
- def from_crawler(cls, crawler):
- interval = crawler.settings.getfloat('LOGSTATS_INTERVAL')
- if not interval:
- raise NotConfigured
- o = cls(crawler.stats, interval)
- crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
- crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
- return o
- def spider_opened(self, spider):
- self.pagesprev = 0
- self.itemsprev = 0
- self.task = task.LoopingCall(self.log, spider)
- self.task.start(self.interval)
- def log(self, spider):
- items = self.stats.get_value('item_scraped_count', 0)
- pages = self.stats.get_value('response_received_count', 0)
- irate = (items - self.itemsprev) * self.multiplier
- prate = (pages - self.pagesprev) * self.multiplier
- self.pagesprev, self.itemsprev = pages, items
- msg = ("Crawled %(pages)d pages (at %(pagerate)d pages/min), "
- "scraped %(items)d items (at %(itemrate)d items/min)")
- log_args = {'pages': pages, 'pagerate': prate,
- 'items': items, 'itemrate': irate}
- logger.info(msg, log_args, extra={'spider': spider})
- def spider_closed(self, spider, reason):
- if self.task and self.task.running:
- self.task.stop()
|