closespider.py 2.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869
  1. """CloseSpider is an extension that forces spiders to be closed after certain
  2. conditions are met.
  3. See documentation in docs/topics/extensions.rst
  4. """
  5. from collections import defaultdict
  6. from twisted.internet import reactor
  7. from scrapy import signals
  8. from scrapy.exceptions import NotConfigured
  9. class CloseSpider(object):
  10. def __init__(self, crawler):
  11. self.crawler = crawler
  12. self.close_on = {
  13. 'timeout': crawler.settings.getfloat('CLOSESPIDER_TIMEOUT'),
  14. 'itemcount': crawler.settings.getint('CLOSESPIDER_ITEMCOUNT'),
  15. 'pagecount': crawler.settings.getint('CLOSESPIDER_PAGECOUNT'),
  16. 'errorcount': crawler.settings.getint('CLOSESPIDER_ERRORCOUNT'),
  17. }
  18. if not any(self.close_on.values()):
  19. raise NotConfigured
  20. self.counter = defaultdict(int)
  21. if self.close_on.get('errorcount'):
  22. crawler.signals.connect(self.error_count, signal=signals.spider_error)
  23. if self.close_on.get('pagecount'):
  24. crawler.signals.connect(self.page_count, signal=signals.response_received)
  25. if self.close_on.get('timeout'):
  26. crawler.signals.connect(self.spider_opened, signal=signals.spider_opened)
  27. if self.close_on.get('itemcount'):
  28. crawler.signals.connect(self.item_scraped, signal=signals.item_scraped)
  29. crawler.signals.connect(self.spider_closed, signal=signals.spider_closed)
  30. @classmethod
  31. def from_crawler(cls, crawler):
  32. return cls(crawler)
  33. def error_count(self, failure, response, spider):
  34. self.counter['errorcount'] += 1
  35. if self.counter['errorcount'] == self.close_on['errorcount']:
  36. self.crawler.engine.close_spider(spider, 'closespider_errorcount')
  37. def page_count(self, response, request, spider):
  38. self.counter['pagecount'] += 1
  39. if self.counter['pagecount'] == self.close_on['pagecount']:
  40. self.crawler.engine.close_spider(spider, 'closespider_pagecount')
  41. def spider_opened(self, spider):
  42. self.task = reactor.callLater(self.close_on['timeout'], \
  43. self.crawler.engine.close_spider, spider, \
  44. reason='closespider_timeout')
  45. def item_scraped(self, item, spider):
  46. self.counter['itemcount'] += 1
  47. if self.counter['itemcount'] == self.close_on['itemcount']:
  48. self.crawler.engine.close_spider(spider, 'closespider_itemcount')
  49. def spider_closed(self, spider):
  50. task = getattr(self, 'task', False)
  51. if task and task.active():
  52. task.cancel()