spiderstate.py 1.1 KB

123456789101112131415161718192021222324252627282930313233343536373839
  1. import os
  2. from six.moves import cPickle as pickle
  3. from scrapy import signals
  4. from scrapy.exceptions import NotConfigured
  5. from scrapy.utils.job import job_dir
  6. class SpiderState(object):
  7. """Store and load spider state during a scraping job"""
  8. def __init__(self, jobdir=None):
  9. self.jobdir = jobdir
  10. @classmethod
  11. def from_crawler(cls, crawler):
  12. jobdir = job_dir(crawler.settings)
  13. if not jobdir:
  14. raise NotConfigured
  15. obj = cls(jobdir)
  16. crawler.signals.connect(obj.spider_closed, signal=signals.spider_closed)
  17. crawler.signals.connect(obj.spider_opened, signal=signals.spider_opened)
  18. return obj
  19. def spider_closed(self, spider):
  20. if self.jobdir:
  21. with open(self.statefn, 'wb') as f:
  22. pickle.dump(spider.state, f, protocol=2)
  23. def spider_opened(self, spider):
  24. if self.jobdir and os.path.exists(self.statefn):
  25. with open(self.statefn, 'rb') as f:
  26. spider.state = pickle.load(f)
  27. else:
  28. spider.state = {}
  29. @property
  30. def statefn(self):
  31. return os.path.join(self.jobdir, 'spider.state')