depth.py 2.0 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758
  1. """
  2. Depth Spider Middleware
  3. See documentation in docs/topics/spider-middleware.rst
  4. """
  5. import logging
  6. from scrapy.http import Request
  7. logger = logging.getLogger(__name__)
  8. class DepthMiddleware(object):
  9. def __init__(self, maxdepth, stats, verbose_stats=False, prio=1):
  10. self.maxdepth = maxdepth
  11. self.stats = stats
  12. self.verbose_stats = verbose_stats
  13. self.prio = prio
  14. @classmethod
  15. def from_crawler(cls, crawler):
  16. settings = crawler.settings
  17. maxdepth = settings.getint('DEPTH_LIMIT')
  18. verbose = settings.getbool('DEPTH_STATS_VERBOSE')
  19. prio = settings.getint('DEPTH_PRIORITY')
  20. return cls(maxdepth, crawler.stats, verbose, prio)
  21. def process_spider_output(self, response, result, spider):
  22. def _filter(request):
  23. if isinstance(request, Request):
  24. depth = response.meta['depth'] + 1
  25. request.meta['depth'] = depth
  26. if self.prio:
  27. request.priority -= depth * self.prio
  28. if self.maxdepth and depth > self.maxdepth:
  29. logger.debug(
  30. "Ignoring link (depth > %(maxdepth)d): %(requrl)s ",
  31. {'maxdepth': self.maxdepth, 'requrl': request.url},
  32. extra={'spider': spider}
  33. )
  34. return False
  35. else:
  36. if self.verbose_stats:
  37. self.stats.inc_value('request_depth_count/%s' % depth,
  38. spider=spider)
  39. self.stats.max_value('request_depth_max', depth,
  40. spider=spider)
  41. return True
  42. # base case (depth=0)
  43. if 'depth' not in response.meta:
  44. response.meta['depth'] = 0
  45. if self.verbose_stats:
  46. self.stats.inc_value('request_depth_count/0', spider=spider)
  47. return (r for r in result or () if _filter(r))