12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758 |
- """
- Depth Spider Middleware
- See documentation in docs/topics/spider-middleware.rst
- """
- import logging
- from scrapy.http import Request
- logger = logging.getLogger(__name__)
- class DepthMiddleware(object):
- def __init__(self, maxdepth, stats, verbose_stats=False, prio=1):
- self.maxdepth = maxdepth
- self.stats = stats
- self.verbose_stats = verbose_stats
- self.prio = prio
- @classmethod
- def from_crawler(cls, crawler):
- settings = crawler.settings
- maxdepth = settings.getint('DEPTH_LIMIT')
- verbose = settings.getbool('DEPTH_STATS_VERBOSE')
- prio = settings.getint('DEPTH_PRIORITY')
- return cls(maxdepth, crawler.stats, verbose, prio)
- def process_spider_output(self, response, result, spider):
- def _filter(request):
- if isinstance(request, Request):
- depth = response.meta['depth'] + 1
- request.meta['depth'] = depth
- if self.prio:
- request.priority -= depth * self.prio
- if self.maxdepth and depth > self.maxdepth:
- logger.debug(
- "Ignoring link (depth > %(maxdepth)d): %(requrl)s ",
- {'maxdepth': self.maxdepth, 'requrl': request.url},
- extra={'spider': spider}
- )
- return False
- else:
- if self.verbose_stats:
- self.stats.inc_value('request_depth_count/%s' % depth,
- spider=spider)
- self.stats.max_value('request_depth_max', depth,
- spider=spider)
- return True
- # base case (depth=0)
- if 'depth' not in response.meta:
- response.meta['depth'] = 0
- if self.verbose_stats:
- self.stats.inc_value('request_depth_count/0', spider=spider)
- return (r for r in result or () if _filter(r))
|