urllength.py 1.0 KB

12345678910111213141516171819202122232425262728293031323334353637
  1. """
  2. Url Length Spider Middleware
  3. See documentation in docs/topics/spider-middleware.rst
  4. """
  5. import logging
  6. from scrapy.http import Request
  7. from scrapy.exceptions import NotConfigured
  8. logger = logging.getLogger(__name__)
  9. class UrlLengthMiddleware(object):
  10. def __init__(self, maxlength):
  11. self.maxlength = maxlength
  12. @classmethod
  13. def from_settings(cls, settings):
  14. maxlength = settings.getint('URLLENGTH_LIMIT')
  15. if not maxlength:
  16. raise NotConfigured
  17. return cls(maxlength)
  18. def process_spider_output(self, response, result, spider):
  19. def _filter(request):
  20. if isinstance(request, Request) and len(request.url) > self.maxlength:
  21. logger.debug("Ignoring link (url length > %(maxlength)d): %(url)s ",
  22. {'maxlength': self.maxlength, 'url': request.url},
  23. extra={'spider': spider})
  24. return False
  25. else:
  26. return True
  27. return (r for r in result or () if _filter(r))