offsite.py 2.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172
  1. """
  2. Offsite Spider Middleware
  3. See documentation in docs/topics/spider-middleware.rst
  4. """
  5. import re
  6. import logging
  7. import warnings
  8. from scrapy import signals
  9. from scrapy.http import Request
  10. from scrapy.utils.httpobj import urlparse_cached
  11. logger = logging.getLogger(__name__)
  12. class OffsiteMiddleware(object):
  13. def __init__(self, stats):
  14. self.stats = stats
  15. @classmethod
  16. def from_crawler(cls, crawler):
  17. o = cls(crawler.stats)
  18. crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
  19. return o
  20. def process_spider_output(self, response, result, spider):
  21. for x in result:
  22. if isinstance(x, Request):
  23. if x.dont_filter or self.should_follow(x, spider):
  24. yield x
  25. else:
  26. domain = urlparse_cached(x).hostname
  27. if domain and domain not in self.domains_seen:
  28. self.domains_seen.add(domain)
  29. logger.debug(
  30. "Filtered offsite request to %(domain)r: %(request)s",
  31. {'domain': domain, 'request': x}, extra={'spider': spider})
  32. self.stats.inc_value('offsite/domains', spider=spider)
  33. self.stats.inc_value('offsite/filtered', spider=spider)
  34. else:
  35. yield x
  36. def should_follow(self, request, spider):
  37. regex = self.host_regex
  38. # hostname can be None for wrong urls (like javascript links)
  39. host = urlparse_cached(request).hostname or ''
  40. return bool(regex.search(host))
  41. def get_host_regex(self, spider):
  42. """Override this method to implement a different offsite policy"""
  43. allowed_domains = getattr(spider, 'allowed_domains', None)
  44. if not allowed_domains:
  45. return re.compile('') # allow all by default
  46. url_pattern = re.compile("^https?://.*$")
  47. for domain in allowed_domains:
  48. if url_pattern.match(domain):
  49. message = ("allowed_domains accepts only domains, not URLs. "
  50. "Ignoring URL entry %s in allowed_domains." % domain)
  51. warnings.warn(message, URLWarning)
  52. domains = [re.escape(d) for d in allowed_domains if d is not None]
  53. regex = r'^(.*\.)?(%s)$' % '|'.join(domains)
  54. return re.compile(regex)
  55. def spider_opened(self, spider):
  56. self.host_regex = self.get_host_regex(spider)
  57. self.domains_seen = set()
  58. class URLWarning(Warning):
  59. pass