crawl.py 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125
  1. """
  2. This modules implements the CrawlSpider which is the recommended spider to use
  3. for scraping typical web sites that requires crawling pages.
  4. See documentation in docs/topics/spiders.rst
  5. """
  6. import copy
  7. import warnings
  8. import six
  9. from scrapy.exceptions import ScrapyDeprecationWarning
  10. from scrapy.http import Request, HtmlResponse
  11. from scrapy.linkextractors import LinkExtractor
  12. from scrapy.spiders import Spider
  13. from scrapy.utils.python import get_func_args
  14. from scrapy.utils.spider import iterate_spider_output
  15. def _identity(request, response):
  16. return request
  17. def _get_method(method, spider):
  18. if callable(method):
  19. return method
  20. elif isinstance(method, six.string_types):
  21. return getattr(spider, method, None)
  22. _default_link_extractor = LinkExtractor()
  23. class Rule(object):
  24. def __init__(self, link_extractor=None, callback=None, cb_kwargs=None, follow=None, process_links=None, process_request=None):
  25. self.link_extractor = link_extractor or _default_link_extractor
  26. self.callback = callback
  27. self.cb_kwargs = cb_kwargs or {}
  28. self.process_links = process_links
  29. self.process_request = process_request or _identity
  30. self.process_request_argcount = None
  31. self.follow = follow if follow is not None else not callback
  32. def _compile(self, spider):
  33. self.callback = _get_method(self.callback, spider)
  34. self.process_links = _get_method(self.process_links, spider)
  35. self.process_request = _get_method(self.process_request, spider)
  36. self.process_request_argcount = len(get_func_args(self.process_request))
  37. if self.process_request_argcount == 1:
  38. msg = 'Rule.process_request should accept two arguments (request, response), accepting only one is deprecated'
  39. warnings.warn(msg, category=ScrapyDeprecationWarning, stacklevel=2)
  40. def _process_request(self, request, response):
  41. """
  42. Wrapper around the request processing function to maintain backward
  43. compatibility with functions that do not take a Response object
  44. """
  45. args = [request] if self.process_request_argcount == 1 else [request, response]
  46. return self.process_request(*args)
  47. class CrawlSpider(Spider):
  48. rules = ()
  49. def __init__(self, *a, **kw):
  50. super(CrawlSpider, self).__init__(*a, **kw)
  51. self._compile_rules()
  52. def parse(self, response):
  53. return self._parse_response(response, self.parse_start_url, cb_kwargs={}, follow=True)
  54. def parse_start_url(self, response):
  55. return []
  56. def process_results(self, response, results):
  57. return results
  58. def _build_request(self, rule, link):
  59. r = Request(url=link.url, callback=self._response_downloaded)
  60. r.meta.update(rule=rule, link_text=link.text)
  61. return r
  62. def _requests_to_follow(self, response):
  63. if not isinstance(response, HtmlResponse):
  64. return
  65. seen = set()
  66. for n, rule in enumerate(self._rules):
  67. links = [lnk for lnk in rule.link_extractor.extract_links(response)
  68. if lnk not in seen]
  69. if links and rule.process_links:
  70. links = rule.process_links(links)
  71. for link in links:
  72. seen.add(link)
  73. request = self._build_request(n, link)
  74. yield rule._process_request(request, response)
  75. def _response_downloaded(self, response):
  76. rule = self._rules[response.meta['rule']]
  77. return self._parse_response(response, rule.callback, rule.cb_kwargs, rule.follow)
  78. def _parse_response(self, response, callback, cb_kwargs, follow=True):
  79. if callback:
  80. cb_res = callback(response, **cb_kwargs) or ()
  81. cb_res = self.process_results(response, cb_res)
  82. for requests_or_item in iterate_spider_output(cb_res):
  83. yield requests_or_item
  84. if follow and self._follow_links:
  85. for request_or_item in self._requests_to_follow(response):
  86. yield request_or_item
  87. def _compile_rules(self):
  88. self._rules = [copy.copy(r) for r in self.rules]
  89. for rule in self._rules:
  90. rule._compile(self)
  91. @classmethod
  92. def from_crawler(cls, crawler, *args, **kwargs):
  93. spider = super(CrawlSpider, cls).from_crawler(crawler, *args, **kwargs)
  94. spider._follow_links = crawler.settings.getbool(
  95. 'CRAWLSPIDER_FOLLOW_LINKS', True)
  96. return spider