ajaxcrawl.py 3.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394
  1. # -*- coding: utf-8 -*-
  2. from __future__ import absolute_import
  3. import re
  4. import logging
  5. import six
  6. from w3lib import html
  7. from scrapy.exceptions import NotConfigured
  8. from scrapy.http import HtmlResponse
  9. logger = logging.getLogger(__name__)
  10. class AjaxCrawlMiddleware(object):
  11. """
  12. Handle 'AJAX crawlable' pages marked as crawlable via meta tag.
  13. For more info see https://developers.google.com/webmasters/ajax-crawling/docs/getting-started.
  14. """
  15. def __init__(self, settings):
  16. if not settings.getbool('AJAXCRAWL_ENABLED'):
  17. raise NotConfigured
  18. # XXX: Google parses at least first 100k bytes; scrapy's redirect
  19. # middleware parses first 4k. 4k turns out to be insufficient
  20. # for this middleware, and parsing 100k could be slow.
  21. # We use something in between (32K) by default.
  22. self.lookup_bytes = settings.getint('AJAXCRAWL_MAXSIZE', 32768)
  23. @classmethod
  24. def from_crawler(cls, crawler):
  25. return cls(crawler.settings)
  26. def process_response(self, request, response, spider):
  27. if not isinstance(response, HtmlResponse) or response.status != 200:
  28. return response
  29. if request.method != 'GET':
  30. # other HTTP methods are either not safe or don't have a body
  31. return response
  32. if 'ajax_crawlable' in request.meta: # prevent loops
  33. return response
  34. if not self._has_ajax_crawlable_variant(response):
  35. return response
  36. # scrapy already handles #! links properly
  37. ajax_crawl_request = request.replace(url=request.url+'#!')
  38. logger.debug("Downloading AJAX crawlable %(ajax_crawl_request)s instead of %(request)s",
  39. {'ajax_crawl_request': ajax_crawl_request, 'request': request},
  40. extra={'spider': spider})
  41. ajax_crawl_request.meta['ajax_crawlable'] = True
  42. return ajax_crawl_request
  43. def _has_ajax_crawlable_variant(self, response):
  44. """
  45. Return True if a page without hash fragment could be "AJAX crawlable"
  46. according to https://developers.google.com/webmasters/ajax-crawling/docs/getting-started.
  47. """
  48. body = response.text[:self.lookup_bytes]
  49. return _has_ajaxcrawlable_meta(body)
  50. # XXX: move it to w3lib?
  51. _ajax_crawlable_re = re.compile(six.u(r'<meta\s+name=["\']fragment["\']\s+content=["\']!["\']/?>'))
  52. def _has_ajaxcrawlable_meta(text):
  53. """
  54. >>> _has_ajaxcrawlable_meta('<html><head><meta name="fragment" content="!"/></head><body></body></html>')
  55. True
  56. >>> _has_ajaxcrawlable_meta("<html><head><meta name='fragment' content='!'></head></html>")
  57. True
  58. >>> _has_ajaxcrawlable_meta('<html><head><!--<meta name="fragment" content="!"/>--></head><body></body></html>')
  59. False
  60. >>> _has_ajaxcrawlable_meta('<html></html>')
  61. False
  62. """
  63. # Stripping scripts and comments is slow (about 20x slower than
  64. # just checking if a string is in text); this is a quick fail-fast
  65. # path that should work for most pages.
  66. if 'fragment' not in text:
  67. return False
  68. if 'content' not in text:
  69. return False
  70. text = html.remove_tags_with_content(text, ('script', 'noscript'))
  71. text = html.replace_entities(text)
  72. text = html.remove_comments(text)
  73. return _ajax_crawlable_re.search(text) is not None