sitemap.py 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101
  1. import re
  2. import logging
  3. import six
  4. from scrapy.spiders import Spider
  5. from scrapy.http import Request, XmlResponse
  6. from scrapy.utils.sitemap import Sitemap, sitemap_urls_from_robots
  7. from scrapy.utils.gz import gunzip, gzip_magic_number
  8. logger = logging.getLogger(__name__)
  9. class SitemapSpider(Spider):
  10. sitemap_urls = ()
  11. sitemap_rules = [('', 'parse')]
  12. sitemap_follow = ['']
  13. sitemap_alternate_links = False
  14. def __init__(self, *a, **kw):
  15. super(SitemapSpider, self).__init__(*a, **kw)
  16. self._cbs = []
  17. for r, c in self.sitemap_rules:
  18. if isinstance(c, six.string_types):
  19. c = getattr(self, c)
  20. self._cbs.append((regex(r), c))
  21. self._follow = [regex(x) for x in self.sitemap_follow]
  22. def start_requests(self):
  23. for url in self.sitemap_urls:
  24. yield Request(url, self._parse_sitemap)
  25. def sitemap_filter(self, entries):
  26. """This method can be used to filter sitemap entries by their
  27. attributes, for example, you can filter locs with lastmod greater
  28. than a given date (see docs).
  29. """
  30. for entry in entries:
  31. yield entry
  32. def _parse_sitemap(self, response):
  33. if response.url.endswith('/robots.txt'):
  34. for url in sitemap_urls_from_robots(response.text, base_url=response.url):
  35. yield Request(url, callback=self._parse_sitemap)
  36. else:
  37. body = self._get_sitemap_body(response)
  38. if body is None:
  39. logger.warning("Ignoring invalid sitemap: %(response)s",
  40. {'response': response}, extra={'spider': self})
  41. return
  42. s = Sitemap(body)
  43. it = self.sitemap_filter(s)
  44. if s.type == 'sitemapindex':
  45. for loc in iterloc(it, self.sitemap_alternate_links):
  46. if any(x.search(loc) for x in self._follow):
  47. yield Request(loc, callback=self._parse_sitemap)
  48. elif s.type == 'urlset':
  49. for loc in iterloc(it, self.sitemap_alternate_links):
  50. for r, c in self._cbs:
  51. if r.search(loc):
  52. yield Request(loc, callback=c)
  53. break
  54. def _get_sitemap_body(self, response):
  55. """Return the sitemap body contained in the given response,
  56. or None if the response is not a sitemap.
  57. """
  58. if isinstance(response, XmlResponse):
  59. return response.body
  60. elif gzip_magic_number(response):
  61. return gunzip(response.body)
  62. # actual gzipped sitemap files are decompressed above ;
  63. # if we are here (response body is not gzipped)
  64. # and have a response for .xml.gz,
  65. # it usually means that it was already gunzipped
  66. # by HttpCompression middleware,
  67. # the HTTP response being sent with "Content-Encoding: gzip"
  68. # without actually being a .xml.gz file in the first place,
  69. # merely XML gzip-compressed on the fly,
  70. # in other word, here, we have plain XML
  71. elif response.url.endswith('.xml') or response.url.endswith('.xml.gz'):
  72. return response.body
  73. def regex(x):
  74. if isinstance(x, six.string_types):
  75. return re.compile(x)
  76. return x
  77. def iterloc(it, alt=False):
  78. for d in it:
  79. yield d['loc']
  80. # Also consider alternate URLs (xhtml:link rel="alternate")
  81. if alt and 'alternate' in d:
  82. for l in d['alternate']:
  83. yield l