123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101 |
- import re
- import logging
- import six
- from scrapy.spiders import Spider
- from scrapy.http import Request, XmlResponse
- from scrapy.utils.sitemap import Sitemap, sitemap_urls_from_robots
- from scrapy.utils.gz import gunzip, gzip_magic_number
- logger = logging.getLogger(__name__)
- class SitemapSpider(Spider):
- sitemap_urls = ()
- sitemap_rules = [('', 'parse')]
- sitemap_follow = ['']
- sitemap_alternate_links = False
- def __init__(self, *a, **kw):
- super(SitemapSpider, self).__init__(*a, **kw)
- self._cbs = []
- for r, c in self.sitemap_rules:
- if isinstance(c, six.string_types):
- c = getattr(self, c)
- self._cbs.append((regex(r), c))
- self._follow = [regex(x) for x in self.sitemap_follow]
- def start_requests(self):
- for url in self.sitemap_urls:
- yield Request(url, self._parse_sitemap)
- def sitemap_filter(self, entries):
- """This method can be used to filter sitemap entries by their
- attributes, for example, you can filter locs with lastmod greater
- than a given date (see docs).
- """
- for entry in entries:
- yield entry
- def _parse_sitemap(self, response):
- if response.url.endswith('/robots.txt'):
- for url in sitemap_urls_from_robots(response.text, base_url=response.url):
- yield Request(url, callback=self._parse_sitemap)
- else:
- body = self._get_sitemap_body(response)
- if body is None:
- logger.warning("Ignoring invalid sitemap: %(response)s",
- {'response': response}, extra={'spider': self})
- return
- s = Sitemap(body)
- it = self.sitemap_filter(s)
- if s.type == 'sitemapindex':
- for loc in iterloc(it, self.sitemap_alternate_links):
- if any(x.search(loc) for x in self._follow):
- yield Request(loc, callback=self._parse_sitemap)
- elif s.type == 'urlset':
- for loc in iterloc(it, self.sitemap_alternate_links):
- for r, c in self._cbs:
- if r.search(loc):
- yield Request(loc, callback=c)
- break
- def _get_sitemap_body(self, response):
- """Return the sitemap body contained in the given response,
- or None if the response is not a sitemap.
- """
- if isinstance(response, XmlResponse):
- return response.body
- elif gzip_magic_number(response):
- return gunzip(response.body)
- # actual gzipped sitemap files are decompressed above ;
- # if we are here (response body is not gzipped)
- # and have a response for .xml.gz,
- # it usually means that it was already gunzipped
- # by HttpCompression middleware,
- # the HTTP response being sent with "Content-Encoding: gzip"
- # without actually being a .xml.gz file in the first place,
- # merely XML gzip-compressed on the fly,
- # in other word, here, we have plain XML
- elif response.url.endswith('.xml') or response.url.endswith('.xml.gz'):
- return response.body
- def regex(x):
- if isinstance(x, six.string_types):
- return re.compile(x)
- return x
- def iterloc(it, alt=False):
- for d in it:
- yield d['loc']
- # Also consider alternate URLs (xhtml:link rel="alternate")
- if alt and 'alternate' in d:
- for l in d['alternate']:
- yield l
|