sitemap.py 1.5 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546
  1. """
  2. Module for processing Sitemaps.
  3. Note: The main purpose of this module is to provide support for the
  4. SitemapSpider, its API is subject to change without notice.
  5. """
  6. import lxml.etree
  7. from six.moves.urllib.parse import urljoin
  8. class Sitemap(object):
  9. """Class to parse Sitemap (type=urlset) and Sitemap Index
  10. (type=sitemapindex) files"""
  11. def __init__(self, xmltext):
  12. xmlp = lxml.etree.XMLParser(recover=True, remove_comments=True, resolve_entities=False)
  13. self._root = lxml.etree.fromstring(xmltext, parser=xmlp)
  14. rt = self._root.tag
  15. self.type = self._root.tag.split('}', 1)[1] if '}' in rt else rt
  16. def __iter__(self):
  17. for elem in self._root.getchildren():
  18. d = {}
  19. for el in elem.getchildren():
  20. tag = el.tag
  21. name = tag.split('}', 1)[1] if '}' in tag else tag
  22. if name == 'link':
  23. if 'href' in el.attrib:
  24. d.setdefault('alternate', []).append(el.get('href'))
  25. else:
  26. d[name] = el.text.strip() if el.text else ''
  27. if 'loc' in d:
  28. yield d
  29. def sitemap_urls_from_robots(robots_text, base_url=None):
  30. """Return an iterator over all sitemap urls contained in the given
  31. robots.txt file
  32. """
  33. for line in robots_text.splitlines():
  34. if line.lstrip().lower().startswith('sitemap:'):
  35. url = line.split(':', 1)[1].strip()
  36. yield urljoin(base_url, url)