sgml.py 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152
  1. """
  2. SGMLParser-based Link extractors
  3. """
  4. import six
  5. from six.moves.urllib.parse import urljoin
  6. import warnings
  7. from sgmllib import SGMLParser
  8. from w3lib.url import safe_url_string, canonicalize_url
  9. from w3lib.html import strip_html5_whitespace
  10. from scrapy.link import Link
  11. from scrapy.linkextractors import FilteringLinkExtractor
  12. from scrapy.utils.misc import arg_to_iter, rel_has_nofollow
  13. from scrapy.utils.python import unique as unique_list, to_unicode
  14. from scrapy.utils.response import get_base_url
  15. from scrapy.exceptions import ScrapyDeprecationWarning
  16. class BaseSgmlLinkExtractor(SGMLParser):
  17. def __init__(self, tag="a", attr="href", unique=False, process_value=None,
  18. strip=True, canonicalized=False):
  19. warnings.warn(
  20. "BaseSgmlLinkExtractor is deprecated and will be removed in future releases. "
  21. "Please use scrapy.linkextractors.LinkExtractor",
  22. ScrapyDeprecationWarning, stacklevel=2,
  23. )
  24. SGMLParser.__init__(self)
  25. self.scan_tag = tag if callable(tag) else lambda t: t == tag
  26. self.scan_attr = attr if callable(attr) else lambda a: a == attr
  27. self.process_value = (lambda v: v) if process_value is None else process_value
  28. self.current_link = None
  29. self.unique = unique
  30. self.strip = strip
  31. if canonicalized:
  32. self.link_key = lambda link: link.url
  33. else:
  34. self.link_key = lambda link: canonicalize_url(link.url,
  35. keep_fragments=True)
  36. def _extract_links(self, response_text, response_url, response_encoding, base_url=None):
  37. """ Do the real extraction work """
  38. self.reset()
  39. self.feed(response_text)
  40. self.close()
  41. ret = []
  42. if base_url is None:
  43. base_url = urljoin(response_url, self.base_url) if self.base_url else response_url
  44. for link in self.links:
  45. if isinstance(link.url, six.text_type):
  46. link.url = link.url.encode(response_encoding)
  47. try:
  48. link.url = urljoin(base_url, link.url)
  49. except ValueError:
  50. continue
  51. link.url = safe_url_string(link.url, response_encoding)
  52. link.text = to_unicode(link.text, response_encoding, errors='replace').strip()
  53. ret.append(link)
  54. return ret
  55. def _process_links(self, links):
  56. """ Normalize and filter extracted links
  57. The subclass should override it if necessary
  58. """
  59. return unique_list(links, key=self.link_key) if self.unique else links
  60. def extract_links(self, response):
  61. # wrapper needed to allow to work directly with text
  62. links = self._extract_links(response.body, response.url, response.encoding)
  63. links = self._process_links(links)
  64. return links
  65. def reset(self):
  66. SGMLParser.reset(self)
  67. self.links = []
  68. self.base_url = None
  69. self.current_link = None
  70. def unknown_starttag(self, tag, attrs):
  71. if tag == 'base':
  72. self.base_url = dict(attrs).get('href')
  73. if self.scan_tag(tag):
  74. for attr, value in attrs:
  75. if self.scan_attr(attr):
  76. if self.strip and value is not None:
  77. value = strip_html5_whitespace(value)
  78. url = self.process_value(value)
  79. if url is not None:
  80. link = Link(url=url, nofollow=rel_has_nofollow(dict(attrs).get('rel')))
  81. self.links.append(link)
  82. self.current_link = link
  83. def unknown_endtag(self, tag):
  84. if self.scan_tag(tag):
  85. self.current_link = None
  86. def handle_data(self, data):
  87. if self.current_link:
  88. self.current_link.text = self.current_link.text + data
  89. def matches(self, url):
  90. """This extractor matches with any url, since
  91. it doesn't contain any patterns"""
  92. return True
  93. class SgmlLinkExtractor(FilteringLinkExtractor):
  94. def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(),
  95. tags=('a', 'area'), attrs=('href',), canonicalize=False, unique=True,
  96. process_value=None, deny_extensions=None, restrict_css=(),
  97. strip=True, restrict_text=()):
  98. warnings.warn(
  99. "SgmlLinkExtractor is deprecated and will be removed in future releases. "
  100. "Please use scrapy.linkextractors.LinkExtractor",
  101. ScrapyDeprecationWarning, stacklevel=2,
  102. )
  103. tags, attrs = set(arg_to_iter(tags)), set(arg_to_iter(attrs))
  104. tag_func = lambda x: x in tags
  105. attr_func = lambda x: x in attrs
  106. with warnings.catch_warnings():
  107. warnings.simplefilter('ignore', ScrapyDeprecationWarning)
  108. lx = BaseSgmlLinkExtractor(tag=tag_func, attr=attr_func,
  109. unique=unique, process_value=process_value, strip=strip,
  110. canonicalized=canonicalize)
  111. super(SgmlLinkExtractor, self).__init__(lx, allow=allow, deny=deny,
  112. allow_domains=allow_domains, deny_domains=deny_domains,
  113. restrict_xpaths=restrict_xpaths, restrict_css=restrict_css,
  114. canonicalize=canonicalize, deny_extensions=deny_extensions,
  115. restrict_text=restrict_text)
  116. def extract_links(self, response):
  117. base_url = None
  118. if self.restrict_xpaths:
  119. base_url = get_base_url(response)
  120. body = u''.join(f
  121. for x in self.restrict_xpaths
  122. for f in response.xpath(x).getall()
  123. ).encode(response.encoding, errors='xmlcharrefreplace')
  124. else:
  125. body = response.body
  126. links = self._extract_links(body, response.url, response.encoding, base_url)
  127. links = self._process_links(links)
  128. return links