1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192 |
- """
- HTMLParser-based link extractor
- """
- import warnings
- import six
- from six.moves.html_parser import HTMLParser
- from six.moves.urllib.parse import urljoin
- from w3lib.url import safe_url_string
- from w3lib.html import strip_html5_whitespace
- from scrapy.link import Link
- from scrapy.utils.python import unique as unique_list
- from scrapy.exceptions import ScrapyDeprecationWarning
- class HtmlParserLinkExtractor(HTMLParser):
- def __init__(self, tag="a", attr="href", process=None, unique=False,
- strip=True):
- HTMLParser.__init__(self)
- warnings.warn(
- "HtmlParserLinkExtractor is deprecated and will be removed in "
- "future releases. Please use scrapy.linkextractors.LinkExtractor",
- ScrapyDeprecationWarning, stacklevel=2,
- )
- self.scan_tag = tag if callable(tag) else lambda t: t == tag
- self.scan_attr = attr if callable(attr) else lambda a: a == attr
- self.process_attr = process if callable(process) else lambda v: v
- self.unique = unique
- self.strip = strip
- def _extract_links(self, response_text, response_url, response_encoding):
- self.reset()
- self.feed(response_text)
- self.close()
- links = unique_list(self.links, key=lambda link: link.url) if self.unique else self.links
- ret = []
- base_url = urljoin(response_url, self.base_url) if self.base_url else response_url
- for link in links:
- if isinstance(link.url, six.text_type):
- link.url = link.url.encode(response_encoding)
- try:
- link.url = urljoin(base_url, link.url)
- except ValueError:
- continue
- link.url = safe_url_string(link.url, response_encoding)
- link.text = link.text.decode(response_encoding)
- ret.append(link)
- return ret
- def extract_links(self, response):
- # wrapper needed to allow to work directly with text
- return self._extract_links(response.body, response.url, response.encoding)
- def reset(self):
- HTMLParser.reset(self)
- self.base_url = None
- self.current_link = None
- self.links = []
- def handle_starttag(self, tag, attrs):
- if tag == 'base':
- self.base_url = dict(attrs).get('href')
- if self.scan_tag(tag):
- for attr, value in attrs:
- if self.scan_attr(attr):
- if self.strip:
- value = strip_html5_whitespace(value)
- url = self.process_attr(value)
- link = Link(url=url)
- self.links.append(link)
- self.current_link = link
- def handle_endtag(self, tag):
- if self.scan_tag(tag):
- self.current_link = None
- def handle_data(self, data):
- if self.current_link:
- self.current_link.text = self.current_link.text + data
- def matches(self, url):
- """This extractor matches with any url, since
- it doesn't contain any patterns"""
- return True
|