regex.py 1.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940
  1. import re
  2. from six.moves.urllib.parse import urljoin
  3. from w3lib.html import remove_tags, replace_entities, replace_escape_chars, get_base_url
  4. from scrapy.link import Link
  5. from .sgml import SgmlLinkExtractor
  6. linkre = re.compile(
  7. "<a\s.*?href=(\"[.#]+?\"|\'[.#]+?\'|[^\s]+?)(>|\s.*?>)(.*?)<[/ ]?a>",
  8. re.DOTALL | re.IGNORECASE)
  9. def clean_link(link_text):
  10. """Remove leading and trailing whitespace and punctuation"""
  11. return link_text.strip("\t\r\n '\"\x0c")
  12. class RegexLinkExtractor(SgmlLinkExtractor):
  13. """High performant link extractor"""
  14. def _extract_links(self, response_text, response_url, response_encoding, base_url=None):
  15. def clean_text(text):
  16. return replace_escape_chars(remove_tags(text.decode(response_encoding))).strip()
  17. def clean_url(url):
  18. clean_url = ''
  19. try:
  20. clean_url = urljoin(base_url, replace_entities(clean_link(url.decode(response_encoding))))
  21. except ValueError:
  22. pass
  23. return clean_url
  24. if base_url is None:
  25. base_url = get_base_url(response_text, response_url, response_encoding)
  26. links_text = linkre.findall(response_text)
  27. return [Link(clean_url(url).encode(response_encoding),
  28. clean_text(text))
  29. for url, _, text in links_text]