url.py 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136
  1. """
  2. This module contains general purpose URL functions not found in the standard
  3. library.
  4. Some of the functions that used to be imported from this module have been moved
  5. to the w3lib.url module. Always import those from there instead.
  6. """
  7. import posixpath
  8. import re
  9. from six.moves.urllib.parse import (ParseResult, urldefrag, urlparse, urlunparse)
  10. # scrapy.utils.url was moved to w3lib.url and import * ensures this
  11. # move doesn't break old code
  12. from w3lib.url import *
  13. from w3lib.url import _safe_chars, _unquotepath
  14. from scrapy.utils.python import to_unicode
  15. def url_is_from_any_domain(url, domains):
  16. """Return True if the url belongs to any of the given domains"""
  17. host = parse_url(url).netloc.lower()
  18. if not host:
  19. return False
  20. domains = [d.lower() for d in domains]
  21. return any((host == d) or (host.endswith('.%s' % d)) for d in domains)
  22. def url_is_from_spider(url, spider):
  23. """Return True if the url belongs to the given spider"""
  24. return url_is_from_any_domain(url,
  25. [spider.name] + list(getattr(spider, 'allowed_domains', [])))
  26. def url_has_any_extension(url, extensions):
  27. return posixpath.splitext(parse_url(url).path)[1].lower() in extensions
  28. def parse_url(url, encoding=None):
  29. """Return urlparsed url from the given argument (which could be an already
  30. parsed url)
  31. """
  32. if isinstance(url, ParseResult):
  33. return url
  34. return urlparse(to_unicode(url, encoding))
  35. def escape_ajax(url):
  36. """
  37. Return the crawleable url according to:
  38. https://developers.google.com/webmasters/ajax-crawling/docs/getting-started
  39. >>> escape_ajax("www.example.com/ajax.html#!key=value")
  40. 'www.example.com/ajax.html?_escaped_fragment_=key%3Dvalue'
  41. >>> escape_ajax("www.example.com/ajax.html?k1=v1&k2=v2#!key=value")
  42. 'www.example.com/ajax.html?k1=v1&k2=v2&_escaped_fragment_=key%3Dvalue'
  43. >>> escape_ajax("www.example.com/ajax.html?#!key=value")
  44. 'www.example.com/ajax.html?_escaped_fragment_=key%3Dvalue'
  45. >>> escape_ajax("www.example.com/ajax.html#!")
  46. 'www.example.com/ajax.html?_escaped_fragment_='
  47. URLs that are not "AJAX crawlable" (according to Google) returned as-is:
  48. >>> escape_ajax("www.example.com/ajax.html#key=value")
  49. 'www.example.com/ajax.html#key=value'
  50. >>> escape_ajax("www.example.com/ajax.html#")
  51. 'www.example.com/ajax.html#'
  52. >>> escape_ajax("www.example.com/ajax.html")
  53. 'www.example.com/ajax.html'
  54. """
  55. defrag, frag = urldefrag(url)
  56. if not frag.startswith('!'):
  57. return url
  58. return add_or_replace_parameter(defrag, '_escaped_fragment_', frag[1:])
  59. def add_http_if_no_scheme(url):
  60. """Add http as the default scheme if it is missing from the url."""
  61. match = re.match(r"^\w+://", url, flags=re.I)
  62. if not match:
  63. parts = urlparse(url)
  64. scheme = "http:" if parts.netloc else "http://"
  65. url = scheme + url
  66. return url
  67. def guess_scheme(url):
  68. """Add an URL scheme if missing: file:// for filepath-like input or http:// otherwise."""
  69. parts = urlparse(url)
  70. if parts.scheme:
  71. return url
  72. # Note: this does not match Windows filepath
  73. if re.match(r'''^ # start with...
  74. (
  75. \. # ...a single dot,
  76. (
  77. \. | [^/\.]+ # optionally followed by
  78. )? # either a second dot or some characters
  79. )? # optional match of ".", ".." or ".blabla"
  80. / # at least one "/" for a file path,
  81. . # and something after the "/"
  82. ''', parts.path, flags=re.VERBOSE):
  83. return any_to_uri(url)
  84. else:
  85. return add_http_if_no_scheme(url)
  86. def strip_url(url, strip_credentials=True, strip_default_port=True, origin_only=False, strip_fragment=True):
  87. """Strip URL string from some of its components:
  88. - ``strip_credentials`` removes "user:password@"
  89. - ``strip_default_port`` removes ":80" (resp. ":443", ":21")
  90. from http:// (resp. https://, ftp://) URLs
  91. - ``origin_only`` replaces path component with "/", also dropping
  92. query and fragment components ; it also strips credentials
  93. - ``strip_fragment`` drops any #fragment component
  94. """
  95. parsed_url = urlparse(url)
  96. netloc = parsed_url.netloc
  97. if (strip_credentials or origin_only) and (parsed_url.username or parsed_url.password):
  98. netloc = netloc.split('@')[-1]
  99. if strip_default_port and parsed_url.port:
  100. if (parsed_url.scheme, parsed_url.port) in (('http', 80),
  101. ('https', 443),
  102. ('ftp', 21)):
  103. netloc = netloc.replace(':{p.port}'.format(p=parsed_url), '')
  104. return urlunparse((
  105. parsed_url.scheme,
  106. netloc,
  107. '/' if origin_only else parsed_url.path,
  108. '' if origin_only else parsed_url.params,
  109. '' if origin_only else parsed_url.query,
  110. '' if strip_fragment else parsed_url.fragment
  111. ))