_iri.py 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288
  1. # coding: utf-8
  2. """
  3. Functions to convert unicode IRIs into ASCII byte string URIs and back. Exports
  4. the following items:
  5. - iri_to_uri()
  6. - uri_to_iri()
  7. """
  8. from __future__ import unicode_literals, division, absolute_import, print_function
  9. from encodings import idna # noqa
  10. import codecs
  11. import re
  12. import sys
  13. from ._errors import unwrap
  14. from ._types import byte_cls, str_cls, type_name, bytes_to_list, int_types
  15. if sys.version_info < (3,):
  16. from urlparse import urlsplit, urlunsplit
  17. from urllib import (
  18. quote as urlquote,
  19. unquote as unquote_to_bytes,
  20. )
  21. else:
  22. from urllib.parse import (
  23. quote as urlquote,
  24. unquote_to_bytes,
  25. urlsplit,
  26. urlunsplit,
  27. )
  28. def iri_to_uri(value):
  29. """
  30. Normalizes and encodes a unicode IRI into an ASCII byte string URI
  31. :param value:
  32. A unicode string of an IRI
  33. :return:
  34. A byte string of the ASCII-encoded URI
  35. """
  36. if not isinstance(value, str_cls):
  37. raise TypeError(unwrap(
  38. '''
  39. value must be a unicode string, not %s
  40. ''',
  41. type_name(value)
  42. ))
  43. scheme = None
  44. # Python 2.6 doesn't split properly is the URL doesn't start with http:// or https://
  45. if sys.version_info < (2, 7) and not value.startswith('http://') and not value.startswith('https://'):
  46. real_prefix = None
  47. prefix_match = re.match('^[^:]*://', value)
  48. if prefix_match:
  49. real_prefix = prefix_match.group(0)
  50. value = 'http://' + value[len(real_prefix):]
  51. parsed = urlsplit(value)
  52. if real_prefix:
  53. value = real_prefix + value[7:]
  54. scheme = _urlquote(real_prefix[:-3])
  55. else:
  56. parsed = urlsplit(value)
  57. if scheme is None:
  58. scheme = _urlquote(parsed.scheme)
  59. hostname = parsed.hostname
  60. if hostname is not None:
  61. hostname = hostname.encode('idna')
  62. # RFC 3986 allows userinfo to contain sub-delims
  63. username = _urlquote(parsed.username, safe='!$&\'()*+,;=')
  64. password = _urlquote(parsed.password, safe='!$&\'()*+,;=')
  65. port = parsed.port
  66. if port is not None:
  67. port = str_cls(port).encode('ascii')
  68. netloc = b''
  69. if username is not None:
  70. netloc += username
  71. if password:
  72. netloc += b':' + password
  73. netloc += b'@'
  74. if hostname is not None:
  75. netloc += hostname
  76. if port is not None:
  77. default_http = scheme == b'http' and port == b'80'
  78. default_https = scheme == b'https' and port == b'443'
  79. if not default_http and not default_https:
  80. netloc += b':' + port
  81. # RFC 3986 allows a path to contain sub-delims, plus "@" and ":"
  82. path = _urlquote(parsed.path, safe='/!$&\'()*+,;=@:')
  83. # RFC 3986 allows the query to contain sub-delims, plus "@", ":" , "/" and "?"
  84. query = _urlquote(parsed.query, safe='/?!$&\'()*+,;=@:')
  85. # RFC 3986 allows the fragment to contain sub-delims, plus "@", ":" , "/" and "?"
  86. fragment = _urlquote(parsed.fragment, safe='/?!$&\'()*+,;=@:')
  87. if query is None and fragment is None and path == b'/':
  88. path = None
  89. # Python 2.7 compat
  90. if path is None:
  91. path = ''
  92. output = urlunsplit((scheme, netloc, path, query, fragment))
  93. if isinstance(output, str_cls):
  94. output = output.encode('latin1')
  95. return output
  96. def uri_to_iri(value):
  97. """
  98. Converts an ASCII URI byte string into a unicode IRI
  99. :param value:
  100. An ASCII-encoded byte string of the URI
  101. :return:
  102. A unicode string of the IRI
  103. """
  104. if not isinstance(value, byte_cls):
  105. raise TypeError(unwrap(
  106. '''
  107. value must be a byte string, not %s
  108. ''',
  109. type_name(value)
  110. ))
  111. parsed = urlsplit(value)
  112. scheme = parsed.scheme
  113. if scheme is not None:
  114. scheme = scheme.decode('ascii')
  115. username = _urlunquote(parsed.username, remap=[':', '@'])
  116. password = _urlunquote(parsed.password, remap=[':', '@'])
  117. hostname = parsed.hostname
  118. if hostname:
  119. hostname = hostname.decode('idna')
  120. port = parsed.port
  121. if port and not isinstance(port, int_types):
  122. port = port.decode('ascii')
  123. netloc = ''
  124. if username is not None:
  125. netloc += username
  126. if password:
  127. netloc += ':' + password
  128. netloc += '@'
  129. if hostname is not None:
  130. netloc += hostname
  131. if port is not None:
  132. netloc += ':' + str_cls(port)
  133. path = _urlunquote(parsed.path, remap=['/'], preserve=True)
  134. query = _urlunquote(parsed.query, remap=['&', '='], preserve=True)
  135. fragment = _urlunquote(parsed.fragment)
  136. return urlunsplit((scheme, netloc, path, query, fragment))
  137. def _iri_utf8_errors_handler(exc):
  138. """
  139. Error handler for decoding UTF-8 parts of a URI into an IRI. Leaves byte
  140. sequences encoded in %XX format, but as part of a unicode string.
  141. :param exc:
  142. The UnicodeDecodeError exception
  143. :return:
  144. A 2-element tuple of (replacement unicode string, integer index to
  145. resume at)
  146. """
  147. bytes_as_ints = bytes_to_list(exc.object[exc.start:exc.end])
  148. replacements = ['%%%02x' % num for num in bytes_as_ints]
  149. return (''.join(replacements), exc.end)
  150. codecs.register_error('iriutf8', _iri_utf8_errors_handler)
  151. def _urlquote(string, safe=''):
  152. """
  153. Quotes a unicode string for use in a URL
  154. :param string:
  155. A unicode string
  156. :param safe:
  157. A unicode string of character to not encode
  158. :return:
  159. None (if string is None) or an ASCII byte string of the quoted string
  160. """
  161. if string is None or string == '':
  162. return None
  163. # Anything already hex quoted is pulled out of the URL and unquoted if
  164. # possible
  165. escapes = []
  166. if re.search('%[0-9a-fA-F]{2}', string):
  167. # Try to unquote any percent values, restoring them if they are not
  168. # valid UTF-8. Also, requote any safe chars since encoded versions of
  169. # those are functionally different than the unquoted ones.
  170. def _try_unescape(match):
  171. byte_string = unquote_to_bytes(match.group(0))
  172. unicode_string = byte_string.decode('utf-8', 'iriutf8')
  173. for safe_char in list(safe):
  174. unicode_string = unicode_string.replace(safe_char, '%%%02x' % ord(safe_char))
  175. return unicode_string
  176. string = re.sub('(?:%[0-9a-fA-F]{2})+', _try_unescape, string)
  177. # Once we have the minimal set of hex quoted values, removed them from
  178. # the string so that they are not double quoted
  179. def _extract_escape(match):
  180. escapes.append(match.group(0).encode('ascii'))
  181. return '\x00'
  182. string = re.sub('%[0-9a-fA-F]{2}', _extract_escape, string)
  183. output = urlquote(string.encode('utf-8'), safe=safe.encode('utf-8'))
  184. if not isinstance(output, byte_cls):
  185. output = output.encode('ascii')
  186. # Restore the existing quoted values that we extracted
  187. if len(escapes) > 0:
  188. def _return_escape(_):
  189. return escapes.pop(0)
  190. output = re.sub(b'%00', _return_escape, output)
  191. return output
  192. def _urlunquote(byte_string, remap=None, preserve=None):
  193. """
  194. Unquotes a URI portion from a byte string into unicode using UTF-8
  195. :param byte_string:
  196. A byte string of the data to unquote
  197. :param remap:
  198. A list of characters (as unicode) that should be re-mapped to a
  199. %XX encoding. This is used when characters are not valid in part of a
  200. URL.
  201. :param preserve:
  202. A bool - indicates that the chars to be remapped if they occur in
  203. non-hex form, should be preserved. E.g. / for URL path.
  204. :return:
  205. A unicode string
  206. """
  207. if byte_string is None:
  208. return byte_string
  209. if byte_string == b'':
  210. return ''
  211. if preserve:
  212. replacements = ['\x1A', '\x1C', '\x1D', '\x1E', '\x1F']
  213. preserve_unmap = {}
  214. for char in remap:
  215. replacement = replacements.pop(0)
  216. preserve_unmap[replacement] = char
  217. byte_string = byte_string.replace(char.encode('ascii'), replacement.encode('ascii'))
  218. byte_string = unquote_to_bytes(byte_string)
  219. if remap:
  220. for char in remap:
  221. byte_string = byte_string.replace(char.encode('ascii'), ('%%%02x' % ord(char)).encode('ascii'))
  222. output = byte_string.decode('utf-8', 'iriutf8')
  223. if preserve:
  224. for replacement, original in preserve_unmap.items():
  225. output = output.replace(replacement, original)
  226. return output