123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288 |
- # coding: utf-8
- """
- Functions to convert unicode IRIs into ASCII byte string URIs and back. Exports
- the following items:
- - iri_to_uri()
- - uri_to_iri()
- """
- from __future__ import unicode_literals, division, absolute_import, print_function
- from encodings import idna # noqa
- import codecs
- import re
- import sys
- from ._errors import unwrap
- from ._types import byte_cls, str_cls, type_name, bytes_to_list, int_types
- if sys.version_info < (3,):
- from urlparse import urlsplit, urlunsplit
- from urllib import (
- quote as urlquote,
- unquote as unquote_to_bytes,
- )
- else:
- from urllib.parse import (
- quote as urlquote,
- unquote_to_bytes,
- urlsplit,
- urlunsplit,
- )
- def iri_to_uri(value):
- """
- Normalizes and encodes a unicode IRI into an ASCII byte string URI
- :param value:
- A unicode string of an IRI
- :return:
- A byte string of the ASCII-encoded URI
- """
- if not isinstance(value, str_cls):
- raise TypeError(unwrap(
- '''
- value must be a unicode string, not %s
- ''',
- type_name(value)
- ))
- scheme = None
- # Python 2.6 doesn't split properly is the URL doesn't start with http:// or https://
- if sys.version_info < (2, 7) and not value.startswith('http://') and not value.startswith('https://'):
- real_prefix = None
- prefix_match = re.match('^[^:]*://', value)
- if prefix_match:
- real_prefix = prefix_match.group(0)
- value = 'http://' + value[len(real_prefix):]
- parsed = urlsplit(value)
- if real_prefix:
- value = real_prefix + value[7:]
- scheme = _urlquote(real_prefix[:-3])
- else:
- parsed = urlsplit(value)
- if scheme is None:
- scheme = _urlquote(parsed.scheme)
- hostname = parsed.hostname
- if hostname is not None:
- hostname = hostname.encode('idna')
- # RFC 3986 allows userinfo to contain sub-delims
- username = _urlquote(parsed.username, safe='!$&\'()*+,;=')
- password = _urlquote(parsed.password, safe='!$&\'()*+,;=')
- port = parsed.port
- if port is not None:
- port = str_cls(port).encode('ascii')
- netloc = b''
- if username is not None:
- netloc += username
- if password:
- netloc += b':' + password
- netloc += b'@'
- if hostname is not None:
- netloc += hostname
- if port is not None:
- default_http = scheme == b'http' and port == b'80'
- default_https = scheme == b'https' and port == b'443'
- if not default_http and not default_https:
- netloc += b':' + port
- # RFC 3986 allows a path to contain sub-delims, plus "@" and ":"
- path = _urlquote(parsed.path, safe='/!$&\'()*+,;=@:')
- # RFC 3986 allows the query to contain sub-delims, plus "@", ":" , "/" and "?"
- query = _urlquote(parsed.query, safe='/?!$&\'()*+,;=@:')
- # RFC 3986 allows the fragment to contain sub-delims, plus "@", ":" , "/" and "?"
- fragment = _urlquote(parsed.fragment, safe='/?!$&\'()*+,;=@:')
- if query is None and fragment is None and path == b'/':
- path = None
- # Python 2.7 compat
- if path is None:
- path = ''
- output = urlunsplit((scheme, netloc, path, query, fragment))
- if isinstance(output, str_cls):
- output = output.encode('latin1')
- return output
- def uri_to_iri(value):
- """
- Converts an ASCII URI byte string into a unicode IRI
- :param value:
- An ASCII-encoded byte string of the URI
- :return:
- A unicode string of the IRI
- """
- if not isinstance(value, byte_cls):
- raise TypeError(unwrap(
- '''
- value must be a byte string, not %s
- ''',
- type_name(value)
- ))
- parsed = urlsplit(value)
- scheme = parsed.scheme
- if scheme is not None:
- scheme = scheme.decode('ascii')
- username = _urlunquote(parsed.username, remap=[':', '@'])
- password = _urlunquote(parsed.password, remap=[':', '@'])
- hostname = parsed.hostname
- if hostname:
- hostname = hostname.decode('idna')
- port = parsed.port
- if port and not isinstance(port, int_types):
- port = port.decode('ascii')
- netloc = ''
- if username is not None:
- netloc += username
- if password:
- netloc += ':' + password
- netloc += '@'
- if hostname is not None:
- netloc += hostname
- if port is not None:
- netloc += ':' + str_cls(port)
- path = _urlunquote(parsed.path, remap=['/'], preserve=True)
- query = _urlunquote(parsed.query, remap=['&', '='], preserve=True)
- fragment = _urlunquote(parsed.fragment)
- return urlunsplit((scheme, netloc, path, query, fragment))
- def _iri_utf8_errors_handler(exc):
- """
- Error handler for decoding UTF-8 parts of a URI into an IRI. Leaves byte
- sequences encoded in %XX format, but as part of a unicode string.
- :param exc:
- The UnicodeDecodeError exception
- :return:
- A 2-element tuple of (replacement unicode string, integer index to
- resume at)
- """
- bytes_as_ints = bytes_to_list(exc.object[exc.start:exc.end])
- replacements = ['%%%02x' % num for num in bytes_as_ints]
- return (''.join(replacements), exc.end)
- codecs.register_error('iriutf8', _iri_utf8_errors_handler)
- def _urlquote(string, safe=''):
- """
- Quotes a unicode string for use in a URL
- :param string:
- A unicode string
- :param safe:
- A unicode string of character to not encode
- :return:
- None (if string is None) or an ASCII byte string of the quoted string
- """
- if string is None or string == '':
- return None
- # Anything already hex quoted is pulled out of the URL and unquoted if
- # possible
- escapes = []
- if re.search('%[0-9a-fA-F]{2}', string):
- # Try to unquote any percent values, restoring them if they are not
- # valid UTF-8. Also, requote any safe chars since encoded versions of
- # those are functionally different than the unquoted ones.
- def _try_unescape(match):
- byte_string = unquote_to_bytes(match.group(0))
- unicode_string = byte_string.decode('utf-8', 'iriutf8')
- for safe_char in list(safe):
- unicode_string = unicode_string.replace(safe_char, '%%%02x' % ord(safe_char))
- return unicode_string
- string = re.sub('(?:%[0-9a-fA-F]{2})+', _try_unescape, string)
- # Once we have the minimal set of hex quoted values, removed them from
- # the string so that they are not double quoted
- def _extract_escape(match):
- escapes.append(match.group(0).encode('ascii'))
- return '\x00'
- string = re.sub('%[0-9a-fA-F]{2}', _extract_escape, string)
- output = urlquote(string.encode('utf-8'), safe=safe.encode('utf-8'))
- if not isinstance(output, byte_cls):
- output = output.encode('ascii')
- # Restore the existing quoted values that we extracted
- if len(escapes) > 0:
- def _return_escape(_):
- return escapes.pop(0)
- output = re.sub(b'%00', _return_escape, output)
- return output
- def _urlunquote(byte_string, remap=None, preserve=None):
- """
- Unquotes a URI portion from a byte string into unicode using UTF-8
- :param byte_string:
- A byte string of the data to unquote
- :param remap:
- A list of characters (as unicode) that should be re-mapped to a
- %XX encoding. This is used when characters are not valid in part of a
- URL.
- :param preserve:
- A bool - indicates that the chars to be remapped if they occur in
- non-hex form, should be preserved. E.g. / for URL path.
- :return:
- A unicode string
- """
- if byte_string is None:
- return byte_string
- if byte_string == b'':
- return ''
- if preserve:
- replacements = ['\x1A', '\x1C', '\x1D', '\x1E', '\x1F']
- preserve_unmap = {}
- for char in remap:
- replacement = replacements.pop(0)
- preserve_unmap[replacement] = char
- byte_string = byte_string.replace(char.encode('ascii'), replacement.encode('ascii'))
- byte_string = unquote_to_bytes(byte_string)
- if remap:
- for char in remap:
- byte_string = byte_string.replace(char.encode('ascii'), ('%%%02x' % ord(char)).encode('ascii'))
- output = byte_string.decode('utf-8', 'iriutf8')
- if preserve:
- for replacement, original in preserve_unmap.items():
- output = output.replace(replacement, original)
- return output
|