123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202 |
- import re
- # from appendix B of rfc 3986 (http://www.ietf.org/rfc/rfc3986.txt)
- uri_pattern = r'^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?'
- uri_re = re.compile(uri_pattern)
- # gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"
- #
- # sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
- # / "*" / "+" / "," / ";" / "="
- #
- # unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
- uri_illegal_char_re = re.compile(
- "[^-A-Za-z0-9:/?#[\]@!$&'()*+,;=._~%]", re.UNICODE)
- authority_pattern = r'^([^@]*@)?([^:]*)(:.*)?'
- authority_re = re.compile(authority_pattern)
- pct_encoded_pattern = r'%([0-9A-Fa-f]{2})'
- pct_encoded_re = re.compile(pct_encoded_pattern)
- try:
- unichr(0x10000)
- except ValueError:
- # narrow python build
- UCSCHAR = [
- (0xA0, 0xD7FF),
- (0xF900, 0xFDCF),
- (0xFDF0, 0xFFEF),
- ]
- IPRIVATE = [
- (0xE000, 0xF8FF),
- ]
- else:
- UCSCHAR = [
- (0xA0, 0xD7FF),
- (0xF900, 0xFDCF),
- (0xFDF0, 0xFFEF),
- (0x10000, 0x1FFFD),
- (0x20000, 0x2FFFD),
- (0x30000, 0x3FFFD),
- (0x40000, 0x4FFFD),
- (0x50000, 0x5FFFD),
- (0x60000, 0x6FFFD),
- (0x70000, 0x7FFFD),
- (0x80000, 0x8FFFD),
- (0x90000, 0x9FFFD),
- (0xA0000, 0xAFFFD),
- (0xB0000, 0xBFFFD),
- (0xC0000, 0xCFFFD),
- (0xD0000, 0xDFFFD),
- (0xE1000, 0xEFFFD),
- ]
- IPRIVATE = [
- (0xE000, 0xF8FF),
- (0xF0000, 0xFFFFD),
- (0x100000, 0x10FFFD),
- ]
- _unreserved = [False] * 256
- for _ in range(ord('A'), ord('Z') + 1): _unreserved[_] = True
- for _ in range(ord('0'), ord('9') + 1): _unreserved[_] = True
- for _ in range(ord('a'), ord('z') + 1): _unreserved[_] = True
- _unreserved[ord('-')] = True
- _unreserved[ord('.')] = True
- _unreserved[ord('_')] = True
- _unreserved[ord('~')] = True
- _escapeme_re = re.compile('[%s]' % (''.join(
- map(lambda (m, n): u'%s-%s' % (unichr(m), unichr(n)),
- UCSCHAR + IPRIVATE)),))
- def _pct_escape_unicode(char_match):
- c = char_match.group()
- return ''.join(['%%%X' % (ord(octet),) for octet in c.encode('utf-8')])
- def _pct_encoded_replace_unreserved(mo):
- try:
- i = int(mo.group(1), 16)
- if _unreserved[i]:
- return chr(i)
- else:
- return mo.group().upper()
- except ValueError:
- return mo.group()
- def _pct_encoded_replace(mo):
- try:
- return chr(int(mo.group(1), 16))
- except ValueError:
- return mo.group()
- def remove_dot_segments(path):
- result_segments = []
- while path:
- if path.startswith('../'):
- path = path[3:]
- elif path.startswith('./'):
- path = path[2:]
- elif path.startswith('/./'):
- path = path[2:]
- elif path == '/.':
- path = '/'
- elif path.startswith('/../'):
- path = path[3:]
- if result_segments:
- result_segments.pop()
- elif path == '/..':
- path = '/'
- if result_segments:
- result_segments.pop()
- elif path == '..' or path == '.':
- path = ''
- else:
- i = 0
- if path[0] == '/':
- i = 1
- i = path.find('/', i)
- if i == -1:
- i = len(path)
- result_segments.append(path[:i])
- path = path[i:]
- return ''.join(result_segments)
- def urinorm(uri):
- if isinstance(uri, unicode):
- uri = _escapeme_re.sub(_pct_escape_unicode, uri).encode('ascii')
- illegal_mo = uri_illegal_char_re.search(uri)
- if illegal_mo:
- raise ValueError('Illegal characters in URI: %r at position %s' %
- (illegal_mo.group(), illegal_mo.start()))
- uri_mo = uri_re.match(uri)
- scheme = uri_mo.group(2)
- if scheme is None:
- raise ValueError('No scheme specified')
- scheme = scheme.lower()
- if scheme not in ('http', 'https'):
- raise ValueError('Not an absolute HTTP or HTTPS URI: %r' % (uri,))
- authority = uri_mo.group(4)
- if authority is None:
- raise ValueError('Not an absolute URI: %r' % (uri,))
- authority_mo = authority_re.match(authority)
- if authority_mo is None:
- raise ValueError('URI does not have a valid authority: %r' % (uri,))
- userinfo, host, port = authority_mo.groups()
- if userinfo is None:
- userinfo = ''
- if '%' in host:
- host = host.lower()
- host = pct_encoded_re.sub(_pct_encoded_replace, host)
- host = unicode(host, 'utf-8').encode('idna')
- else:
- host = host.lower()
- if port:
- if (port == ':' or
- (scheme == 'http' and port == ':80') or
- (scheme == 'https' and port == ':443')):
- port = ''
- else:
- port = ''
- authority = userinfo + host + port
- path = uri_mo.group(5)
- path = pct_encoded_re.sub(_pct_encoded_replace_unreserved, path)
- path = remove_dot_segments(path)
- if not path:
- path = '/'
- query = uri_mo.group(6)
- if query is None:
- query = ''
- fragment = uri_mo.group(8)
- if fragment is None:
- fragment = ''
- return scheme + '://' + authority + path + query + fragment
|