urinorm.py 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202
  1. import re
  2. # from appendix B of rfc 3986 (http://www.ietf.org/rfc/rfc3986.txt)
  3. uri_pattern = r'^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?'
  4. uri_re = re.compile(uri_pattern)
  5. # gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"
  6. #
  7. # sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
  8. # / "*" / "+" / "," / ";" / "="
  9. #
  10. # unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
  11. uri_illegal_char_re = re.compile(
  12. "[^-A-Za-z0-9:/?#[\]@!$&'()*+,;=._~%]", re.UNICODE)
  13. authority_pattern = r'^([^@]*@)?([^:]*)(:.*)?'
  14. authority_re = re.compile(authority_pattern)
  15. pct_encoded_pattern = r'%([0-9A-Fa-f]{2})'
  16. pct_encoded_re = re.compile(pct_encoded_pattern)
  17. try:
  18. unichr(0x10000)
  19. except ValueError:
  20. # narrow python build
  21. UCSCHAR = [
  22. (0xA0, 0xD7FF),
  23. (0xF900, 0xFDCF),
  24. (0xFDF0, 0xFFEF),
  25. ]
  26. IPRIVATE = [
  27. (0xE000, 0xF8FF),
  28. ]
  29. else:
  30. UCSCHAR = [
  31. (0xA0, 0xD7FF),
  32. (0xF900, 0xFDCF),
  33. (0xFDF0, 0xFFEF),
  34. (0x10000, 0x1FFFD),
  35. (0x20000, 0x2FFFD),
  36. (0x30000, 0x3FFFD),
  37. (0x40000, 0x4FFFD),
  38. (0x50000, 0x5FFFD),
  39. (0x60000, 0x6FFFD),
  40. (0x70000, 0x7FFFD),
  41. (0x80000, 0x8FFFD),
  42. (0x90000, 0x9FFFD),
  43. (0xA0000, 0xAFFFD),
  44. (0xB0000, 0xBFFFD),
  45. (0xC0000, 0xCFFFD),
  46. (0xD0000, 0xDFFFD),
  47. (0xE1000, 0xEFFFD),
  48. ]
  49. IPRIVATE = [
  50. (0xE000, 0xF8FF),
  51. (0xF0000, 0xFFFFD),
  52. (0x100000, 0x10FFFD),
  53. ]
  54. _unreserved = [False] * 256
  55. for _ in range(ord('A'), ord('Z') + 1): _unreserved[_] = True
  56. for _ in range(ord('0'), ord('9') + 1): _unreserved[_] = True
  57. for _ in range(ord('a'), ord('z') + 1): _unreserved[_] = True
  58. _unreserved[ord('-')] = True
  59. _unreserved[ord('.')] = True
  60. _unreserved[ord('_')] = True
  61. _unreserved[ord('~')] = True
  62. _escapeme_re = re.compile('[%s]' % (''.join(
  63. map(lambda (m, n): u'%s-%s' % (unichr(m), unichr(n)),
  64. UCSCHAR + IPRIVATE)),))
  65. def _pct_escape_unicode(char_match):
  66. c = char_match.group()
  67. return ''.join(['%%%X' % (ord(octet),) for octet in c.encode('utf-8')])
  68. def _pct_encoded_replace_unreserved(mo):
  69. try:
  70. i = int(mo.group(1), 16)
  71. if _unreserved[i]:
  72. return chr(i)
  73. else:
  74. return mo.group().upper()
  75. except ValueError:
  76. return mo.group()
  77. def _pct_encoded_replace(mo):
  78. try:
  79. return chr(int(mo.group(1), 16))
  80. except ValueError:
  81. return mo.group()
  82. def remove_dot_segments(path):
  83. result_segments = []
  84. while path:
  85. if path.startswith('../'):
  86. path = path[3:]
  87. elif path.startswith('./'):
  88. path = path[2:]
  89. elif path.startswith('/./'):
  90. path = path[2:]
  91. elif path == '/.':
  92. path = '/'
  93. elif path.startswith('/../'):
  94. path = path[3:]
  95. if result_segments:
  96. result_segments.pop()
  97. elif path == '/..':
  98. path = '/'
  99. if result_segments:
  100. result_segments.pop()
  101. elif path == '..' or path == '.':
  102. path = ''
  103. else:
  104. i = 0
  105. if path[0] == '/':
  106. i = 1
  107. i = path.find('/', i)
  108. if i == -1:
  109. i = len(path)
  110. result_segments.append(path[:i])
  111. path = path[i:]
  112. return ''.join(result_segments)
  113. def urinorm(uri):
  114. if isinstance(uri, unicode):
  115. uri = _escapeme_re.sub(_pct_escape_unicode, uri).encode('ascii')
  116. illegal_mo = uri_illegal_char_re.search(uri)
  117. if illegal_mo:
  118. raise ValueError('Illegal characters in URI: %r at position %s' %
  119. (illegal_mo.group(), illegal_mo.start()))
  120. uri_mo = uri_re.match(uri)
  121. scheme = uri_mo.group(2)
  122. if scheme is None:
  123. raise ValueError('No scheme specified')
  124. scheme = scheme.lower()
  125. if scheme not in ('http', 'https'):
  126. raise ValueError('Not an absolute HTTP or HTTPS URI: %r' % (uri,))
  127. authority = uri_mo.group(4)
  128. if authority is None:
  129. raise ValueError('Not an absolute URI: %r' % (uri,))
  130. authority_mo = authority_re.match(authority)
  131. if authority_mo is None:
  132. raise ValueError('URI does not have a valid authority: %r' % (uri,))
  133. userinfo, host, port = authority_mo.groups()
  134. if userinfo is None:
  135. userinfo = ''
  136. if '%' in host:
  137. host = host.lower()
  138. host = pct_encoded_re.sub(_pct_encoded_replace, host)
  139. host = unicode(host, 'utf-8').encode('idna')
  140. else:
  141. host = host.lower()
  142. if port:
  143. if (port == ':' or
  144. (scheme == 'http' and port == ':80') or
  145. (scheme == 'https' and port == ':443')):
  146. port = ''
  147. else:
  148. port = ''
  149. authority = userinfo + host + port
  150. path = uri_mo.group(5)
  151. path = pct_encoded_re.sub(_pct_encoded_replace_unreserved, path)
  152. path = remove_dot_segments(path)
  153. if not path:
  154. path = '/'
  155. query = uri_mo.group(6)
  156. if query is None:
  157. query = ''
  158. fragment = uri_mo.group(8)
  159. if fragment is None:
  160. fragment = ''
  161. return scheme + '://' + authority + path + query + fragment