misc.py 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214
  1. # -*- coding: utf-8 -*-
  2. # Copyright (c) 2014 Rackspace
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
  12. # implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. """
  16. rfc3986.misc
  17. ~~~~~~~~~~~~
  18. This module contains important constants, patterns, and compiled regular
  19. expressions for parsing and validating URIs and their components.
  20. """
  21. import re
  22. # These are enumerated for the named tuple used as a superclass of
  23. # URIReference
  24. URI_COMPONENTS = ['scheme', 'authority', 'path', 'query', 'fragment']
  25. important_characters = {
  26. 'generic_delimiters': ":/?#[]@",
  27. 'sub_delimiters': "!$&'()*+,;=",
  28. # We need to escape the '*' in this case
  29. 're_sub_delimiters': "!$&'()\*+,;=",
  30. 'unreserved_chars': ('ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
  31. '0123456789._~-'),
  32. # We need to escape the '-' in this case:
  33. 're_unreserved': 'A-Za-z0-9._~\-',
  34. }
  35. # For details about delimiters and reserved characters, see:
  36. # http://tools.ietf.org/html/rfc3986#section-2.2
  37. GENERIC_DELIMITERS = set(important_characters['generic_delimiters'])
  38. SUB_DELIMITERS = set(important_characters['sub_delimiters'])
  39. RESERVED_CHARS = GENERIC_DELIMITERS.union(SUB_DELIMITERS)
  40. # For details about unreserved characters, see:
  41. # http://tools.ietf.org/html/rfc3986#section-2.3
  42. UNRESERVED_CHARS = set(important_characters['unreserved_chars'])
  43. NON_PCT_ENCODED = RESERVED_CHARS.union(UNRESERVED_CHARS).union('%')
  44. # Extracted from http://tools.ietf.org/html/rfc3986#appendix-B
  45. component_pattern_dict = {
  46. 'scheme': '[^:/?#]+',
  47. 'authority': '[^/?#]*',
  48. 'path': '[^?#]*',
  49. 'query': '[^#]*',
  50. 'fragment': '.*',
  51. }
  52. # See http://tools.ietf.org/html/rfc3986#appendix-B
  53. # In this case, we name each of the important matches so we can use
  54. # SRE_Match#groupdict to parse the values out if we so choose. This is also
  55. # modified to ignore other matches that are not important to the parsing of
  56. # the reference so we can also simply use SRE_Match#groups.
  57. expression = ('(?:(?P<scheme>{scheme}):)?(?://(?P<authority>{authority}))?'
  58. '(?P<path>{path})(?:\?(?P<query>{query}))?'
  59. '(?:#(?P<fragment>{fragment}))?'
  60. ).format(**component_pattern_dict)
  61. URI_MATCHER = re.compile(expression)
  62. # #########################
  63. # Authority Matcher Section
  64. # #########################
  65. # Host patterns, see: http://tools.ietf.org/html/rfc3986#section-3.2.2
  66. # The pattern for a regular name, e.g., www.google.com, api.github.com
  67. reg_name = '(({0})*|[{1}]*)'.format(
  68. '%[0-9A-Fa-f]{2}',
  69. important_characters['re_sub_delimiters'] +
  70. important_characters['re_unreserved']
  71. )
  72. # The pattern for an IPv4 address, e.g., 192.168.255.255, 127.0.0.1,
  73. ipv4 = '(\d{1,3}.){3}\d{1,3}'
  74. # Hexadecimal characters used in each piece of an IPv6 address
  75. hexdig = '[0-9A-Fa-f]{1,4}'
  76. # Least-significant 32 bits of an IPv6 address
  77. ls32 = '({hex}:{hex}|{ipv4})'.format(hex=hexdig, ipv4=ipv4)
  78. # Substitutions into the following patterns for IPv6 patterns defined
  79. # http://tools.ietf.org/html/rfc3986#page-20
  80. subs = {'hex': hexdig, 'ls32': ls32}
  81. # Below: h16 = hexdig, see: https://tools.ietf.org/html/rfc5234 for details
  82. # about ABNF (Augmented Backus-Naur Form) use in the comments
  83. variations = [
  84. # 6( h16 ":" ) ls32
  85. '(%(hex)s:){6}%(ls32)s' % subs,
  86. # "::" 5( h16 ":" ) ls32
  87. '::(%(hex)s:){5}%(ls32)s' % subs,
  88. # [ h16 ] "::" 4( h16 ":" ) ls32
  89. '(%(hex)s)?::(%(hex)s:){4}%(ls32)s' % subs,
  90. # [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32
  91. '((%(hex)s:)?%(hex)s)?::(%(hex)s:){3}%(ls32)s' % subs,
  92. # [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32
  93. '((%(hex)s:){0,2}%(hex)s)?::(%(hex)s:){2}%(ls32)s' % subs,
  94. # [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32
  95. '((%(hex)s:){0,3}%(hex)s)?::%(hex)s:%(ls32)s' % subs,
  96. # [ *4( h16 ":" ) h16 ] "::" ls32
  97. '((%(hex)s:){0,4}%(hex)s)?::%(ls32)s' % subs,
  98. # [ *5( h16 ":" ) h16 ] "::" h16
  99. '((%(hex)s:){0,5}%(hex)s)?::%(hex)s' % subs,
  100. # [ *6( h16 ":" ) h16 ] "::"
  101. '((%(hex)s:){0,6}%(hex)s)?::' % subs,
  102. ]
  103. ipv6 = '(({0})|({1})|({2})|({3})|({4})|({5})|({6})|({7}))'.format(*variations)
  104. ipv_future = 'v[0-9A-Fa-f]+.[%s]+' % (
  105. important_characters['re_unreserved'] +
  106. important_characters['re_sub_delimiters'] +
  107. ':')
  108. ip_literal = '\[({0}|{1})\]'.format(ipv6, ipv_future)
  109. # Pattern for matching the host piece of the authority
  110. HOST_PATTERN = '({0}|{1}|{2})'.format(reg_name, ipv4, ip_literal)
  111. SUBAUTHORITY_MATCHER = re.compile((
  112. '^(?:(?P<userinfo>[A-Za-z0-9_.~\-%:]+)@)?' # userinfo
  113. '(?P<host>{0}?)' # host
  114. ':?(?P<port>\d+)?$' # port
  115. ).format(HOST_PATTERN))
  116. IPv4_MATCHER = re.compile('^' + ipv4 + '$')
  117. # ####################
  118. # Path Matcher Section
  119. # ####################
  120. # See http://tools.ietf.org/html/rfc3986#section-3.3 for more information
  121. # about the path patterns defined below.
  122. # Percent encoded character values
  123. pct_encoded = '%[A-Fa-f0-9]{2}'
  124. pchar = ('([' + important_characters['re_unreserved']
  125. + important_characters['re_sub_delimiters']
  126. + ':@]|%s)' % pct_encoded)
  127. segments = {
  128. 'segment': pchar + '*',
  129. # Non-zero length segment
  130. 'segment-nz': pchar + '+',
  131. # Non-zero length segment without ":"
  132. 'segment-nz-nc': pchar.replace(':', '') + '+'
  133. }
  134. # Path types taken from Section 3.3 (linked above)
  135. path_empty = '^$'
  136. path_rootless = '%(segment-nz)s(/%(segment)s)*' % segments
  137. path_noscheme = '%(segment-nz-nc)s(/%(segment)s)*' % segments
  138. path_absolute = '/(%s)?' % path_rootless
  139. path_abempty = '(/%(segment)s)*' % segments
  140. # Matcher used to validate path components
  141. PATH_MATCHER = re.compile('^(%s|%s|%s|%s|%s)$' % (
  142. path_abempty, path_absolute, path_noscheme, path_rootless, path_empty
  143. ))
  144. # ##################################
  145. # Query and Fragment Matcher Section
  146. # ##################################
  147. QUERY_MATCHER = re.compile(
  148. '^([/?:@' + important_characters['re_unreserved']
  149. + important_characters['re_sub_delimiters']
  150. + ']|%s)*$' % pct_encoded)
  151. FRAGMENT_MATCHER = QUERY_MATCHER
  152. # Scheme validation, see: http://tools.ietf.org/html/rfc3986#section-3.1
  153. SCHEME_MATCHER = re.compile('^[A-Za-z][A-Za-z0-9+.\-]*$')
  154. # Relative reference matcher
  155. # See http://tools.ietf.org/html/rfc3986#section-4.2 for details
  156. relative_part = '(//%s%s|%s|%s|%s)' % (
  157. component_pattern_dict['authority'], path_abempty, path_absolute,
  158. path_noscheme, path_empty
  159. )
  160. RELATIVE_REF_MATCHER = re.compile('^%s(\?%s)?(#%s)?$' % (
  161. relative_part, QUERY_MATCHER.pattern, FRAGMENT_MATCHER.pattern
  162. ))
  163. # See http://tools.ietf.org/html/rfc3986#section-3 for definition
  164. hier_part = '(//%s%s|%s|%s|%s)' % (
  165. component_pattern_dict['authority'], path_abempty, path_absolute,
  166. path_rootless, path_empty
  167. )
  168. # See http://tools.ietf.org/html/rfc3986#section-4.3
  169. ABSOLUTE_URI_MATCHER = re.compile('^%s:%s(\?%s)?$' % (
  170. component_pattern_dict['scheme'], hier_part, QUERY_MATCHER.pattern[1:-1]
  171. ))
  172. # Path merger as defined in http://tools.ietf.org/html/rfc3986#section-5.2.3
  173. def merge_paths(base_uri, relative_path):
  174. """Merge a base URI's path with a relative URI's path."""
  175. if base_uri.path is None and base_uri.authority is not None:
  176. return '/' + relative_path
  177. else:
  178. path = base_uri.path or ''
  179. index = path.rfind('/')
  180. return path[:index] + '/' + relative_path