uri_validate.py 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215
  1. """
  2. Regex for URIs
  3. These regex are directly derived from the collected ABNF in RFC3986
  4. (except for DIGIT, ALPHA and HEXDIG, defined by RFC2234).
  5. They should be processed with re.VERBOSE.
  6. Thanks Mark Nottingham for this code - https://gist.github.com/138549
  7. """
  8. from __future__ import unicode_literals
  9. import re
  10. # basics
  11. DIGIT = r"[\x30-\x39]"
  12. ALPHA = r"[\x41-\x5A\x61-\x7A]"
  13. HEXDIG = r"[\x30-\x39A-Fa-f]"
  14. # pct-encoded = "%" HEXDIG HEXDIG
  15. pct_encoded = r" %% %(HEXDIG)s %(HEXDIG)s" % locals()
  16. # unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
  17. unreserved = r"(?: %(ALPHA)s | %(DIGIT)s | \- | \. | _ | ~ )" % locals()
  18. # gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"
  19. gen_delims = r"(?: : | / | \? | \# | \[ | \] | @ )"
  20. # sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
  21. # / "*" / "+" / "," / ";" / "="
  22. sub_delims = r"""(?: ! | \$ | & | ' | \( | \) |
  23. \* | \+ | , | ; | = )"""
  24. # pchar = unreserved / pct-encoded / sub-delims / ":" / "@"
  25. pchar = r"(?: %(unreserved)s | %(pct_encoded)s | %(sub_delims)s | : | @ )" % locals(
  26. )
  27. # reserved = gen-delims / sub-delims
  28. reserved = r"(?: %(gen_delims)s | %(sub_delims)s )" % locals()
  29. # scheme
  30. # scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
  31. scheme = r"%(ALPHA)s (?: %(ALPHA)s | %(DIGIT)s | \+ | \- | \. )*" % locals()
  32. # authority
  33. # dec-octet = DIGIT ; 0-9
  34. # / %x31-39 DIGIT ; 10-99
  35. # / "1" 2DIGIT ; 100-199
  36. # / "2" %x30-34 DIGIT ; 200-249
  37. # / "25" %x30-35 ; 250-255
  38. dec_octet = r"""(?: %(DIGIT)s |
  39. [\x31-\x39] %(DIGIT)s |
  40. 1 %(DIGIT)s{2} |
  41. 2 [\x30-\x34] %(DIGIT)s |
  42. 25 [\x30-\x35]
  43. )
  44. """ % locals()
  45. # IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet
  46. IPv4address = r"%(dec_octet)s \. %(dec_octet)s \. %(dec_octet)s \. %(dec_octet)s" % locals(
  47. )
  48. # h16 = 1*4HEXDIG
  49. h16 = r"(?: %(HEXDIG)s ){1,4}" % locals()
  50. # ls32 = ( h16 ":" h16 ) / IPv4address
  51. ls32 = r"(?: (?: %(h16)s : %(h16)s ) | %(IPv4address)s )" % locals()
  52. # IPv6address = 6( h16 ":" ) ls32
  53. # / "::" 5( h16 ":" ) ls32
  54. # / [ h16 ] "::" 4( h16 ":" ) ls32
  55. # / [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32
  56. # / [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32
  57. # / [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32
  58. # / [ *4( h16 ":" ) h16 ] "::" ls32
  59. # / [ *5( h16 ":" ) h16 ] "::" h16
  60. # / [ *6( h16 ":" ) h16 ] "::"
  61. IPv6address = r"""(?: (?: %(h16)s : ){6} %(ls32)s |
  62. :: (?: %(h16)s : ){5} %(ls32)s |
  63. %(h16)s :: (?: %(h16)s : ){4} %(ls32)s |
  64. (?: %(h16)s : ) %(h16)s :: (?: %(h16)s : ){3} %(ls32)s |
  65. (?: %(h16)s : ){2} %(h16)s :: (?: %(h16)s : ){2} %(ls32)s |
  66. (?: %(h16)s : ){3} %(h16)s :: %(h16)s : %(ls32)s |
  67. (?: %(h16)s : ){4} %(h16)s :: %(ls32)s |
  68. (?: %(h16)s : ){5} %(h16)s :: %(h16)s |
  69. (?: %(h16)s : ){6} %(h16)s ::
  70. )
  71. """ % locals()
  72. # IPvFuture = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" )
  73. IPvFuture = r"v %(HEXDIG)s+ \. (?: %(unreserved)s | %(sub_delims)s | : )+" % locals()
  74. # IP-literal = "[" ( IPv6address / IPvFuture ) "]"
  75. IP_literal = r"\[ (?: %(IPv6address)s | %(IPvFuture)s ) \]" % locals()
  76. # reg-name = *( unreserved / pct-encoded / sub-delims )
  77. reg_name = r"(?: %(unreserved)s | %(pct_encoded)s | %(sub_delims)s )*" % locals()
  78. # userinfo = *( unreserved / pct-encoded / sub-delims / ":" )
  79. userinfo = r"(?: %(unreserved)s | %(pct_encoded)s | %(sub_delims)s | : )" % locals(
  80. )
  81. # host = IP-literal / IPv4address / reg-name
  82. host = r"(?: %(IP_literal)s | %(IPv4address)s | %(reg_name)s )" % locals()
  83. # port = *DIGIT
  84. port = r"(?: %(DIGIT)s )*" % locals()
  85. # authority = [ userinfo "@" ] host [ ":" port ]
  86. authority = r"(?: %(userinfo)s @)? %(host)s (?: : %(port)s)?" % locals()
  87. # Path
  88. # segment = *pchar
  89. segment = r"%(pchar)s*" % locals()
  90. # segment-nz = 1*pchar
  91. segment_nz = r"%(pchar)s+" % locals()
  92. # segment-nz-nc = 1*( unreserved / pct-encoded / sub-delims / "@" )
  93. # ; non-zero-length segment without any colon ":"
  94. segment_nz_nc = r"(?: %(unreserved)s | %(pct_encoded)s | %(sub_delims)s | @ )+" % locals()
  95. # path-abempty = *( "/" segment )
  96. path_abempty = r"(?: / %(segment)s )*" % locals()
  97. # path-absolute = "/" [ segment-nz *( "/" segment ) ]
  98. path_absolute = r"/ (?: %(segment_nz)s (?: / %(segment)s )* )?" % locals()
  99. # path-noscheme = segment-nz-nc *( "/" segment )
  100. path_noscheme = r"%(segment_nz_nc)s (?: / %(segment)s )*" % locals()
  101. # path-rootless = segment-nz *( "/" segment )
  102. path_rootless = r"%(segment_nz)s (?: / %(segment)s )*" % locals()
  103. # path-empty = 0<pchar>
  104. path_empty = r"" # FIXME
  105. # path = path-abempty ; begins with "/" or is empty
  106. # / path-absolute ; begins with "/" but not "//"
  107. # / path-noscheme ; begins with a non-colon segment
  108. # / path-rootless ; begins with a segment
  109. # / path-empty ; zero characters
  110. path = r"""(?: %(path_abempty)s |
  111. %(path_absolute)s |
  112. %(path_noscheme)s |
  113. %(path_rootless)s |
  114. %(path_empty)s
  115. )
  116. """ % locals()
  117. ### Query and Fragment
  118. # query = *( pchar / "/" / "?" )
  119. query = r"(?: %(pchar)s | / | \? )*" % locals()
  120. # fragment = *( pchar / "/" / "?" )
  121. fragment = r"(?: %(pchar)s | / | \? )*" % locals()
  122. # URIs
  123. # hier-part = "//" authority path-abempty
  124. # / path-absolute
  125. # / path-rootless
  126. # / path-empty
  127. hier_part = r"""(?: (?: // %(authority)s %(path_abempty)s ) |
  128. %(path_absolute)s |
  129. %(path_rootless)s |
  130. %(path_empty)s
  131. )
  132. """ % locals()
  133. # relative-part = "//" authority path-abempty
  134. # / path-absolute
  135. # / path-noscheme
  136. # / path-empty
  137. relative_part = r"""(?: (?: // %(authority)s %(path_abempty)s ) |
  138. %(path_absolute)s |
  139. %(path_noscheme)s |
  140. %(path_empty)s
  141. )
  142. """ % locals()
  143. # relative-ref = relative-part [ "?" query ] [ "#" fragment ]
  144. relative_ref = r"%(relative_part)s (?: \? %(query)s)? (?: \# %(fragment)s)?" % locals(
  145. )
  146. # URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
  147. URI = r"^(?: %(scheme)s : %(hier_part)s (?: \? %(query)s )? (?: \# %(fragment)s )? )$" % locals(
  148. )
  149. # URI-reference = URI / relative-ref
  150. URI_reference = r"^(?: %(URI)s | %(relative_ref)s )$" % locals()
  151. # absolute-URI = scheme ":" hier-part [ "?" query ]
  152. absolute_URI = r"^(?: %(scheme)s : %(hier_part)s (?: \? %(query)s )? )$" % locals(
  153. )
  154. def is_uri(uri):
  155. return re.match(URI, uri, re.VERBOSE)
  156. def is_uri_reference(uri):
  157. return re.match(URI_reference, uri, re.VERBOSE)
  158. def is_absolute_uri(uri):
  159. return re.match(absolute_URI, uri, re.VERBOSE)