escape.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399
  1. #
  2. # Copyright 2009 Facebook
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License"); you may
  5. # not use this file except in compliance with the License. You may obtain
  6. # a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  12. # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  13. # License for the specific language governing permissions and limitations
  14. # under the License.
  15. """Escaping/unescaping methods for HTML, JSON, URLs, and others.
  16. Also includes a few other miscellaneous string manipulation functions that
  17. have crept in over time.
  18. """
  19. from __future__ import absolute_import, division, print_function
  20. import json
  21. import re
  22. from tornado.util import PY3, unicode_type, basestring_type
  23. if PY3:
  24. from urllib.parse import parse_qs as _parse_qs
  25. import html.entities as htmlentitydefs
  26. import urllib.parse as urllib_parse
  27. unichr = chr
  28. else:
  29. from urlparse import parse_qs as _parse_qs
  30. import htmlentitydefs
  31. import urllib as urllib_parse
  32. try:
  33. import typing # noqa
  34. except ImportError:
  35. pass
  36. _XHTML_ESCAPE_RE = re.compile('[&<>"\']')
  37. _XHTML_ESCAPE_DICT = {'&': '&amp;', '<': '&lt;', '>': '&gt;', '"': '&quot;',
  38. '\'': '&#39;'}
  39. def xhtml_escape(value):
  40. """Escapes a string so it is valid within HTML or XML.
  41. Escapes the characters ``<``, ``>``, ``"``, ``'``, and ``&``.
  42. When used in attribute values the escaped strings must be enclosed
  43. in quotes.
  44. .. versionchanged:: 3.2
  45. Added the single quote to the list of escaped characters.
  46. """
  47. return _XHTML_ESCAPE_RE.sub(lambda match: _XHTML_ESCAPE_DICT[match.group(0)],
  48. to_basestring(value))
  49. def xhtml_unescape(value):
  50. """Un-escapes an XML-escaped string."""
  51. return re.sub(r"&(#?)(\w+?);", _convert_entity, _unicode(value))
  52. # The fact that json_encode wraps json.dumps is an implementation detail.
  53. # Please see https://github.com/tornadoweb/tornado/pull/706
  54. # before sending a pull request that adds **kwargs to this function.
  55. def json_encode(value):
  56. """JSON-encodes the given Python object."""
  57. # JSON permits but does not require forward slashes to be escaped.
  58. # This is useful when json data is emitted in a <script> tag
  59. # in HTML, as it prevents </script> tags from prematurely terminating
  60. # the javascript. Some json libraries do this escaping by default,
  61. # although python's standard library does not, so we do it here.
  62. # http://stackoverflow.com/questions/1580647/json-why-are-forward-slashes-escaped
  63. return json.dumps(value).replace("</", "<\\/")
  64. def json_decode(value):
  65. """Returns Python objects for the given JSON string."""
  66. return json.loads(to_basestring(value))
  67. def squeeze(value):
  68. """Replace all sequences of whitespace chars with a single space."""
  69. return re.sub(r"[\x00-\x20]+", " ", value).strip()
  70. def url_escape(value, plus=True):
  71. """Returns a URL-encoded version of the given value.
  72. If ``plus`` is true (the default), spaces will be represented
  73. as "+" instead of "%20". This is appropriate for query strings
  74. but not for the path component of a URL. Note that this default
  75. is the reverse of Python's urllib module.
  76. .. versionadded:: 3.1
  77. The ``plus`` argument
  78. """
  79. quote = urllib_parse.quote_plus if plus else urllib_parse.quote
  80. return quote(utf8(value))
  81. # python 3 changed things around enough that we need two separate
  82. # implementations of url_unescape. We also need our own implementation
  83. # of parse_qs since python 3's version insists on decoding everything.
  84. if not PY3:
  85. def url_unescape(value, encoding='utf-8', plus=True):
  86. """Decodes the given value from a URL.
  87. The argument may be either a byte or unicode string.
  88. If encoding is None, the result will be a byte string. Otherwise,
  89. the result is a unicode string in the specified encoding.
  90. If ``plus`` is true (the default), plus signs will be interpreted
  91. as spaces (literal plus signs must be represented as "%2B"). This
  92. is appropriate for query strings and form-encoded values but not
  93. for the path component of a URL. Note that this default is the
  94. reverse of Python's urllib module.
  95. .. versionadded:: 3.1
  96. The ``plus`` argument
  97. """
  98. unquote = (urllib_parse.unquote_plus if plus else urllib_parse.unquote)
  99. if encoding is None:
  100. return unquote(utf8(value))
  101. else:
  102. return unicode_type(unquote(utf8(value)), encoding)
  103. parse_qs_bytes = _parse_qs
  104. else:
  105. def url_unescape(value, encoding='utf-8', plus=True):
  106. """Decodes the given value from a URL.
  107. The argument may be either a byte or unicode string.
  108. If encoding is None, the result will be a byte string. Otherwise,
  109. the result is a unicode string in the specified encoding.
  110. If ``plus`` is true (the default), plus signs will be interpreted
  111. as spaces (literal plus signs must be represented as "%2B"). This
  112. is appropriate for query strings and form-encoded values but not
  113. for the path component of a URL. Note that this default is the
  114. reverse of Python's urllib module.
  115. .. versionadded:: 3.1
  116. The ``plus`` argument
  117. """
  118. if encoding is None:
  119. if plus:
  120. # unquote_to_bytes doesn't have a _plus variant
  121. value = to_basestring(value).replace('+', ' ')
  122. return urllib_parse.unquote_to_bytes(value)
  123. else:
  124. unquote = (urllib_parse.unquote_plus if plus
  125. else urllib_parse.unquote)
  126. return unquote(to_basestring(value), encoding=encoding)
  127. def parse_qs_bytes(qs, keep_blank_values=False, strict_parsing=False):
  128. """Parses a query string like urlparse.parse_qs, but returns the
  129. values as byte strings.
  130. Keys still become type str (interpreted as latin1 in python3!)
  131. because it's too painful to keep them as byte strings in
  132. python3 and in practice they're nearly always ascii anyway.
  133. """
  134. # This is gross, but python3 doesn't give us another way.
  135. # Latin1 is the universal donor of character encodings.
  136. result = _parse_qs(qs, keep_blank_values, strict_parsing,
  137. encoding='latin1', errors='strict')
  138. encoded = {}
  139. for k, v in result.items():
  140. encoded[k] = [i.encode('latin1') for i in v]
  141. return encoded
  142. _UTF8_TYPES = (bytes, type(None))
  143. def utf8(value):
  144. # type: (typing.Union[bytes,unicode_type,None])->typing.Union[bytes,None]
  145. """Converts a string argument to a byte string.
  146. If the argument is already a byte string or None, it is returned unchanged.
  147. Otherwise it must be a unicode string and is encoded as utf8.
  148. """
  149. if isinstance(value, _UTF8_TYPES):
  150. return value
  151. if not isinstance(value, unicode_type):
  152. raise TypeError(
  153. "Expected bytes, unicode, or None; got %r" % type(value)
  154. )
  155. return value.encode("utf-8")
  156. _TO_UNICODE_TYPES = (unicode_type, type(None))
  157. def to_unicode(value):
  158. """Converts a string argument to a unicode string.
  159. If the argument is already a unicode string or None, it is returned
  160. unchanged. Otherwise it must be a byte string and is decoded as utf8.
  161. """
  162. if isinstance(value, _TO_UNICODE_TYPES):
  163. return value
  164. if not isinstance(value, bytes):
  165. raise TypeError(
  166. "Expected bytes, unicode, or None; got %r" % type(value)
  167. )
  168. return value.decode("utf-8")
  169. # to_unicode was previously named _unicode not because it was private,
  170. # but to avoid conflicts with the built-in unicode() function/type
  171. _unicode = to_unicode
  172. # When dealing with the standard library across python 2 and 3 it is
  173. # sometimes useful to have a direct conversion to the native string type
  174. if str is unicode_type:
  175. native_str = to_unicode
  176. else:
  177. native_str = utf8
  178. _BASESTRING_TYPES = (basestring_type, type(None))
  179. def to_basestring(value):
  180. """Converts a string argument to a subclass of basestring.
  181. In python2, byte and unicode strings are mostly interchangeable,
  182. so functions that deal with a user-supplied argument in combination
  183. with ascii string constants can use either and should return the type
  184. the user supplied. In python3, the two types are not interchangeable,
  185. so this method is needed to convert byte strings to unicode.
  186. """
  187. if isinstance(value, _BASESTRING_TYPES):
  188. return value
  189. if not isinstance(value, bytes):
  190. raise TypeError(
  191. "Expected bytes, unicode, or None; got %r" % type(value)
  192. )
  193. return value.decode("utf-8")
  194. def recursive_unicode(obj):
  195. """Walks a simple data structure, converting byte strings to unicode.
  196. Supports lists, tuples, and dictionaries.
  197. """
  198. if isinstance(obj, dict):
  199. return dict((recursive_unicode(k), recursive_unicode(v)) for (k, v) in obj.items())
  200. elif isinstance(obj, list):
  201. return list(recursive_unicode(i) for i in obj)
  202. elif isinstance(obj, tuple):
  203. return tuple(recursive_unicode(i) for i in obj)
  204. elif isinstance(obj, bytes):
  205. return to_unicode(obj)
  206. else:
  207. return obj
  208. # I originally used the regex from
  209. # http://daringfireball.net/2010/07/improved_regex_for_matching_urls
  210. # but it gets all exponential on certain patterns (such as too many trailing
  211. # dots), causing the regex matcher to never return.
  212. # This regex should avoid those problems.
  213. # Use to_unicode instead of tornado.util.u - we don't want backslashes getting
  214. # processed as escapes.
  215. _URL_RE = re.compile(to_unicode(
  216. r"""\b((?:([\w-]+):(/{1,3})|www[.])(?:(?:(?:[^\s&()]|&amp;|&quot;)*(?:[^!"#$%&'()*+,.:;<=>?@\[\]^`{|}~\s]))|(?:\((?:[^\s&()]|&amp;|&quot;)*\)))+)""" # noqa: E501
  217. ))
  218. def linkify(text, shorten=False, extra_params="",
  219. require_protocol=False, permitted_protocols=["http", "https"]):
  220. """Converts plain text into HTML with links.
  221. For example: ``linkify("Hello http://tornadoweb.org!")`` would return
  222. ``Hello <a href="http://tornadoweb.org">http://tornadoweb.org</a>!``
  223. Parameters:
  224. * ``shorten``: Long urls will be shortened for display.
  225. * ``extra_params``: Extra text to include in the link tag, or a callable
  226. taking the link as an argument and returning the extra text
  227. e.g. ``linkify(text, extra_params='rel="nofollow" class="external"')``,
  228. or::
  229. def extra_params_cb(url):
  230. if url.startswith("http://example.com"):
  231. return 'class="internal"'
  232. else:
  233. return 'class="external" rel="nofollow"'
  234. linkify(text, extra_params=extra_params_cb)
  235. * ``require_protocol``: Only linkify urls which include a protocol. If
  236. this is False, urls such as www.facebook.com will also be linkified.
  237. * ``permitted_protocols``: List (or set) of protocols which should be
  238. linkified, e.g. ``linkify(text, permitted_protocols=["http", "ftp",
  239. "mailto"])``. It is very unsafe to include protocols such as
  240. ``javascript``.
  241. """
  242. if extra_params and not callable(extra_params):
  243. extra_params = " " + extra_params.strip()
  244. def make_link(m):
  245. url = m.group(1)
  246. proto = m.group(2)
  247. if require_protocol and not proto:
  248. return url # not protocol, no linkify
  249. if proto and proto not in permitted_protocols:
  250. return url # bad protocol, no linkify
  251. href = m.group(1)
  252. if not proto:
  253. href = "http://" + href # no proto specified, use http
  254. if callable(extra_params):
  255. params = " " + extra_params(href).strip()
  256. else:
  257. params = extra_params
  258. # clip long urls. max_len is just an approximation
  259. max_len = 30
  260. if shorten and len(url) > max_len:
  261. before_clip = url
  262. if proto:
  263. proto_len = len(proto) + 1 + len(m.group(3) or "") # +1 for :
  264. else:
  265. proto_len = 0
  266. parts = url[proto_len:].split("/")
  267. if len(parts) > 1:
  268. # Grab the whole host part plus the first bit of the path
  269. # The path is usually not that interesting once shortened
  270. # (no more slug, etc), so it really just provides a little
  271. # extra indication of shortening.
  272. url = url[:proto_len] + parts[0] + "/" + \
  273. parts[1][:8].split('?')[0].split('.')[0]
  274. if len(url) > max_len * 1.5: # still too long
  275. url = url[:max_len]
  276. if url != before_clip:
  277. amp = url.rfind('&')
  278. # avoid splitting html char entities
  279. if amp > max_len - 5:
  280. url = url[:amp]
  281. url += "..."
  282. if len(url) >= len(before_clip):
  283. url = before_clip
  284. else:
  285. # full url is visible on mouse-over (for those who don't
  286. # have a status bar, such as Safari by default)
  287. params += ' title="%s"' % href
  288. return u'<a href="%s"%s>%s</a>' % (href, params, url)
  289. # First HTML-escape so that our strings are all safe.
  290. # The regex is modified to avoid character entites other than &amp; so
  291. # that we won't pick up &quot;, etc.
  292. text = _unicode(xhtml_escape(text))
  293. return _URL_RE.sub(make_link, text)
  294. def _convert_entity(m):
  295. if m.group(1) == "#":
  296. try:
  297. if m.group(2)[:1].lower() == 'x':
  298. return unichr(int(m.group(2)[1:], 16))
  299. else:
  300. return unichr(int(m.group(2)))
  301. except ValueError:
  302. return "&#%s;" % m.group(2)
  303. try:
  304. return _HTML_UNICODE_MAP[m.group(2)]
  305. except KeyError:
  306. return "&%s;" % m.group(2)
  307. def _build_unicode_map():
  308. unicode_map = {}
  309. for name, value in htmlentitydefs.name2codepoint.items():
  310. unicode_map[name] = unichr(value)
  311. return unicode_map
  312. _HTML_UNICODE_MAP = _build_unicode_map()