encoding.py 8.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237
  1. from __future__ import unicode_literals
  2. import codecs
  3. import datetime
  4. from decimal import Decimal
  5. import locale
  6. from django.utils.functional import Promise
  7. from django.utils import six
  8. from django.utils.six.moves.urllib.parse import quote
  9. class DjangoUnicodeDecodeError(UnicodeDecodeError):
  10. def __init__(self, obj, *args):
  11. self.obj = obj
  12. UnicodeDecodeError.__init__(self, *args)
  13. def __str__(self):
  14. original = UnicodeDecodeError.__str__(self)
  15. return '%s. You passed in %r (%s)' % (original, self.obj,
  16. type(self.obj))
  17. def python_2_unicode_compatible(klass):
  18. """
  19. A decorator that defines __unicode__ and __str__ methods under Python 2.
  20. Under Python 3 it does nothing.
  21. To support Python 2 and 3 with a single code base, define a __str__ method
  22. returning text and apply this decorator to the class.
  23. """
  24. if six.PY2:
  25. if '__str__' not in klass.__dict__:
  26. raise ValueError("@python_2_unicode_compatible cannot be applied "
  27. "to %s because it doesn't define __str__()." %
  28. klass.__name__)
  29. klass.__unicode__ = klass.__str__
  30. klass.__str__ = lambda self: self.__unicode__().encode('utf-8')
  31. return klass
  32. def smart_text(s, encoding='utf-8', strings_only=False, errors='strict'):
  33. """
  34. Returns a text object representing 's' -- unicode on Python 2 and str on
  35. Python 3. Treats bytestrings using the 'encoding' codec.
  36. If strings_only is True, don't convert (some) non-string-like objects.
  37. """
  38. if isinstance(s, Promise):
  39. # The input is the result of a gettext_lazy() call.
  40. return s
  41. return force_text(s, encoding, strings_only, errors)
  42. def is_protected_type(obj):
  43. """Determine if the object instance is of a protected type.
  44. Objects of protected types are preserved as-is when passed to
  45. force_text(strings_only=True).
  46. """
  47. return isinstance(obj, six.integer_types + (type(None), float, Decimal,
  48. datetime.datetime, datetime.date, datetime.time))
  49. def force_text(s, encoding='utf-8', strings_only=False, errors='strict'):
  50. """
  51. Similar to smart_text, except that lazy instances are resolved to
  52. strings, rather than kept as lazy objects.
  53. If strings_only is True, don't convert (some) non-string-like objects.
  54. """
  55. # Handle the common case first for performance reasons.
  56. if isinstance(s, six.text_type):
  57. return s
  58. if strings_only and is_protected_type(s):
  59. return s
  60. try:
  61. if not isinstance(s, six.string_types):
  62. if six.PY3:
  63. if isinstance(s, bytes):
  64. s = six.text_type(s, encoding, errors)
  65. else:
  66. s = six.text_type(s)
  67. elif hasattr(s, '__unicode__'):
  68. s = six.text_type(s)
  69. else:
  70. s = six.text_type(bytes(s), encoding, errors)
  71. else:
  72. # Note: We use .decode() here, instead of six.text_type(s, encoding,
  73. # errors), so that if s is a SafeBytes, it ends up being a
  74. # SafeText at the end.
  75. s = s.decode(encoding, errors)
  76. except UnicodeDecodeError as e:
  77. if not isinstance(s, Exception):
  78. raise DjangoUnicodeDecodeError(s, *e.args)
  79. else:
  80. # If we get to here, the caller has passed in an Exception
  81. # subclass populated with non-ASCII bytestring data without a
  82. # working unicode method. Try to handle this without raising a
  83. # further exception by individually forcing the exception args
  84. # to unicode.
  85. s = ' '.join([force_text(arg, encoding, strings_only,
  86. errors) for arg in s])
  87. return s
  88. def smart_bytes(s, encoding='utf-8', strings_only=False, errors='strict'):
  89. """
  90. Returns a bytestring version of 's', encoded as specified in 'encoding'.
  91. If strings_only is True, don't convert (some) non-string-like objects.
  92. """
  93. if isinstance(s, Promise):
  94. # The input is the result of a gettext_lazy() call.
  95. return s
  96. return force_bytes(s, encoding, strings_only, errors)
  97. def force_bytes(s, encoding='utf-8', strings_only=False, errors='strict'):
  98. """
  99. Similar to smart_bytes, except that lazy instances are resolved to
  100. strings, rather than kept as lazy objects.
  101. If strings_only is True, don't convert (some) non-string-like objects.
  102. """
  103. # Handle the common case first for performance reasons.
  104. if isinstance(s, bytes):
  105. if encoding == 'utf-8':
  106. return s
  107. else:
  108. return s.decode('utf-8', errors).encode(encoding, errors)
  109. if strings_only and is_protected_type(s):
  110. return s
  111. if isinstance(s, six.memoryview):
  112. return bytes(s)
  113. if isinstance(s, Promise):
  114. return six.text_type(s).encode(encoding, errors)
  115. if not isinstance(s, six.string_types):
  116. try:
  117. if six.PY3:
  118. return six.text_type(s).encode(encoding)
  119. else:
  120. return bytes(s)
  121. except UnicodeEncodeError:
  122. if isinstance(s, Exception):
  123. # An Exception subclass containing non-ASCII data that doesn't
  124. # know how to print itself properly. We shouldn't raise a
  125. # further exception.
  126. return b' '.join([force_bytes(arg, encoding, strings_only,
  127. errors) for arg in s])
  128. return six.text_type(s).encode(encoding, errors)
  129. else:
  130. return s.encode(encoding, errors)
  131. if six.PY3:
  132. smart_str = smart_text
  133. force_str = force_text
  134. else:
  135. smart_str = smart_bytes
  136. force_str = force_bytes
  137. # backwards compatibility for Python 2
  138. smart_unicode = smart_text
  139. force_unicode = force_text
  140. smart_str.__doc__ = """
  141. Apply smart_text in Python 3 and smart_bytes in Python 2.
  142. This is suitable for writing to sys.stdout (for instance).
  143. """
  144. force_str.__doc__ = """
  145. Apply force_text in Python 3 and force_bytes in Python 2.
  146. """
  147. def iri_to_uri(iri):
  148. """
  149. Convert an Internationalized Resource Identifier (IRI) portion to a URI
  150. portion that is suitable for inclusion in a URL.
  151. This is the algorithm from section 3.1 of RFC 3987. However, since we are
  152. assuming input is either UTF-8 or unicode already, we can simplify things a
  153. little from the full method.
  154. Returns an ASCII string containing the encoded result.
  155. """
  156. # The list of safe characters here is constructed from the "reserved" and
  157. # "unreserved" characters specified in sections 2.2 and 2.3 of RFC 3986:
  158. # reserved = gen-delims / sub-delims
  159. # gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"
  160. # sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
  161. # / "*" / "+" / "," / ";" / "="
  162. # unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
  163. # Of the unreserved characters, urllib.quote already considers all but
  164. # the ~ safe.
  165. # The % character is also added to the list of safe characters here, as the
  166. # end of section 3.1 of RFC 3987 specifically mentions that % must not be
  167. # converted.
  168. if iri is None:
  169. return iri
  170. return quote(force_bytes(iri), safe=b"/#%[]=:;$&()+,!?*@'~")
  171. def filepath_to_uri(path):
  172. """Convert a file system path to a URI portion that is suitable for
  173. inclusion in a URL.
  174. We are assuming input is either UTF-8 or unicode already.
  175. This method will encode certain chars that would normally be recognized as
  176. special chars for URIs. Note that this method does not encode the '
  177. character, as it is a valid character within URIs. See
  178. encodeURIComponent() JavaScript function for more details.
  179. Returns an ASCII string containing the encoded result.
  180. """
  181. if path is None:
  182. return path
  183. # I know about `os.sep` and `os.altsep` but I want to leave
  184. # some flexibility for hardcoding separators.
  185. return quote(force_bytes(path).replace(b"\\", b"/"), safe=b"/~!*()'")
  186. def get_system_encoding():
  187. """
  188. The encoding of the default system locale but falls back to the given
  189. fallback encoding if the encoding is unsupported by python or could
  190. not be determined. See tickets #10335 and #5846
  191. """
  192. try:
  193. encoding = locale.getdefaultlocale()[1] or 'ascii'
  194. codecs.lookup(encoding)
  195. except Exception:
  196. encoding = 'ascii'
  197. return encoding
  198. DEFAULT_LOCALE_ENCODING = get_system_encoding()