text.py 3.1 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192
  1. # coding=utf-8
  2. import datetime
  3. from decimal import Decimal
  4. import re
  5. import six
  6. import unicodedata
  7. _re_pattern = re.compile(r'[^\w\s-]', flags=re.U)
  8. _re_pattern_allow_dots = re.compile(r'[^\.\w\s-]', flags=re.U)
  9. _re_spaces = re.compile(r'[-\s]+', flags=re.U)
  10. _PROTECTED_TYPES = six.integer_types + (
  11. type(None), float, Decimal,
  12. datetime.datetime, datetime.date, datetime.time,
  13. )
  14. def is_protected_type(obj):
  15. """Determine if the object instance is of a protected type.
  16. Objects of protected types are preserved as-is when passed to
  17. force_text(strings_only=True).
  18. """
  19. return isinstance(obj, _PROTECTED_TYPES)
  20. def force_text(s, encoding='utf-8', strings_only=False, errors='strict'):
  21. """
  22. Similar to smart_text, except that lazy instances are resolved to
  23. strings, rather than kept as lazy objects.
  24. If strings_only is True, don't convert (some) non-string-like objects.
  25. """
  26. # Handle the common case first for performance reasons.
  27. if issubclass(type(s), six.text_type):
  28. return s
  29. if strings_only and is_protected_type(s):
  30. return s
  31. try:
  32. if not issubclass(type(s), six.string_types):
  33. if six.PY3:
  34. if isinstance(s, bytes):
  35. s = six.text_type(s, encoding, errors)
  36. else:
  37. s = six.text_type(s)
  38. elif hasattr(s, '__unicode__'):
  39. s = six.text_type(s)
  40. else:
  41. s = six.text_type(bytes(s), encoding, errors)
  42. else:
  43. # Note: We use .decode() here, instead of six.text_type(s, encoding,
  44. # errors), so that if s is a SafeBytes, it ends up being a
  45. # SafeText at the end.
  46. s = s.decode(encoding, errors)
  47. except UnicodeDecodeError as e:
  48. if not isinstance(s, Exception):
  49. raise ValueError(s, *e.args)
  50. else:
  51. # If we get to here, the caller has passed in an Exception
  52. # subclass populated with non-ASCII bytestring data without a
  53. # working unicode method. Try to handle this without raising a
  54. # further exception by individually forcing the exception args
  55. # to unicode.
  56. s = ' '.join(force_text(arg, encoding, strings_only, errors)
  57. for arg in s)
  58. return s
  59. def slugify(value, allow_dots=False, allow_unicode=False):
  60. """
  61. Converts to lowercase, removes non-word characters (alphanumerics and
  62. underscores) and converts spaces to hyphens. Also strips leading and
  63. trailing whitespace. Modified to optionally allow dots.
  64. Adapted from Django 1.9
  65. """
  66. if allow_dots:
  67. pattern = _re_pattern_allow_dots
  68. else:
  69. pattern = _re_pattern
  70. value = force_text(value)
  71. if allow_unicode:
  72. value = unicodedata.normalize('NFKC', value)
  73. value = pattern.sub('', value).strip().lower()
  74. return _re_spaces.sub('-', value)
  75. value = unicodedata.normalize('NFKD', value).encode(
  76. 'ascii', 'ignore').decode('ascii')
  77. value = pattern.sub('', value).strip().lower()
  78. return _re_spaces.sub('-', value)