decoder.py 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156
  1. # -*- coding: utf-8 -*-
  2. """
  3. requests_toolbelt.multipart.decoder
  4. ===================================
  5. This holds all the implementation details of the MultipartDecoder
  6. """
  7. import sys
  8. import email.parser
  9. from .encoder import encode_with
  10. from requests.structures import CaseInsensitiveDict
  11. def _split_on_find(content, bound):
  12. point = content.find(bound)
  13. return content[:point], content[point + len(bound):]
  14. class ImproperBodyPartContentException(Exception):
  15. pass
  16. class NonMultipartContentTypeException(Exception):
  17. pass
  18. def _header_parser(string, encoding):
  19. major = sys.version_info[0]
  20. if major == 3:
  21. string = string.decode(encoding)
  22. headers = email.parser.HeaderParser().parsestr(string).items()
  23. return (
  24. (encode_with(k, encoding), encode_with(v, encoding))
  25. for k, v in headers
  26. )
  27. class BodyPart(object):
  28. """
  29. The ``BodyPart`` object is a ``Response``-like interface to an individual
  30. subpart of a multipart response. It is expected that these will
  31. generally be created by objects of the ``MultipartDecoder`` class.
  32. Like ``Response``, there is a ``CaseInsensitiveDict`` object named headers,
  33. ``content`` to access bytes, ``text`` to access unicode, and ``encoding``
  34. to access the unicode codec.
  35. """
  36. def __init__(self, content, encoding):
  37. self.encoding = encoding
  38. headers = {}
  39. # Split into header section (if any) and the content
  40. if b'\r\n\r\n' in content:
  41. first, self.content = _split_on_find(content, b'\r\n\r\n')
  42. if first != b'':
  43. headers = _header_parser(first.lstrip(), encoding)
  44. else:
  45. raise ImproperBodyPartContentException(
  46. 'content does not contain CR-LF-CR-LF'
  47. )
  48. self.headers = CaseInsensitiveDict(headers)
  49. @property
  50. def text(self):
  51. """Content of the ``BodyPart`` in unicode."""
  52. return self.content.decode(self.encoding)
  53. class MultipartDecoder(object):
  54. """
  55. The ``MultipartDecoder`` object parses the multipart payload of
  56. a bytestring into a tuple of ``Response``-like ``BodyPart`` objects.
  57. The basic usage is::
  58. import requests
  59. from requests_toolbelt import MultipartDecoder
  60. response = request.get(url)
  61. decoder = MultipartDecoder.from_response(response)
  62. for part in decoder.parts:
  63. print(part.headers['content-type'])
  64. If the multipart content is not from a response, basic usage is::
  65. from requests_toolbelt import MultipartDecoder
  66. decoder = MultipartDecoder(content, content_type)
  67. for part in decoder.parts:
  68. print(part.headers['content-type'])
  69. For both these usages, there is an optional ``encoding`` parameter. This is
  70. a string, which is the name of the unicode codec to use (default is
  71. ``'utf-8'``).
  72. """
  73. def __init__(self, content, content_type, encoding='utf-8'):
  74. #: Original Content-Type header
  75. self.content_type = content_type
  76. #: Response body encoding
  77. self.encoding = encoding
  78. #: Parsed parts of the multipart response body
  79. self.parts = tuple()
  80. self._find_boundary()
  81. self._parse_body(content)
  82. def _find_boundary(self):
  83. ct_info = tuple(x.strip() for x in self.content_type.split(';'))
  84. mimetype = ct_info[0]
  85. if mimetype.split('/')[0].lower() != 'multipart':
  86. raise NonMultipartContentTypeException(
  87. "Unexpected mimetype in content-type: '{0}'".format(mimetype)
  88. )
  89. for item in ct_info[1:]:
  90. attr, value = _split_on_find(
  91. item,
  92. '='
  93. )
  94. if attr.lower() == 'boundary':
  95. self.boundary = encode_with(value.strip('"'), self.encoding)
  96. @staticmethod
  97. def _fix_first_part(part, boundary_marker):
  98. bm_len = len(boundary_marker)
  99. if boundary_marker == part[:bm_len]:
  100. return part[bm_len:]
  101. else:
  102. return part
  103. def _parse_body(self, content):
  104. boundary = b''.join((b'--', self.boundary))
  105. def body_part(part):
  106. fixed = MultipartDecoder._fix_first_part(part, boundary)
  107. return BodyPart(fixed, self.encoding)
  108. def test_part(part):
  109. return (part != b'' and
  110. part != b'\r\n' and
  111. part[:4] != b'--\r\n' and
  112. part != b'--')
  113. parts = content.split(b''.join((b'\r\n', boundary)))
  114. self.parts = tuple(body_part(x) for x in parts if test_part(x))
  115. @classmethod
  116. def from_response(cls, response, encoding='utf-8'):
  117. content = response.content
  118. content_type = response.headers.get('content-type', None)
  119. return cls(content, content_type, encoding)