httpcompression.py 2.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869
  1. import zlib
  2. from scrapy.utils.gz import gunzip
  3. from scrapy.http import Response, TextResponse
  4. from scrapy.responsetypes import responsetypes
  5. from scrapy.exceptions import NotConfigured
  6. ACCEPTED_ENCODINGS = [b'gzip', b'deflate']
  7. try:
  8. import brotli
  9. ACCEPTED_ENCODINGS.append(b'br')
  10. except ImportError:
  11. pass
  12. class HttpCompressionMiddleware(object):
  13. """This middleware allows compressed (gzip, deflate) traffic to be
  14. sent/received from web sites"""
  15. @classmethod
  16. def from_crawler(cls, crawler):
  17. if not crawler.settings.getbool('COMPRESSION_ENABLED'):
  18. raise NotConfigured
  19. return cls()
  20. def process_request(self, request, spider):
  21. request.headers.setdefault('Accept-Encoding',
  22. b",".join(ACCEPTED_ENCODINGS))
  23. def process_response(self, request, response, spider):
  24. if request.method == 'HEAD':
  25. return response
  26. if isinstance(response, Response):
  27. content_encoding = response.headers.getlist('Content-Encoding')
  28. if content_encoding:
  29. encoding = content_encoding.pop()
  30. decoded_body = self._decode(response.body, encoding.lower())
  31. respcls = responsetypes.from_args(headers=response.headers, \
  32. url=response.url, body=decoded_body)
  33. kwargs = dict(cls=respcls, body=decoded_body)
  34. if issubclass(respcls, TextResponse):
  35. # force recalculating the encoding until we make sure the
  36. # responsetypes guessing is reliable
  37. kwargs['encoding'] = None
  38. response = response.replace(**kwargs)
  39. if not content_encoding:
  40. del response.headers['Content-Encoding']
  41. return response
  42. def _decode(self, body, encoding):
  43. if encoding == b'gzip' or encoding == b'x-gzip':
  44. body = gunzip(body)
  45. if encoding == b'deflate':
  46. try:
  47. body = zlib.decompress(body)
  48. except zlib.error:
  49. # ugly hack to work with raw deflate content that may
  50. # be sent by microsoft servers. For more information, see:
  51. # http://carsten.codimi.de/gzip.yaws/
  52. # http://www.port80software.com/200ok/archive/2005/10/31/868.aspx
  53. # http://www.gzip.org/zlib/zlib_faq.html#faq38
  54. body = zlib.decompress(body, -15)
  55. if encoding == b'br' and b'br' in ACCEPTED_ENCODINGS:
  56. body = brotli.decompress(body)
  57. return body