decompression.py 2.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788
  1. """ This module implements the DecompressionMiddleware which tries to recognise
  2. and extract the potentially compressed responses that may arrive.
  3. """
  4. import bz2
  5. import gzip
  6. import zipfile
  7. import tarfile
  8. import logging
  9. from tempfile import mktemp
  10. import six
  11. try:
  12. from cStringIO import StringIO as BytesIO
  13. except ImportError:
  14. from io import BytesIO
  15. from scrapy.responsetypes import responsetypes
  16. logger = logging.getLogger(__name__)
  17. class DecompressionMiddleware(object):
  18. """ This middleware tries to recognise and extract the possibly compressed
  19. responses that may arrive. """
  20. def __init__(self):
  21. self._formats = {
  22. 'tar': self._is_tar,
  23. 'zip': self._is_zip,
  24. 'gz': self._is_gzip,
  25. 'bz2': self._is_bzip2
  26. }
  27. def _is_tar(self, response):
  28. archive = BytesIO(response.body)
  29. try:
  30. tar_file = tarfile.open(name=mktemp(), fileobj=archive)
  31. except tarfile.ReadError:
  32. return
  33. body = tar_file.extractfile(tar_file.members[0]).read()
  34. respcls = responsetypes.from_args(filename=tar_file.members[0].name, body=body)
  35. return response.replace(body=body, cls=respcls)
  36. def _is_zip(self, response):
  37. archive = BytesIO(response.body)
  38. try:
  39. zip_file = zipfile.ZipFile(archive)
  40. except zipfile.BadZipfile:
  41. return
  42. namelist = zip_file.namelist()
  43. body = zip_file.read(namelist[0])
  44. respcls = responsetypes.from_args(filename=namelist[0], body=body)
  45. return response.replace(body=body, cls=respcls)
  46. def _is_gzip(self, response):
  47. archive = BytesIO(response.body)
  48. try:
  49. body = gzip.GzipFile(fileobj=archive).read()
  50. except IOError:
  51. return
  52. respcls = responsetypes.from_args(body=body)
  53. return response.replace(body=body, cls=respcls)
  54. def _is_bzip2(self, response):
  55. try:
  56. body = bz2.decompress(response.body)
  57. except IOError:
  58. return
  59. respcls = responsetypes.from_args(body=body)
  60. return response.replace(body=body, cls=respcls)
  61. def process_response(self, request, response, spider):
  62. if not response.body:
  63. return response
  64. for fmt, func in six.iteritems(self._formats):
  65. new_response = func(response)
  66. if new_response:
  67. logger.debug('Decompressed response with format: %(responsefmt)s',
  68. {'responsefmt': fmt}, extra={'spider': spider})
  69. return new_response
  70. return response