12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788 |
- """ This module implements the DecompressionMiddleware which tries to recognise
- and extract the potentially compressed responses that may arrive.
- """
- import bz2
- import gzip
- import zipfile
- import tarfile
- import logging
- from tempfile import mktemp
- import six
- try:
- from cStringIO import StringIO as BytesIO
- except ImportError:
- from io import BytesIO
- from scrapy.responsetypes import responsetypes
- logger = logging.getLogger(__name__)
- class DecompressionMiddleware(object):
- """ This middleware tries to recognise and extract the possibly compressed
- responses that may arrive. """
- def __init__(self):
- self._formats = {
- 'tar': self._is_tar,
- 'zip': self._is_zip,
- 'gz': self._is_gzip,
- 'bz2': self._is_bzip2
- }
- def _is_tar(self, response):
- archive = BytesIO(response.body)
- try:
- tar_file = tarfile.open(name=mktemp(), fileobj=archive)
- except tarfile.ReadError:
- return
- body = tar_file.extractfile(tar_file.members[0]).read()
- respcls = responsetypes.from_args(filename=tar_file.members[0].name, body=body)
- return response.replace(body=body, cls=respcls)
- def _is_zip(self, response):
- archive = BytesIO(response.body)
- try:
- zip_file = zipfile.ZipFile(archive)
- except zipfile.BadZipfile:
- return
- namelist = zip_file.namelist()
- body = zip_file.read(namelist[0])
- respcls = responsetypes.from_args(filename=namelist[0], body=body)
- return response.replace(body=body, cls=respcls)
- def _is_gzip(self, response):
- archive = BytesIO(response.body)
- try:
- body = gzip.GzipFile(fileobj=archive).read()
- except IOError:
- return
- respcls = responsetypes.from_args(body=body)
- return response.replace(body=body, cls=respcls)
- def _is_bzip2(self, response):
- try:
- body = bz2.decompress(response.body)
- except IOError:
- return
- respcls = responsetypes.from_args(body=body)
- return response.replace(body=body, cls=respcls)
- def process_response(self, request, response, spider):
- if not response.body:
- return response
- for fmt, func in six.iteritems(self._formats):
- new_response = func(response)
- if new_response:
- logger.debug('Decompressed response with format: %(responsefmt)s',
- {'responsefmt': fmt}, extra={'spider': spider})
- return new_response
- return response
|