images.py 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173
  1. """
  2. Images Pipeline
  3. See documentation in topics/media-pipeline.rst
  4. """
  5. import functools
  6. import hashlib
  7. import six
  8. try:
  9. from cStringIO import StringIO as BytesIO
  10. except ImportError:
  11. from io import BytesIO
  12. from PIL import Image
  13. from scrapy.utils.misc import md5sum
  14. from scrapy.utils.python import to_bytes
  15. from scrapy.http import Request
  16. from scrapy.settings import Settings
  17. from scrapy.exceptions import DropItem
  18. #TODO: from scrapy.pipelines.media import MediaPipeline
  19. from scrapy.pipelines.files import FileException, FilesPipeline
  20. class NoimagesDrop(DropItem):
  21. """Product with no images exception"""
  22. class ImageException(FileException):
  23. """General image error exception"""
  24. class ImagesPipeline(FilesPipeline):
  25. """Abstract pipeline that implement the image thumbnail generation logic
  26. """
  27. MEDIA_NAME = 'image'
  28. # Uppercase attributes kept for backward compatibility with code that subclasses
  29. # ImagesPipeline. They may be overridden by settings.
  30. MIN_WIDTH = 0
  31. MIN_HEIGHT = 0
  32. EXPIRES = 90
  33. THUMBS = {}
  34. DEFAULT_IMAGES_URLS_FIELD = 'image_urls'
  35. DEFAULT_IMAGES_RESULT_FIELD = 'images'
  36. def __init__(self, store_uri, download_func=None, settings=None):
  37. super(ImagesPipeline, self).__init__(store_uri, settings=settings,
  38. download_func=download_func)
  39. if isinstance(settings, dict) or settings is None:
  40. settings = Settings(settings)
  41. resolve = functools.partial(self._key_for_pipe,
  42. base_class_name="ImagesPipeline",
  43. settings=settings)
  44. self.expires = settings.getint(
  45. resolve("IMAGES_EXPIRES"), self.EXPIRES
  46. )
  47. if not hasattr(self, "IMAGES_RESULT_FIELD"):
  48. self.IMAGES_RESULT_FIELD = self.DEFAULT_IMAGES_RESULT_FIELD
  49. if not hasattr(self, "IMAGES_URLS_FIELD"):
  50. self.IMAGES_URLS_FIELD = self.DEFAULT_IMAGES_URLS_FIELD
  51. self.images_urls_field = settings.get(
  52. resolve('IMAGES_URLS_FIELD'),
  53. self.IMAGES_URLS_FIELD
  54. )
  55. self.images_result_field = settings.get(
  56. resolve('IMAGES_RESULT_FIELD'),
  57. self.IMAGES_RESULT_FIELD
  58. )
  59. self.min_width = settings.getint(
  60. resolve('IMAGES_MIN_WIDTH'), self.MIN_WIDTH
  61. )
  62. self.min_height = settings.getint(
  63. resolve('IMAGES_MIN_HEIGHT'), self.MIN_HEIGHT
  64. )
  65. self.thumbs = settings.get(
  66. resolve('IMAGES_THUMBS'), self.THUMBS
  67. )
  68. @classmethod
  69. def from_settings(cls, settings):
  70. s3store = cls.STORE_SCHEMES['s3']
  71. s3store.AWS_ACCESS_KEY_ID = settings['AWS_ACCESS_KEY_ID']
  72. s3store.AWS_SECRET_ACCESS_KEY = settings['AWS_SECRET_ACCESS_KEY']
  73. s3store.AWS_ENDPOINT_URL = settings['AWS_ENDPOINT_URL']
  74. s3store.AWS_REGION_NAME = settings['AWS_REGION_NAME']
  75. s3store.AWS_USE_SSL = settings['AWS_USE_SSL']
  76. s3store.AWS_VERIFY = settings['AWS_VERIFY']
  77. s3store.POLICY = settings['IMAGES_STORE_S3_ACL']
  78. gcs_store = cls.STORE_SCHEMES['gs']
  79. gcs_store.GCS_PROJECT_ID = settings['GCS_PROJECT_ID']
  80. gcs_store.POLICY = settings['IMAGES_STORE_GCS_ACL'] or None
  81. store_uri = settings['IMAGES_STORE']
  82. return cls(store_uri, settings=settings)
  83. def file_downloaded(self, response, request, info):
  84. return self.image_downloaded(response, request, info)
  85. def image_downloaded(self, response, request, info):
  86. checksum = None
  87. for path, image, buf in self.get_images(response, request, info):
  88. if checksum is None:
  89. buf.seek(0)
  90. checksum = md5sum(buf)
  91. width, height = image.size
  92. self.store.persist_file(
  93. path, buf, info,
  94. meta={'width': width, 'height': height},
  95. headers={'Content-Type': 'image/jpeg'})
  96. return checksum
  97. def get_images(self, response, request, info):
  98. path = self.file_path(request, response=response, info=info)
  99. orig_image = Image.open(BytesIO(response.body))
  100. width, height = orig_image.size
  101. if width < self.min_width or height < self.min_height:
  102. raise ImageException("Image too small (%dx%d < %dx%d)" %
  103. (width, height, self.min_width, self.min_height))
  104. image, buf = self.convert_image(orig_image)
  105. yield path, image, buf
  106. for thumb_id, size in six.iteritems(self.thumbs):
  107. thumb_path = self.thumb_path(request, thumb_id, response=response, info=info)
  108. thumb_image, thumb_buf = self.convert_image(image, size)
  109. yield thumb_path, thumb_image, thumb_buf
  110. def convert_image(self, image, size=None):
  111. if image.format == 'PNG' and image.mode == 'RGBA':
  112. background = Image.new('RGBA', image.size, (255, 255, 255))
  113. background.paste(image, image)
  114. image = background.convert('RGB')
  115. elif image.mode == 'P':
  116. image = image.convert("RGBA")
  117. background = Image.new('RGBA', image.size, (255, 255, 255))
  118. background.paste(image, image)
  119. image = background.convert('RGB')
  120. elif image.mode != 'RGB':
  121. image = image.convert('RGB')
  122. if size:
  123. image = image.copy()
  124. image.thumbnail(size, Image.ANTIALIAS)
  125. buf = BytesIO()
  126. image.save(buf, 'JPEG')
  127. return image, buf
  128. def get_media_requests(self, item, info):
  129. return [Request(x) for x in item.get(self.images_urls_field, [])]
  130. def item_completed(self, results, item, info):
  131. if isinstance(item, dict) or self.images_result_field in item.fields:
  132. item[self.images_result_field] = [x for ok, x in results if ok]
  133. return item
  134. def file_path(self, request, response=None, info=None):
  135. image_guid = hashlib.sha1(to_bytes(request.url)).hexdigest()
  136. return 'full/%s.jpg' % (image_guid)
  137. def thumb_path(self, request, thumb_id, response=None, info=None):
  138. thumb_guid = hashlib.sha1(to_bytes(request.url)).hexdigest()
  139. return 'thumbs/%s/%s.jpg' % (thumb_id, thumb_guid)