sync_s3.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400
  1. # -*- coding: utf-8 -*-
  2. """
  3. Sync Media to S3
  4. ================
  5. Django command that scans all files in your settings.MEDIA_ROOT and
  6. settings.STATIC_ROOT folders and uploads them to S3 with the same directory
  7. structure.
  8. This command can optionally do the following but it is off by default:
  9. * gzip compress any CSS and Javascript files it finds and adds the appropriate
  10. 'Content-Encoding' header.
  11. * set a far future 'Expires' header for optimal caching.
  12. * upload only media or static files.
  13. * use any other provider compatible with Amazon S3.
  14. * set other than 'public-read' ACL.
  15. Note: This script requires the Python boto library and valid Amazon Web
  16. Services API keys.
  17. Required settings.py variables:
  18. AWS_ACCESS_KEY_ID = ''
  19. AWS_SECRET_ACCESS_KEY = ''
  20. AWS_BUCKET_NAME = ''
  21. When you call this command with the `--renamegzip` param, it will add
  22. the '.gz' extension to the file name. But Safari just doesn't recognize
  23. '.gz' files and your site won't work on it! To fix this problem, you can
  24. set any other extension (like .jgz) in the `SYNC_S3_RENAME_GZIP_EXT`
  25. variable.
  26. Command options are:
  27. -p PREFIX, --prefix=PREFIX
  28. The prefix to prepend to the path on S3.
  29. --gzip Enables gzipping CSS and Javascript files.
  30. --expires Enables setting a far future expires header.
  31. --force Skip the file mtime check to force upload of all
  32. files.
  33. --filter-list Override default directory and file exclusion
  34. filters. (enter as comma separated line)
  35. --renamegzip Enables renaming of gzipped files by appending '.gz'.
  36. to the original file name. This way your original
  37. assets will not be replaced by the gzipped ones.
  38. You can change the extension setting the
  39. `SYNC_S3_RENAME_GZIP_EXT` var in your settings.py
  40. file.
  41. --invalidate Invalidates the objects in CloudFront after uploading
  42. stuff to s3.
  43. --media-only Only MEDIA_ROOT files will be uploaded to S3.
  44. --static-only Only STATIC_ROOT files will be uploaded to S3.
  45. --s3host Override default s3 host.
  46. --acl Override default ACL settings ('public-read' if
  47. settings.AWS_DEFAULT_ACL is not defined).
  48. TODO:
  49. * Use fnmatch (or regex) to allow more complex FILTER_LIST rules.
  50. """
  51. import datetime
  52. import email
  53. import gzip
  54. import mimetypes
  55. import os
  56. import time
  57. from typing import List # NOQA
  58. from django.conf import settings
  59. from django.core.management.base import BaseCommand, CommandError
  60. from six import StringIO
  61. from django_extensions.management.utils import signalcommand
  62. try:
  63. import boto
  64. except ImportError:
  65. HAS_BOTO = False
  66. else:
  67. HAS_BOTO = True
  68. class Command(BaseCommand):
  69. # Extra variables to avoid passing these around
  70. AWS_ACCESS_KEY_ID = ''
  71. AWS_SECRET_ACCESS_KEY = ''
  72. AWS_BUCKET_NAME = ''
  73. AWS_CLOUDFRONT_DISTRIBUTION = ''
  74. SYNC_S3_RENAME_GZIP_EXT = ''
  75. DIRECTORIES = ''
  76. FILTER_LIST = ['.DS_Store', '.svn', '.hg', '.git', 'Thumbs.db']
  77. GZIP_CONTENT_TYPES = (
  78. 'text/css',
  79. 'application/javascript',
  80. 'application/x-javascript',
  81. 'text/javascript'
  82. )
  83. uploaded_files = [] # type: List[str]
  84. upload_count = 0
  85. skip_count = 0
  86. help = 'Syncs the complete MEDIA_ROOT structure and files to S3 into the given bucket name.'
  87. args = 'bucket_name'
  88. can_import_settings = True
  89. def add_arguments(self, parser):
  90. super().add_arguments(parser)
  91. parser.add_argument(
  92. '-p', '--prefix',
  93. dest='prefix',
  94. default=getattr(settings, 'SYNC_S3_PREFIX', ''),
  95. help="The prefix to prepend to the path on S3."
  96. )
  97. parser.add_argument(
  98. '-d', '--dir',
  99. dest='dir',
  100. help="Custom static root directory to use"
  101. )
  102. parser.add_argument(
  103. '--s3host',
  104. dest='s3host',
  105. default=getattr(settings, 'AWS_S3_HOST', ''),
  106. help="The s3 host (enables connecting to other providers/regions)"
  107. )
  108. parser.add_argument(
  109. '--acl',
  110. dest='acl',
  111. default=getattr(settings, 'AWS_DEFAULT_ACL', 'public-read'),
  112. help="Enables to override default acl (public-read)."
  113. )
  114. parser.add_argument(
  115. '--gzip',
  116. action='store_true', dest='gzip', default=False,
  117. help="Enables gzipping CSS and Javascript files."
  118. )
  119. parser.add_argument(
  120. '--renamegzip',
  121. action='store_true', dest='renamegzip', default=False,
  122. help="Enables renaming of gzipped assets to have '.gz' appended to the filename."
  123. )
  124. parser.add_argument(
  125. '--expires',
  126. action='store_true', dest='expires', default=False,
  127. help="Enables setting a far future expires header."
  128. )
  129. parser.add_argument(
  130. '--force',
  131. action='store_true', dest='force', default=False,
  132. help="Skip the file mtime check to force upload of all files."
  133. )
  134. parser.add_argument(
  135. '--filter-list', dest='filter_list',
  136. action='store', default='',
  137. help="Override default directory and file exclusion filters. (enter as comma seperated line)"
  138. )
  139. parser.add_argument(
  140. '--invalidate', dest='invalidate', default=False,
  141. action='store_true',
  142. help='Invalidates the associated objects in CloudFront'
  143. )
  144. parser.add_argument(
  145. '--media-only', dest='media_only', default='',
  146. action='store_true',
  147. help="Only MEDIA_ROOT files will be uploaded to S3"
  148. )
  149. parser.add_argument(
  150. '--static-only', dest='static_only', default='',
  151. action='store_true',
  152. help="Only STATIC_ROOT files will be uploaded to S3"
  153. )
  154. @signalcommand
  155. def handle(self, *args, **options):
  156. if not HAS_BOTO:
  157. raise CommandError("Please install the 'boto' Python library. ($ pip install boto)")
  158. # Check for AWS keys in settings
  159. if not hasattr(settings, 'AWS_ACCESS_KEY_ID') or not hasattr(settings, 'AWS_SECRET_ACCESS_KEY'):
  160. raise CommandError('Missing AWS keys from settings file. Please supply both AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY.')
  161. else:
  162. self.AWS_ACCESS_KEY_ID = settings.AWS_ACCESS_KEY_ID
  163. self.AWS_SECRET_ACCESS_KEY = settings.AWS_SECRET_ACCESS_KEY
  164. if not hasattr(settings, 'AWS_BUCKET_NAME'):
  165. raise CommandError('Missing bucket name from settings file. Please add the AWS_BUCKET_NAME to your settings file.')
  166. else:
  167. if not settings.AWS_BUCKET_NAME:
  168. raise CommandError('AWS_BUCKET_NAME cannot be empty.')
  169. self.AWS_BUCKET_NAME = settings.AWS_BUCKET_NAME
  170. if not hasattr(settings, 'MEDIA_ROOT'):
  171. raise CommandError('MEDIA_ROOT must be set in your settings.')
  172. else:
  173. if not settings.MEDIA_ROOT:
  174. raise CommandError('MEDIA_ROOT must be set in your settings.')
  175. self.AWS_CLOUDFRONT_DISTRIBUTION = getattr(settings, 'AWS_CLOUDFRONT_DISTRIBUTION', '')
  176. self.SYNC_S3_RENAME_GZIP_EXT = \
  177. getattr(settings, 'SYNC_S3_RENAME_GZIP_EXT', '.gz')
  178. self.verbosity = options["verbosity"]
  179. self.prefix = options['prefix']
  180. self.do_gzip = options['gzip']
  181. self.rename_gzip = options['renamegzip']
  182. self.do_expires = options['expires']
  183. self.do_force = options['force']
  184. self.invalidate = options['invalidate']
  185. self.DIRECTORIES = options['dir']
  186. self.s3host = options['s3host']
  187. self.default_acl = options['acl']
  188. self.FILTER_LIST = getattr(settings, 'FILTER_LIST', self.FILTER_LIST)
  189. filter_list = options['filter_list']
  190. if filter_list:
  191. # command line option overrides default filter_list and
  192. # settings.filter_list
  193. self.FILTER_LIST = filter_list.split(',')
  194. self.media_only = options['media_only']
  195. self.static_only = options['static_only']
  196. # Get directories
  197. if self.media_only and self.static_only:
  198. raise CommandError("Can't use --media-only and --static-only together. Better not use anything...")
  199. elif self.media_only:
  200. self.DIRECTORIES = [settings.MEDIA_ROOT]
  201. elif self.static_only:
  202. self.DIRECTORIES = [settings.STATIC_ROOT]
  203. elif self.DIRECTORIES:
  204. self.DIRECTORIES = [self.DIRECTORIES]
  205. else:
  206. self.DIRECTORIES = [settings.MEDIA_ROOT, settings.STATIC_ROOT]
  207. # Now call the syncing method to walk the MEDIA_ROOT directory and
  208. # upload all files found.
  209. self.sync_s3()
  210. # Sending the invalidation request to CloudFront if the user
  211. # requested this action
  212. if self.invalidate:
  213. self.invalidate_objects_cf()
  214. print("")
  215. print("%d files uploaded." % self.upload_count)
  216. print("%d files skipped." % self.skip_count)
  217. def open_cf(self):
  218. """Return an open connection to CloudFront"""
  219. return boto.connect_cloudfront(
  220. self.AWS_ACCESS_KEY_ID, self.AWS_SECRET_ACCESS_KEY)
  221. def invalidate_objects_cf(self):
  222. """Split the invalidation request in groups of 1000 objects"""
  223. if not self.AWS_CLOUDFRONT_DISTRIBUTION:
  224. raise CommandError(
  225. 'An object invalidation was requested but the variable '
  226. 'AWS_CLOUDFRONT_DISTRIBUTION is not present in your settings.')
  227. # We can't send more than 1000 objects in the same invalidation
  228. # request.
  229. chunk = 1000
  230. # Connecting to CloudFront
  231. conn = self.open_cf()
  232. # Splitting the object list
  233. objs = self.uploaded_files
  234. chunks = [objs[i:i + chunk] for i in range(0, len(objs), chunk)]
  235. # Invalidation requests
  236. for paths in chunks:
  237. conn.create_invalidation_request(
  238. self.AWS_CLOUDFRONT_DISTRIBUTION, paths)
  239. def sync_s3(self):
  240. """Walk the media/static directories and syncs files to S3"""
  241. bucket, key = self.open_s3()
  242. for directory in self.DIRECTORIES:
  243. for root, dirs, files in os.walk(directory):
  244. self.upload_s3((bucket, key, self.AWS_BUCKET_NAME, directory), root, files, dirs)
  245. def compress_string(self, s):
  246. """Gzip a given string."""
  247. zbuf = StringIO()
  248. zfile = gzip.GzipFile(mode='wb', compresslevel=6, fileobj=zbuf)
  249. zfile.write(s)
  250. zfile.close()
  251. return zbuf.getvalue()
  252. def get_s3connection_kwargs(self):
  253. """Return connection kwargs as a dict"""
  254. kwargs = {}
  255. if self.s3host:
  256. kwargs['host'] = self.s3host
  257. return kwargs
  258. def open_s3(self):
  259. """Open connection to S3 returning bucket and key"""
  260. conn = boto.connect_s3(
  261. self.AWS_ACCESS_KEY_ID,
  262. self.AWS_SECRET_ACCESS_KEY,
  263. **self.get_s3connection_kwargs())
  264. try:
  265. bucket = conn.get_bucket(self.AWS_BUCKET_NAME)
  266. except boto.exception.S3ResponseError:
  267. bucket = conn.create_bucket(self.AWS_BUCKET_NAME)
  268. return bucket, boto.s3.key.Key(bucket)
  269. def upload_s3(self, arg, dirname, names, dirs):
  270. bucket, key, bucket_name, root_dir = arg
  271. # Skip directories we don't want to sync
  272. if os.path.basename(dirname) in self.FILTER_LIST and os.path.dirname(dirname) in self.DIRECTORIES:
  273. # prevent walk from processing subfiles/subdirs below the ignored one
  274. del dirs[:]
  275. return
  276. # Later we assume the MEDIA_ROOT ends with a trailing slash
  277. if not root_dir.endswith(os.path.sep):
  278. root_dir = root_dir + os.path.sep
  279. for file in names:
  280. headers = {}
  281. if file in self.FILTER_LIST:
  282. continue # Skip files we don't want to sync
  283. filename = os.path.join(dirname, file)
  284. if os.path.isdir(filename):
  285. continue # Don't try to upload directories
  286. file_key = filename[len(root_dir):]
  287. if self.prefix:
  288. file_key = '%s/%s' % (self.prefix, file_key)
  289. # Check if file on S3 is older than local file, if so, upload
  290. if not self.do_force:
  291. s3_key = bucket.get_key(file_key)
  292. if s3_key:
  293. s3_datetime = datetime.datetime(*time.strptime(
  294. s3_key.last_modified, '%a, %d %b %Y %H:%M:%S %Z')[0:6])
  295. local_datetime = datetime.datetime.utcfromtimestamp(
  296. os.stat(filename).st_mtime)
  297. if local_datetime < s3_datetime:
  298. self.skip_count += 1
  299. if self.verbosity > 1:
  300. print("File %s hasn't been modified since last being uploaded" % file_key)
  301. continue
  302. # File is newer, let's process and upload
  303. if self.verbosity > 0:
  304. print("Uploading %s..." % file_key)
  305. content_type = mimetypes.guess_type(filename)[0]
  306. if content_type:
  307. headers['Content-Type'] = content_type
  308. else:
  309. headers['Content-Type'] = 'application/octet-stream'
  310. file_obj = open(filename, 'rb')
  311. file_size = os.fstat(file_obj.fileno()).st_size
  312. filedata = file_obj.read()
  313. if self.do_gzip:
  314. # Gzip only if file is large enough (>1K is recommended)
  315. # and only if file is a common text type (not a binary file)
  316. if file_size > 1024 and content_type in self.GZIP_CONTENT_TYPES:
  317. filedata = self.compress_string(filedata)
  318. if self.rename_gzip:
  319. # If rename_gzip is True, then rename the file
  320. # by appending an extension (like '.gz)' to
  321. # original filename.
  322. file_key = '%s.%s' % (
  323. file_key, self.SYNC_S3_RENAME_GZIP_EXT)
  324. headers['Content-Encoding'] = 'gzip'
  325. if self.verbosity > 1:
  326. print("\tgzipped: %dk to %dk" % (file_size / 1024, len(filedata) / 1024))
  327. if self.do_expires:
  328. # HTTP/1.0
  329. headers['Expires'] = '%s GMT' % (email.Utils.formatdate(time.mktime((datetime.datetime.now() + datetime.timedelta(days=365 * 2)).timetuple())))
  330. # HTTP/1.1
  331. headers['Cache-Control'] = 'max-age %d' % (3600 * 24 * 365 * 2)
  332. if self.verbosity > 1:
  333. print("\texpires: %s" % headers['Expires'])
  334. print("\tcache-control: %s" % headers['Cache-Control'])
  335. try:
  336. key.name = file_key
  337. key.set_contents_from_string(filedata, headers, replace=True,
  338. policy=self.default_acl)
  339. except boto.exception.S3CreateError as e:
  340. print("Failed: %s" % e)
  341. except Exception as e:
  342. print(e)
  343. raise
  344. else:
  345. self.upload_count += 1
  346. self.uploaded_files.append(file_key)
  347. file_obj.close()