| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628 |
- #!/usr/bin/env python
- """
- Download utility as an easy way to get file from the net
-
- python -m wget <URL>
- python wget.py <URL>
- Downloads: http://pypi.python.org/pypi/wget/
- Development: http://bitbucket.org/techtonik/python-wget/
- wget.py is not option compatible with Unix wget utility,
- to make command line interface intuitive for new people.
- Public domain by anatoly techtonik <techtonik@gmail.com>
- Also available under the terms of MIT license
- Copyright (c) 2010-2015 anatoly techtonik
- """
- __version__ = "3.2"
- import sys, shutil, os
- import tempfile
- import math
- PY3K = sys.version_info >= (3, 0)
- if PY3K:
- import urllib.request as ulib
- import urllib.parse as urlparse
- else:
- import urllib as ulib
- import urlparse
- # --- workarounds for Python misbehavior ---
- # enable passing unicode arguments from command line in Python 2.x
- # https://stackoverflow.com/questions/846850/read-unicode-characters
- def win32_utf8_argv():
- """Uses shell32.GetCommandLineArgvW to get sys.argv as a list of Unicode
- strings.
- Versions 2.x of Python don't support Unicode in sys.argv on
- Windows, with the underlying Windows API instead replacing multi-byte
- characters with '?'.
- """
- from ctypes import POINTER, byref, cdll, c_int, windll
- from ctypes.wintypes import LPCWSTR, LPWSTR
- GetCommandLineW = cdll.kernel32.GetCommandLineW
- GetCommandLineW.argtypes = []
- GetCommandLineW.restype = LPCWSTR
- CommandLineToArgvW = windll.shell32.CommandLineToArgvW
- CommandLineToArgvW.argtypes = [LPCWSTR, POINTER(c_int)]
- CommandLineToArgvW.restype = POINTER(LPWSTR)
- cmd = GetCommandLineW()
- argc = c_int(0)
- argv = CommandLineToArgvW(cmd, byref(argc))
- argnum = argc.value
- sysnum = len(sys.argv)
- result = []
- if argnum > 0:
- # Remove Python executable and commands if present
- start = argnum - sysnum
- for i in range(start, argnum):
- result.append(argv[i].encode('utf-8'))
- return result
- # enable unicode output to windows console
- # https://stackoverflow.com/questions/878972/windows-cmd-encoding-change-causes-python-crash
- def win32_unicode_console():
- import codecs
- from ctypes import WINFUNCTYPE, windll, POINTER, byref, c_int
- from ctypes.wintypes import BOOL, HANDLE, DWORD, LPWSTR, LPCWSTR, LPVOID
- original_stderr = sys.stderr
- # Output exceptions in this code to original_stderr, so that we can at least see them
- def _complain(message):
- original_stderr.write(message if isinstance(message, str) else repr(message))
- original_stderr.write('\n')
- codecs.register(lambda name: codecs.lookup('utf-8') if name == 'cp65001' else None)
- try:
- GetStdHandle = WINFUNCTYPE(HANDLE, DWORD)(("GetStdHandle", windll.kernel32))
- STD_OUTPUT_HANDLE = DWORD(-11)
- STD_ERROR_HANDLE = DWORD(-12)
- GetFileType = WINFUNCTYPE(DWORD, DWORD)(("GetFileType", windll.kernel32))
- FILE_TYPE_CHAR = 0x0002
- FILE_TYPE_REMOTE = 0x8000
- GetConsoleMode = WINFUNCTYPE(BOOL, HANDLE, POINTER(DWORD))(("GetConsoleMode", windll.kernel32))
- INVALID_HANDLE_VALUE = DWORD(-1).value
- def not_a_console(handle):
- if handle == INVALID_HANDLE_VALUE or handle is None:
- return True
- return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
- or GetConsoleMode(handle, byref(DWORD())) == 0)
- old_stdout_fileno = None
- old_stderr_fileno = None
- if hasattr(sys.stdout, 'fileno'):
- old_stdout_fileno = sys.stdout.fileno()
- if hasattr(sys.stderr, 'fileno'):
- old_stderr_fileno = sys.stderr.fileno()
- STDOUT_FILENO = 1
- STDERR_FILENO = 2
- real_stdout = (old_stdout_fileno == STDOUT_FILENO)
- real_stderr = (old_stderr_fileno == STDERR_FILENO)
- if real_stdout:
- hStdout = GetStdHandle(STD_OUTPUT_HANDLE)
- if not_a_console(hStdout):
- real_stdout = False
- if real_stderr:
- hStderr = GetStdHandle(STD_ERROR_HANDLE)
- if not_a_console(hStderr):
- real_stderr = False
- if real_stdout or real_stderr:
- WriteConsoleW = WINFUNCTYPE(BOOL, HANDLE, LPWSTR, DWORD, POINTER(DWORD), LPVOID)(("WriteConsoleW", windll.kernel32))
- class UnicodeOutput:
- def __init__(self, hConsole, stream, fileno, name):
- self._hConsole = hConsole
- self._stream = stream
- self._fileno = fileno
- self.closed = False
- self.softspace = False
- self.mode = 'w'
- self.encoding = 'utf-8'
- self.name = name
- self.flush()
- def isatty(self):
- return False
- def close(self):
- # don't really close the handle, that would only cause problems
- self.closed = True
- def fileno(self):
- return self._fileno
- def flush(self):
- if self._hConsole is None:
- try:
- self._stream.flush()
- except Exception as e:
- _complain("%s.flush: %r from %r" % (self.name, e, self._stream))
- raise
- def write(self, text):
- try:
- if self._hConsole is None:
- if not PY3K and isinstance(text, unicode):
- text = text.encode('utf-8')
- elif PY3K and isinstance(text, str):
- text = text.encode('utf-8')
- self._stream.write(text)
- else:
- if not PY3K and not isinstance(text, unicode):
- text = str(text).decode('utf-8')
- elif PY3K and not isinstance(text, str):
- text = text.decode('utf-8')
- remaining = len(text)
- while remaining:
- n = DWORD(0)
- # There is a shorter-than-documented limitation on the
- # length of the string passed to WriteConsoleW (see
- # <http://tahoe-lafs.org/trac/tahoe-lafs/ticket/1232>.
- retval = WriteConsoleW(self._hConsole, text, min(remaining, 10000), byref(n), None)
- if retval == 0 or n.value == 0:
- raise IOError("WriteConsoleW returned %r, n.value = %r" % (retval, n.value))
- remaining -= n.value
- if not remaining:
- break
- text = text[n.value:]
- except Exception as e:
- _complain("%s.write: %r" % (self.name, e))
- raise
- def writelines(self, lines):
- try:
- for line in lines:
- self.write(line)
- except Exception as e:
- _complain("%s.writelines: %r" % (self.name, e))
- raise
- if real_stdout:
- sys.stdout = UnicodeOutput(hStdout, None, STDOUT_FILENO, '<Unicode console stdout>')
- else:
- sys.stdout = UnicodeOutput(None, sys.stdout, old_stdout_fileno, '<Unicode redirected stdout>')
- if real_stderr:
- sys.stderr = UnicodeOutput(hStderr, None, STDERR_FILENO, '<Unicode console stderr>')
- else:
- sys.stderr = UnicodeOutput(None, sys.stderr, old_stderr_fileno, '<Unicode redirected stderr>')
- except Exception as e:
- _complain("exception %r while fixing up sys.stdout and sys.stderr" % (e,))
- # --- helpers ---
- def to_unicode(filename):
- """:return: filename decoded from utf-8 to unicode"""
- #
- if PY3K:
- # [ ] test this on Python 3 + (Windows, Linux)
- # [ ] port filename_from_headers once this works
- # [ ] add test to repository / Travis
- return filename
- else:
- if isinstance(filename, unicode):
- return filename
- else:
- return unicode(filename, 'utf-8')
- def filename_from_url(url):
- """:return: detected filename as unicode or None"""
- # [ ] test urlparse behavior with unicode url
- fname = os.path.basename(urlparse.urlparse(url).path)
- if len(fname.strip(" \n\t.")) == 0:
- return None
- return to_unicode(fname)
- def filename_from_headers(headers):
- """Detect filename from Content-Disposition headers if present.
- http://greenbytes.de/tech/tc2231/
- :param: headers as dict, list or string
- :return: filename from content-disposition header or None
- """
- if type(headers) == str:
- headers = headers.splitlines()
- if type(headers) == list:
- headers = dict([x.split(':', 1) for x in headers])
- cdisp = headers.get("Content-Disposition")
- if not cdisp:
- return None
- cdtype = cdisp.split(';')
- if len(cdtype) == 1:
- return None
- if cdtype[0].strip().lower() not in ('inline', 'attachment'):
- return None
- # several filename params is illegal, but just in case
- fnames = [x for x in cdtype[1:] if x.strip().startswith('filename=')]
- if len(fnames) > 1:
- return None
- name = fnames[0].split('=')[1].strip(' \t"')
- name = os.path.basename(name)
- if not name:
- return None
- return name
- def filename_fix_existing(filename):
- """Expands name portion of filename with numeric ' (x)' suffix to
- return filename that doesn't exist already.
- """
- dirname = u'.'
- name, ext = filename.rsplit('.', 1)
- names = [x for x in os.listdir(dirname) if x.startswith(name)]
- names = [x.rsplit('.', 1)[0] for x in names]
- suffixes = [x.replace(name, '') for x in names]
- # filter suffixes that match ' (x)' pattern
- suffixes = [x[2:-1] for x in suffixes
- if x.startswith(' (') and x.endswith(')')]
- indexes = [int(x) for x in suffixes
- if set(x) <= set('0123456789')]
- idx = 1
- if indexes:
- idx += sorted(indexes)[-1]
- return '%s (%d).%s' % (name, idx, ext)
- # --- terminal/console output helpers ---
- def get_console_width():
- """Return width of available window area. Autodetection works for
- Windows and POSIX platforms. Returns 80 for others
- Code from http://bitbucket.org/techtonik/python-pager
- """
- if os.name == 'nt':
- STD_INPUT_HANDLE = -10
- STD_OUTPUT_HANDLE = -11
- STD_ERROR_HANDLE = -12
- # get console handle
- from ctypes import windll, Structure, byref
- try:
- from ctypes.wintypes import SHORT, WORD, DWORD
- except ImportError:
- # workaround for missing types in Python 2.5
- from ctypes import (
- c_short as SHORT, c_ushort as WORD, c_ulong as DWORD)
- console_handle = windll.kernel32.GetStdHandle(STD_OUTPUT_HANDLE)
- # CONSOLE_SCREEN_BUFFER_INFO Structure
- class COORD(Structure):
- _fields_ = [("X", SHORT), ("Y", SHORT)]
- class SMALL_RECT(Structure):
- _fields_ = [("Left", SHORT), ("Top", SHORT),
- ("Right", SHORT), ("Bottom", SHORT)]
- class CONSOLE_SCREEN_BUFFER_INFO(Structure):
- _fields_ = [("dwSize", COORD),
- ("dwCursorPosition", COORD),
- ("wAttributes", WORD),
- ("srWindow", SMALL_RECT),
- ("dwMaximumWindowSize", DWORD)]
- sbi = CONSOLE_SCREEN_BUFFER_INFO()
- ret = windll.kernel32.GetConsoleScreenBufferInfo(
- console_handle, byref(sbi))
- if ret == 0:
- return 0
- return sbi.srWindow.Right+1
- elif os.name == 'posix':
- from fcntl import ioctl
- from termios import TIOCGWINSZ
- from array import array
- winsize = array("H", [0] * 4)
- try:
- ioctl(sys.stdout.fileno(), TIOCGWINSZ, winsize)
- except IOError:
- pass
- return (winsize[1], winsize[0])[0]
- return 80
- def bar_thermometer(current, total, width=80):
- """Return thermometer style progress bar string. `total` argument
- can not be zero. The minimum size of bar returned is 3. Example:
- [.......... ]
- Control and trailing symbols (\r and spaces) are not included.
- See `bar_adaptive` for more information.
- """
- # number of dots on thermometer scale
- avail_dots = width-2
- shaded_dots = int(math.floor(float(current) / total * avail_dots))
- return '[' + '.'*shaded_dots + ' '*(avail_dots-shaded_dots) + ']'
- def bar_adaptive(current, total, width=80):
- """Return progress bar string for given values in one of three
- styles depending on available width:
- [.. ] downloaded / total
- downloaded / total
- [.. ]
- if total value is unknown or <= 0, show bytes counter using two
- adaptive styles:
- %s / unknown
- %s
- if there is not enough space on the screen, do not display anything
- returned string doesn't include control characters like \r used to
- place cursor at the beginning of the line to erase previous content.
- this function leaves one free character at the end of string to
- avoid automatic linefeed on Windows.
- """
- # process special case when total size is unknown and return immediately
- if not total or total < 0:
- msg = "%s / unknown" % current
- if len(msg) < width: # leaves one character to avoid linefeed
- return msg
- if len("%s" % current) < width:
- return "%s" % current
- # --- adaptive layout algorithm ---
- #
- # [x] describe the format of the progress bar
- # [x] describe min width for each data field
- # [x] set priorities for each element
- # [x] select elements to be shown
- # [x] choose top priority element min_width < avail_width
- # [x] lessen avail_width by value if min_width
- # [x] exclude element from priority list and repeat
-
- # 10% [.. ] 10/100
- # pppp bbbbb sssssss
- min_width = {
- 'percent': 4, # 100%
- 'bar': 3, # [.]
- 'size': len("%s" % total)*2 + 3, # 'xxxx / yyyy'
- }
- priority = ['percent', 'bar', 'size']
- # select elements to show
- selected = []
- avail = width
- for field in priority:
- if min_width[field] < avail:
- selected.append(field)
- avail -= min_width[field]+1 # +1 is for separator or for reserved space at
- # the end of line to avoid linefeed on Windows
- # render
- output = ''
- for field in selected:
- if field == 'percent':
- # fixed size width for percentage
- output += ('%s%%' % (100 * current // total)).rjust(min_width['percent'])
- elif field == 'bar': # [. ]
- # bar takes its min width + all available space
- output += bar_thermometer(current, total, min_width['bar']+avail)
- elif field == 'size':
- # size field has a constant width (min == max)
- output += ("%s / %s" % (current, total)).rjust(min_width['size'])
- selected = selected[1:]
- if selected:
- output += ' ' # add field separator
- return output
- # --/ console helpers
- __current_size = 0 # global state variable, which exists solely as a
- # workaround against Python 3.3.0 regression
- # http://bugs.python.org/issue16409
- # fixed in Python 3.3.1
- def callback_progress(blocks, block_size, total_size, bar_function):
- """callback function for urlretrieve that is called when connection is
- created and when once for each block
- draws adaptive progress bar in terminal/console
- use sys.stdout.write() instead of "print,", because it allows one more
- symbol at the line end without linefeed on Windows
- :param blocks: number of blocks transferred so far
- :param block_size: in bytes
- :param total_size: in bytes, can be -1 if server doesn't return it
- :param bar_function: another callback function to visualize progress
- """
- global __current_size
-
- width = min(100, get_console_width())
- if sys.version_info[:3] == (3, 3, 0): # regression workaround
- if blocks == 0: # first call
- __current_size = 0
- else:
- __current_size += block_size
- current_size = __current_size
- else:
- current_size = min(blocks*block_size, total_size)
- progress = bar_function(current_size, total_size, width)
- if progress:
- sys.stdout.write("\r" + progress)
- def detect_filename(url=None, out=None, headers=None, default="download.wget"):
- """Return filename for saving file. If no filename is detected from output
- argument, url or headers, return default (download.wget)
- """
- names = dict(out='', url='', headers='')
- if out:
- names["out"] = out or ''
- if url:
- names["url"] = filename_from_url(url) or ''
- if headers:
- names["headers"] = filename_from_headers(headers) or ''
- return names["out"] or names["headers"] or names["url"] or default
- def download(url, out=None, bar=bar_adaptive):
- """High level function, which downloads URL into tmp file in current
- directory and then renames it to filename autodetected from either URL
- or HTTP headers.
- :param bar: function to track download progress (visualize etc.)
- :param out: output filename or directory
- :return: filename where URL is downloaded to
- """
- # detect of out is a directory
- outdir = None
- if out and os.path.isdir(out):
- outdir = out
- out = None
- # get filename for temp file in current directory
- prefix = detect_filename(url, out)
- (fd, tmpfile) = tempfile.mkstemp(".tmp", prefix=prefix, dir=".")
- os.close(fd)
- os.unlink(tmpfile)
- # set progress monitoring callback
- def callback_charged(blocks, block_size, total_size):
- # 'closure' to set bar drawing function in callback
- callback_progress(blocks, block_size, total_size, bar_function=bar)
- if bar:
- callback = callback_charged
- else:
- callback = None
- if PY3K:
- # Python 3 can not quote URL as needed
- binurl = list(urlparse.urlsplit(url))
- binurl[2] = urlparse.quote(binurl[2])
- binurl = urlparse.urlunsplit(binurl)
- else:
- binurl = url
- (tmpfile, headers) = ulib.urlretrieve(binurl, tmpfile, callback)
- filename = detect_filename(url, out, headers)
- if outdir:
- filename = outdir + "/" + filename
- # add numeric ' (x)' suffix if filename already exists
- if os.path.exists(filename):
- filename = filename_fix_existing(filename)
- shutil.move(tmpfile, filename)
- #print headers
- return filename
- usage = """\
- usage: wget.py [options] URL
- options:
- -o --output FILE|DIR output filename or directory
- -h --help
- --version
- """
- if __name__ == "__main__":
- if len(sys.argv) < 2 or "-h" in sys.argv or "--help" in sys.argv:
- sys.exit(usage)
- if "--version" in sys.argv:
- sys.exit("wget.py " + __version__)
- # patch Python 2.x to read unicode from command line
- if not PY3K and sys.platform == "win32":
- sys.argv = win32_utf8_argv()
- # patch Python to write unicode characters to console
- if sys.platform == "win32":
- win32_unicode_console()
- from optparse import OptionParser
- parser = OptionParser()
- parser.add_option("-o", "--output", dest="output")
- (options, args) = parser.parse_args()
- url = sys.argv[1]
- filename = download(args[0], out=options.output)
- print("")
- print("Saved under %s" % filename)
- r"""
- features that require more tuits for urlretrieve API
- http://www.python.org/doc/2.6/library/urllib.html#urllib.urlretrieve
- [x] autodetect filename from URL
- [x] autodetect filename from headers - Content-Disposition
- http://greenbytes.de/tech/tc2231/
- [ ] make HEAD request to detect temp filename from Content-Disposition
- [ ] process HTTP status codes (i.e. 404 error)
- http://ftp.de.debian.org/debian/pool/iso-codes_3.24.2.orig.tar.bz2
- [ ] catch KeyboardInterrupt
- [ ] optionally preserve incomplete file
- [x] create temp file in current directory
- [ ] resume download (broken connection)
- [ ] resume download (incomplete file)
- [x] show progress indicator
- http://mail.python.org/pipermail/tutor/2005-May/038797.html
- [x] do not overwrite downloaded file
- [x] rename file automatically if exists
- [x] optionally specify path for downloaded file
- [ ] options plan
- [x] -h, --help, --version (CHAOS speccy)
- [ ] clpbar progress bar style
- _ 30.0Mb at 3.0 Mbps eta: 0:00:20 30% [===== ]
- [ ] test "bar \r" print with \r at the end of line on Windows
- [ ] process Python 2.x urllib.ContentTooShortError exception gracefully
- (ideally retry and continue download)
- (tmpfile, headers) = urllib.urlretrieve(url, tmpfile, callback_progress)
- File "C:\Python27\lib\urllib.py", line 93, in urlretrieve
- return _urlopener.retrieve(url, filename, reporthook, data)
- File "C:\Python27\lib\urllib.py", line 283, in retrieve
- "of %i bytes" % (read, size), result)
- urllib.ContentTooShortError: retrieval incomplete: got only 15239952 out of 24807571 bytes
- [ ] find out if urlretrieve may return unicode headers
- [ ] write files with unicode characters
- https://bitbucket.org/techtonik/python-wget/issues/7/filename-issue
- [x] Python 2, Windows
- [x] Python 3, Windows
- [ ] Linux
- [ ] add automatic tests
- [ ] specify unicode URL from command line
- [ ] specify unicode output file from command line
- [ ] test suite for unsafe filenames from url and from headers
- [ ] security checks
- [ ] filename_from_url
- [ ] filename_from_headers
- [ ] MITM redirect from https URL
- [ ] https certificate check
- [ ] size+hash check helpers
- [ ] fail if size is known and mismatch
- [ ] fail if hash mismatch
- """
|