wget.py 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628
  1. #!/usr/bin/env python
  2. """
  3. Download utility as an easy way to get file from the net
  4. python -m wget <URL>
  5. python wget.py <URL>
  6. Downloads: http://pypi.python.org/pypi/wget/
  7. Development: http://bitbucket.org/techtonik/python-wget/
  8. wget.py is not option compatible with Unix wget utility,
  9. to make command line interface intuitive for new people.
  10. Public domain by anatoly techtonik <techtonik@gmail.com>
  11. Also available under the terms of MIT license
  12. Copyright (c) 2010-2015 anatoly techtonik
  13. """
  14. __version__ = "3.2"
  15. import sys, shutil, os
  16. import tempfile
  17. import math
  18. PY3K = sys.version_info >= (3, 0)
  19. if PY3K:
  20. import urllib.request as ulib
  21. import urllib.parse as urlparse
  22. else:
  23. import urllib as ulib
  24. import urlparse
  25. # --- workarounds for Python misbehavior ---
  26. # enable passing unicode arguments from command line in Python 2.x
  27. # https://stackoverflow.com/questions/846850/read-unicode-characters
  28. def win32_utf8_argv():
  29. """Uses shell32.GetCommandLineArgvW to get sys.argv as a list of Unicode
  30. strings.
  31. Versions 2.x of Python don't support Unicode in sys.argv on
  32. Windows, with the underlying Windows API instead replacing multi-byte
  33. characters with '?'.
  34. """
  35. from ctypes import POINTER, byref, cdll, c_int, windll
  36. from ctypes.wintypes import LPCWSTR, LPWSTR
  37. GetCommandLineW = cdll.kernel32.GetCommandLineW
  38. GetCommandLineW.argtypes = []
  39. GetCommandLineW.restype = LPCWSTR
  40. CommandLineToArgvW = windll.shell32.CommandLineToArgvW
  41. CommandLineToArgvW.argtypes = [LPCWSTR, POINTER(c_int)]
  42. CommandLineToArgvW.restype = POINTER(LPWSTR)
  43. cmd = GetCommandLineW()
  44. argc = c_int(0)
  45. argv = CommandLineToArgvW(cmd, byref(argc))
  46. argnum = argc.value
  47. sysnum = len(sys.argv)
  48. result = []
  49. if argnum > 0:
  50. # Remove Python executable and commands if present
  51. start = argnum - sysnum
  52. for i in range(start, argnum):
  53. result.append(argv[i].encode('utf-8'))
  54. return result
  55. # enable unicode output to windows console
  56. # https://stackoverflow.com/questions/878972/windows-cmd-encoding-change-causes-python-crash
  57. def win32_unicode_console():
  58. import codecs
  59. from ctypes import WINFUNCTYPE, windll, POINTER, byref, c_int
  60. from ctypes.wintypes import BOOL, HANDLE, DWORD, LPWSTR, LPCWSTR, LPVOID
  61. original_stderr = sys.stderr
  62. # Output exceptions in this code to original_stderr, so that we can at least see them
  63. def _complain(message):
  64. original_stderr.write(message if isinstance(message, str) else repr(message))
  65. original_stderr.write('\n')
  66. codecs.register(lambda name: codecs.lookup('utf-8') if name == 'cp65001' else None)
  67. try:
  68. GetStdHandle = WINFUNCTYPE(HANDLE, DWORD)(("GetStdHandle", windll.kernel32))
  69. STD_OUTPUT_HANDLE = DWORD(-11)
  70. STD_ERROR_HANDLE = DWORD(-12)
  71. GetFileType = WINFUNCTYPE(DWORD, DWORD)(("GetFileType", windll.kernel32))
  72. FILE_TYPE_CHAR = 0x0002
  73. FILE_TYPE_REMOTE = 0x8000
  74. GetConsoleMode = WINFUNCTYPE(BOOL, HANDLE, POINTER(DWORD))(("GetConsoleMode", windll.kernel32))
  75. INVALID_HANDLE_VALUE = DWORD(-1).value
  76. def not_a_console(handle):
  77. if handle == INVALID_HANDLE_VALUE or handle is None:
  78. return True
  79. return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
  80. or GetConsoleMode(handle, byref(DWORD())) == 0)
  81. old_stdout_fileno = None
  82. old_stderr_fileno = None
  83. if hasattr(sys.stdout, 'fileno'):
  84. old_stdout_fileno = sys.stdout.fileno()
  85. if hasattr(sys.stderr, 'fileno'):
  86. old_stderr_fileno = sys.stderr.fileno()
  87. STDOUT_FILENO = 1
  88. STDERR_FILENO = 2
  89. real_stdout = (old_stdout_fileno == STDOUT_FILENO)
  90. real_stderr = (old_stderr_fileno == STDERR_FILENO)
  91. if real_stdout:
  92. hStdout = GetStdHandle(STD_OUTPUT_HANDLE)
  93. if not_a_console(hStdout):
  94. real_stdout = False
  95. if real_stderr:
  96. hStderr = GetStdHandle(STD_ERROR_HANDLE)
  97. if not_a_console(hStderr):
  98. real_stderr = False
  99. if real_stdout or real_stderr:
  100. WriteConsoleW = WINFUNCTYPE(BOOL, HANDLE, LPWSTR, DWORD, POINTER(DWORD), LPVOID)(("WriteConsoleW", windll.kernel32))
  101. class UnicodeOutput:
  102. def __init__(self, hConsole, stream, fileno, name):
  103. self._hConsole = hConsole
  104. self._stream = stream
  105. self._fileno = fileno
  106. self.closed = False
  107. self.softspace = False
  108. self.mode = 'w'
  109. self.encoding = 'utf-8'
  110. self.name = name
  111. self.flush()
  112. def isatty(self):
  113. return False
  114. def close(self):
  115. # don't really close the handle, that would only cause problems
  116. self.closed = True
  117. def fileno(self):
  118. return self._fileno
  119. def flush(self):
  120. if self._hConsole is None:
  121. try:
  122. self._stream.flush()
  123. except Exception as e:
  124. _complain("%s.flush: %r from %r" % (self.name, e, self._stream))
  125. raise
  126. def write(self, text):
  127. try:
  128. if self._hConsole is None:
  129. if not PY3K and isinstance(text, unicode):
  130. text = text.encode('utf-8')
  131. elif PY3K and isinstance(text, str):
  132. text = text.encode('utf-8')
  133. self._stream.write(text)
  134. else:
  135. if not PY3K and not isinstance(text, unicode):
  136. text = str(text).decode('utf-8')
  137. elif PY3K and not isinstance(text, str):
  138. text = text.decode('utf-8')
  139. remaining = len(text)
  140. while remaining:
  141. n = DWORD(0)
  142. # There is a shorter-than-documented limitation on the
  143. # length of the string passed to WriteConsoleW (see
  144. # <http://tahoe-lafs.org/trac/tahoe-lafs/ticket/1232>.
  145. retval = WriteConsoleW(self._hConsole, text, min(remaining, 10000), byref(n), None)
  146. if retval == 0 or n.value == 0:
  147. raise IOError("WriteConsoleW returned %r, n.value = %r" % (retval, n.value))
  148. remaining -= n.value
  149. if not remaining:
  150. break
  151. text = text[n.value:]
  152. except Exception as e:
  153. _complain("%s.write: %r" % (self.name, e))
  154. raise
  155. def writelines(self, lines):
  156. try:
  157. for line in lines:
  158. self.write(line)
  159. except Exception as e:
  160. _complain("%s.writelines: %r" % (self.name, e))
  161. raise
  162. if real_stdout:
  163. sys.stdout = UnicodeOutput(hStdout, None, STDOUT_FILENO, '<Unicode console stdout>')
  164. else:
  165. sys.stdout = UnicodeOutput(None, sys.stdout, old_stdout_fileno, '<Unicode redirected stdout>')
  166. if real_stderr:
  167. sys.stderr = UnicodeOutput(hStderr, None, STDERR_FILENO, '<Unicode console stderr>')
  168. else:
  169. sys.stderr = UnicodeOutput(None, sys.stderr, old_stderr_fileno, '<Unicode redirected stderr>')
  170. except Exception as e:
  171. _complain("exception %r while fixing up sys.stdout and sys.stderr" % (e,))
  172. # --- helpers ---
  173. def to_unicode(filename):
  174. """:return: filename decoded from utf-8 to unicode"""
  175. #
  176. if PY3K:
  177. # [ ] test this on Python 3 + (Windows, Linux)
  178. # [ ] port filename_from_headers once this works
  179. # [ ] add test to repository / Travis
  180. return filename
  181. else:
  182. if isinstance(filename, unicode):
  183. return filename
  184. else:
  185. return unicode(filename, 'utf-8')
  186. def filename_from_url(url):
  187. """:return: detected filename as unicode or None"""
  188. # [ ] test urlparse behavior with unicode url
  189. fname = os.path.basename(urlparse.urlparse(url).path)
  190. if len(fname.strip(" \n\t.")) == 0:
  191. return None
  192. return to_unicode(fname)
  193. def filename_from_headers(headers):
  194. """Detect filename from Content-Disposition headers if present.
  195. http://greenbytes.de/tech/tc2231/
  196. :param: headers as dict, list or string
  197. :return: filename from content-disposition header or None
  198. """
  199. if type(headers) == str:
  200. headers = headers.splitlines()
  201. if type(headers) == list:
  202. headers = dict([x.split(':', 1) for x in headers])
  203. cdisp = headers.get("Content-Disposition")
  204. if not cdisp:
  205. return None
  206. cdtype = cdisp.split(';')
  207. if len(cdtype) == 1:
  208. return None
  209. if cdtype[0].strip().lower() not in ('inline', 'attachment'):
  210. return None
  211. # several filename params is illegal, but just in case
  212. fnames = [x for x in cdtype[1:] if x.strip().startswith('filename=')]
  213. if len(fnames) > 1:
  214. return None
  215. name = fnames[0].split('=')[1].strip(' \t"')
  216. name = os.path.basename(name)
  217. if not name:
  218. return None
  219. return name
  220. def filename_fix_existing(filename):
  221. """Expands name portion of filename with numeric ' (x)' suffix to
  222. return filename that doesn't exist already.
  223. """
  224. dirname = u'.'
  225. name, ext = filename.rsplit('.', 1)
  226. names = [x for x in os.listdir(dirname) if x.startswith(name)]
  227. names = [x.rsplit('.', 1)[0] for x in names]
  228. suffixes = [x.replace(name, '') for x in names]
  229. # filter suffixes that match ' (x)' pattern
  230. suffixes = [x[2:-1] for x in suffixes
  231. if x.startswith(' (') and x.endswith(')')]
  232. indexes = [int(x) for x in suffixes
  233. if set(x) <= set('0123456789')]
  234. idx = 1
  235. if indexes:
  236. idx += sorted(indexes)[-1]
  237. return '%s (%d).%s' % (name, idx, ext)
  238. # --- terminal/console output helpers ---
  239. def get_console_width():
  240. """Return width of available window area. Autodetection works for
  241. Windows and POSIX platforms. Returns 80 for others
  242. Code from http://bitbucket.org/techtonik/python-pager
  243. """
  244. if os.name == 'nt':
  245. STD_INPUT_HANDLE = -10
  246. STD_OUTPUT_HANDLE = -11
  247. STD_ERROR_HANDLE = -12
  248. # get console handle
  249. from ctypes import windll, Structure, byref
  250. try:
  251. from ctypes.wintypes import SHORT, WORD, DWORD
  252. except ImportError:
  253. # workaround for missing types in Python 2.5
  254. from ctypes import (
  255. c_short as SHORT, c_ushort as WORD, c_ulong as DWORD)
  256. console_handle = windll.kernel32.GetStdHandle(STD_OUTPUT_HANDLE)
  257. # CONSOLE_SCREEN_BUFFER_INFO Structure
  258. class COORD(Structure):
  259. _fields_ = [("X", SHORT), ("Y", SHORT)]
  260. class SMALL_RECT(Structure):
  261. _fields_ = [("Left", SHORT), ("Top", SHORT),
  262. ("Right", SHORT), ("Bottom", SHORT)]
  263. class CONSOLE_SCREEN_BUFFER_INFO(Structure):
  264. _fields_ = [("dwSize", COORD),
  265. ("dwCursorPosition", COORD),
  266. ("wAttributes", WORD),
  267. ("srWindow", SMALL_RECT),
  268. ("dwMaximumWindowSize", DWORD)]
  269. sbi = CONSOLE_SCREEN_BUFFER_INFO()
  270. ret = windll.kernel32.GetConsoleScreenBufferInfo(
  271. console_handle, byref(sbi))
  272. if ret == 0:
  273. return 0
  274. return sbi.srWindow.Right+1
  275. elif os.name == 'posix':
  276. from fcntl import ioctl
  277. from termios import TIOCGWINSZ
  278. from array import array
  279. winsize = array("H", [0] * 4)
  280. try:
  281. ioctl(sys.stdout.fileno(), TIOCGWINSZ, winsize)
  282. except IOError:
  283. pass
  284. return (winsize[1], winsize[0])[0]
  285. return 80
  286. def bar_thermometer(current, total, width=80):
  287. """Return thermometer style progress bar string. `total` argument
  288. can not be zero. The minimum size of bar returned is 3. Example:
  289. [.......... ]
  290. Control and trailing symbols (\r and spaces) are not included.
  291. See `bar_adaptive` for more information.
  292. """
  293. # number of dots on thermometer scale
  294. avail_dots = width-2
  295. shaded_dots = int(math.floor(float(current) / total * avail_dots))
  296. return '[' + '.'*shaded_dots + ' '*(avail_dots-shaded_dots) + ']'
  297. def bar_adaptive(current, total, width=80):
  298. """Return progress bar string for given values in one of three
  299. styles depending on available width:
  300. [.. ] downloaded / total
  301. downloaded / total
  302. [.. ]
  303. if total value is unknown or <= 0, show bytes counter using two
  304. adaptive styles:
  305. %s / unknown
  306. %s
  307. if there is not enough space on the screen, do not display anything
  308. returned string doesn't include control characters like \r used to
  309. place cursor at the beginning of the line to erase previous content.
  310. this function leaves one free character at the end of string to
  311. avoid automatic linefeed on Windows.
  312. """
  313. # process special case when total size is unknown and return immediately
  314. if not total or total < 0:
  315. msg = "%s / unknown" % current
  316. if len(msg) < width: # leaves one character to avoid linefeed
  317. return msg
  318. if len("%s" % current) < width:
  319. return "%s" % current
  320. # --- adaptive layout algorithm ---
  321. #
  322. # [x] describe the format of the progress bar
  323. # [x] describe min width for each data field
  324. # [x] set priorities for each element
  325. # [x] select elements to be shown
  326. # [x] choose top priority element min_width < avail_width
  327. # [x] lessen avail_width by value if min_width
  328. # [x] exclude element from priority list and repeat
  329. # 10% [.. ] 10/100
  330. # pppp bbbbb sssssss
  331. min_width = {
  332. 'percent': 4, # 100%
  333. 'bar': 3, # [.]
  334. 'size': len("%s" % total)*2 + 3, # 'xxxx / yyyy'
  335. }
  336. priority = ['percent', 'bar', 'size']
  337. # select elements to show
  338. selected = []
  339. avail = width
  340. for field in priority:
  341. if min_width[field] < avail:
  342. selected.append(field)
  343. avail -= min_width[field]+1 # +1 is for separator or for reserved space at
  344. # the end of line to avoid linefeed on Windows
  345. # render
  346. output = ''
  347. for field in selected:
  348. if field == 'percent':
  349. # fixed size width for percentage
  350. output += ('%s%%' % (100 * current // total)).rjust(min_width['percent'])
  351. elif field == 'bar': # [. ]
  352. # bar takes its min width + all available space
  353. output += bar_thermometer(current, total, min_width['bar']+avail)
  354. elif field == 'size':
  355. # size field has a constant width (min == max)
  356. output += ("%s / %s" % (current, total)).rjust(min_width['size'])
  357. selected = selected[1:]
  358. if selected:
  359. output += ' ' # add field separator
  360. return output
  361. # --/ console helpers
  362. __current_size = 0 # global state variable, which exists solely as a
  363. # workaround against Python 3.3.0 regression
  364. # http://bugs.python.org/issue16409
  365. # fixed in Python 3.3.1
  366. def callback_progress(blocks, block_size, total_size, bar_function):
  367. """callback function for urlretrieve that is called when connection is
  368. created and when once for each block
  369. draws adaptive progress bar in terminal/console
  370. use sys.stdout.write() instead of "print,", because it allows one more
  371. symbol at the line end without linefeed on Windows
  372. :param blocks: number of blocks transferred so far
  373. :param block_size: in bytes
  374. :param total_size: in bytes, can be -1 if server doesn't return it
  375. :param bar_function: another callback function to visualize progress
  376. """
  377. global __current_size
  378. width = min(100, get_console_width())
  379. if sys.version_info[:3] == (3, 3, 0): # regression workaround
  380. if blocks == 0: # first call
  381. __current_size = 0
  382. else:
  383. __current_size += block_size
  384. current_size = __current_size
  385. else:
  386. current_size = min(blocks*block_size, total_size)
  387. progress = bar_function(current_size, total_size, width)
  388. if progress:
  389. sys.stdout.write("\r" + progress)
  390. def detect_filename(url=None, out=None, headers=None, default="download.wget"):
  391. """Return filename for saving file. If no filename is detected from output
  392. argument, url or headers, return default (download.wget)
  393. """
  394. names = dict(out='', url='', headers='')
  395. if out:
  396. names["out"] = out or ''
  397. if url:
  398. names["url"] = filename_from_url(url) or ''
  399. if headers:
  400. names["headers"] = filename_from_headers(headers) or ''
  401. return names["out"] or names["headers"] or names["url"] or default
  402. def download(url, out=None, bar=bar_adaptive):
  403. """High level function, which downloads URL into tmp file in current
  404. directory and then renames it to filename autodetected from either URL
  405. or HTTP headers.
  406. :param bar: function to track download progress (visualize etc.)
  407. :param out: output filename or directory
  408. :return: filename where URL is downloaded to
  409. """
  410. # detect of out is a directory
  411. outdir = None
  412. if out and os.path.isdir(out):
  413. outdir = out
  414. out = None
  415. # get filename for temp file in current directory
  416. prefix = detect_filename(url, out)
  417. (fd, tmpfile) = tempfile.mkstemp(".tmp", prefix=prefix, dir=".")
  418. os.close(fd)
  419. os.unlink(tmpfile)
  420. # set progress monitoring callback
  421. def callback_charged(blocks, block_size, total_size):
  422. # 'closure' to set bar drawing function in callback
  423. callback_progress(blocks, block_size, total_size, bar_function=bar)
  424. if bar:
  425. callback = callback_charged
  426. else:
  427. callback = None
  428. if PY3K:
  429. # Python 3 can not quote URL as needed
  430. binurl = list(urlparse.urlsplit(url))
  431. binurl[2] = urlparse.quote(binurl[2])
  432. binurl = urlparse.urlunsplit(binurl)
  433. else:
  434. binurl = url
  435. (tmpfile, headers) = ulib.urlretrieve(binurl, tmpfile, callback)
  436. filename = detect_filename(url, out, headers)
  437. if outdir:
  438. filename = outdir + "/" + filename
  439. # add numeric ' (x)' suffix if filename already exists
  440. if os.path.exists(filename):
  441. filename = filename_fix_existing(filename)
  442. shutil.move(tmpfile, filename)
  443. #print headers
  444. return filename
  445. usage = """\
  446. usage: wget.py [options] URL
  447. options:
  448. -o --output FILE|DIR output filename or directory
  449. -h --help
  450. --version
  451. """
  452. if __name__ == "__main__":
  453. if len(sys.argv) < 2 or "-h" in sys.argv or "--help" in sys.argv:
  454. sys.exit(usage)
  455. if "--version" in sys.argv:
  456. sys.exit("wget.py " + __version__)
  457. # patch Python 2.x to read unicode from command line
  458. if not PY3K and sys.platform == "win32":
  459. sys.argv = win32_utf8_argv()
  460. # patch Python to write unicode characters to console
  461. if sys.platform == "win32":
  462. win32_unicode_console()
  463. from optparse import OptionParser
  464. parser = OptionParser()
  465. parser.add_option("-o", "--output", dest="output")
  466. (options, args) = parser.parse_args()
  467. url = sys.argv[1]
  468. filename = download(args[0], out=options.output)
  469. print("")
  470. print("Saved under %s" % filename)
  471. r"""
  472. features that require more tuits for urlretrieve API
  473. http://www.python.org/doc/2.6/library/urllib.html#urllib.urlretrieve
  474. [x] autodetect filename from URL
  475. [x] autodetect filename from headers - Content-Disposition
  476. http://greenbytes.de/tech/tc2231/
  477. [ ] make HEAD request to detect temp filename from Content-Disposition
  478. [ ] process HTTP status codes (i.e. 404 error)
  479. http://ftp.de.debian.org/debian/pool/iso-codes_3.24.2.orig.tar.bz2
  480. [ ] catch KeyboardInterrupt
  481. [ ] optionally preserve incomplete file
  482. [x] create temp file in current directory
  483. [ ] resume download (broken connection)
  484. [ ] resume download (incomplete file)
  485. [x] show progress indicator
  486. http://mail.python.org/pipermail/tutor/2005-May/038797.html
  487. [x] do not overwrite downloaded file
  488. [x] rename file automatically if exists
  489. [x] optionally specify path for downloaded file
  490. [ ] options plan
  491. [x] -h, --help, --version (CHAOS speccy)
  492. [ ] clpbar progress bar style
  493. _ 30.0Mb at 3.0 Mbps eta: 0:00:20 30% [===== ]
  494. [ ] test "bar \r" print with \r at the end of line on Windows
  495. [ ] process Python 2.x urllib.ContentTooShortError exception gracefully
  496. (ideally retry and continue download)
  497. (tmpfile, headers) = urllib.urlretrieve(url, tmpfile, callback_progress)
  498. File "C:\Python27\lib\urllib.py", line 93, in urlretrieve
  499. return _urlopener.retrieve(url, filename, reporthook, data)
  500. File "C:\Python27\lib\urllib.py", line 283, in retrieve
  501. "of %i bytes" % (read, size), result)
  502. urllib.ContentTooShortError: retrieval incomplete: got only 15239952 out of 24807571 bytes
  503. [ ] find out if urlretrieve may return unicode headers
  504. [ ] write files with unicode characters
  505. https://bitbucket.org/techtonik/python-wget/issues/7/filename-issue
  506. [x] Python 2, Windows
  507. [x] Python 3, Windows
  508. [ ] Linux
  509. [ ] add automatic tests
  510. [ ] specify unicode URL from command line
  511. [ ] specify unicode output file from command line
  512. [ ] test suite for unsafe filenames from url and from headers
  513. [ ] security checks
  514. [ ] filename_from_url
  515. [ ] filename_from_headers
  516. [ ] MITM redirect from https URL
  517. [ ] https certificate check
  518. [ ] size+hash check helpers
  519. [ ] fail if size is known and mismatch
  520. [ ] fail if hash mismatch
  521. """