download.py 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202
  1. """Download files with progress indicators.
  2. """
  3. import cgi
  4. import logging
  5. import mimetypes
  6. import os
  7. from pip._vendor.requests.models import CONTENT_CHUNK_SIZE
  8. from pip._internal.cli.progress_bars import DownloadProgressProvider
  9. from pip._internal.exceptions import NetworkConnectionError
  10. from pip._internal.models.index import PyPI
  11. from pip._internal.network.cache import is_from_cache
  12. from pip._internal.network.utils import HEADERS, raise_for_status, response_chunks
  13. from pip._internal.utils.misc import format_size, redact_auth_from_url, splitext
  14. from pip._internal.utils.typing import MYPY_CHECK_RUNNING
  15. if MYPY_CHECK_RUNNING:
  16. from typing import Iterable, Optional, Tuple
  17. from pip._vendor.requests.models import Response
  18. from pip._internal.models.link import Link
  19. from pip._internal.network.session import PipSession
  20. logger = logging.getLogger(__name__)
  21. def _get_http_response_size(resp):
  22. # type: (Response) -> Optional[int]
  23. try:
  24. return int(resp.headers['content-length'])
  25. except (ValueError, KeyError, TypeError):
  26. return None
  27. def _prepare_download(
  28. resp, # type: Response
  29. link, # type: Link
  30. progress_bar # type: str
  31. ):
  32. # type: (...) -> Iterable[bytes]
  33. total_length = _get_http_response_size(resp)
  34. if link.netloc == PyPI.file_storage_domain:
  35. url = link.show_url
  36. else:
  37. url = link.url_without_fragment
  38. logged_url = redact_auth_from_url(url)
  39. if total_length:
  40. logged_url = '{} ({})'.format(logged_url, format_size(total_length))
  41. if is_from_cache(resp):
  42. logger.info("Using cached %s", logged_url)
  43. else:
  44. logger.info("Downloading %s", logged_url)
  45. if logger.getEffectiveLevel() > logging.INFO:
  46. show_progress = False
  47. elif is_from_cache(resp):
  48. show_progress = False
  49. elif not total_length:
  50. show_progress = True
  51. elif total_length > (40 * 1000):
  52. show_progress = True
  53. else:
  54. show_progress = False
  55. chunks = response_chunks(resp, CONTENT_CHUNK_SIZE)
  56. if not show_progress:
  57. return chunks
  58. return DownloadProgressProvider(
  59. progress_bar, max=total_length
  60. )(chunks)
  61. def sanitize_content_filename(filename):
  62. # type: (str) -> str
  63. """
  64. Sanitize the "filename" value from a Content-Disposition header.
  65. """
  66. return os.path.basename(filename)
  67. def parse_content_disposition(content_disposition, default_filename):
  68. # type: (str, str) -> str
  69. """
  70. Parse the "filename" value from a Content-Disposition header, and
  71. return the default filename if the result is empty.
  72. """
  73. _type, params = cgi.parse_header(content_disposition)
  74. filename = params.get('filename')
  75. if filename:
  76. # We need to sanitize the filename to prevent directory traversal
  77. # in case the filename contains ".." path parts.
  78. filename = sanitize_content_filename(filename)
  79. return filename or default_filename
  80. def _get_http_response_filename(resp, link):
  81. # type: (Response, Link) -> str
  82. """Get an ideal filename from the given HTTP response, falling back to
  83. the link filename if not provided.
  84. """
  85. filename = link.filename # fallback
  86. # Have a look at the Content-Disposition header for a better guess
  87. content_disposition = resp.headers.get('content-disposition')
  88. if content_disposition:
  89. filename = parse_content_disposition(content_disposition, filename)
  90. ext = splitext(filename)[1] # type: Optional[str]
  91. if not ext:
  92. ext = mimetypes.guess_extension(
  93. resp.headers.get('content-type', '')
  94. )
  95. if ext:
  96. filename += ext
  97. if not ext and link.url != resp.url:
  98. ext = os.path.splitext(resp.url)[1]
  99. if ext:
  100. filename += ext
  101. return filename
  102. def _http_get_download(session, link):
  103. # type: (PipSession, Link) -> Response
  104. target_url = link.url.split('#', 1)[0]
  105. resp = session.get(target_url, headers=HEADERS, stream=True)
  106. raise_for_status(resp)
  107. return resp
  108. class Downloader(object):
  109. def __init__(
  110. self,
  111. session, # type: PipSession
  112. progress_bar, # type: str
  113. ):
  114. # type: (...) -> None
  115. self._session = session
  116. self._progress_bar = progress_bar
  117. def __call__(self, link, location):
  118. # type: (Link, str) -> Tuple[str, str]
  119. """Download the file given by link into location."""
  120. try:
  121. resp = _http_get_download(self._session, link)
  122. except NetworkConnectionError as e:
  123. assert e.response is not None
  124. logger.critical(
  125. "HTTP error %s while getting %s", e.response.status_code, link
  126. )
  127. raise
  128. filename = _get_http_response_filename(resp, link)
  129. filepath = os.path.join(location, filename)
  130. chunks = _prepare_download(resp, link, self._progress_bar)
  131. with open(filepath, 'wb') as content_file:
  132. for chunk in chunks:
  133. content_file.write(chunk)
  134. content_type = resp.headers.get('Content-Type', '')
  135. return filepath, content_type
  136. class BatchDownloader(object):
  137. def __init__(
  138. self,
  139. session, # type: PipSession
  140. progress_bar, # type: str
  141. ):
  142. # type: (...) -> None
  143. self._session = session
  144. self._progress_bar = progress_bar
  145. def __call__(self, links, location):
  146. # type: (Iterable[Link], str) -> Iterable[Tuple[str, Tuple[str, str]]]
  147. """Download the files given by links into location."""
  148. for link in links:
  149. try:
  150. resp = _http_get_download(self._session, link)
  151. except NetworkConnectionError as e:
  152. assert e.response is not None
  153. logger.critical(
  154. "HTTP error %s while getting %s",
  155. e.response.status_code, link,
  156. )
  157. raise
  158. filename = _get_http_response_filename(resp, link)
  159. filepath = os.path.join(location, filename)
  160. chunks = _prepare_download(resp, link, self._progress_bar)
  161. with open(filepath, 'wb') as content_file:
  162. for chunk in chunks:
  163. content_file.write(chunk)
  164. content_type = resp.headers.get('Content-Type', '')
  165. yield link.url, (filepath, content_type)