webclient.py 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160
  1. from time import time
  2. from six.moves.urllib.parse import urlparse, urlunparse, urldefrag
  3. from twisted.web.client import HTTPClientFactory
  4. from twisted.web.http import HTTPClient
  5. from twisted.internet import defer
  6. from scrapy.http import Headers
  7. from scrapy.utils.httpobj import urlparse_cached
  8. from scrapy.utils.python import to_bytes
  9. from scrapy.responsetypes import responsetypes
  10. def _parsed_url_args(parsed):
  11. # Assume parsed is urlparse-d from Request.url,
  12. # which was passed via safe_url_string and is ascii-only.
  13. b = lambda s: to_bytes(s, encoding='ascii')
  14. path = urlunparse(('', '', parsed.path or '/', parsed.params, parsed.query, ''))
  15. path = b(path)
  16. host = b(parsed.hostname)
  17. port = parsed.port
  18. scheme = b(parsed.scheme)
  19. netloc = b(parsed.netloc)
  20. if port is None:
  21. port = 443 if scheme == b'https' else 80
  22. return scheme, netloc, host, port, path
  23. def _parse(url):
  24. """ Return tuple of (scheme, netloc, host, port, path),
  25. all in bytes except for port which is int.
  26. Assume url is from Request.url, which was passed via safe_url_string
  27. and is ascii-only.
  28. """
  29. url = url.strip()
  30. parsed = urlparse(url)
  31. return _parsed_url_args(parsed)
  32. class ScrapyHTTPPageGetter(HTTPClient):
  33. delimiter = b'\n'
  34. def connectionMade(self):
  35. self.headers = Headers() # bucket for response headers
  36. # Method command
  37. self.sendCommand(self.factory.method, self.factory.path)
  38. # Headers
  39. for key, values in self.factory.headers.items():
  40. for value in values:
  41. self.sendHeader(key, value)
  42. self.endHeaders()
  43. # Body
  44. if self.factory.body is not None:
  45. self.transport.write(self.factory.body)
  46. def lineReceived(self, line):
  47. return HTTPClient.lineReceived(self, line.rstrip())
  48. def handleHeader(self, key, value):
  49. self.headers.appendlist(key, value)
  50. def handleStatus(self, version, status, message):
  51. self.factory.gotStatus(version, status, message)
  52. def handleEndHeaders(self):
  53. self.factory.gotHeaders(self.headers)
  54. def connectionLost(self, reason):
  55. self._connection_lost_reason = reason
  56. HTTPClient.connectionLost(self, reason)
  57. self.factory.noPage(reason)
  58. def handleResponse(self, response):
  59. if self.factory.method.upper() == b'HEAD':
  60. self.factory.page(b'')
  61. elif self.length is not None and self.length > 0:
  62. self.factory.noPage(self._connection_lost_reason)
  63. else:
  64. self.factory.page(response)
  65. self.transport.loseConnection()
  66. def timeout(self):
  67. self.transport.loseConnection()
  68. # transport cleanup needed for HTTPS connections
  69. if self.factory.url.startswith(b'https'):
  70. self.transport.stopProducing()
  71. self.factory.noPage(\
  72. defer.TimeoutError("Getting %s took longer than %s seconds." % \
  73. (self.factory.url, self.factory.timeout)))
  74. class ScrapyHTTPClientFactory(HTTPClientFactory):
  75. """Scrapy implementation of the HTTPClientFactory overwriting the
  76. setUrl method to make use of our Url object that cache the parse
  77. result.
  78. """
  79. protocol = ScrapyHTTPPageGetter
  80. waiting = 1
  81. noisy = False
  82. followRedirect = False
  83. afterFoundGet = False
  84. def __init__(self, request, timeout=180):
  85. self._url = urldefrag(request.url)[0]
  86. # converting to bytes to comply to Twisted interface
  87. self.url = to_bytes(self._url, encoding='ascii')
  88. self.method = to_bytes(request.method, encoding='ascii')
  89. self.body = request.body or None
  90. self.headers = Headers(request.headers)
  91. self.response_headers = None
  92. self.timeout = request.meta.get('download_timeout') or timeout
  93. self.start_time = time()
  94. self.deferred = defer.Deferred().addCallback(self._build_response, request)
  95. # Fixes Twisted 11.1.0+ support as HTTPClientFactory is expected
  96. # to have _disconnectedDeferred. See Twisted r32329.
  97. # As Scrapy implements it's own logic to handle redirects is not
  98. # needed to add the callback _waitForDisconnect.
  99. # Specifically this avoids the AttributeError exception when
  100. # clientConnectionFailed method is called.
  101. self._disconnectedDeferred = defer.Deferred()
  102. self._set_connection_attributes(request)
  103. # set Host header based on url
  104. self.headers.setdefault('Host', self.netloc)
  105. # set Content-Length based len of body
  106. if self.body is not None:
  107. self.headers['Content-Length'] = len(self.body)
  108. # just in case a broken http/1.1 decides to keep connection alive
  109. self.headers.setdefault("Connection", "close")
  110. # Content-Length must be specified in POST method even with no body
  111. elif self.method == b'POST':
  112. self.headers['Content-Length'] = 0
  113. def _build_response(self, body, request):
  114. request.meta['download_latency'] = self.headers_time-self.start_time
  115. status = int(self.status)
  116. headers = Headers(self.response_headers)
  117. respcls = responsetypes.from_args(headers=headers, url=self._url)
  118. return respcls(url=self._url, status=status, headers=headers, body=body)
  119. def _set_connection_attributes(self, request):
  120. parsed = urlparse_cached(request)
  121. self.scheme, self.netloc, self.host, self.port, self.path = _parsed_url_args(parsed)
  122. proxy = request.meta.get('proxy')
  123. if proxy:
  124. self.scheme, _, self.host, self.port, _ = _parse(proxy)
  125. self.path = self.url
  126. def gotHeaders(self, headers):
  127. self.headers_time = time()
  128. self.response_headers = headers