httpproxy.py 2.5 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374
  1. import base64
  2. from six.moves.urllib.parse import unquote, urlunparse
  3. from six.moves.urllib.request import getproxies, proxy_bypass
  4. try:
  5. from urllib2 import _parse_proxy
  6. except ImportError:
  7. from urllib.request import _parse_proxy
  8. from scrapy.exceptions import NotConfigured
  9. from scrapy.utils.httpobj import urlparse_cached
  10. from scrapy.utils.python import to_bytes
  11. class HttpProxyMiddleware(object):
  12. def __init__(self, auth_encoding='latin-1'):
  13. self.auth_encoding = auth_encoding
  14. self.proxies = {}
  15. for type_, url in getproxies().items():
  16. self.proxies[type_] = self._get_proxy(url, type_)
  17. @classmethod
  18. def from_crawler(cls, crawler):
  19. if not crawler.settings.getbool('HTTPPROXY_ENABLED'):
  20. raise NotConfigured
  21. auth_encoding = crawler.settings.get('HTTPPROXY_AUTH_ENCODING')
  22. return cls(auth_encoding)
  23. def _basic_auth_header(self, username, password):
  24. user_pass = to_bytes(
  25. '%s:%s' % (unquote(username), unquote(password)),
  26. encoding=self.auth_encoding)
  27. return base64.b64encode(user_pass)
  28. def _get_proxy(self, url, orig_type):
  29. proxy_type, user, password, hostport = _parse_proxy(url)
  30. proxy_url = urlunparse((proxy_type or orig_type, hostport, '', '', '', ''))
  31. if user:
  32. creds = self._basic_auth_header(user, password)
  33. else:
  34. creds = None
  35. return creds, proxy_url
  36. def process_request(self, request, spider):
  37. # ignore if proxy is already set
  38. if 'proxy' in request.meta:
  39. if request.meta['proxy'] is None:
  40. return
  41. # extract credentials if present
  42. creds, proxy_url = self._get_proxy(request.meta['proxy'], '')
  43. request.meta['proxy'] = proxy_url
  44. if creds and not request.headers.get('Proxy-Authorization'):
  45. request.headers['Proxy-Authorization'] = b'Basic ' + creds
  46. return
  47. elif not self.proxies:
  48. return
  49. parsed = urlparse_cached(request)
  50. scheme = parsed.scheme
  51. # 'no_proxy' is only supported by http schemes
  52. if scheme in ('http', 'https') and proxy_bypass(parsed.hostname):
  53. return
  54. if scheme in self.proxies:
  55. self._set_proxy(request, scheme)
  56. def _set_proxy(self, request, scheme):
  57. creds, proxy = self.proxies[scheme]
  58. request.meta['proxy'] = proxy
  59. if creds:
  60. request.headers['Proxy-Authorization'] = b'Basic ' + creds