fetchers.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427
  1. # -*- test-case-name: openid.test.test_fetchers -*-
  2. """
  3. This module contains the HTTP fetcher interface and several implementations.
  4. """
  5. __all__ = ['fetch', 'getDefaultFetcher', 'setDefaultFetcher', 'HTTPResponse',
  6. 'HTTPFetcher', 'createHTTPFetcher', 'HTTPFetchingError',
  7. 'HTTPError']
  8. import urllib2
  9. import time
  10. import cStringIO
  11. import sys
  12. import openid
  13. import openid.urinorm
  14. # Try to import httplib2 for caching support
  15. # http://bitworking.org/projects/httplib2/
  16. try:
  17. import httplib2
  18. except ImportError:
  19. # httplib2 not available
  20. httplib2 = None
  21. # try to import pycurl, which will let us use CurlHTTPFetcher
  22. try:
  23. import pycurl
  24. except ImportError:
  25. pycurl = None
  26. USER_AGENT = "python-openid/%s (%s)" % (openid.__version__, sys.platform)
  27. MAX_RESPONSE_KB = 1024
  28. def fetch(url, body=None, headers=None):
  29. """Invoke the fetch method on the default fetcher. Most users
  30. should need only this method.
  31. @raises Exception: any exceptions that may be raised by the default fetcher
  32. """
  33. fetcher = getDefaultFetcher()
  34. return fetcher.fetch(url, body, headers)
  35. def createHTTPFetcher():
  36. """Create a default HTTP fetcher instance
  37. prefers Curl to urllib2."""
  38. if pycurl is None:
  39. fetcher = Urllib2Fetcher()
  40. else:
  41. fetcher = CurlHTTPFetcher()
  42. return fetcher
  43. # Contains the currently set HTTP fetcher. If it is set to None, the
  44. # library will call createHTTPFetcher() to set it. Do not access this
  45. # variable outside of this module.
  46. _default_fetcher = None
  47. def getDefaultFetcher():
  48. """Return the default fetcher instance
  49. if no fetcher has been set, it will create a default fetcher.
  50. @return: the default fetcher
  51. @rtype: HTTPFetcher
  52. """
  53. global _default_fetcher
  54. if _default_fetcher is None:
  55. setDefaultFetcher(createHTTPFetcher())
  56. return _default_fetcher
  57. def setDefaultFetcher(fetcher, wrap_exceptions=True):
  58. """Set the default fetcher
  59. @param fetcher: The fetcher to use as the default HTTP fetcher
  60. @type fetcher: HTTPFetcher
  61. @param wrap_exceptions: Whether to wrap exceptions thrown by the
  62. fetcher wil HTTPFetchingError so that they may be caught
  63. easier. By default, exceptions will be wrapped. In general,
  64. unwrapped fetchers are useful for debugging of fetching errors
  65. or if your fetcher raises well-known exceptions that you would
  66. like to catch.
  67. @type wrap_exceptions: bool
  68. """
  69. global _default_fetcher
  70. if fetcher is None or not wrap_exceptions:
  71. _default_fetcher = fetcher
  72. else:
  73. _default_fetcher = ExceptionWrappingFetcher(fetcher)
  74. def usingCurl():
  75. """Whether the currently set HTTP fetcher is a Curl HTTP fetcher."""
  76. return isinstance(getDefaultFetcher(), CurlHTTPFetcher)
  77. class HTTPResponse(object):
  78. """XXX document attributes"""
  79. headers = None
  80. status = None
  81. body = None
  82. final_url = None
  83. def __init__(self, final_url=None, status=None, headers=None, body=None):
  84. self.final_url = final_url
  85. self.status = status
  86. self.headers = headers
  87. self.body = body
  88. def __repr__(self):
  89. return "<%s status %s for %s>" % (self.__class__.__name__,
  90. self.status,
  91. self.final_url)
  92. class HTTPFetcher(object):
  93. """
  94. This class is the interface for openid HTTP fetchers. This
  95. interface is only important if you need to write a new fetcher for
  96. some reason.
  97. """
  98. def fetch(self, url, body=None, headers=None):
  99. """
  100. This performs an HTTP POST or GET, following redirects along
  101. the way. If a body is specified, then the request will be a
  102. POST. Otherwise, it will be a GET.
  103. @param headers: HTTP headers to include with the request
  104. @type headers: {str:str}
  105. @return: An object representing the server's HTTP response. If
  106. there are network or protocol errors, an exception will be
  107. raised. HTTP error responses, like 404 or 500, do not
  108. cause exceptions.
  109. @rtype: L{HTTPResponse}
  110. @raise Exception: Different implementations will raise
  111. different errors based on the underlying HTTP library.
  112. """
  113. raise NotImplementedError
  114. def _allowedURL(url):
  115. return url.startswith('http://') or url.startswith('https://')
  116. class HTTPFetchingError(Exception):
  117. """Exception that is wrapped around all exceptions that are raised
  118. by the underlying fetcher when using the ExceptionWrappingFetcher
  119. @ivar why: The exception that caused this exception
  120. """
  121. def __init__(self, why=None):
  122. Exception.__init__(self, why)
  123. self.why = why
  124. class ExceptionWrappingFetcher(HTTPFetcher):
  125. """Fetcher that wraps another fetcher, causing all exceptions
  126. @cvar uncaught_exceptions: Exceptions that should be exposed to the
  127. user if they are raised by the fetch call
  128. """
  129. uncaught_exceptions = (SystemExit, KeyboardInterrupt, MemoryError)
  130. def __init__(self, fetcher):
  131. self.fetcher = fetcher
  132. def fetch(self, *args, **kwargs):
  133. try:
  134. return self.fetcher.fetch(*args, **kwargs)
  135. except self.uncaught_exceptions:
  136. raise
  137. except:
  138. exc_cls, exc_inst = sys.exc_info()[:2]
  139. if exc_inst is None:
  140. # string exceptions
  141. exc_inst = exc_cls
  142. raise HTTPFetchingError(why=exc_inst)
  143. class Urllib2Fetcher(HTTPFetcher):
  144. """An C{L{HTTPFetcher}} that uses urllib2.
  145. """
  146. # Parameterized for the benefit of testing frameworks, see
  147. # http://trac.openidenabled.com/trac/ticket/85
  148. urlopen = staticmethod(urllib2.urlopen)
  149. def fetch(self, url, body=None, headers=None):
  150. if not _allowedURL(url):
  151. raise ValueError('Bad URL scheme: %r' % (url,))
  152. if headers is None:
  153. headers = {}
  154. headers.setdefault(
  155. 'User-Agent',
  156. "%s Python-urllib/%s" % (USER_AGENT, urllib2.__version__,))
  157. req = urllib2.Request(url, data=body, headers=headers)
  158. try:
  159. f = self.urlopen(req)
  160. try:
  161. return self._makeResponse(f)
  162. finally:
  163. f.close()
  164. except urllib2.HTTPError, why:
  165. try:
  166. return self._makeResponse(why)
  167. finally:
  168. why.close()
  169. def _makeResponse(self, urllib2_response):
  170. resp = HTTPResponse()
  171. resp.body = urllib2_response.read(MAX_RESPONSE_KB * 1024)
  172. resp.final_url = urllib2_response.geturl()
  173. resp.headers = dict(urllib2_response.info().items())
  174. if hasattr(urllib2_response, 'code'):
  175. resp.status = urllib2_response.code
  176. else:
  177. resp.status = 200
  178. return resp
  179. class HTTPError(HTTPFetchingError):
  180. """
  181. This exception is raised by the C{L{CurlHTTPFetcher}} when it
  182. encounters an exceptional situation fetching a URL.
  183. """
  184. pass
  185. # XXX: define what we mean by paranoid, and make sure it is.
  186. class CurlHTTPFetcher(HTTPFetcher):
  187. """
  188. An C{L{HTTPFetcher}} that uses pycurl for fetching.
  189. See U{http://pycurl.sourceforge.net/}.
  190. """
  191. ALLOWED_TIME = 20 # seconds
  192. def __init__(self):
  193. HTTPFetcher.__init__(self)
  194. if pycurl is None:
  195. raise RuntimeError('Cannot find pycurl library')
  196. def _parseHeaders(self, header_file):
  197. header_file.seek(0)
  198. # Remove the status line from the beginning of the input
  199. unused_http_status_line = header_file.readline().lower ()
  200. if unused_http_status_line.startswith('http/1.1 100 '):
  201. unused_http_status_line = header_file.readline()
  202. unused_http_status_line = header_file.readline()
  203. lines = [line.strip() for line in header_file]
  204. # and the blank line from the end
  205. empty_line = lines.pop()
  206. if empty_line:
  207. raise HTTPError("No blank line at end of headers: %r" % (line,))
  208. headers = {}
  209. for line in lines:
  210. try:
  211. name, value = line.split(':', 1)
  212. except ValueError:
  213. raise HTTPError(
  214. "Malformed HTTP header line in response: %r" % (line,))
  215. value = value.strip()
  216. # HTTP headers are case-insensitive
  217. name = name.lower()
  218. headers[name] = value
  219. return headers
  220. def _checkURL(self, url):
  221. # XXX: document that this can be overridden to match desired policy
  222. # XXX: make sure url is well-formed and routeable
  223. return _allowedURL(url)
  224. def fetch(self, url, body=None, headers=None):
  225. stop = int(time.time()) + self.ALLOWED_TIME
  226. off = self.ALLOWED_TIME
  227. if headers is None:
  228. headers = {}
  229. headers.setdefault('User-Agent',
  230. "%s %s" % (USER_AGENT, pycurl.version,))
  231. header_list = []
  232. if headers is not None:
  233. for header_name, header_value in headers.iteritems():
  234. header_list.append('%s: %s' % (header_name, header_value))
  235. c = pycurl.Curl()
  236. try:
  237. c.setopt(pycurl.NOSIGNAL, 1)
  238. if header_list:
  239. c.setopt(pycurl.HTTPHEADER, header_list)
  240. # Presence of a body indicates that we should do a POST
  241. if body is not None:
  242. c.setopt(pycurl.POST, 1)
  243. c.setopt(pycurl.POSTFIELDS, body)
  244. while off > 0:
  245. if not self._checkURL(url):
  246. raise HTTPError("Fetching URL not allowed: %r" % (url,))
  247. data = cStringIO.StringIO()
  248. def write_data(chunk):
  249. if data.tell() > 1024*MAX_RESPONSE_KB:
  250. return 0
  251. else:
  252. return data.write(chunk)
  253. response_header_data = cStringIO.StringIO()
  254. c.setopt(pycurl.WRITEFUNCTION, write_data)
  255. c.setopt(pycurl.HEADERFUNCTION, response_header_data.write)
  256. c.setopt(pycurl.TIMEOUT, off)
  257. c.setopt(pycurl.URL, openid.urinorm.urinorm(url))
  258. c.perform()
  259. response_headers = self._parseHeaders(response_header_data)
  260. code = c.getinfo(pycurl.RESPONSE_CODE)
  261. if code in [301, 302, 303, 307]:
  262. url = response_headers.get('location')
  263. if url is None:
  264. raise HTTPError(
  265. 'Redirect (%s) returned without a location' % code)
  266. # Redirects are always GETs
  267. c.setopt(pycurl.POST, 0)
  268. # There is no way to reset POSTFIELDS to empty and
  269. # reuse the connection, but we only use it once.
  270. else:
  271. resp = HTTPResponse()
  272. resp.headers = response_headers
  273. resp.status = code
  274. resp.final_url = url
  275. resp.body = data.getvalue()
  276. return resp
  277. off = stop - int(time.time())
  278. raise HTTPError("Timed out fetching: %r" % (url,))
  279. finally:
  280. c.close()
  281. class HTTPLib2Fetcher(HTTPFetcher):
  282. """A fetcher that uses C{httplib2} for performing HTTP
  283. requests. This implementation supports HTTP caching.
  284. @see: http://bitworking.org/projects/httplib2/
  285. """
  286. def __init__(self, cache=None):
  287. """@param cache: An object suitable for use as an C{httplib2}
  288. cache. If a string is passed, it is assumed to be a
  289. directory name.
  290. """
  291. if httplib2 is None:
  292. raise RuntimeError('Cannot find httplib2 library. '
  293. 'See http://bitworking.org/projects/httplib2/')
  294. super(HTTPLib2Fetcher, self).__init__()
  295. # An instance of the httplib2 object that performs HTTP requests
  296. self.httplib2 = httplib2.Http(cache)
  297. # We want httplib2 to raise exceptions for errors, just like
  298. # the other fetchers.
  299. self.httplib2.force_exception_to_status_code = False
  300. def fetch(self, url, body=None, headers=None):
  301. """Perform an HTTP request
  302. @raises Exception: Any exception that can be raised by httplib2
  303. @see: C{L{HTTPFetcher.fetch}}
  304. """
  305. if body:
  306. method = 'POST'
  307. else:
  308. method = 'GET'
  309. if headers is None:
  310. headers = {}
  311. # httplib2 doesn't check to make sure that the URL's scheme is
  312. # 'http' so we do it here.
  313. if not (url.startswith('http://') or url.startswith('https://')):
  314. raise ValueError('URL is not a HTTP URL: %r' % (url,))
  315. httplib2_response, content = self.httplib2.request(
  316. url, method, body=body, headers=headers)
  317. # Translate the httplib2 response to our HTTP response abstraction
  318. # When a 400 is returned, there is no "content-location"
  319. # header set. This seems like a bug to me. I can't think of a
  320. # case where we really care about the final URL when it is an
  321. # error response, but being careful about it can't hurt.
  322. try:
  323. final_url = httplib2_response['content-location']
  324. except KeyError:
  325. # We're assuming that no redirects occurred
  326. assert not httplib2_response.previous
  327. # And this should never happen for a successful response
  328. assert httplib2_response.status != 200
  329. final_url = url
  330. return HTTPResponse(
  331. body=content,
  332. final_url=final_url,
  333. headers=dict(httplib2_response.items()),
  334. status=httplib2_response.status,
  335. )