request.py 3.4 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394
  1. """
  2. This module provides some useful functions for working with
  3. scrapy.http.Request objects
  4. """
  5. from __future__ import print_function
  6. import hashlib
  7. import weakref
  8. from six.moves.urllib.parse import urlunparse
  9. from w3lib.http import basic_auth_header
  10. from scrapy.utils.python import to_bytes, to_native_str
  11. from w3lib.url import canonicalize_url
  12. from scrapy.utils.httpobj import urlparse_cached
  13. _fingerprint_cache = weakref.WeakKeyDictionary()
  14. def request_fingerprint(request, include_headers=None):
  15. """
  16. Return the request fingerprint.
  17. The request fingerprint is a hash that uniquely identifies the resource the
  18. request points to. For example, take the following two urls:
  19. http://www.example.com/query?id=111&cat=222
  20. http://www.example.com/query?cat=222&id=111
  21. Even though those are two different URLs both point to the same resource
  22. and are equivalent (ie. they should return the same response).
  23. Another example are cookies used to store session ids. Suppose the
  24. following page is only accessible to authenticated users:
  25. http://www.example.com/members/offers.html
  26. Lot of sites use a cookie to store the session id, which adds a random
  27. component to the HTTP Request and thus should be ignored when calculating
  28. the fingerprint.
  29. For this reason, request headers are ignored by default when calculating
  30. the fingeprint. If you want to include specific headers use the
  31. include_headers argument, which is a list of Request headers to include.
  32. """
  33. if include_headers:
  34. include_headers = tuple(to_bytes(h.lower())
  35. for h in sorted(include_headers))
  36. cache = _fingerprint_cache.setdefault(request, {})
  37. if include_headers not in cache:
  38. fp = hashlib.sha1()
  39. fp.update(to_bytes(request.method))
  40. fp.update(to_bytes(canonicalize_url(request.url)))
  41. fp.update(request.body or b'')
  42. if include_headers:
  43. for hdr in include_headers:
  44. if hdr in request.headers:
  45. fp.update(hdr)
  46. for v in request.headers.getlist(hdr):
  47. fp.update(v)
  48. cache[include_headers] = fp.hexdigest()
  49. return cache[include_headers]
  50. def request_authenticate(request, username, password):
  51. """Autenticate the given request (in place) using the HTTP basic access
  52. authentication mechanism (RFC 2617) and the given username and password
  53. """
  54. request.headers['Authorization'] = basic_auth_header(username, password)
  55. def request_httprepr(request):
  56. """Return the raw HTTP representation (as bytes) of the given request.
  57. This is provided only for reference since it's not the actual stream of
  58. bytes that will be send when performing the request (that's controlled
  59. by Twisted).
  60. """
  61. parsed = urlparse_cached(request)
  62. path = urlunparse(('', '', parsed.path or '/', parsed.params, parsed.query, ''))
  63. s = to_bytes(request.method) + b" " + to_bytes(path) + b" HTTP/1.1\r\n"
  64. s += b"Host: " + to_bytes(parsed.hostname or b'') + b"\r\n"
  65. if request.headers:
  66. s += request.headers.to_string() + b"\r\n"
  67. s += b"\r\n"
  68. s += request.body
  69. return s
  70. def referer_str(request):
  71. """ Return Referer HTTP header suitable for logging. """
  72. referrer = request.headers.get('Referer')
  73. if referrer is None:
  74. return referrer
  75. return to_native_str(referrer, errors='replace')