| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394 |
- """
- This module provides some useful functions for working with
- scrapy.http.Request objects
- """
- from __future__ import print_function
- import hashlib
- import weakref
- from six.moves.urllib.parse import urlunparse
- from w3lib.http import basic_auth_header
- from scrapy.utils.python import to_bytes, to_native_str
- from w3lib.url import canonicalize_url
- from scrapy.utils.httpobj import urlparse_cached
- _fingerprint_cache = weakref.WeakKeyDictionary()
- def request_fingerprint(request, include_headers=None):
- """
- Return the request fingerprint.
- The request fingerprint is a hash that uniquely identifies the resource the
- request points to. For example, take the following two urls:
- http://www.example.com/query?id=111&cat=222
- http://www.example.com/query?cat=222&id=111
- Even though those are two different URLs both point to the same resource
- and are equivalent (ie. they should return the same response).
- Another example are cookies used to store session ids. Suppose the
- following page is only accessible to authenticated users:
- http://www.example.com/members/offers.html
- Lot of sites use a cookie to store the session id, which adds a random
- component to the HTTP Request and thus should be ignored when calculating
- the fingerprint.
- For this reason, request headers are ignored by default when calculating
- the fingeprint. If you want to include specific headers use the
- include_headers argument, which is a list of Request headers to include.
- """
- if include_headers:
- include_headers = tuple(to_bytes(h.lower())
- for h in sorted(include_headers))
- cache = _fingerprint_cache.setdefault(request, {})
- if include_headers not in cache:
- fp = hashlib.sha1()
- fp.update(to_bytes(request.method))
- fp.update(to_bytes(canonicalize_url(request.url)))
- fp.update(request.body or b'')
- if include_headers:
- for hdr in include_headers:
- if hdr in request.headers:
- fp.update(hdr)
- for v in request.headers.getlist(hdr):
- fp.update(v)
- cache[include_headers] = fp.hexdigest()
- return cache[include_headers]
- def request_authenticate(request, username, password):
- """Autenticate the given request (in place) using the HTTP basic access
- authentication mechanism (RFC 2617) and the given username and password
- """
- request.headers['Authorization'] = basic_auth_header(username, password)
- def request_httprepr(request):
- """Return the raw HTTP representation (as bytes) of the given request.
- This is provided only for reference since it's not the actual stream of
- bytes that will be send when performing the request (that's controlled
- by Twisted).
- """
- parsed = urlparse_cached(request)
- path = urlunparse(('', '', parsed.path or '/', parsed.params, parsed.query, ''))
- s = to_bytes(request.method) + b" " + to_bytes(path) + b" HTTP/1.1\r\n"
- s += b"Host: " + to_bytes(parsed.hostname or b'') + b"\r\n"
- if request.headers:
- s += request.headers.to_string() + b"\r\n"
- s += b"\r\n"
- s += request.body
- return s
- def referer_str(request):
- """ Return Referer HTTP header suitable for logging. """
- referrer = request.headers.get('Referer')
- if referrer is None:
- return referrer
- return to_native_str(referrer, errors='replace')
|