123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360 |
- """
- RefererMiddleware: populates Request referer field, based on the Response which
- originated it.
- """
- from six.moves.urllib.parse import urlparse
- import warnings
- from w3lib.url import safe_url_string
- from scrapy.http import Request, Response
- from scrapy.exceptions import NotConfigured
- from scrapy import signals
- from scrapy.utils.python import to_native_str
- from scrapy.utils.httpobj import urlparse_cached
- from scrapy.utils.misc import load_object
- from scrapy.utils.url import strip_url
- LOCAL_SCHEMES = ('about', 'blob', 'data', 'filesystem',)
- POLICY_NO_REFERRER = "no-referrer"
- POLICY_NO_REFERRER_WHEN_DOWNGRADE = "no-referrer-when-downgrade"
- POLICY_SAME_ORIGIN = "same-origin"
- POLICY_ORIGIN = "origin"
- POLICY_STRICT_ORIGIN = "strict-origin"
- POLICY_ORIGIN_WHEN_CROSS_ORIGIN = "origin-when-cross-origin"
- POLICY_STRICT_ORIGIN_WHEN_CROSS_ORIGIN = "strict-origin-when-cross-origin"
- POLICY_UNSAFE_URL = "unsafe-url"
- POLICY_SCRAPY_DEFAULT = "scrapy-default"
- class ReferrerPolicy(object):
- NOREFERRER_SCHEMES = LOCAL_SCHEMES
- def referrer(self, response_url, request_url):
- raise NotImplementedError()
- def stripped_referrer(self, url):
- if urlparse(url).scheme not in self.NOREFERRER_SCHEMES:
- return self.strip_url(url)
- def origin_referrer(self, url):
- if urlparse(url).scheme not in self.NOREFERRER_SCHEMES:
- return self.origin(url)
- def strip_url(self, url, origin_only=False):
- """
- https://www.w3.org/TR/referrer-policy/#strip-url
- If url is null, return no referrer.
- If url's scheme is a local scheme, then return no referrer.
- Set url's username to the empty string.
- Set url's password to null.
- Set url's fragment to null.
- If the origin-only flag is true, then:
- Set url's path to null.
- Set url's query to null.
- Return url.
- """
- if not url:
- return None
- return strip_url(url,
- strip_credentials=True,
- strip_fragment=True,
- strip_default_port=True,
- origin_only=origin_only)
- def origin(self, url):
- """Return serialized origin (scheme, host, path) for a request or response URL."""
- return self.strip_url(url, origin_only=True)
- def potentially_trustworthy(self, url):
- # Note: this does not follow https://w3c.github.io/webappsec-secure-contexts/#is-url-trustworthy
- parsed_url = urlparse(url)
- if parsed_url.scheme in ('data',):
- return False
- return self.tls_protected(url)
- def tls_protected(self, url):
- return urlparse(url).scheme in ('https', 'ftps')
- class NoReferrerPolicy(ReferrerPolicy):
- """
- https://www.w3.org/TR/referrer-policy/#referrer-policy-no-referrer
- The simplest policy is "no-referrer", which specifies that no referrer information
- is to be sent along with requests made from a particular request client to any origin.
- The header will be omitted entirely.
- """
- name = POLICY_NO_REFERRER
- def referrer(self, response_url, request_url):
- return None
- class NoReferrerWhenDowngradePolicy(ReferrerPolicy):
- """
- https://www.w3.org/TR/referrer-policy/#referrer-policy-no-referrer-when-downgrade
- The "no-referrer-when-downgrade" policy sends a full URL along with requests
- from a TLS-protected environment settings object to a potentially trustworthy URL,
- and requests from clients which are not TLS-protected to any origin.
- Requests from TLS-protected clients to non-potentially trustworthy URLs,
- on the other hand, will contain no referrer information.
- A Referer HTTP header will not be sent.
- This is a user agent's default behavior, if no policy is otherwise specified.
- """
- name = POLICY_NO_REFERRER_WHEN_DOWNGRADE
- def referrer(self, response_url, request_url):
- if not self.tls_protected(response_url) or self.tls_protected(request_url):
- return self.stripped_referrer(response_url)
- class SameOriginPolicy(ReferrerPolicy):
- """
- https://www.w3.org/TR/referrer-policy/#referrer-policy-same-origin
- The "same-origin" policy specifies that a full URL, stripped for use as a referrer,
- is sent as referrer information when making same-origin requests from a particular request client.
- Cross-origin requests, on the other hand, will contain no referrer information.
- A Referer HTTP header will not be sent.
- """
- name = POLICY_SAME_ORIGIN
- def referrer(self, response_url, request_url):
- if self.origin(response_url) == self.origin(request_url):
- return self.stripped_referrer(response_url)
- class OriginPolicy(ReferrerPolicy):
- """
- https://www.w3.org/TR/referrer-policy/#referrer-policy-origin
- The "origin" policy specifies that only the ASCII serialization
- of the origin of the request client is sent as referrer information
- when making both same-origin requests and cross-origin requests
- from a particular request client.
- """
- name = POLICY_ORIGIN
- def referrer(self, response_url, request_url):
- return self.origin_referrer(response_url)
- class StrictOriginPolicy(ReferrerPolicy):
- """
- https://www.w3.org/TR/referrer-policy/#referrer-policy-strict-origin
- The "strict-origin" policy sends the ASCII serialization
- of the origin of the request client when making requests:
- - from a TLS-protected environment settings object to a potentially trustworthy URL, and
- - from non-TLS-protected environment settings objects to any origin.
- Requests from TLS-protected request clients to non- potentially trustworthy URLs,
- on the other hand, will contain no referrer information.
- A Referer HTTP header will not be sent.
- """
- name = POLICY_STRICT_ORIGIN
- def referrer(self, response_url, request_url):
- if ((self.tls_protected(response_url) and
- self.potentially_trustworthy(request_url))
- or not self.tls_protected(response_url)):
- return self.origin_referrer(response_url)
- class OriginWhenCrossOriginPolicy(ReferrerPolicy):
- """
- https://www.w3.org/TR/referrer-policy/#referrer-policy-origin-when-cross-origin
- The "origin-when-cross-origin" policy specifies that a full URL,
- stripped for use as a referrer, is sent as referrer information
- when making same-origin requests from a particular request client,
- and only the ASCII serialization of the origin of the request client
- is sent as referrer information when making cross-origin requests
- from a particular request client.
- """
- name = POLICY_ORIGIN_WHEN_CROSS_ORIGIN
- def referrer(self, response_url, request_url):
- origin = self.origin(response_url)
- if origin == self.origin(request_url):
- return self.stripped_referrer(response_url)
- else:
- return origin
- class StrictOriginWhenCrossOriginPolicy(ReferrerPolicy):
- """
- https://www.w3.org/TR/referrer-policy/#referrer-policy-strict-origin-when-cross-origin
- The "strict-origin-when-cross-origin" policy specifies that a full URL,
- stripped for use as a referrer, is sent as referrer information
- when making same-origin requests from a particular request client,
- and only the ASCII serialization of the origin of the request client
- when making cross-origin requests:
- - from a TLS-protected environment settings object to a potentially trustworthy URL, and
- - from non-TLS-protected environment settings objects to any origin.
- Requests from TLS-protected clients to non- potentially trustworthy URLs,
- on the other hand, will contain no referrer information.
- A Referer HTTP header will not be sent.
- """
- name = POLICY_STRICT_ORIGIN_WHEN_CROSS_ORIGIN
- def referrer(self, response_url, request_url):
- origin = self.origin(response_url)
- if origin == self.origin(request_url):
- return self.stripped_referrer(response_url)
- elif ((self.tls_protected(response_url) and
- self.potentially_trustworthy(request_url))
- or not self.tls_protected(response_url)):
- return self.origin_referrer(response_url)
- class UnsafeUrlPolicy(ReferrerPolicy):
- """
- https://www.w3.org/TR/referrer-policy/#referrer-policy-unsafe-url
- The "unsafe-url" policy specifies that a full URL, stripped for use as a referrer,
- is sent along with both cross-origin requests
- and same-origin requests made from a particular request client.
- Note: The policy's name doesn't lie; it is unsafe.
- This policy will leak origins and paths from TLS-protected resources
- to insecure origins.
- Carefully consider the impact of setting such a policy for potentially sensitive documents.
- """
- name = POLICY_UNSAFE_URL
- def referrer(self, response_url, request_url):
- return self.stripped_referrer(response_url)
- class DefaultReferrerPolicy(NoReferrerWhenDowngradePolicy):
- """
- A variant of "no-referrer-when-downgrade",
- with the addition that "Referer" is not sent if the parent request was
- using ``file://`` or ``s3://`` scheme.
- """
- NOREFERRER_SCHEMES = LOCAL_SCHEMES + ('file', 's3')
- name = POLICY_SCRAPY_DEFAULT
- _policy_classes = {p.name: p for p in (
- NoReferrerPolicy,
- NoReferrerWhenDowngradePolicy,
- SameOriginPolicy,
- OriginPolicy,
- StrictOriginPolicy,
- OriginWhenCrossOriginPolicy,
- StrictOriginWhenCrossOriginPolicy,
- UnsafeUrlPolicy,
- DefaultReferrerPolicy,
- )}
- # Reference: https://www.w3.org/TR/referrer-policy/#referrer-policy-empty-string
- _policy_classes[''] = NoReferrerWhenDowngradePolicy
- def _load_policy_class(policy, warning_only=False):
- """
- Expect a string for the path to the policy class,
- otherwise try to interpret the string as a standard value
- from https://www.w3.org/TR/referrer-policy/#referrer-policies
- """
- try:
- return load_object(policy)
- except ValueError:
- try:
- return _policy_classes[policy.lower()]
- except KeyError:
- msg = "Could not load referrer policy %r" % policy
- if not warning_only:
- raise RuntimeError(msg)
- else:
- warnings.warn(msg, RuntimeWarning)
- return None
- class RefererMiddleware(object):
- def __init__(self, settings=None):
- self.default_policy = DefaultReferrerPolicy
- if settings is not None:
- self.default_policy = _load_policy_class(
- settings.get('REFERRER_POLICY'))
- @classmethod
- def from_crawler(cls, crawler):
- if not crawler.settings.getbool('REFERER_ENABLED'):
- raise NotConfigured
- mw = cls(crawler.settings)
- # Note: this hook is a bit of a hack to intercept redirections
- crawler.signals.connect(mw.request_scheduled, signal=signals.request_scheduled)
- return mw
- def policy(self, resp_or_url, request):
- """
- Determine Referrer-Policy to use from a parent Response (or URL),
- and a Request to be sent.
- - if a valid policy is set in Request meta, it is used.
- - if the policy is set in meta but is wrong (e.g. a typo error),
- the policy from settings is used
- - if the policy is not set in Request meta,
- but there is a Referrer-policy header in the parent response,
- it is used if valid
- - otherwise, the policy from settings is used.
- """
- policy_name = request.meta.get('referrer_policy')
- if policy_name is None:
- if isinstance(resp_or_url, Response):
- policy_header = resp_or_url.headers.get('Referrer-Policy')
- if policy_header is not None:
- policy_name = to_native_str(policy_header.decode('latin1'))
- if policy_name is None:
- return self.default_policy()
- cls = _load_policy_class(policy_name, warning_only=True)
- return cls() if cls else self.default_policy()
- def process_spider_output(self, response, result, spider):
- def _set_referer(r):
- if isinstance(r, Request):
- referrer = self.policy(response, r).referrer(response.url, r.url)
- if referrer is not None:
- r.headers.setdefault('Referer', referrer)
- return r
- return (_set_referer(r) for r in result or ())
- def request_scheduled(self, request, spider):
- # check redirected request to patch "Referer" header if necessary
- redirected_urls = request.meta.get('redirect_urls', [])
- if redirected_urls:
- request_referrer = request.headers.get('Referer')
- # we don't patch the referrer value if there is none
- if request_referrer is not None:
- # the request's referrer header value acts as a surrogate
- # for the parent response URL
- #
- # Note: if the 3xx response contained a Referrer-Policy header,
- # the information is not available using this hook
- parent_url = safe_url_string(request_referrer)
- policy_referrer = self.policy(parent_url, request).referrer(
- parent_url, request.url)
- if policy_referrer != request_referrer:
- if policy_referrer is None:
- request.headers.pop('Referer')
- else:
- request.headers['Referer'] = policy_referrer
|