referer.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360
  1. """
  2. RefererMiddleware: populates Request referer field, based on the Response which
  3. originated it.
  4. """
  5. from six.moves.urllib.parse import urlparse
  6. import warnings
  7. from w3lib.url import safe_url_string
  8. from scrapy.http import Request, Response
  9. from scrapy.exceptions import NotConfigured
  10. from scrapy import signals
  11. from scrapy.utils.python import to_native_str
  12. from scrapy.utils.httpobj import urlparse_cached
  13. from scrapy.utils.misc import load_object
  14. from scrapy.utils.url import strip_url
  15. LOCAL_SCHEMES = ('about', 'blob', 'data', 'filesystem',)
  16. POLICY_NO_REFERRER = "no-referrer"
  17. POLICY_NO_REFERRER_WHEN_DOWNGRADE = "no-referrer-when-downgrade"
  18. POLICY_SAME_ORIGIN = "same-origin"
  19. POLICY_ORIGIN = "origin"
  20. POLICY_STRICT_ORIGIN = "strict-origin"
  21. POLICY_ORIGIN_WHEN_CROSS_ORIGIN = "origin-when-cross-origin"
  22. POLICY_STRICT_ORIGIN_WHEN_CROSS_ORIGIN = "strict-origin-when-cross-origin"
  23. POLICY_UNSAFE_URL = "unsafe-url"
  24. POLICY_SCRAPY_DEFAULT = "scrapy-default"
  25. class ReferrerPolicy(object):
  26. NOREFERRER_SCHEMES = LOCAL_SCHEMES
  27. def referrer(self, response_url, request_url):
  28. raise NotImplementedError()
  29. def stripped_referrer(self, url):
  30. if urlparse(url).scheme not in self.NOREFERRER_SCHEMES:
  31. return self.strip_url(url)
  32. def origin_referrer(self, url):
  33. if urlparse(url).scheme not in self.NOREFERRER_SCHEMES:
  34. return self.origin(url)
  35. def strip_url(self, url, origin_only=False):
  36. """
  37. https://www.w3.org/TR/referrer-policy/#strip-url
  38. If url is null, return no referrer.
  39. If url's scheme is a local scheme, then return no referrer.
  40. Set url's username to the empty string.
  41. Set url's password to null.
  42. Set url's fragment to null.
  43. If the origin-only flag is true, then:
  44. Set url's path to null.
  45. Set url's query to null.
  46. Return url.
  47. """
  48. if not url:
  49. return None
  50. return strip_url(url,
  51. strip_credentials=True,
  52. strip_fragment=True,
  53. strip_default_port=True,
  54. origin_only=origin_only)
  55. def origin(self, url):
  56. """Return serialized origin (scheme, host, path) for a request or response URL."""
  57. return self.strip_url(url, origin_only=True)
  58. def potentially_trustworthy(self, url):
  59. # Note: this does not follow https://w3c.github.io/webappsec-secure-contexts/#is-url-trustworthy
  60. parsed_url = urlparse(url)
  61. if parsed_url.scheme in ('data',):
  62. return False
  63. return self.tls_protected(url)
  64. def tls_protected(self, url):
  65. return urlparse(url).scheme in ('https', 'ftps')
  66. class NoReferrerPolicy(ReferrerPolicy):
  67. """
  68. https://www.w3.org/TR/referrer-policy/#referrer-policy-no-referrer
  69. The simplest policy is "no-referrer", which specifies that no referrer information
  70. is to be sent along with requests made from a particular request client to any origin.
  71. The header will be omitted entirely.
  72. """
  73. name = POLICY_NO_REFERRER
  74. def referrer(self, response_url, request_url):
  75. return None
  76. class NoReferrerWhenDowngradePolicy(ReferrerPolicy):
  77. """
  78. https://www.w3.org/TR/referrer-policy/#referrer-policy-no-referrer-when-downgrade
  79. The "no-referrer-when-downgrade" policy sends a full URL along with requests
  80. from a TLS-protected environment settings object to a potentially trustworthy URL,
  81. and requests from clients which are not TLS-protected to any origin.
  82. Requests from TLS-protected clients to non-potentially trustworthy URLs,
  83. on the other hand, will contain no referrer information.
  84. A Referer HTTP header will not be sent.
  85. This is a user agent's default behavior, if no policy is otherwise specified.
  86. """
  87. name = POLICY_NO_REFERRER_WHEN_DOWNGRADE
  88. def referrer(self, response_url, request_url):
  89. if not self.tls_protected(response_url) or self.tls_protected(request_url):
  90. return self.stripped_referrer(response_url)
  91. class SameOriginPolicy(ReferrerPolicy):
  92. """
  93. https://www.w3.org/TR/referrer-policy/#referrer-policy-same-origin
  94. The "same-origin" policy specifies that a full URL, stripped for use as a referrer,
  95. is sent as referrer information when making same-origin requests from a particular request client.
  96. Cross-origin requests, on the other hand, will contain no referrer information.
  97. A Referer HTTP header will not be sent.
  98. """
  99. name = POLICY_SAME_ORIGIN
  100. def referrer(self, response_url, request_url):
  101. if self.origin(response_url) == self.origin(request_url):
  102. return self.stripped_referrer(response_url)
  103. class OriginPolicy(ReferrerPolicy):
  104. """
  105. https://www.w3.org/TR/referrer-policy/#referrer-policy-origin
  106. The "origin" policy specifies that only the ASCII serialization
  107. of the origin of the request client is sent as referrer information
  108. when making both same-origin requests and cross-origin requests
  109. from a particular request client.
  110. """
  111. name = POLICY_ORIGIN
  112. def referrer(self, response_url, request_url):
  113. return self.origin_referrer(response_url)
  114. class StrictOriginPolicy(ReferrerPolicy):
  115. """
  116. https://www.w3.org/TR/referrer-policy/#referrer-policy-strict-origin
  117. The "strict-origin" policy sends the ASCII serialization
  118. of the origin of the request client when making requests:
  119. - from a TLS-protected environment settings object to a potentially trustworthy URL, and
  120. - from non-TLS-protected environment settings objects to any origin.
  121. Requests from TLS-protected request clients to non- potentially trustworthy URLs,
  122. on the other hand, will contain no referrer information.
  123. A Referer HTTP header will not be sent.
  124. """
  125. name = POLICY_STRICT_ORIGIN
  126. def referrer(self, response_url, request_url):
  127. if ((self.tls_protected(response_url) and
  128. self.potentially_trustworthy(request_url))
  129. or not self.tls_protected(response_url)):
  130. return self.origin_referrer(response_url)
  131. class OriginWhenCrossOriginPolicy(ReferrerPolicy):
  132. """
  133. https://www.w3.org/TR/referrer-policy/#referrer-policy-origin-when-cross-origin
  134. The "origin-when-cross-origin" policy specifies that a full URL,
  135. stripped for use as a referrer, is sent as referrer information
  136. when making same-origin requests from a particular request client,
  137. and only the ASCII serialization of the origin of the request client
  138. is sent as referrer information when making cross-origin requests
  139. from a particular request client.
  140. """
  141. name = POLICY_ORIGIN_WHEN_CROSS_ORIGIN
  142. def referrer(self, response_url, request_url):
  143. origin = self.origin(response_url)
  144. if origin == self.origin(request_url):
  145. return self.stripped_referrer(response_url)
  146. else:
  147. return origin
  148. class StrictOriginWhenCrossOriginPolicy(ReferrerPolicy):
  149. """
  150. https://www.w3.org/TR/referrer-policy/#referrer-policy-strict-origin-when-cross-origin
  151. The "strict-origin-when-cross-origin" policy specifies that a full URL,
  152. stripped for use as a referrer, is sent as referrer information
  153. when making same-origin requests from a particular request client,
  154. and only the ASCII serialization of the origin of the request client
  155. when making cross-origin requests:
  156. - from a TLS-protected environment settings object to a potentially trustworthy URL, and
  157. - from non-TLS-protected environment settings objects to any origin.
  158. Requests from TLS-protected clients to non- potentially trustworthy URLs,
  159. on the other hand, will contain no referrer information.
  160. A Referer HTTP header will not be sent.
  161. """
  162. name = POLICY_STRICT_ORIGIN_WHEN_CROSS_ORIGIN
  163. def referrer(self, response_url, request_url):
  164. origin = self.origin(response_url)
  165. if origin == self.origin(request_url):
  166. return self.stripped_referrer(response_url)
  167. elif ((self.tls_protected(response_url) and
  168. self.potentially_trustworthy(request_url))
  169. or not self.tls_protected(response_url)):
  170. return self.origin_referrer(response_url)
  171. class UnsafeUrlPolicy(ReferrerPolicy):
  172. """
  173. https://www.w3.org/TR/referrer-policy/#referrer-policy-unsafe-url
  174. The "unsafe-url" policy specifies that a full URL, stripped for use as a referrer,
  175. is sent along with both cross-origin requests
  176. and same-origin requests made from a particular request client.
  177. Note: The policy's name doesn't lie; it is unsafe.
  178. This policy will leak origins and paths from TLS-protected resources
  179. to insecure origins.
  180. Carefully consider the impact of setting such a policy for potentially sensitive documents.
  181. """
  182. name = POLICY_UNSAFE_URL
  183. def referrer(self, response_url, request_url):
  184. return self.stripped_referrer(response_url)
  185. class DefaultReferrerPolicy(NoReferrerWhenDowngradePolicy):
  186. """
  187. A variant of "no-referrer-when-downgrade",
  188. with the addition that "Referer" is not sent if the parent request was
  189. using ``file://`` or ``s3://`` scheme.
  190. """
  191. NOREFERRER_SCHEMES = LOCAL_SCHEMES + ('file', 's3')
  192. name = POLICY_SCRAPY_DEFAULT
  193. _policy_classes = {p.name: p for p in (
  194. NoReferrerPolicy,
  195. NoReferrerWhenDowngradePolicy,
  196. SameOriginPolicy,
  197. OriginPolicy,
  198. StrictOriginPolicy,
  199. OriginWhenCrossOriginPolicy,
  200. StrictOriginWhenCrossOriginPolicy,
  201. UnsafeUrlPolicy,
  202. DefaultReferrerPolicy,
  203. )}
  204. # Reference: https://www.w3.org/TR/referrer-policy/#referrer-policy-empty-string
  205. _policy_classes[''] = NoReferrerWhenDowngradePolicy
  206. def _load_policy_class(policy, warning_only=False):
  207. """
  208. Expect a string for the path to the policy class,
  209. otherwise try to interpret the string as a standard value
  210. from https://www.w3.org/TR/referrer-policy/#referrer-policies
  211. """
  212. try:
  213. return load_object(policy)
  214. except ValueError:
  215. try:
  216. return _policy_classes[policy.lower()]
  217. except KeyError:
  218. msg = "Could not load referrer policy %r" % policy
  219. if not warning_only:
  220. raise RuntimeError(msg)
  221. else:
  222. warnings.warn(msg, RuntimeWarning)
  223. return None
  224. class RefererMiddleware(object):
  225. def __init__(self, settings=None):
  226. self.default_policy = DefaultReferrerPolicy
  227. if settings is not None:
  228. self.default_policy = _load_policy_class(
  229. settings.get('REFERRER_POLICY'))
  230. @classmethod
  231. def from_crawler(cls, crawler):
  232. if not crawler.settings.getbool('REFERER_ENABLED'):
  233. raise NotConfigured
  234. mw = cls(crawler.settings)
  235. # Note: this hook is a bit of a hack to intercept redirections
  236. crawler.signals.connect(mw.request_scheduled, signal=signals.request_scheduled)
  237. return mw
  238. def policy(self, resp_or_url, request):
  239. """
  240. Determine Referrer-Policy to use from a parent Response (or URL),
  241. and a Request to be sent.
  242. - if a valid policy is set in Request meta, it is used.
  243. - if the policy is set in meta but is wrong (e.g. a typo error),
  244. the policy from settings is used
  245. - if the policy is not set in Request meta,
  246. but there is a Referrer-policy header in the parent response,
  247. it is used if valid
  248. - otherwise, the policy from settings is used.
  249. """
  250. policy_name = request.meta.get('referrer_policy')
  251. if policy_name is None:
  252. if isinstance(resp_or_url, Response):
  253. policy_header = resp_or_url.headers.get('Referrer-Policy')
  254. if policy_header is not None:
  255. policy_name = to_native_str(policy_header.decode('latin1'))
  256. if policy_name is None:
  257. return self.default_policy()
  258. cls = _load_policy_class(policy_name, warning_only=True)
  259. return cls() if cls else self.default_policy()
  260. def process_spider_output(self, response, result, spider):
  261. def _set_referer(r):
  262. if isinstance(r, Request):
  263. referrer = self.policy(response, r).referrer(response.url, r.url)
  264. if referrer is not None:
  265. r.headers.setdefault('Referer', referrer)
  266. return r
  267. return (_set_referer(r) for r in result or ())
  268. def request_scheduled(self, request, spider):
  269. # check redirected request to patch "Referer" header if necessary
  270. redirected_urls = request.meta.get('redirect_urls', [])
  271. if redirected_urls:
  272. request_referrer = request.headers.get('Referer')
  273. # we don't patch the referrer value if there is none
  274. if request_referrer is not None:
  275. # the request's referrer header value acts as a surrogate
  276. # for the parent response URL
  277. #
  278. # Note: if the 3xx response contained a Referrer-Policy header,
  279. # the information is not available using this hook
  280. parent_url = safe_url_string(request_referrer)
  281. policy_referrer = self.policy(parent_url, request).referrer(
  282. parent_url, request.url)
  283. if policy_referrer != request_referrer:
  284. if policy_referrer is None:
  285. request.headers.pop('Referer')
  286. else:
  287. request.headers['Referer'] = policy_referrer