"""
RefererMiddleware: populates Request referer field, based on the Response which
originated it.
"""
import warnings
from urllib.parse import urlparse
from w3lib.url import safe_url_string
from scrapy.http import Request, Response
from scrapy.exceptions import NotConfigured
from scrapy import signals
from scrapy.utils.python import to_unicode
from scrapy.utils.misc import load_object
from scrapy.utils.url import strip_url
LOCAL_SCHEMES = ('about', 'blob', 'data', 'filesystem',)
POLICY_NO_REFERRER = "no-referrer"
POLICY_NO_REFERRER_WHEN_DOWNGRADE = "no-referrer-when-downgrade"
POLICY_SAME_ORIGIN = "same-origin"
POLICY_ORIGIN = "origin"
POLICY_STRICT_ORIGIN = "strict-origin"
POLICY_ORIGIN_WHEN_CROSS_ORIGIN = "origin-when-cross-origin"
POLICY_STRICT_ORIGIN_WHEN_CROSS_ORIGIN = "strict-origin-when-cross-origin"
POLICY_UNSAFE_URL = "unsafe-url"
POLICY_SCRAPY_DEFAULT = "scrapy-default"
class ReferrerPolicy(object):
NOREFERRER_SCHEMES = LOCAL_SCHEMES
def referrer(self, response_url, request_url):
raise NotImplementedError()
def stripped_referrer(self, url):
if urlparse(url).scheme not in self.NOREFERRER_SCHEMES:
return self.strip_url(url)
def origin_referrer(self, url):
if urlparse(url).scheme not in self.NOREFERRER_SCHEMES:
return self.origin(url)
def strip_url(self, url, origin_only=False):
"""
https://www.w3.org/TR/referrer-policy/#strip-url
If url is null, return no referrer.
If url's scheme is a local scheme, then return no referrer.
Set url's username to the empty string.
Set url's password to null.
Set url's fragment to null.
If the origin-only flag is true, then:
Set url's path to null.
Set url's query to null.
Return url.
"""
if not url:
return None
return strip_url(url,
strip_credentials=True,
strip_fragment=True,
strip_default_port=True,
origin_only=origin_only)
def origin(self, url):
"""Return serialized origin (scheme, host, path) for a request or response URL."""
return self.strip_url(url, origin_only=True)
def potentially_trustworthy(self, url):
# Note: this does not follow https://w3c.github.io/webappsec-secure-contexts/#is-url-trustworthy
parsed_url = urlparse(url)
if parsed_url.scheme in ('data',):
return False
return self.tls_protected(url)
def tls_protected(self, url):
return urlparse(url).scheme in ('https', 'ftps')
[docs]class NoReferrerPolicy(ReferrerPolicy):
"""
https://www.w3.org/TR/referrer-policy/#referrer-policy-no-referrer
The simplest policy is "no-referrer", which specifies that no referrer information
is to be sent along with requests made from a particular request client to any origin.
The header will be omitted entirely.
"""
name = POLICY_NO_REFERRER
def referrer(self, response_url, request_url):
return None
[docs]class NoReferrerWhenDowngradePolicy(ReferrerPolicy):
"""
https://www.w3.org/TR/referrer-policy/#referrer-policy-no-referrer-when-downgrade
The "no-referrer-when-downgrade" policy sends a full URL along with requests
from a TLS-protected environment settings object to a potentially trustworthy URL,
and requests from clients which are not TLS-protected to any origin.
Requests from TLS-protected clients to non-potentially trustworthy URLs,
on the other hand, will contain no referrer information.
A Referer HTTP header will not be sent.
This is a user agent's default behavior, if no policy is otherwise specified.
"""
name = POLICY_NO_REFERRER_WHEN_DOWNGRADE
def referrer(self, response_url, request_url):
if not self.tls_protected(response_url) or self.tls_protected(request_url):
return self.stripped_referrer(response_url)
[docs]class SameOriginPolicy(ReferrerPolicy):
"""
https://www.w3.org/TR/referrer-policy/#referrer-policy-same-origin
The "same-origin" policy specifies that a full URL, stripped for use as a referrer,
is sent as referrer information when making same-origin requests from a particular request client.
Cross-origin requests, on the other hand, will contain no referrer information.
A Referer HTTP header will not be sent.
"""
name = POLICY_SAME_ORIGIN
def referrer(self, response_url, request_url):
if self.origin(response_url) == self.origin(request_url):
return self.stripped_referrer(response_url)
[docs]class OriginPolicy(ReferrerPolicy):
"""
https://www.w3.org/TR/referrer-policy/#referrer-policy-origin
The "origin" policy specifies that only the ASCII serialization
of the origin of the request client is sent as referrer information
when making both same-origin requests and cross-origin requests
from a particular request client.
"""
name = POLICY_ORIGIN
def referrer(self, response_url, request_url):
return self.origin_referrer(response_url)
[docs]class StrictOriginPolicy(ReferrerPolicy):
"""
https://www.w3.org/TR/referrer-policy/#referrer-policy-strict-origin
The "strict-origin" policy sends the ASCII serialization
of the origin of the request client when making requests:
- from a TLS-protected environment settings object to a potentially trustworthy URL, and
- from non-TLS-protected environment settings objects to any origin.
Requests from TLS-protected request clients to non- potentially trustworthy URLs,
on the other hand, will contain no referrer information.
A Referer HTTP header will not be sent.
"""
name = POLICY_STRICT_ORIGIN
def referrer(self, response_url, request_url):
if ((self.tls_protected(response_url) and
self.potentially_trustworthy(request_url))
or not self.tls_protected(response_url)):
return self.origin_referrer(response_url)
[docs]class OriginWhenCrossOriginPolicy(ReferrerPolicy):
"""
https://www.w3.org/TR/referrer-policy/#referrer-policy-origin-when-cross-origin
The "origin-when-cross-origin" policy specifies that a full URL,
stripped for use as a referrer, is sent as referrer information
when making same-origin requests from a particular request client,
and only the ASCII serialization of the origin of the request client
is sent as referrer information when making cross-origin requests
from a particular request client.
"""
name = POLICY_ORIGIN_WHEN_CROSS_ORIGIN
def referrer(self, response_url, request_url):
origin = self.origin(response_url)
if origin == self.origin(request_url):
return self.stripped_referrer(response_url)
else:
return origin
[docs]class StrictOriginWhenCrossOriginPolicy(ReferrerPolicy):
"""
https://www.w3.org/TR/referrer-policy/#referrer-policy-strict-origin-when-cross-origin
The "strict-origin-when-cross-origin" policy specifies that a full URL,
stripped for use as a referrer, is sent as referrer information
when making same-origin requests from a particular request client,
and only the ASCII serialization of the origin of the request client
when making cross-origin requests:
- from a TLS-protected environment settings object to a potentially trustworthy URL, and
- from non-TLS-protected environment settings objects to any origin.
Requests from TLS-protected clients to non- potentially trustworthy URLs,
on the other hand, will contain no referrer information.
A Referer HTTP header will not be sent.
"""
name = POLICY_STRICT_ORIGIN_WHEN_CROSS_ORIGIN
def referrer(self, response_url, request_url):
origin = self.origin(response_url)
if origin == self.origin(request_url):
return self.stripped_referrer(response_url)
elif ((self.tls_protected(response_url) and
self.potentially_trustworthy(request_url))
or not self.tls_protected(response_url)):
return self.origin_referrer(response_url)
[docs]class UnsafeUrlPolicy(ReferrerPolicy):
"""
https://www.w3.org/TR/referrer-policy/#referrer-policy-unsafe-url
The "unsafe-url" policy specifies that a full URL, stripped for use as a referrer,
is sent along with both cross-origin requests
and same-origin requests made from a particular request client.
Note: The policy's name doesn't lie; it is unsafe.
This policy will leak origins and paths from TLS-protected resources
to insecure origins.
Carefully consider the impact of setting such a policy for potentially sensitive documents.
"""
name = POLICY_UNSAFE_URL
def referrer(self, response_url, request_url):
return self.stripped_referrer(response_url)
[docs]class DefaultReferrerPolicy(NoReferrerWhenDowngradePolicy):
"""
A variant of "no-referrer-when-downgrade",
with the addition that "Referer" is not sent if the parent request was
using ``file://`` or ``s3://`` scheme.
"""
NOREFERRER_SCHEMES = LOCAL_SCHEMES + ('file', 's3')
name = POLICY_SCRAPY_DEFAULT
_policy_classes = {p.name: p for p in (
NoReferrerPolicy,
NoReferrerWhenDowngradePolicy,
SameOriginPolicy,
OriginPolicy,
StrictOriginPolicy,
OriginWhenCrossOriginPolicy,
StrictOriginWhenCrossOriginPolicy,
UnsafeUrlPolicy,
DefaultReferrerPolicy,
)}
# Reference: https://www.w3.org/TR/referrer-policy/#referrer-policy-empty-string
_policy_classes[''] = NoReferrerWhenDowngradePolicy
def _load_policy_class(policy, warning_only=False):
"""
Expect a string for the path to the policy class,
otherwise try to interpret the string as a standard value
from https://www.w3.org/TR/referrer-policy/#referrer-policies
"""
try:
return load_object(policy)
except ValueError:
try:
return _policy_classes[policy.lower()]
except KeyError:
msg = "Could not load referrer policy %r" % policy
if not warning_only:
raise RuntimeError(msg)
else:
warnings.warn(msg, RuntimeWarning)
return None
[docs]class RefererMiddleware(object):
def __init__(self, settings=None):
self.default_policy = DefaultReferrerPolicy
if settings is not None:
self.default_policy = _load_policy_class(
settings.get('REFERRER_POLICY'))
@classmethod
def from_crawler(cls, crawler):
if not crawler.settings.getbool('REFERER_ENABLED'):
raise NotConfigured
mw = cls(crawler.settings)
# Note: this hook is a bit of a hack to intercept redirections
crawler.signals.connect(mw.request_scheduled, signal=signals.request_scheduled)
return mw
def policy(self, resp_or_url, request):
"""
Determine Referrer-Policy to use from a parent Response (or URL),
and a Request to be sent.
- if a valid policy is set in Request meta, it is used.
- if the policy is set in meta but is wrong (e.g. a typo error),
the policy from settings is used
- if the policy is not set in Request meta,
but there is a Referrer-policy header in the parent response,
it is used if valid
- otherwise, the policy from settings is used.
"""
policy_name = request.meta.get('referrer_policy')
if policy_name is None:
if isinstance(resp_or_url, Response):
policy_header = resp_or_url.headers.get('Referrer-Policy')
if policy_header is not None:
policy_name = to_unicode(policy_header.decode('latin1'))
if policy_name is None:
return self.default_policy()
cls = _load_policy_class(policy_name, warning_only=True)
return cls() if cls else self.default_policy()
def process_spider_output(self, response, result, spider):
def _set_referer(r):
if isinstance(r, Request):
referrer = self.policy(response, r).referrer(response.url, r.url)
if referrer is not None:
r.headers.setdefault('Referer', referrer)
return r
return (_set_referer(r) for r in result or ())
def request_scheduled(self, request, spider):
# check redirected request to patch "Referer" header if necessary
redirected_urls = request.meta.get('redirect_urls', [])
if redirected_urls:
request_referrer = request.headers.get('Referer')
# we don't patch the referrer value if there is none
if request_referrer is not None:
# the request's referrer header value acts as a surrogate
# for the parent response URL
#
# Note: if the 3xx response contained a Referrer-Policy header,
# the information is not available using this hook
parent_url = safe_url_string(request_referrer)
policy_referrer = self.policy(parent_url, request).referrer(
parent_url, request.url)
if policy_referrer != request_referrer:
if policy_referrer is None:
request.headers.pop('Referer')
else:
request.headers['Referer'] = policy_referrer