123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195 |
- import time
- from six.moves.http_cookiejar import (
- CookieJar as _CookieJar, DefaultCookiePolicy, IPV4_RE
- )
- from scrapy.utils.httpobj import urlparse_cached
- from scrapy.utils.python import to_native_str
- class CookieJar(object):
- def __init__(self, policy=None, check_expired_frequency=10000):
- self.policy = policy or DefaultCookiePolicy()
- self.jar = _CookieJar(self.policy)
- self.jar._cookies_lock = _DummyLock()
- self.check_expired_frequency = check_expired_frequency
- self.processed = 0
- def extract_cookies(self, response, request):
- wreq = WrappedRequest(request)
- wrsp = WrappedResponse(response)
- return self.jar.extract_cookies(wrsp, wreq)
- def add_cookie_header(self, request):
- wreq = WrappedRequest(request)
- self.policy._now = self.jar._now = int(time.time())
- # the cookiejar implementation iterates through all domains
- # instead we restrict to potential matches on the domain
- req_host = urlparse_cached(request).hostname
- if not req_host:
- return
- if not IPV4_RE.search(req_host):
- hosts = potential_domain_matches(req_host)
- if '.' not in req_host:
- hosts += [req_host + ".local"]
- else:
- hosts = [req_host]
- cookies = []
- for host in hosts:
- if host in self.jar._cookies:
- cookies += self.jar._cookies_for_domain(host, wreq)
- attrs = self.jar._cookie_attrs(cookies)
- if attrs:
- if not wreq.has_header("Cookie"):
- wreq.add_unredirected_header("Cookie", "; ".join(attrs))
- self.processed += 1
- if self.processed % self.check_expired_frequency == 0:
- # This is still quite inefficient for large number of cookies
- self.jar.clear_expired_cookies()
- @property
- def _cookies(self):
- return self.jar._cookies
- def clear_session_cookies(self, *args, **kwargs):
- return self.jar.clear_session_cookies(*args, **kwargs)
- def clear(self, domain=None, path=None, name=None):
- return self.jar.clear(domain, path, name)
- def __iter__(self):
- return iter(self.jar)
- def __len__(self):
- return len(self.jar)
- def set_policy(self, pol):
- return self.jar.set_policy(pol)
- def make_cookies(self, response, request):
- wreq = WrappedRequest(request)
- wrsp = WrappedResponse(response)
- return self.jar.make_cookies(wrsp, wreq)
- def set_cookie(self, cookie):
- self.jar.set_cookie(cookie)
- def set_cookie_if_ok(self, cookie, request):
- self.jar.set_cookie_if_ok(cookie, WrappedRequest(request))
- def potential_domain_matches(domain):
- """Potential domain matches for a cookie
- >>> potential_domain_matches('www.example.com')
- ['www.example.com', 'example.com', '.www.example.com', '.example.com']
- """
- matches = [domain]
- try:
- start = domain.index('.') + 1
- end = domain.rindex('.')
- while start < end:
- matches.append(domain[start:])
- start = domain.index('.', start) + 1
- except ValueError:
- pass
- return matches + ['.' + d for d in matches]
- class _DummyLock(object):
- def acquire(self):
- pass
- def release(self):
- pass
- class WrappedRequest(object):
- """Wraps a scrapy Request class with methods defined by urllib2.Request class to interact with CookieJar class
- see http://docs.python.org/library/urllib2.html#urllib2.Request
- """
- def __init__(self, request):
- self.request = request
- def get_full_url(self):
- return self.request.url
- def get_host(self):
- return urlparse_cached(self.request).netloc
- def get_type(self):
- return urlparse_cached(self.request).scheme
- def is_unverifiable(self):
- """Unverifiable should indicate whether the request is unverifiable, as defined by RFC 2965.
- It defaults to False. An unverifiable request is one whose URL the user did not have the
- option to approve. For example, if the request is for an image in an
- HTML document, and the user had no option to approve the automatic
- fetching of the image, this should be true.
- """
- return self.request.meta.get('is_unverifiable', False)
- def get_origin_req_host(self):
- return urlparse_cached(self.request).hostname
- # python3 uses attributes instead of methods
- @property
- def full_url(self):
- return self.get_full_url()
- @property
- def host(self):
- return self.get_host()
- @property
- def type(self):
- return self.get_type()
- @property
- def unverifiable(self):
- return self.is_unverifiable()
- @property
- def origin_req_host(self):
- return self.get_origin_req_host()
- def has_header(self, name):
- return name in self.request.headers
- def get_header(self, name, default=None):
- return to_native_str(self.request.headers.get(name, default),
- errors='replace')
- def header_items(self):
- return [
- (to_native_str(k, errors='replace'),
- [to_native_str(x, errors='replace') for x in v])
- for k, v in self.request.headers.items()
- ]
- def add_unredirected_header(self, name, value):
- self.request.headers.appendlist(name, value)
- class WrappedResponse(object):
- def __init__(self, response):
- self.response = response
- def info(self):
- return self
- # python3 cookiejars calls get_all
- def get_all(self, name, default=None):
- return [to_native_str(v, errors='replace')
- for v in self.response.headers.getlist(name)]
- # python2 cookiejars calls getheaders
- getheaders = get_all
|