cookies.py 3.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394
  1. import os
  2. import six
  3. import logging
  4. from collections import defaultdict
  5. from scrapy.exceptions import NotConfigured
  6. from scrapy.http import Response
  7. from scrapy.http.cookies import CookieJar
  8. from scrapy.utils.python import to_native_str
  9. logger = logging.getLogger(__name__)
  10. class CookiesMiddleware(object):
  11. """This middleware enables working with sites that need cookies"""
  12. def __init__(self, debug=False):
  13. self.jars = defaultdict(CookieJar)
  14. self.debug = debug
  15. @classmethod
  16. def from_crawler(cls, crawler):
  17. if not crawler.settings.getbool('COOKIES_ENABLED'):
  18. raise NotConfigured
  19. return cls(crawler.settings.getbool('COOKIES_DEBUG'))
  20. def process_request(self, request, spider):
  21. if request.meta.get('dont_merge_cookies', False):
  22. return
  23. cookiejarkey = request.meta.get("cookiejar")
  24. jar = self.jars[cookiejarkey]
  25. cookies = self._get_request_cookies(jar, request)
  26. for cookie in cookies:
  27. jar.set_cookie_if_ok(cookie, request)
  28. # set Cookie header
  29. request.headers.pop('Cookie', None)
  30. jar.add_cookie_header(request)
  31. self._debug_cookie(request, spider)
  32. def process_response(self, request, response, spider):
  33. if request.meta.get('dont_merge_cookies', False):
  34. return response
  35. # extract cookies from Set-Cookie and drop invalid/expired cookies
  36. cookiejarkey = request.meta.get("cookiejar")
  37. jar = self.jars[cookiejarkey]
  38. jar.extract_cookies(response, request)
  39. self._debug_set_cookie(response, spider)
  40. return response
  41. def _debug_cookie(self, request, spider):
  42. if self.debug:
  43. cl = [to_native_str(c, errors='replace')
  44. for c in request.headers.getlist('Cookie')]
  45. if cl:
  46. cookies = "\n".join("Cookie: {}\n".format(c) for c in cl)
  47. msg = "Sending cookies to: {}\n{}".format(request, cookies)
  48. logger.debug(msg, extra={'spider': spider})
  49. def _debug_set_cookie(self, response, spider):
  50. if self.debug:
  51. cl = [to_native_str(c, errors='replace')
  52. for c in response.headers.getlist('Set-Cookie')]
  53. if cl:
  54. cookies = "\n".join("Set-Cookie: {}\n".format(c) for c in cl)
  55. msg = "Received cookies from: {}\n{}".format(response, cookies)
  56. logger.debug(msg, extra={'spider': spider})
  57. def _format_cookie(self, cookie):
  58. # build cookie string
  59. cookie_str = '%s=%s' % (cookie['name'], cookie['value'])
  60. if cookie.get('path', None):
  61. cookie_str += '; Path=%s' % cookie['path']
  62. if cookie.get('domain', None):
  63. cookie_str += '; Domain=%s' % cookie['domain']
  64. return cookie_str
  65. def _get_request_cookies(self, jar, request):
  66. if isinstance(request.cookies, dict):
  67. cookie_list = [{'name': k, 'value': v} for k, v in \
  68. six.iteritems(request.cookies)]
  69. else:
  70. cookie_list = request.cookies
  71. cookies = [self._format_cookie(x) for x in cookie_list]
  72. headers = {'Set-Cookie': cookies}
  73. response = Response(request.url, headers=headers)
  74. return jar.make_cookies(response, request)