text.py 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176
  1. """
  2. This module implements the TextResponse class which adds encoding handling and
  3. discovering (through HTTP headers) to base Response class.
  4. See documentation in docs/topics/request-response.rst
  5. """
  6. import six
  7. from six.moves.urllib.parse import urljoin
  8. import parsel
  9. from w3lib.encoding import html_to_unicode, resolve_encoding, \
  10. html_body_declared_encoding, http_content_type_encoding
  11. from w3lib.html import strip_html5_whitespace
  12. from scrapy.http.request import Request
  13. from scrapy.http.response import Response
  14. from scrapy.utils.response import get_base_url
  15. from scrapy.utils.python import memoizemethod_noargs, to_native_str
  16. class TextResponse(Response):
  17. _DEFAULT_ENCODING = 'ascii'
  18. def __init__(self, *args, **kwargs):
  19. self._encoding = kwargs.pop('encoding', None)
  20. self._cached_benc = None
  21. self._cached_ubody = None
  22. self._cached_selector = None
  23. super(TextResponse, self).__init__(*args, **kwargs)
  24. def _set_url(self, url):
  25. if isinstance(url, six.text_type):
  26. if six.PY2 and self.encoding is None:
  27. raise TypeError("Cannot convert unicode url - %s "
  28. "has no encoding" % type(self).__name__)
  29. self._url = to_native_str(url, self.encoding)
  30. else:
  31. super(TextResponse, self)._set_url(url)
  32. def _set_body(self, body):
  33. self._body = b'' # used by encoding detection
  34. if isinstance(body, six.text_type):
  35. if self._encoding is None:
  36. raise TypeError('Cannot convert unicode body - %s has no encoding' %
  37. type(self).__name__)
  38. self._body = body.encode(self._encoding)
  39. else:
  40. super(TextResponse, self)._set_body(body)
  41. def replace(self, *args, **kwargs):
  42. kwargs.setdefault('encoding', self.encoding)
  43. return Response.replace(self, *args, **kwargs)
  44. @property
  45. def encoding(self):
  46. return self._declared_encoding() or self._body_inferred_encoding()
  47. def _declared_encoding(self):
  48. return self._encoding or self._headers_encoding() \
  49. or self._body_declared_encoding()
  50. def body_as_unicode(self):
  51. """Return body as unicode"""
  52. return self.text
  53. @property
  54. def text(self):
  55. """ Body as unicode """
  56. # access self.encoding before _cached_ubody to make sure
  57. # _body_inferred_encoding is called
  58. benc = self.encoding
  59. if self._cached_ubody is None:
  60. charset = 'charset=%s' % benc
  61. self._cached_ubody = html_to_unicode(charset, self.body)[1]
  62. return self._cached_ubody
  63. def urljoin(self, url):
  64. """Join this Response's url with a possible relative url to form an
  65. absolute interpretation of the latter."""
  66. return urljoin(get_base_url(self), url)
  67. @memoizemethod_noargs
  68. def _headers_encoding(self):
  69. content_type = self.headers.get(b'Content-Type', b'')
  70. return http_content_type_encoding(to_native_str(content_type))
  71. def _body_inferred_encoding(self):
  72. if self._cached_benc is None:
  73. content_type = to_native_str(self.headers.get(b'Content-Type', b''))
  74. benc, ubody = html_to_unicode(content_type, self.body,
  75. auto_detect_fun=self._auto_detect_fun,
  76. default_encoding=self._DEFAULT_ENCODING)
  77. self._cached_benc = benc
  78. self._cached_ubody = ubody
  79. return self._cached_benc
  80. def _auto_detect_fun(self, text):
  81. for enc in (self._DEFAULT_ENCODING, 'utf-8', 'cp1252'):
  82. try:
  83. text.decode(enc)
  84. except UnicodeError:
  85. continue
  86. return resolve_encoding(enc)
  87. @memoizemethod_noargs
  88. def _body_declared_encoding(self):
  89. return html_body_declared_encoding(self.body)
  90. @property
  91. def selector(self):
  92. from scrapy.selector import Selector
  93. if self._cached_selector is None:
  94. self._cached_selector = Selector(self)
  95. return self._cached_selector
  96. def xpath(self, query, **kwargs):
  97. return self.selector.xpath(query, **kwargs)
  98. def css(self, query):
  99. return self.selector.css(query)
  100. def follow(self, url, callback=None, method='GET', headers=None, body=None,
  101. cookies=None, meta=None, encoding=None, priority=0,
  102. dont_filter=False, errback=None, cb_kwargs=None):
  103. # type: (...) -> Request
  104. """
  105. Return a :class:`~.Request` instance to follow a link ``url``.
  106. It accepts the same arguments as ``Request.__init__`` method,
  107. but ``url`` can be not only an absolute URL, but also
  108. * a relative URL;
  109. * a scrapy.link.Link object (e.g. a link extractor result);
  110. * an attribute Selector (not SelectorList) - e.g.
  111. ``response.css('a::attr(href)')[0]`` or
  112. ``response.xpath('//img/@src')[0]``.
  113. * a Selector for ``<a>`` or ``<link>`` element, e.g.
  114. ``response.css('a.my_link')[0]``.
  115. See :ref:`response-follow-example` for usage examples.
  116. """
  117. if isinstance(url, parsel.Selector):
  118. url = _url_from_selector(url)
  119. elif isinstance(url, parsel.SelectorList):
  120. raise ValueError("SelectorList is not supported")
  121. encoding = self.encoding if encoding is None else encoding
  122. return super(TextResponse, self).follow(url, callback,
  123. method=method,
  124. headers=headers,
  125. body=body,
  126. cookies=cookies,
  127. meta=meta,
  128. encoding=encoding,
  129. priority=priority,
  130. dont_filter=dont_filter,
  131. errback=errback,
  132. cb_kwargs=cb_kwargs,
  133. )
  134. def _url_from_selector(sel):
  135. # type: (parsel.Selector) -> str
  136. if isinstance(sel.root, six.string_types):
  137. # e.g. ::attr(href) result
  138. return strip_html5_whitespace(sel.root)
  139. if not hasattr(sel.root, 'tag'):
  140. raise ValueError("Unsupported selector: %s" % sel)
  141. if sel.root.tag not in ('a', 'link'):
  142. raise ValueError("Only <a> and <link> elements are supported; got <%s>" %
  143. sel.root.tag)
  144. href = sel.root.get('href')
  145. if href is None:
  146. raise ValueError("<%s> element has no href attribute: %s" %
  147. (sel.root.tag, sel))
  148. return strip_html5_whitespace(href)