unified.py 2.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384
  1. """
  2. XPath selectors based on lxml
  3. """
  4. import warnings
  5. from parsel import Selector as _ParselSelector
  6. from scrapy.utils.trackref import object_ref
  7. from scrapy.utils.python import to_bytes
  8. from scrapy.http import HtmlResponse, XmlResponse
  9. from scrapy.utils.decorators import deprecated
  10. __all__ = ['Selector', 'SelectorList']
  11. def _st(response, st):
  12. if st is None:
  13. return 'xml' if isinstance(response, XmlResponse) else 'html'
  14. return st
  15. def _response_from_text(text, st):
  16. rt = XmlResponse if st == 'xml' else HtmlResponse
  17. return rt(url='about:blank', encoding='utf-8',
  18. body=to_bytes(text, 'utf-8'))
  19. class SelectorList(_ParselSelector.selectorlist_cls, object_ref):
  20. """
  21. The :class:`SelectorList` class is a subclass of the builtin ``list``
  22. class, which provides a few additional methods.
  23. """
  24. class Selector(_ParselSelector, object_ref):
  25. """
  26. An instance of :class:`Selector` is a wrapper over response to select
  27. certain parts of its content.
  28. ``response`` is an :class:`~scrapy.http.HtmlResponse` or an
  29. :class:`~scrapy.http.XmlResponse` object that will be used for selecting
  30. and extracting data.
  31. ``text`` is a unicode string or utf-8 encoded text for cases when a
  32. ``response`` isn't available. Using ``text`` and ``response`` together is
  33. undefined behavior.
  34. ``type`` defines the selector type, it can be ``"html"``, ``"xml"``
  35. or ``None`` (default).
  36. If ``type`` is ``None``, the selector automatically chooses the best type
  37. based on ``response`` type (see below), or defaults to ``"html"`` in case it
  38. is used together with ``text``.
  39. If ``type`` is ``None`` and a ``response`` is passed, the selector type is
  40. inferred from the response type as follows:
  41. * ``"html"`` for :class:`~scrapy.http.HtmlResponse` type
  42. * ``"xml"`` for :class:`~scrapy.http.XmlResponse` type
  43. * ``"html"`` for anything else
  44. Otherwise, if ``type`` is set, the selector type will be forced and no
  45. detection will occur.
  46. """
  47. __slots__ = ['response']
  48. selectorlist_cls = SelectorList
  49. def __init__(self, response=None, text=None, type=None, root=None, **kwargs):
  50. if not(response is None or text is None):
  51. raise ValueError('%s.__init__() received both response and text'
  52. % self.__class__.__name__)
  53. st = _st(response, type or self._default_type)
  54. if text is not None:
  55. response = _response_from_text(text, st)
  56. if response is not None:
  57. text = response.text
  58. kwargs.setdefault('base_url', response.url)
  59. self.response = response
  60. super(Selector, self).__init__(text=text, type=st, root=root, **kwargs)