123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405 |
- """
- XPath selectors based on lxml
- """
- import sys
- import six
- from lxml import etree, html
- from .utils import flatten, iflatten, extract_regex, shorten
- from .csstranslator import HTMLTranslator, GenericTranslator
- class CannotRemoveElementWithoutRoot(Exception):
- pass
- class CannotRemoveElementWithoutParent(Exception):
- pass
- class SafeXMLParser(etree.XMLParser):
- def __init__(self, *args, **kwargs):
- kwargs.setdefault('resolve_entities', False)
- super(SafeXMLParser, self).__init__(*args, **kwargs)
- _ctgroup = {
- 'html': {'_parser': html.HTMLParser,
- '_csstranslator': HTMLTranslator(),
- '_tostring_method': 'html'},
- 'xml': {'_parser': SafeXMLParser,
- '_csstranslator': GenericTranslator(),
- '_tostring_method': 'xml'},
- }
- def _st(st):
- if st is None:
- return 'html'
- elif st in _ctgroup:
- return st
- else:
- raise ValueError('Invalid type: %s' % st)
- def create_root_node(text, parser_cls, base_url=None):
- """Create root node for text using given parser class.
- """
- body = text.strip().replace('\x00', '').encode('utf8') or b'<html/>'
- parser = parser_cls(recover=True, encoding='utf8')
- root = etree.fromstring(body, parser=parser, base_url=base_url)
- if root is None:
- root = etree.fromstring(b'<html/>', parser=parser, base_url=base_url)
- return root
- class SelectorList(list):
- """
- The :class:`SelectorList` class is a subclass of the builtin ``list``
- class, which provides a few additional methods.
- """
- # __getslice__ is deprecated but `list` builtin implements it only in Py2
- def __getslice__(self, i, j):
- o = super(SelectorList, self).__getslice__(i, j)
- return self.__class__(o)
- def __getitem__(self, pos):
- o = super(SelectorList, self).__getitem__(pos)
- return self.__class__(o) if isinstance(pos, slice) else o
- def __getstate__(self):
- raise TypeError("can't pickle SelectorList objects")
- def xpath(self, xpath, namespaces=None, **kwargs):
- """
- Call the ``.xpath()`` method for each element in this list and return
- their results flattened as another :class:`SelectorList`.
- ``query`` is the same argument as the one in :meth:`Selector.xpath`
- ``namespaces`` is an optional ``prefix: namespace-uri`` mapping (dict)
- for additional prefixes to those registered with ``register_namespace(prefix, uri)``.
- Contrary to ``register_namespace()``, these prefixes are not
- saved for future calls.
- Any additional named arguments can be used to pass values for XPath
- variables in the XPath expression, e.g.::
- selector.xpath('//a[href=$url]', url="http://www.example.com")
- """
- return self.__class__(flatten([x.xpath(xpath, namespaces=namespaces, **kwargs) for x in self]))
- def css(self, query):
- """
- Call the ``.css()`` method for each element in this list and return
- their results flattened as another :class:`SelectorList`.
- ``query`` is the same argument as the one in :meth:`Selector.css`
- """
- return self.__class__(flatten([x.css(query) for x in self]))
- def re(self, regex, replace_entities=True):
- """
- Call the ``.re()`` method for each element in this list and return
- their results flattened, as a list of unicode strings.
- By default, character entity references are replaced by their
- corresponding character (except for ``&`` and ``<``.
- Passing ``replace_entities`` as ``False`` switches off these
- replacements.
- """
- return flatten([x.re(regex, replace_entities=replace_entities) for x in self])
- def re_first(self, regex, default=None, replace_entities=True):
- """
- Call the ``.re()`` method for the first element in this list and
- return the result in an unicode string. If the list is empty or the
- regex doesn't match anything, return the default value (``None`` if
- the argument is not provided).
- By default, character entity references are replaced by their
- corresponding character (except for ``&`` and ``<``.
- Passing ``replace_entities`` as ``False`` switches off these
- replacements.
- """
- for el in iflatten(x.re(regex, replace_entities=replace_entities) for x in self):
- return el
- return default
- def getall(self):
- """
- Call the ``.get()`` method for each element is this list and return
- their results flattened, as a list of unicode strings.
- """
- return [x.get() for x in self]
- extract = getall
- def get(self, default=None):
- """
- Return the result of ``.get()`` for the first element in this list.
- If the list is empty, return the default value.
- """
- for x in self:
- return x.get()
- return default
- extract_first = get
- @property
- def attrib(self):
- """Return the attributes dictionary for the first element.
- If the list is empty, return an empty dict.
- """
- for x in self:
- return x.attrib
- return {}
- def remove(self):
- """
- Remove matched nodes from the parent for each element in this list.
- """
- for x in self:
- x.remove()
- class Selector(object):
- """
- :class:`Selector` allows you to select parts of an XML or HTML text using CSS
- or XPath expressions and extract data from it.
- ``text`` is a ``unicode`` object in Python 2 or a ``str`` object in Python 3
- ``type`` defines the selector type, it can be ``"html"``, ``"xml"`` or ``None`` (default).
- If ``type`` is ``None``, the selector defaults to ``"html"``.
- ``base_url`` allows setting a URL for the document. This is needed when looking up external entities with relative paths.
- See [`lxml` documentation](https://lxml.de/api/index.html) ``lxml.etree.fromstring`` for more information.
- """
- __slots__ = ['text', 'namespaces', 'type', '_expr', 'root',
- '__weakref__', '_parser', '_csstranslator', '_tostring_method']
- _default_type = None
- _default_namespaces = {
- "re": "http://exslt.org/regular-expressions",
- # supported in libxslt:
- # set:difference
- # set:has-same-node
- # set:intersection
- # set:leading
- # set:trailing
- "set": "http://exslt.org/sets"
- }
- _lxml_smart_strings = False
- selectorlist_cls = SelectorList
- def __init__(self, text=None, type=None, namespaces=None, root=None,
- base_url=None, _expr=None):
- self.type = st = _st(type or self._default_type)
- self._parser = _ctgroup[st]['_parser']
- self._csstranslator = _ctgroup[st]['_csstranslator']
- self._tostring_method = _ctgroup[st]['_tostring_method']
- if text is not None:
- if not isinstance(text, six.text_type):
- msg = "text argument should be of type %s, got %s" % (
- six.text_type, text.__class__)
- raise TypeError(msg)
- root = self._get_root(text, base_url)
- elif root is None:
- raise ValueError("Selector needs either text or root argument")
- self.namespaces = dict(self._default_namespaces)
- if namespaces is not None:
- self.namespaces.update(namespaces)
- self.root = root
- self._expr = _expr
- def __getstate__(self):
- raise TypeError("can't pickle Selector objects")
- def _get_root(self, text, base_url=None):
- return create_root_node(text, self._parser, base_url=base_url)
- def xpath(self, query, namespaces=None, **kwargs):
- """
- Find nodes matching the xpath ``query`` and return the result as a
- :class:`SelectorList` instance with all elements flattened. List
- elements implement :class:`Selector` interface too.
- ``query`` is a string containing the XPATH query to apply.
- ``namespaces`` is an optional ``prefix: namespace-uri`` mapping (dict)
- for additional prefixes to those registered with ``register_namespace(prefix, uri)``.
- Contrary to ``register_namespace()``, these prefixes are not
- saved for future calls.
- Any additional named arguments can be used to pass values for XPath
- variables in the XPath expression, e.g.::
- selector.xpath('//a[href=$url]', url="http://www.example.com")
- """
- try:
- xpathev = self.root.xpath
- except AttributeError:
- return self.selectorlist_cls([])
- nsp = dict(self.namespaces)
- if namespaces is not None:
- nsp.update(namespaces)
- try:
- result = xpathev(query, namespaces=nsp,
- smart_strings=self._lxml_smart_strings,
- **kwargs)
- except etree.XPathError as exc:
- msg = u"XPath error: %s in %s" % (exc, query)
- msg = msg if six.PY3 else msg.encode('unicode_escape')
- six.reraise(ValueError, ValueError(msg), sys.exc_info()[2])
- if type(result) is not list:
- result = [result]
- result = [self.__class__(root=x, _expr=query,
- namespaces=self.namespaces,
- type=self.type)
- for x in result]
- return self.selectorlist_cls(result)
- def css(self, query):
- """
- Apply the given CSS selector and return a :class:`SelectorList` instance.
- ``query`` is a string containing the CSS selector to apply.
- In the background, CSS queries are translated into XPath queries using
- `cssselect`_ library and run ``.xpath()`` method.
- .. _cssselect: https://pypi.python.org/pypi/cssselect/
- """
- return self.xpath(self._css2xpath(query))
- def _css2xpath(self, query):
- return self._csstranslator.css_to_xpath(query)
- def re(self, regex, replace_entities=True):
- """
- Apply the given regex and return a list of unicode strings with the
- matches.
- ``regex`` can be either a compiled regular expression or a string which
- will be compiled to a regular expression using ``re.compile(regex)``.
- By default, character entity references are replaced by their
- corresponding character (except for ``&`` and ``<``).
- Passing ``replace_entities`` as ``False`` switches off these
- replacements.
- """
- return extract_regex(regex, self.get(), replace_entities=replace_entities)
- def re_first(self, regex, default=None, replace_entities=True):
- """
- Apply the given regex and return the first unicode string which
- matches. If there is no match, return the default value (``None`` if
- the argument is not provided).
- By default, character entity references are replaced by their
- corresponding character (except for ``&`` and ``<``).
- Passing ``replace_entities`` as ``False`` switches off these
- replacements.
- """
- return next(iflatten(self.re(regex, replace_entities=replace_entities)), default)
- def get(self):
- """
- Serialize and return the matched nodes in a single unicode string.
- Percent encoded content is unquoted.
- """
- try:
- return etree.tostring(self.root,
- method=self._tostring_method,
- encoding='unicode',
- with_tail=False)
- except (AttributeError, TypeError):
- if self.root is True:
- return u'1'
- elif self.root is False:
- return u'0'
- else:
- return six.text_type(self.root)
- extract = get
- def getall(self):
- """
- Serialize and return the matched node in a 1-element list of unicode strings.
- """
- return [self.get()]
- def register_namespace(self, prefix, uri):
- """
- Register the given namespace to be used in this :class:`Selector`.
- Without registering namespaces you can't select or extract data from
- non-standard namespaces. See :ref:`selector-examples-xml`.
- """
- self.namespaces[prefix] = uri
- def remove_namespaces(self):
- """
- Remove all namespaces, allowing to traverse the document using
- namespace-less xpaths. See :ref:`removing-namespaces`.
- """
- for el in self.root.iter('*'):
- if el.tag.startswith('{'):
- el.tag = el.tag.split('}', 1)[1]
- # loop on element attributes also
- for an in el.attrib.keys():
- if an.startswith('{'):
- el.attrib[an.split('}', 1)[1]] = el.attrib.pop(an)
- # remove namespace declarations
- etree.cleanup_namespaces(self.root)
- def remove(self):
- """
- Remove matched nodes from the parent element.
- """
- try:
- parent = self.root.getparent()
- except AttributeError:
- # 'str' object has no attribute 'getparent'
- raise CannotRemoveElementWithoutRoot(
- "The node you're trying to remove has no root, "
- "are you trying to remove a pseudo-element? "
- "Try to use 'li' as a selector instead of 'li::text' or "
- "'//li' instead of '//li/text()', for example."
- )
- try:
- parent.remove(self.root)
- except AttributeError:
- # 'NoneType' object has no attribute 'remove'
- raise CannotRemoveElementWithoutParent(
- "The node you're trying to remove has no parent, "
- "are you trying to remove a root element?"
- )
- @property
- def attrib(self):
- """Return the attributes dictionary for underlying element.
- """
- return dict(self.root.attrib)
- def __bool__(self):
- """
- Return ``True`` if there is any real content selected or ``False``
- otherwise. In other words, the boolean value of a :class:`Selector` is
- given by the contents it selects.
- """
- return bool(self.get())
- __nonzero__ = __bool__
- def __str__(self):
- data = repr(shorten(self.get(), width=40))
- return "<%s xpath=%r data=%s>" % (type(self).__name__, self._expr, data)
- __repr__ = __str__
|