#-*- coding:utf-8 -*- # # Copyright (C) 2008 - Olivier Lauzanne # # Distributed under the BSD license, see LICENSE.txt from .cssselectpatch import JQueryTranslator from .openers import url_opener from copy import deepcopy from lxml import etree import lxml.html import inspect import types import sys PY3k = sys.version_info >= (3,) if PY3k: from urllib.parse import urlencode from urllib.parse import urljoin basestring = (str, bytes) unicode = str else: from urllib import urlencode # NOQA from urlparse import urljoin # NOQA def func_globals(f): return f.__globals__ if PY3k else f.func_globals def func_code(f): return f.__code__ if PY3k else f.func_code def with_camel_case_alias(func): """decorator for methods who required a camelcase alias""" _camel_case_aliases.add(func.__name__) return func _camel_case_aliases = set() def build_camel_case_aliases(PyQuery): """add camelcase aliases to PyQuery""" for alias in _camel_case_aliases: parts = list(alias.split('_')) name = parts[0] + ''.join([p.title() for p in parts[1:]]) func = getattr(PyQuery, alias) f = types.FunctionType(func_code(func), func_globals(func), name, inspect.getargspec(func).defaults) f.__doc__ = ( 'Alias for :func:`~pyquery.pyquery.PyQuery.%s`') % func.__name__ setattr(PyQuery, name, f.__get__(None, PyQuery)) def fromstring(context, parser=None, custom_parser=None): """use html parser if we don't have clean xml """ if hasattr(context, 'read') and hasattr(context.read, '__call__'): meth = 'parse' else: meth = 'fromstring' if custom_parser is None: if parser is None: try: result = getattr(etree, meth)(context) except etree.XMLSyntaxError: if hasattr(context, 'seek'): context.seek(0) result = getattr(lxml.html, meth)(context) if isinstance(result, etree._ElementTree): return [result.getroot()] else: return [result] elif parser == 'xml': custom_parser = getattr(etree, meth) elif parser == 'html': custom_parser = getattr(lxml.html, meth) elif parser == 'html5': from lxml.html import html5parser custom_parser = getattr(html5parser, meth) elif parser == 'soup': from lxml.html import soupparser custom_parser = getattr(soupparser, meth) elif parser == 'html_fragments': custom_parser = lxml.html.fragments_fromstring else: raise ValueError('No such parser: "%s"' % parser) result = custom_parser(context) if type(result) is list: return result elif isinstance(result, etree._ElementTree): return [result.getroot()] elif result is not None: return [result] else: return [] def callback(func, *args): return func(*args[:func_code(func).co_argcount]) class NoDefault(object): def __repr__(self): """clean representation in Sphinx""" return '' no_default = NoDefault() del NoDefault class FlexibleElement(object): """property to allow a flexible api""" def __init__(self, pget, pset=no_default, pdel=no_default): self.pget = pget self.pset = pset self.pdel = pdel def __get__(self, instance, klass): class _element(object): """real element to support set/get/del attr and item and js call style""" def __call__(prop, *args, **kwargs): return self.pget(instance, *args, **kwargs) __getattr__ = __getitem__ = __setattr__ = __setitem__ = __call__ def __delitem__(prop, name): if self.pdel is not no_default: return self.pdel(instance, name) else: raise NotImplementedError() __delattr__ = __delitem__ def __repr__(prop): return '' % self.pget.__name__ return _element() def __set__(self, instance, value): if self.pset is not no_default: self.pset(instance, value) else: raise NotImplementedError() class PyQuery(list): """The main class """ _translator_class = JQueryTranslator def __init__(self, *args, **kwargs): html = None elements = [] self._base_url = None self.parser = kwargs.pop('parser', None) if (len(args) >= 1 and (not PY3k and isinstance(args[0], basestring) or (PY3k and isinstance(args[0], str))) and args[0].split('://', 1)[0] in ('http', 'https')): kwargs['url'] = args[0] if len(args) >= 2: kwargs['data'] = args[1] args = [] if 'parent' in kwargs: self._parent = kwargs.pop('parent') else: self._parent = no_default if 'css_translator' in kwargs: self._translator = kwargs.pop('css_translator') elif self.parser in ('xml',): self._translator = self._translator_class(xhtml=True) elif self._parent is not no_default: self._translator = self._parent._translator else: self._translator = self._translator_class(xhtml=False) self.namespaces = kwargs.pop('namespaces', None) if kwargs: # specific case to get the dom if 'filename' in kwargs: html = open(kwargs['filename']) elif 'url' in kwargs: url = kwargs.pop('url') if 'opener' in kwargs: opener = kwargs.pop('opener') html = opener(url, **kwargs) else: html = url_opener(url, kwargs) if not self.parser: self.parser = 'html' self._base_url = url else: raise ValueError('Invalid keyword arguments %s' % kwargs) elements = fromstring(html, self.parser) # close open descriptor if possible if hasattr(html, 'close'): try: html.close() except: pass else: # get nodes # determine context and selector if any selector = context = no_default length = len(args) if length == 1: context = args[0] elif length == 2: selector, context = args else: raise ValueError( "You can't do that. Please, provide arguments") # get context if isinstance(context, basestring): try: elements = fromstring(context, self.parser) except Exception: raise elif isinstance(context, self.__class__): # copy elements = context[:] elif isinstance(context, list): elements = context elif isinstance(context, etree._Element): elements = [context] # select nodes if elements and selector is not no_default: xpath = self._css_to_xpath(selector) results = [] for tag in elements: results.extend(tag.xpath(xpath, namespaces=self.namespaces)) elements = results list.__init__(self, elements) def _css_to_xpath(self, selector, prefix='descendant-or-self::'): selector = selector.replace('[@', '[') return self._translator.css_to_xpath(selector, prefix) def _copy(self, *args, **kwargs): kwargs.setdefault('namespaces', self.namespaces) return self.__class__(*args, **kwargs) def __call__(self, *args, **kwargs): """return a new PyQuery instance """ length = len(args) if length == 0: raise ValueError('You must provide at least a selector') if args[0] == '': return self._copy([]) if (len(args) == 1 and (not PY3k and isinstance(args[0], basestring) or (PY3k and isinstance(args[0], str))) and not args[0].startswith('<')): args += (self,) result = self._copy(*args, parent=self, **kwargs) return result # keep original list api prefixed with _ _append = list.append _extend = list.extend # improve pythonic api def __add__(self, other): assert isinstance(other, self.__class__) return self._copy(self[:] + other[:]) def extend(self, other): """Extend with anoter PyQuery object""" assert isinstance(other, self.__class__) self._extend(other[:]) return self def items(self, selector=None): """Iter over elements. Return PyQuery objects: >>> d = PyQuery('
foobar
') >>> [i.text() for i in d.items('span')] ['foo', 'bar'] >>> [i.text() for i in d('span').items()] ['foo', 'bar'] >>> list(d.items('a')) == list(d('a').items()) True """ if selector: elems = self(selector) or [] else: elems = self for elem in elems: yield self._copy(elem, parent=self) def xhtml_to_html(self): """Remove xhtml namespace: >>> doc = PyQuery( ... '') >>> doc [<{http://www.w3.org/1999/xhtml}html>] >>> doc.xhtml_to_html() [] """ try: root = self[0].getroottree() except IndexError: pass else: lxml.html.xhtml_to_html(root) return self def remove_namespaces(self): """Remove all namespaces: >>> doc = PyQuery('') >>> doc [<{http://example.com/foo}foo>] >>> doc.remove_namespaces() [] """ try: root = self[0].getroottree() except IndexError: pass else: for el in root.iter('{*}*'): if el.tag.startswith('{'): el.tag = el.tag.split('}', 1)[1] return self def __str__(self): """xml representation of current nodes:: >>> xml = PyQuery( ... '', parser='html_fragments') >>> print(str(xml)) """ if PY3k: return ''.join([etree.tostring(e, encoding=str) for e in self]) else: return ''.join([etree.tostring(e) for e in self]) def __unicode__(self): """xml representation of current nodes""" return unicode('').join([etree.tostring(e, encoding=unicode) for e in self]) def __html__(self): """html representation of current nodes:: >>> html = PyQuery( ... '', parser='html_fragments') >>> print(html.__html__()) """ return unicode('').join([lxml.html.tostring(e, encoding=unicode) for e in self]) def __repr__(self): r = [] try: for el in self: c = el.get('class') c = c and '.' + '.'.join(c.split(' ')) or '' id = el.get('id') id = id and '#' + id or '' r.append('<%s%s%s>' % (el.tag, id, c)) return '[' + (', '.join(r)) + ']' except AttributeError: if PY3k: return list.__repr__(self) else: for el in self: if isinstance(el, unicode): r.append(el.encode('utf-8')) else: r.append(el) return repr(r) @property def root(self): """return the xml root element """ if self._parent is not no_default: return self._parent.getroottree() return self[0].getroottree() @property def encoding(self): """return the xml encoding of the root element """ root = self.root if root is not None: return self.root.docinfo.encoding ############## # Traversing # ############## def _filter_only(self, selector, elements, reverse=False, unique=False): """Filters the selection set only, as opposed to also including descendants. """ if selector is None: results = elements else: xpath = self._css_to_xpath(selector, 'self::') results = [] for tag in elements: results.extend(tag.xpath(xpath, namespaces=self.namespaces)) if reverse: results.reverse() if unique: result_list = results results = [] for item in result_list: if not item in results: results.append(item) return self._copy(results, parent=self) def parent(self, selector=None): return self._filter_only( selector, [e.getparent() for e in self if e.getparent() is not None], unique=True) def prev(self, selector=None): return self._filter_only( selector, [e.getprevious() for e in self if e.getprevious() is not None]) def next(self, selector=None): return self._filter_only( selector, [e.getnext() for e in self if e.getnext() is not None]) def _traverse(self, method): for e in self: current = getattr(e, method)() while current is not None: yield current current = getattr(current, method)() def _traverse_parent_topdown(self): for e in self: this_list = [] current = e.getparent() while current is not None: this_list.append(current) current = current.getparent() this_list.reverse() for j in this_list: yield j def _next_all(self): return [e for e in self._traverse('getnext')] @with_camel_case_alias def next_all(self, selector=None): """ >>> h = '

Hi

Bye

' >>> d = PyQuery(h) >>> d('p:last').next_all() [] >>> d('p:last').nextAll() [] """ return self._filter_only(selector, self._next_all()) def _prev_all(self): return [e for e in self._traverse('getprevious')] @with_camel_case_alias def prev_all(self, selector=None): """ >>> h = '

Hi

Bye

' >>> d = PyQuery(h) >>> d('p:last').prev_all() [] >>> d('p:last').prevAll() [] """ return self._filter_only(selector, self._prev_all(), reverse=True) def siblings(self, selector=None): """ >>> h = '

Hi

Bye

' >>> d = PyQuery(h) >>> d('.hello').siblings() [

, ] >>> d('.hello').siblings('img') [] """ return self._filter_only(selector, self._prev_all() + self._next_all()) def parents(self, selector=None): """ >>> d = PyQuery('

Hi

Bye

') >>> d('p').parents() [] >>> d('.hello').parents('span') [] >>> d('.hello').parents('p') [] """ return self._filter_only( selector, [e for e in self._traverse_parent_topdown()], unique=True ) def children(self, selector=None): """Filter elements that are direct children of self using optional selector: >>> d = PyQuery('

Hi

Bye

') >>> d [] >>> d.children() [,

] >>> d.children('.hello') [] """ elements = [child for tag in self for child in tag.getchildren()] return self._filter_only(selector, elements) def closest(self, selector=None): """ >>> d = PyQuery( ... '

This is a ' ... 'test

') >>> d('strong').closest('div') [] >>> d('strong').closest('.hello') [] >>> d('strong').closest('form') [] """ result = [] for current in self: while (current is not None and not self._copy(current).is_(selector)): current = current.getparent() if current is not None: result.append(current) return self._copy(result, parent=self) def contents(self): """ Return contents (with text nodes): >>> d = PyQuery('hello bold') >>> d.contents() # doctest: +ELLIPSIS ['hello ', ] """ results = [] for elem in self: results.extend(elem.xpath('child::text()|child::*', namespaces=self.namespaces)) return self._copy(results, parent=self) def filter(self, selector): """Filter elements in self using selector (string or function): >>> d = PyQuery('

Hi

Bye

') >>> d('p') [,

] >>> d('p').filter('.hello') [] >>> d('p').filter(lambda i: i == 1) [

] >>> d('p').filter(lambda i: PyQuery(this).text() == 'Hi') [] >>> d('p').filter(lambda i, this: PyQuery(this).text() == 'Hi') [] """ if not hasattr(selector, '__call__'): return self._filter_only(selector, self) else: elements = [] args = inspect.getargspec(callback).args try: for i, this in enumerate(self): if len(args) == 1: func_globals(selector)['this'] = this if callback(selector, i, this): elements.append(this) finally: f_globals = func_globals(selector) if 'this' in f_globals: del f_globals['this'] return self._copy(elements, parent=self) def not_(self, selector): """Return elements that don't match the given selector: >>> d = PyQuery('

Hi

Bye

') >>> d('p').not_('.hello') [

] """ exclude = set(self._copy(selector, self)) return self._copy([e for e in self if e not in exclude], parent=self) def is_(self, selector): """Returns True if selector matches at least one current element, else False: >>> d = PyQuery('

Hi

Bye

') >>> d('p').eq(0).is_('.hello') True >>> d('p').eq(0).is_('span') False >>> d('p').eq(1).is_('.hello') False .. """ return bool(self._filter_only(selector, self)) def find(self, selector): """Find elements using selector traversing down from self: >>> m = '

Whoah!

there

' >>> d = PyQuery(m) >>> d('p').find('em') [, ] >>> d('p').eq(1).find('em') [] """ xpath = self._css_to_xpath(selector) results = [child.xpath(xpath, namespaces=self.namespaces) for tag in self for child in tag.getchildren()] # Flatten the results elements = [] for r in results: elements.extend(r) return self._copy(elements, parent=self) def eq(self, index): """Return PyQuery of only the element with the provided index:: >>> d = PyQuery('

Hi

Bye

') >>> d('p').eq(0) [] >>> d('p').eq(1) [

] >>> d('p').eq(2) [] .. """ # Slicing will return empty list when index=-1 # we should handle out of bound by ourselves try: items = self[index] except IndexError: items = [] return self._copy(items, parent=self) def each(self, func): """apply func on each nodes """ try: for i, element in enumerate(self): func_globals(func)['this'] = element if callback(func, i, element) is False: break finally: f_globals = func_globals(func) if 'this' in f_globals: del f_globals['this'] return self def map(self, func): """Returns a new PyQuery after transforming current items with func. func should take two arguments - 'index' and 'element'. Elements can also be referred to as 'this' inside of func:: >>> d = PyQuery('

Hi there

Bye


') >>> d('p').map(lambda i, e: PyQuery(e).text()) ['Hi there', 'Bye'] >>> d('p').map(lambda i, e: len(PyQuery(this).text())) [8, 3] >>> d('p').map(lambda i, e: PyQuery(this).text().split()) ['Hi', 'there', 'Bye'] """ items = [] try: for i, element in enumerate(self): func_globals(func)['this'] = element result = callback(func, i, element) if result is not None: if not isinstance(result, list): items.append(result) else: items.extend(result) finally: f_globals = func_globals(func) if 'this' in f_globals: del f_globals['this'] return self._copy(items, parent=self) @property def length(self): return len(self) def size(self): return len(self) def end(self): """Break out of a level of traversal and return to the parent level. >>> m = '

Whoah!

there

' >>> d = PyQuery(m) >>> d('p').eq(1).find('em').end().end() [

,

] """ return self._parent ############## # Attributes # ############## def attr(self, *args, **kwargs): """Attributes manipulation """ mapping = {'class_': 'class', 'for_': 'for'} attr = value = no_default length = len(args) if length == 1: attr = args[0] attr = mapping.get(attr, attr) elif length == 2: attr, value = args attr = mapping.get(attr, attr) elif kwargs: attr = {} for k, v in kwargs.items(): attr[mapping.get(k, k)] = v else: raise ValueError('Invalid arguments %s %s' % (args, kwargs)) if not self: return None elif isinstance(attr, dict): for tag in self: for key, value in attr.items(): tag.set(key, value) elif value is no_default: return self[0].get(attr) elif value is None: return self.remove_attr(attr) else: for tag in self: tag.set(attr, value) return self @with_camel_case_alias def remove_attr(self, name): """Remove an attribute:: >>> d = PyQuery('

') >>> d.remove_attr('id') [
] >>> d.removeAttr('id') [
] .. """ for tag in self: try: del tag.attrib[name] except KeyError: pass return self attr = FlexibleElement(pget=attr, pdel=remove_attr) ####### # CSS # ####### def height(self, value=no_default): """set/get height of element """ return self.attr('height', value) def width(self, value=no_default): """set/get width of element """ return self.attr('width', value) @with_camel_case_alias def has_class(self, name): """Return True if element has class:: >>> d = PyQuery('
') >>> d.has_class('myclass') True >>> d.hasClass('myclass') True .. """ return self.is_('.%s' % name) @with_camel_case_alias def add_class(self, value): """Add a css class to elements:: >>> d = PyQuery('
') >>> d.add_class('myclass') [] >>> d.addClass('myclass') [] .. """ for tag in self: values = value.split(' ') classes = (tag.get('class') or '').split() classes += [v for v in values if v not in classes] tag.set('class', ' '.join(classes)) return self @with_camel_case_alias def remove_class(self, value): """Remove a css class to elements:: >>> d = PyQuery('
') >>> d.remove_class('myclass') [
] >>> d.removeClass('myclass') [
] .. """ for tag in self: values = value.split(' ') classes = set((tag.get('class') or '').split()) classes.difference_update(values) classes.difference_update(['']) classes = ' '.join(classes) if classes.strip(): tag.set('class', classes) elif tag.get('class'): tag.set('class', classes) return self @with_camel_case_alias def toggle_class(self, value): """Toggle a css class to elements >>> d = PyQuery('
') >>> d.toggle_class('myclass') [] >>> d.toggleClass('myclass') [
] """ for tag in self: values = value.split(' ') classes = (tag.get('class') or '').split() values_to_add = [v for v in values if v not in classes] values_to_del = [v for v in values if v in classes] classes = [v for v in classes if v not in values_to_del] classes += values_to_add tag.set('class', ' '.join(classes)) return self def css(self, *args, **kwargs): """css attributes manipulation """ attr = value = no_default length = len(args) if length == 1: attr = args[0] elif length == 2: attr, value = args elif kwargs: attr = kwargs else: raise ValueError('Invalid arguments %s %s' % (args, kwargs)) if isinstance(attr, dict): for tag in self: stripped_keys = [key.strip().replace('_', '-') for key in attr.keys()] current = [el.strip() for el in (tag.get('style') or '').split(';') if el.strip() and not el.split(':')[0].strip() in stripped_keys] for key, value in attr.items(): key = key.replace('_', '-') current.append('%s: %s' % (key, value)) tag.set('style', '; '.join(current)) elif isinstance(value, basestring): attr = attr.replace('_', '-') for tag in self: current = [ el.strip() for el in (tag.get('style') or '').split(';') if (el.strip() and not el.split(':')[0].strip() == attr.strip())] current.append('%s: %s' % (attr, value)) tag.set('style', '; '.join(current)) return self css = FlexibleElement(pget=css, pset=css) ################### # CORE UI EFFECTS # ################### def hide(self): """remove display:none to elements style >>> print(PyQuery('
').hide())
""" return self.css('display', 'none') def show(self): """add display:block to elements style >>> print(PyQuery('
').show())
""" return self.css('display', 'block') ######## # HTML # ######## def val(self, value=no_default): """Set the attribute value:: >>> d = PyQuery('') >>> d.val('Youhou') [] Get the attribute value:: >>> d.val() 'Youhou' """ def _get_value(tag): #