pyquery.py 46 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499
  1. #-*- coding:utf-8 -*-
  2. #
  3. # Copyright (C) 2008 - Olivier Lauzanne <olauzanne@gmail.com>
  4. #
  5. # Distributed under the BSD license, see LICENSE.txt
  6. from .cssselectpatch import JQueryTranslator
  7. from .openers import url_opener
  8. from copy import deepcopy
  9. from lxml import etree
  10. import lxml.html
  11. import inspect
  12. import types
  13. import sys
  14. PY3k = sys.version_info >= (3,)
  15. if PY3k:
  16. from urllib.parse import urlencode
  17. from urllib.parse import urljoin
  18. basestring = (str, bytes)
  19. unicode = str
  20. else:
  21. from urllib import urlencode # NOQA
  22. from urlparse import urljoin # NOQA
  23. def func_globals(f):
  24. return f.__globals__ if PY3k else f.func_globals
  25. def func_code(f):
  26. return f.__code__ if PY3k else f.func_code
  27. def with_camel_case_alias(func):
  28. """decorator for methods who required a camelcase alias"""
  29. _camel_case_aliases.add(func.__name__)
  30. return func
  31. _camel_case_aliases = set()
  32. def build_camel_case_aliases(PyQuery):
  33. """add camelcase aliases to PyQuery"""
  34. for alias in _camel_case_aliases:
  35. parts = list(alias.split('_'))
  36. name = parts[0] + ''.join([p.title() for p in parts[1:]])
  37. func = getattr(PyQuery, alias)
  38. f = types.FunctionType(func_code(func), func_globals(func),
  39. name, inspect.getargspec(func).defaults)
  40. f.__doc__ = (
  41. 'Alias for :func:`~pyquery.pyquery.PyQuery.%s`') % func.__name__
  42. setattr(PyQuery, name, f.__get__(None, PyQuery))
  43. def fromstring(context, parser=None, custom_parser=None):
  44. """use html parser if we don't have clean xml
  45. """
  46. if hasattr(context, 'read') and hasattr(context.read, '__call__'):
  47. meth = 'parse'
  48. else:
  49. meth = 'fromstring'
  50. if custom_parser is None:
  51. if parser is None:
  52. try:
  53. result = getattr(etree, meth)(context)
  54. except etree.XMLSyntaxError:
  55. if hasattr(context, 'seek'):
  56. context.seek(0)
  57. result = getattr(lxml.html, meth)(context)
  58. if isinstance(result, etree._ElementTree):
  59. return [result.getroot()]
  60. else:
  61. return [result]
  62. elif parser == 'xml':
  63. custom_parser = getattr(etree, meth)
  64. elif parser == 'html':
  65. custom_parser = getattr(lxml.html, meth)
  66. elif parser == 'html5':
  67. from lxml.html import html5parser
  68. custom_parser = getattr(html5parser, meth)
  69. elif parser == 'soup':
  70. from lxml.html import soupparser
  71. custom_parser = getattr(soupparser, meth)
  72. elif parser == 'html_fragments':
  73. custom_parser = lxml.html.fragments_fromstring
  74. else:
  75. raise ValueError('No such parser: "%s"' % parser)
  76. result = custom_parser(context)
  77. if type(result) is list:
  78. return result
  79. elif isinstance(result, etree._ElementTree):
  80. return [result.getroot()]
  81. elif result is not None:
  82. return [result]
  83. else:
  84. return []
  85. def callback(func, *args):
  86. return func(*args[:func_code(func).co_argcount])
  87. class NoDefault(object):
  88. def __repr__(self):
  89. """clean representation in Sphinx"""
  90. return '<NoDefault>'
  91. no_default = NoDefault()
  92. del NoDefault
  93. class FlexibleElement(object):
  94. """property to allow a flexible api"""
  95. def __init__(self, pget, pset=no_default, pdel=no_default):
  96. self.pget = pget
  97. self.pset = pset
  98. self.pdel = pdel
  99. def __get__(self, instance, klass):
  100. class _element(object):
  101. """real element to support set/get/del attr and item and js call
  102. style"""
  103. def __call__(prop, *args, **kwargs):
  104. return self.pget(instance, *args, **kwargs)
  105. __getattr__ = __getitem__ = __setattr__ = __setitem__ = __call__
  106. def __delitem__(prop, name):
  107. if self.pdel is not no_default:
  108. return self.pdel(instance, name)
  109. else:
  110. raise NotImplementedError()
  111. __delattr__ = __delitem__
  112. def __repr__(prop):
  113. return '<flexible_element %s>' % self.pget.__name__
  114. return _element()
  115. def __set__(self, instance, value):
  116. if self.pset is not no_default:
  117. self.pset(instance, value)
  118. else:
  119. raise NotImplementedError()
  120. class PyQuery(list):
  121. """The main class
  122. """
  123. _translator_class = JQueryTranslator
  124. def __init__(self, *args, **kwargs):
  125. html = None
  126. elements = []
  127. self._base_url = None
  128. self.parser = kwargs.pop('parser', None)
  129. if (len(args) >= 1 and
  130. (not PY3k and isinstance(args[0], basestring) or
  131. (PY3k and isinstance(args[0], str))) and
  132. args[0].split('://', 1)[0] in ('http', 'https')):
  133. kwargs['url'] = args[0]
  134. if len(args) >= 2:
  135. kwargs['data'] = args[1]
  136. args = []
  137. if 'parent' in kwargs:
  138. self._parent = kwargs.pop('parent')
  139. else:
  140. self._parent = no_default
  141. if 'css_translator' in kwargs:
  142. self._translator = kwargs.pop('css_translator')
  143. elif self.parser in ('xml',):
  144. self._translator = self._translator_class(xhtml=True)
  145. elif self._parent is not no_default:
  146. self._translator = self._parent._translator
  147. else:
  148. self._translator = self._translator_class(xhtml=False)
  149. self.namespaces = kwargs.pop('namespaces', None)
  150. if kwargs:
  151. # specific case to get the dom
  152. if 'filename' in kwargs:
  153. html = open(kwargs['filename'])
  154. elif 'url' in kwargs:
  155. url = kwargs.pop('url')
  156. if 'opener' in kwargs:
  157. opener = kwargs.pop('opener')
  158. html = opener(url, **kwargs)
  159. else:
  160. html = url_opener(url, kwargs)
  161. if not self.parser:
  162. self.parser = 'html'
  163. self._base_url = url
  164. else:
  165. raise ValueError('Invalid keyword arguments %s' % kwargs)
  166. elements = fromstring(html, self.parser)
  167. # close open descriptor if possible
  168. if hasattr(html, 'close'):
  169. try:
  170. html.close()
  171. except:
  172. pass
  173. else:
  174. # get nodes
  175. # determine context and selector if any
  176. selector = context = no_default
  177. length = len(args)
  178. if length == 1:
  179. context = args[0]
  180. elif length == 2:
  181. selector, context = args
  182. else:
  183. raise ValueError(
  184. "You can't do that. Please, provide arguments")
  185. # get context
  186. if isinstance(context, basestring):
  187. try:
  188. elements = fromstring(context, self.parser)
  189. except Exception:
  190. raise
  191. elif isinstance(context, self.__class__):
  192. # copy
  193. elements = context[:]
  194. elif isinstance(context, list):
  195. elements = context
  196. elif isinstance(context, etree._Element):
  197. elements = [context]
  198. # select nodes
  199. if elements and selector is not no_default:
  200. xpath = self._css_to_xpath(selector)
  201. results = []
  202. for tag in elements:
  203. results.extend(tag.xpath(xpath, namespaces=self.namespaces))
  204. elements = results
  205. list.__init__(self, elements)
  206. def _css_to_xpath(self, selector, prefix='descendant-or-self::'):
  207. selector = selector.replace('[@', '[')
  208. return self._translator.css_to_xpath(selector, prefix)
  209. def _copy(self, *args, **kwargs):
  210. kwargs.setdefault('namespaces', self.namespaces)
  211. return self.__class__(*args, **kwargs)
  212. def __call__(self, *args, **kwargs):
  213. """return a new PyQuery instance
  214. """
  215. length = len(args)
  216. if length == 0:
  217. raise ValueError('You must provide at least a selector')
  218. if args[0] == '':
  219. return self._copy([])
  220. if (len(args) == 1 and
  221. (not PY3k and isinstance(args[0], basestring) or
  222. (PY3k and isinstance(args[0], str))) and
  223. not args[0].startswith('<')):
  224. args += (self,)
  225. result = self._copy(*args, parent=self, **kwargs)
  226. return result
  227. # keep original list api prefixed with _
  228. _append = list.append
  229. _extend = list.extend
  230. # improve pythonic api
  231. def __add__(self, other):
  232. assert isinstance(other, self.__class__)
  233. return self._copy(self[:] + other[:])
  234. def extend(self, other):
  235. """Extend with anoter PyQuery object"""
  236. assert isinstance(other, self.__class__)
  237. self._extend(other[:])
  238. return self
  239. def items(self, selector=None):
  240. """Iter over elements. Return PyQuery objects:
  241. >>> d = PyQuery('<div><span>foo</span><span>bar</span></div>')
  242. >>> [i.text() for i in d.items('span')]
  243. ['foo', 'bar']
  244. >>> [i.text() for i in d('span').items()]
  245. ['foo', 'bar']
  246. >>> list(d.items('a')) == list(d('a').items())
  247. True
  248. """
  249. if selector:
  250. elems = self(selector) or []
  251. else:
  252. elems = self
  253. for elem in elems:
  254. yield self._copy(elem, parent=self)
  255. def xhtml_to_html(self):
  256. """Remove xhtml namespace:
  257. >>> doc = PyQuery(
  258. ... '<html xmlns="http://www.w3.org/1999/xhtml"></html>')
  259. >>> doc
  260. [<{http://www.w3.org/1999/xhtml}html>]
  261. >>> doc.xhtml_to_html()
  262. [<html>]
  263. """
  264. try:
  265. root = self[0].getroottree()
  266. except IndexError:
  267. pass
  268. else:
  269. lxml.html.xhtml_to_html(root)
  270. return self
  271. def remove_namespaces(self):
  272. """Remove all namespaces:
  273. >>> doc = PyQuery('<foo xmlns="http://example.com/foo"></foo>')
  274. >>> doc
  275. [<{http://example.com/foo}foo>]
  276. >>> doc.remove_namespaces()
  277. [<foo>]
  278. """
  279. try:
  280. root = self[0].getroottree()
  281. except IndexError:
  282. pass
  283. else:
  284. for el in root.iter('{*}*'):
  285. if el.tag.startswith('{'):
  286. el.tag = el.tag.split('}', 1)[1]
  287. return self
  288. def __str__(self):
  289. """xml representation of current nodes::
  290. >>> xml = PyQuery(
  291. ... '<script><![[CDATA[ ]></script>', parser='html_fragments')
  292. >>> print(str(xml))
  293. <script>&lt;![[CDATA[ ]&gt;</script>
  294. """
  295. if PY3k:
  296. return ''.join([etree.tostring(e, encoding=str) for e in self])
  297. else:
  298. return ''.join([etree.tostring(e) for e in self])
  299. def __unicode__(self):
  300. """xml representation of current nodes"""
  301. return unicode('').join([etree.tostring(e, encoding=unicode)
  302. for e in self])
  303. def __html__(self):
  304. """html representation of current nodes::
  305. >>> html = PyQuery(
  306. ... '<script><![[CDATA[ ]></script>', parser='html_fragments')
  307. >>> print(html.__html__())
  308. <script><![[CDATA[ ]></script>
  309. """
  310. return unicode('').join([lxml.html.tostring(e, encoding=unicode)
  311. for e in self])
  312. def __repr__(self):
  313. r = []
  314. try:
  315. for el in self:
  316. c = el.get('class')
  317. c = c and '.' + '.'.join(c.split(' ')) or ''
  318. id = el.get('id')
  319. id = id and '#' + id or ''
  320. r.append('<%s%s%s>' % (el.tag, id, c))
  321. return '[' + (', '.join(r)) + ']'
  322. except AttributeError:
  323. if PY3k:
  324. return list.__repr__(self)
  325. else:
  326. for el in self:
  327. if isinstance(el, unicode):
  328. r.append(el.encode('utf-8'))
  329. else:
  330. r.append(el)
  331. return repr(r)
  332. @property
  333. def root(self):
  334. """return the xml root element
  335. """
  336. if self._parent is not no_default:
  337. return self._parent.getroottree()
  338. return self[0].getroottree()
  339. @property
  340. def encoding(self):
  341. """return the xml encoding of the root element
  342. """
  343. root = self.root
  344. if root is not None:
  345. return self.root.docinfo.encoding
  346. ##############
  347. # Traversing #
  348. ##############
  349. def _filter_only(self, selector, elements, reverse=False, unique=False):
  350. """Filters the selection set only, as opposed to also including
  351. descendants.
  352. """
  353. if selector is None:
  354. results = elements
  355. else:
  356. xpath = self._css_to_xpath(selector, 'self::')
  357. results = []
  358. for tag in elements:
  359. results.extend(tag.xpath(xpath, namespaces=self.namespaces))
  360. if reverse:
  361. results.reverse()
  362. if unique:
  363. result_list = results
  364. results = []
  365. for item in result_list:
  366. if not item in results:
  367. results.append(item)
  368. return self._copy(results, parent=self)
  369. def parent(self, selector=None):
  370. return self._filter_only(
  371. selector,
  372. [e.getparent() for e in self if e.getparent() is not None],
  373. unique=True)
  374. def prev(self, selector=None):
  375. return self._filter_only(
  376. selector,
  377. [e.getprevious() for e in self if e.getprevious() is not None])
  378. def next(self, selector=None):
  379. return self._filter_only(
  380. selector,
  381. [e.getnext() for e in self if e.getnext() is not None])
  382. def _traverse(self, method):
  383. for e in self:
  384. current = getattr(e, method)()
  385. while current is not None:
  386. yield current
  387. current = getattr(current, method)()
  388. def _traverse_parent_topdown(self):
  389. for e in self:
  390. this_list = []
  391. current = e.getparent()
  392. while current is not None:
  393. this_list.append(current)
  394. current = current.getparent()
  395. this_list.reverse()
  396. for j in this_list:
  397. yield j
  398. def _next_all(self):
  399. return [e for e in self._traverse('getnext')]
  400. @with_camel_case_alias
  401. def next_all(self, selector=None):
  402. """
  403. >>> h = '<span><p class="hello">Hi</p><p>Bye</p><img scr=""/></span>'
  404. >>> d = PyQuery(h)
  405. >>> d('p:last').next_all()
  406. [<img>]
  407. >>> d('p:last').nextAll()
  408. [<img>]
  409. """
  410. return self._filter_only(selector, self._next_all())
  411. def _prev_all(self):
  412. return [e for e in self._traverse('getprevious')]
  413. @with_camel_case_alias
  414. def prev_all(self, selector=None):
  415. """
  416. >>> h = '<span><p class="hello">Hi</p><p>Bye</p><img scr=""/></span>'
  417. >>> d = PyQuery(h)
  418. >>> d('p:last').prev_all()
  419. [<p.hello>]
  420. >>> d('p:last').prevAll()
  421. [<p.hello>]
  422. """
  423. return self._filter_only(selector, self._prev_all(), reverse=True)
  424. def siblings(self, selector=None):
  425. """
  426. >>> h = '<span><p class="hello">Hi</p><p>Bye</p><img scr=""/></span>'
  427. >>> d = PyQuery(h)
  428. >>> d('.hello').siblings()
  429. [<p>, <img>]
  430. >>> d('.hello').siblings('img')
  431. [<img>]
  432. """
  433. return self._filter_only(selector, self._prev_all() + self._next_all())
  434. def parents(self, selector=None):
  435. """
  436. >>> d = PyQuery('<span><p class="hello">Hi</p><p>Bye</p></span>')
  437. >>> d('p').parents()
  438. [<span>]
  439. >>> d('.hello').parents('span')
  440. [<span>]
  441. >>> d('.hello').parents('p')
  442. []
  443. """
  444. return self._filter_only(
  445. selector,
  446. [e for e in self._traverse_parent_topdown()],
  447. unique=True
  448. )
  449. def children(self, selector=None):
  450. """Filter elements that are direct children of self using optional
  451. selector:
  452. >>> d = PyQuery('<span><p class="hello">Hi</p><p>Bye</p></span>')
  453. >>> d
  454. [<span>]
  455. >>> d.children()
  456. [<p.hello>, <p>]
  457. >>> d.children('.hello')
  458. [<p.hello>]
  459. """
  460. elements = [child for tag in self for child in tag.getchildren()]
  461. return self._filter_only(selector, elements)
  462. def closest(self, selector=None):
  463. """
  464. >>> d = PyQuery(
  465. ... '<div class="hello"><p>This is a '
  466. ... '<strong class="hello">test</strong></p></div>')
  467. >>> d('strong').closest('div')
  468. [<div.hello>]
  469. >>> d('strong').closest('.hello')
  470. [<strong.hello>]
  471. >>> d('strong').closest('form')
  472. []
  473. """
  474. result = []
  475. for current in self:
  476. while (current is not None and
  477. not self._copy(current).is_(selector)):
  478. current = current.getparent()
  479. if current is not None:
  480. result.append(current)
  481. return self._copy(result, parent=self)
  482. def contents(self):
  483. """
  484. Return contents (with text nodes):
  485. >>> d = PyQuery('hello <b>bold</b>')
  486. >>> d.contents() # doctest: +ELLIPSIS
  487. ['hello ', <Element b at ...>]
  488. """
  489. results = []
  490. for elem in self:
  491. results.extend(elem.xpath('child::text()|child::*', namespaces=self.namespaces))
  492. return self._copy(results, parent=self)
  493. def filter(self, selector):
  494. """Filter elements in self using selector (string or function):
  495. >>> d = PyQuery('<p class="hello">Hi</p><p>Bye</p>')
  496. >>> d('p')
  497. [<p.hello>, <p>]
  498. >>> d('p').filter('.hello')
  499. [<p.hello>]
  500. >>> d('p').filter(lambda i: i == 1)
  501. [<p>]
  502. >>> d('p').filter(lambda i: PyQuery(this).text() == 'Hi')
  503. [<p.hello>]
  504. >>> d('p').filter(lambda i, this: PyQuery(this).text() == 'Hi')
  505. [<p.hello>]
  506. """
  507. if not hasattr(selector, '__call__'):
  508. return self._filter_only(selector, self)
  509. else:
  510. elements = []
  511. args = inspect.getargspec(callback).args
  512. try:
  513. for i, this in enumerate(self):
  514. if len(args) == 1:
  515. func_globals(selector)['this'] = this
  516. if callback(selector, i, this):
  517. elements.append(this)
  518. finally:
  519. f_globals = func_globals(selector)
  520. if 'this' in f_globals:
  521. del f_globals['this']
  522. return self._copy(elements, parent=self)
  523. def not_(self, selector):
  524. """Return elements that don't match the given selector:
  525. >>> d = PyQuery('<p class="hello">Hi</p><p>Bye</p><div></div>')
  526. >>> d('p').not_('.hello')
  527. [<p>]
  528. """
  529. exclude = set(self._copy(selector, self))
  530. return self._copy([e for e in self if e not in exclude],
  531. parent=self)
  532. def is_(self, selector):
  533. """Returns True if selector matches at least one current element, else
  534. False:
  535. >>> d = PyQuery('<p class="hello"><span>Hi</span></p><p>Bye</p>')
  536. >>> d('p').eq(0).is_('.hello')
  537. True
  538. >>> d('p').eq(0).is_('span')
  539. False
  540. >>> d('p').eq(1).is_('.hello')
  541. False
  542. ..
  543. """
  544. return bool(self._filter_only(selector, self))
  545. def find(self, selector):
  546. """Find elements using selector traversing down from self:
  547. >>> m = '<p><span><em>Whoah!</em></span></p><p><em> there</em></p>'
  548. >>> d = PyQuery(m)
  549. >>> d('p').find('em')
  550. [<em>, <em>]
  551. >>> d('p').eq(1).find('em')
  552. [<em>]
  553. """
  554. xpath = self._css_to_xpath(selector)
  555. results = [child.xpath(xpath, namespaces=self.namespaces) for tag in self
  556. for child in tag.getchildren()]
  557. # Flatten the results
  558. elements = []
  559. for r in results:
  560. elements.extend(r)
  561. return self._copy(elements, parent=self)
  562. def eq(self, index):
  563. """Return PyQuery of only the element with the provided index::
  564. >>> d = PyQuery('<p class="hello">Hi</p><p>Bye</p><div></div>')
  565. >>> d('p').eq(0)
  566. [<p.hello>]
  567. >>> d('p').eq(1)
  568. [<p>]
  569. >>> d('p').eq(2)
  570. []
  571. ..
  572. """
  573. # Slicing will return empty list when index=-1
  574. # we should handle out of bound by ourselves
  575. try:
  576. items = self[index]
  577. except IndexError:
  578. items = []
  579. return self._copy(items, parent=self)
  580. def each(self, func):
  581. """apply func on each nodes
  582. """
  583. try:
  584. for i, element in enumerate(self):
  585. func_globals(func)['this'] = element
  586. if callback(func, i, element) is False:
  587. break
  588. finally:
  589. f_globals = func_globals(func)
  590. if 'this' in f_globals:
  591. del f_globals['this']
  592. return self
  593. def map(self, func):
  594. """Returns a new PyQuery after transforming current items with func.
  595. func should take two arguments - 'index' and 'element'. Elements can
  596. also be referred to as 'this' inside of func::
  597. >>> d = PyQuery('<p class="hello">Hi there</p><p>Bye</p><br />')
  598. >>> d('p').map(lambda i, e: PyQuery(e).text())
  599. ['Hi there', 'Bye']
  600. >>> d('p').map(lambda i, e: len(PyQuery(this).text()))
  601. [8, 3]
  602. >>> d('p').map(lambda i, e: PyQuery(this).text().split())
  603. ['Hi', 'there', 'Bye']
  604. """
  605. items = []
  606. try:
  607. for i, element in enumerate(self):
  608. func_globals(func)['this'] = element
  609. result = callback(func, i, element)
  610. if result is not None:
  611. if not isinstance(result, list):
  612. items.append(result)
  613. else:
  614. items.extend(result)
  615. finally:
  616. f_globals = func_globals(func)
  617. if 'this' in f_globals:
  618. del f_globals['this']
  619. return self._copy(items, parent=self)
  620. @property
  621. def length(self):
  622. return len(self)
  623. def size(self):
  624. return len(self)
  625. def end(self):
  626. """Break out of a level of traversal and return to the parent level.
  627. >>> m = '<p><span><em>Whoah!</em></span></p><p><em> there</em></p>'
  628. >>> d = PyQuery(m)
  629. >>> d('p').eq(1).find('em').end().end()
  630. [<p>, <p>]
  631. """
  632. return self._parent
  633. ##############
  634. # Attributes #
  635. ##############
  636. def attr(self, *args, **kwargs):
  637. """Attributes manipulation
  638. """
  639. mapping = {'class_': 'class', 'for_': 'for'}
  640. attr = value = no_default
  641. length = len(args)
  642. if length == 1:
  643. attr = args[0]
  644. attr = mapping.get(attr, attr)
  645. elif length == 2:
  646. attr, value = args
  647. attr = mapping.get(attr, attr)
  648. elif kwargs:
  649. attr = {}
  650. for k, v in kwargs.items():
  651. attr[mapping.get(k, k)] = v
  652. else:
  653. raise ValueError('Invalid arguments %s %s' % (args, kwargs))
  654. if not self:
  655. return None
  656. elif isinstance(attr, dict):
  657. for tag in self:
  658. for key, value in attr.items():
  659. tag.set(key, value)
  660. elif value is no_default:
  661. return self[0].get(attr)
  662. elif value is None:
  663. return self.remove_attr(attr)
  664. else:
  665. for tag in self:
  666. tag.set(attr, value)
  667. return self
  668. @with_camel_case_alias
  669. def remove_attr(self, name):
  670. """Remove an attribute::
  671. >>> d = PyQuery('<div id="myid"></div>')
  672. >>> d.remove_attr('id')
  673. [<div>]
  674. >>> d.removeAttr('id')
  675. [<div>]
  676. ..
  677. """
  678. for tag in self:
  679. try:
  680. del tag.attrib[name]
  681. except KeyError:
  682. pass
  683. return self
  684. attr = FlexibleElement(pget=attr, pdel=remove_attr)
  685. #######
  686. # CSS #
  687. #######
  688. def height(self, value=no_default):
  689. """set/get height of element
  690. """
  691. return self.attr('height', value)
  692. def width(self, value=no_default):
  693. """set/get width of element
  694. """
  695. return self.attr('width', value)
  696. @with_camel_case_alias
  697. def has_class(self, name):
  698. """Return True if element has class::
  699. >>> d = PyQuery('<div class="myclass"></div>')
  700. >>> d.has_class('myclass')
  701. True
  702. >>> d.hasClass('myclass')
  703. True
  704. ..
  705. """
  706. return self.is_('.%s' % name)
  707. @with_camel_case_alias
  708. def add_class(self, value):
  709. """Add a css class to elements::
  710. >>> d = PyQuery('<div></div>')
  711. >>> d.add_class('myclass')
  712. [<div.myclass>]
  713. >>> d.addClass('myclass')
  714. [<div.myclass>]
  715. ..
  716. """
  717. for tag in self:
  718. values = value.split(' ')
  719. classes = (tag.get('class') or '').split()
  720. classes += [v for v in values if v not in classes]
  721. tag.set('class', ' '.join(classes))
  722. return self
  723. @with_camel_case_alias
  724. def remove_class(self, value):
  725. """Remove a css class to elements::
  726. >>> d = PyQuery('<div class="myclass"></div>')
  727. >>> d.remove_class('myclass')
  728. [<div>]
  729. >>> d.removeClass('myclass')
  730. [<div>]
  731. ..
  732. """
  733. for tag in self:
  734. values = value.split(' ')
  735. classes = set((tag.get('class') or '').split())
  736. classes.difference_update(values)
  737. classes.difference_update([''])
  738. classes = ' '.join(classes)
  739. if classes.strip():
  740. tag.set('class', classes)
  741. elif tag.get('class'):
  742. tag.set('class', classes)
  743. return self
  744. @with_camel_case_alias
  745. def toggle_class(self, value):
  746. """Toggle a css class to elements
  747. >>> d = PyQuery('<div></div>')
  748. >>> d.toggle_class('myclass')
  749. [<div.myclass>]
  750. >>> d.toggleClass('myclass')
  751. [<div>]
  752. """
  753. for tag in self:
  754. values = value.split(' ')
  755. classes = (tag.get('class') or '').split()
  756. values_to_add = [v for v in values if v not in classes]
  757. values_to_del = [v for v in values if v in classes]
  758. classes = [v for v in classes if v not in values_to_del]
  759. classes += values_to_add
  760. tag.set('class', ' '.join(classes))
  761. return self
  762. def css(self, *args, **kwargs):
  763. """css attributes manipulation
  764. """
  765. attr = value = no_default
  766. length = len(args)
  767. if length == 1:
  768. attr = args[0]
  769. elif length == 2:
  770. attr, value = args
  771. elif kwargs:
  772. attr = kwargs
  773. else:
  774. raise ValueError('Invalid arguments %s %s' % (args, kwargs))
  775. if isinstance(attr, dict):
  776. for tag in self:
  777. stripped_keys = [key.strip().replace('_', '-')
  778. for key in attr.keys()]
  779. current = [el.strip()
  780. for el in (tag.get('style') or '').split(';')
  781. if el.strip()
  782. and not el.split(':')[0].strip() in stripped_keys]
  783. for key, value in attr.items():
  784. key = key.replace('_', '-')
  785. current.append('%s: %s' % (key, value))
  786. tag.set('style', '; '.join(current))
  787. elif isinstance(value, basestring):
  788. attr = attr.replace('_', '-')
  789. for tag in self:
  790. current = [
  791. el.strip()
  792. for el in (tag.get('style') or '').split(';')
  793. if (el.strip() and
  794. not el.split(':')[0].strip() == attr.strip())]
  795. current.append('%s: %s' % (attr, value))
  796. tag.set('style', '; '.join(current))
  797. return self
  798. css = FlexibleElement(pget=css, pset=css)
  799. ###################
  800. # CORE UI EFFECTS #
  801. ###################
  802. def hide(self):
  803. """remove display:none to elements style
  804. >>> print(PyQuery('<div style="display:none;"/>').hide())
  805. <div style="display: none"/>
  806. """
  807. return self.css('display', 'none')
  808. def show(self):
  809. """add display:block to elements style
  810. >>> print(PyQuery('<div />').show())
  811. <div style="display: block"/>
  812. """
  813. return self.css('display', 'block')
  814. ########
  815. # HTML #
  816. ########
  817. def val(self, value=no_default):
  818. """Set the attribute value::
  819. >>> d = PyQuery('<input />')
  820. >>> d.val('Youhou')
  821. [<input>]
  822. Get the attribute value::
  823. >>> d.val()
  824. 'Youhou'
  825. """
  826. def _get_value(tag):
  827. # <textarea>
  828. if tag.tag == 'textarea':
  829. return self._copy(tag).text()
  830. # <select>
  831. elif tag.tag == 'select':
  832. selected_option = self._copy(tag)('option[selected]:last')
  833. if selected_option:
  834. return selected_option.attr('value')
  835. else:
  836. return self._copy(tag)('option').attr('value')
  837. # <input type="checkbox"> or <input type="radio">
  838. elif self.is_(':checkbox,:radio'):
  839. val = self._copy(tag).attr('value')
  840. if val is None:
  841. return 'on'
  842. else:
  843. return val
  844. # <input> and everything else.
  845. return self._copy(tag).attr('value') or ''
  846. def _set_value(pq, value):
  847. for tag in pq:
  848. # <textarea>
  849. if tag.tag == 'textarea':
  850. self._copy(tag).text(value)
  851. continue
  852. # <select>
  853. if tag.tag == 'select':
  854. def _make_option_selected(_, elem):
  855. pq = self._copy(elem)
  856. if pq.attr('value') == value:
  857. pq.attr('selected', 'selected')
  858. else:
  859. pq.removeAttr('selected')
  860. self._copy(tag)('option').each(_make_option_selected)
  861. continue
  862. # <input> and everything else.
  863. self._copy(tag).attr('value', value)
  864. if value is no_default:
  865. if len(self):
  866. return _get_value(self[0])
  867. else:
  868. _set_value(self, value)
  869. return self
  870. def html(self, value=no_default, **kwargs):
  871. """Get or set the html representation of sub nodes.
  872. Get the text value::
  873. >>> d = PyQuery('<div><span>toto</span></div>')
  874. >>> print(d.html())
  875. <span>toto</span>
  876. Extra args are passed to ``lxml.etree.tostring``::
  877. >>> d = PyQuery('<div><span></span></div>')
  878. >>> print(d.html())
  879. <span/>
  880. >>> print(d.html(method='html'))
  881. <span></span>
  882. Set the text value::
  883. >>> d.html('<span>Youhou !</span>')
  884. [<div>]
  885. >>> print(d)
  886. <div><span>Youhou !</span></div>
  887. """
  888. if value is no_default:
  889. if not self:
  890. return None
  891. tag = self[0]
  892. children = tag.getchildren()
  893. if not children:
  894. return tag.text
  895. html = tag.text or ''
  896. if 'encoding' not in kwargs:
  897. kwargs['encoding'] = unicode
  898. html += unicode('').join([etree.tostring(e, **kwargs)
  899. for e in children])
  900. return html
  901. else:
  902. if isinstance(value, self.__class__):
  903. new_html = unicode(value)
  904. elif isinstance(value, basestring):
  905. new_html = value
  906. elif not value:
  907. new_html = ''
  908. else:
  909. raise ValueError(type(value))
  910. for tag in self:
  911. for child in tag.getchildren():
  912. tag.remove(child)
  913. root = fromstring(
  914. unicode('<root>') + new_html + unicode('</root>'),
  915. self.parser)[0]
  916. children = root.getchildren()
  917. if children:
  918. tag.extend(children)
  919. tag.text = root.text
  920. tag.tail = root.tail
  921. return self
  922. @with_camel_case_alias
  923. def outer_html(self):
  924. """Get the html representation of the first selected element::
  925. >>> d = PyQuery('<div><span class="red">toto</span> rocks</div>')
  926. >>> print(d('span'))
  927. <span class="red">toto</span> rocks
  928. >>> print(d('span').outer_html())
  929. <span class="red">toto</span>
  930. >>> print(d('span').outerHtml())
  931. <span class="red">toto</span>
  932. >>> S = PyQuery('<p>Only <b>me</b> & myself</p>')
  933. >>> print(S('b').outer_html())
  934. <b>me</b>
  935. ..
  936. """
  937. if not self:
  938. return None
  939. e0 = self[0]
  940. if e0.tail:
  941. e0 = deepcopy(e0)
  942. e0.tail = ''
  943. return etree.tostring(e0, encoding=unicode)
  944. def text(self, value=no_default):
  945. """Get or set the text representation of sub nodes.
  946. Get the text value::
  947. >>> doc = PyQuery('<div><span>toto</span><span>tata</span></div>')
  948. >>> print(doc.text())
  949. toto tata
  950. Set the text value::
  951. >>> doc.text('Youhou !')
  952. [<div>]
  953. >>> print(doc)
  954. <div>Youhou !</div>
  955. """
  956. if value is no_default:
  957. if not self:
  958. return ''
  959. text = []
  960. def add_text(tag, no_tail=False):
  961. if tag.text and not isinstance(tag, lxml.etree._Comment):
  962. text.append(tag.text)
  963. for child in tag.getchildren():
  964. add_text(child)
  965. if not no_tail and tag.tail:
  966. text.append(tag.tail)
  967. for tag in self:
  968. add_text(tag, no_tail=True)
  969. return ' '.join([t.strip() for t in text if t.strip()])
  970. for tag in self:
  971. for child in tag.getchildren():
  972. tag.remove(child)
  973. tag.text = value
  974. return self
  975. ################
  976. # Manipulating #
  977. ################
  978. def _get_root(self, value):
  979. if isinstance(value, basestring):
  980. root = fromstring(unicode('<root>') + value + unicode('</root>'),
  981. self.parser)[0]
  982. elif isinstance(value, etree._Element):
  983. root = self._copy(value)
  984. elif isinstance(value, PyQuery):
  985. root = value
  986. else:
  987. raise TypeError(
  988. 'Value must be string, PyQuery or Element. Got %r' % value)
  989. if hasattr(root, 'text') and isinstance(root.text, basestring):
  990. root_text = root.text
  991. else:
  992. root_text = ''
  993. return root, root_text
  994. def append(self, value):
  995. """append value to each nodes
  996. """
  997. root, root_text = self._get_root(value)
  998. for i, tag in enumerate(self):
  999. if len(tag) > 0: # if the tag has children
  1000. last_child = tag[-1]
  1001. if not last_child.tail:
  1002. last_child.tail = ''
  1003. last_child.tail += root_text
  1004. else:
  1005. if not tag.text:
  1006. tag.text = ''
  1007. tag.text += root_text
  1008. if i > 0:
  1009. root = deepcopy(list(root))
  1010. tag.extend(root)
  1011. return self
  1012. @with_camel_case_alias
  1013. def append_to(self, value):
  1014. """append nodes to value
  1015. """
  1016. value.append(self)
  1017. return self
  1018. def prepend(self, value):
  1019. """prepend value to nodes
  1020. """
  1021. root, root_text = self._get_root(value)
  1022. for i, tag in enumerate(self):
  1023. if not tag.text:
  1024. tag.text = ''
  1025. if len(root) > 0:
  1026. root[-1].tail = tag.text
  1027. tag.text = root_text
  1028. else:
  1029. tag.text = root_text + tag.text
  1030. if i > 0:
  1031. root = deepcopy(list(root))
  1032. tag[:0] = root
  1033. root = tag[:len(root)]
  1034. return self
  1035. @with_camel_case_alias
  1036. def prepend_to(self, value):
  1037. """prepend nodes to value
  1038. """
  1039. value.prepend(self)
  1040. return self
  1041. def after(self, value):
  1042. """add value after nodes
  1043. """
  1044. root, root_text = self._get_root(value)
  1045. for i, tag in enumerate(self):
  1046. if not tag.tail:
  1047. tag.tail = ''
  1048. tag.tail += root_text
  1049. if i > 0:
  1050. root = deepcopy(list(root))
  1051. parent = tag.getparent()
  1052. index = parent.index(tag) + 1
  1053. parent[index:index] = root
  1054. root = parent[index:len(root)]
  1055. return self
  1056. @with_camel_case_alias
  1057. def insert_after(self, value):
  1058. """insert nodes after value
  1059. """
  1060. value.after(self)
  1061. return self
  1062. def before(self, value):
  1063. """insert value before nodes
  1064. """
  1065. root, root_text = self._get_root(value)
  1066. for i, tag in enumerate(self):
  1067. previous = tag.getprevious()
  1068. if previous is not None:
  1069. if not previous.tail:
  1070. previous.tail = ''
  1071. previous.tail += root_text
  1072. else:
  1073. parent = tag.getparent()
  1074. if not parent.text:
  1075. parent.text = ''
  1076. parent.text += root_text
  1077. if i > 0:
  1078. root = deepcopy(list(root))
  1079. parent = tag.getparent()
  1080. index = parent.index(tag)
  1081. parent[index:index] = root
  1082. root = parent[index:len(root)]
  1083. return self
  1084. @with_camel_case_alias
  1085. def insert_before(self, value):
  1086. """insert nodes before value
  1087. """
  1088. value.before(self)
  1089. return self
  1090. def wrap(self, value):
  1091. """A string of HTML that will be created on the fly and wrapped around
  1092. each target:
  1093. >>> d = PyQuery('<span>youhou</span>')
  1094. >>> d.wrap('<div></div>')
  1095. [<div>]
  1096. >>> print(d)
  1097. <div><span>youhou</span></div>
  1098. """
  1099. assert isinstance(value, basestring)
  1100. value = fromstring(value)[0]
  1101. nodes = []
  1102. for tag in self:
  1103. wrapper = deepcopy(value)
  1104. # FIXME: using iterchildren is probably not optimal
  1105. if not wrapper.getchildren():
  1106. wrapper.append(deepcopy(tag))
  1107. else:
  1108. childs = [c for c in wrapper.iterchildren()]
  1109. child = childs[-1]
  1110. child.append(deepcopy(tag))
  1111. nodes.append(wrapper)
  1112. parent = tag.getparent()
  1113. if parent is not None:
  1114. for t in parent.iterchildren():
  1115. if t is tag:
  1116. t.addnext(wrapper)
  1117. parent.remove(t)
  1118. break
  1119. self[:] = nodes
  1120. return self
  1121. @with_camel_case_alias
  1122. def wrap_all(self, value):
  1123. """Wrap all the elements in the matched set into a single wrapper
  1124. element::
  1125. >>> d = PyQuery('<div><span>Hey</span><span>you !</span></div>')
  1126. >>> print(d('span').wrap_all('<div id="wrapper"></div>'))
  1127. <div id="wrapper"><span>Hey</span><span>you !</span></div>
  1128. >>> d = PyQuery('<div><span>Hey</span><span>you !</span></div>')
  1129. >>> print(d('span').wrapAll('<div id="wrapper"></div>'))
  1130. <div id="wrapper"><span>Hey</span><span>you !</span></div>
  1131. ..
  1132. """
  1133. if not self:
  1134. return self
  1135. assert isinstance(value, basestring)
  1136. value = fromstring(value)[0]
  1137. wrapper = deepcopy(value)
  1138. if not wrapper.getchildren():
  1139. child = wrapper
  1140. else:
  1141. childs = [c for c in wrapper.iterchildren()]
  1142. child = childs[-1]
  1143. replace_childs = True
  1144. parent = self[0].getparent()
  1145. if parent is None:
  1146. parent = no_default
  1147. # add nodes to wrapper and check parent
  1148. for tag in self:
  1149. child.append(deepcopy(tag))
  1150. if tag.getparent() is not parent:
  1151. replace_childs = False
  1152. # replace nodes i parent if possible
  1153. if parent is not no_default and replace_childs:
  1154. childs = [c for c in parent.iterchildren()]
  1155. if len(childs) == len(self):
  1156. for tag in self:
  1157. parent.remove(tag)
  1158. parent.append(wrapper)
  1159. self[:] = [wrapper]
  1160. return self
  1161. @with_camel_case_alias
  1162. def replace_with(self, value):
  1163. """replace nodes by value::
  1164. >>> doc = PyQuery("<html><div /></html>")
  1165. >>> node = PyQuery("<span />")
  1166. >>> child = doc.find('div')
  1167. >>> child.replace_with(node)
  1168. [<div>]
  1169. >>> print(doc)
  1170. <html><span/></html>
  1171. """
  1172. if isinstance(value, PyQuery):
  1173. value = str(value)
  1174. if hasattr(value, '__call__'):
  1175. for i, element in enumerate(self):
  1176. self._copy(element).before(
  1177. value(i, element) + (element.tail or ''))
  1178. parent = element.getparent()
  1179. parent.remove(element)
  1180. else:
  1181. for tag in self:
  1182. self._copy(tag).before(value + (tag.tail or ''))
  1183. parent = tag.getparent()
  1184. parent.remove(tag)
  1185. return self
  1186. @with_camel_case_alias
  1187. def replace_all(self, expr):
  1188. """replace nodes by expr
  1189. """
  1190. if self._parent is no_default:
  1191. raise ValueError(
  1192. 'replaceAll can only be used with an object with parent')
  1193. self._parent(expr).replace_with(self)
  1194. return self
  1195. def clone(self):
  1196. """return a copy of nodes
  1197. """
  1198. return PyQuery([deepcopy(tag) for tag in self])
  1199. def empty(self):
  1200. """remove nodes content
  1201. """
  1202. for tag in self:
  1203. tag.text = None
  1204. tag[:] = []
  1205. return self
  1206. def remove(self, expr=no_default):
  1207. """Remove nodes:
  1208. >>> h = '<div>Maybe <em>she</em> does <strong>NOT</strong> know</div>'
  1209. >>> d = PyQuery(h)
  1210. >>> d('strong').remove()
  1211. [<strong>]
  1212. >>> print(d)
  1213. <div>Maybe <em>she</em> does know</div>
  1214. """
  1215. if expr is no_default:
  1216. for tag in self:
  1217. parent = tag.getparent()
  1218. if parent is not None:
  1219. if tag.tail:
  1220. prev = tag.getprevious()
  1221. if prev is None:
  1222. if not parent.text:
  1223. parent.text = ''
  1224. parent.text += ' ' + tag.tail
  1225. else:
  1226. if not prev.tail:
  1227. prev.tail = ''
  1228. prev.tail += ' ' + tag.tail
  1229. parent.remove(tag)
  1230. else:
  1231. results = self._copy(expr, self)
  1232. results.remove()
  1233. return self
  1234. class Fn(object):
  1235. """Hook for defining custom function (like the jQuery.fn):
  1236. .. sourcecode:: python
  1237. >>> fn = lambda: this.map(lambda i, el: PyQuery(this).outerHtml())
  1238. >>> PyQuery.fn.listOuterHtml = fn
  1239. >>> S = PyQuery(
  1240. ... '<ol> <li>Coffee</li> <li>Tea</li> <li>Milk</li> </ol>')
  1241. >>> S('li').listOuterHtml()
  1242. ['<li>Coffee</li>', '<li>Tea</li>', '<li>Milk</li>']
  1243. """
  1244. def __setattr__(self, name, func):
  1245. def fn(self, *args, **kwargs):
  1246. func_globals(func)['this'] = self
  1247. return func(*args, **kwargs)
  1248. fn.__name__ = name
  1249. setattr(PyQuery, name, fn)
  1250. fn = Fn()
  1251. #####################################################
  1252. # Additional methods that are not in the jQuery API #
  1253. #####################################################
  1254. @property
  1255. def base_url(self):
  1256. """Return the url of current html document or None if not available.
  1257. """
  1258. if self._base_url is not None:
  1259. return self._base_url
  1260. if self._parent is not no_default:
  1261. return self._parent.base_url
  1262. def make_links_absolute(self, base_url=None):
  1263. """Make all links absolute.
  1264. """
  1265. if base_url is None:
  1266. base_url = self.base_url
  1267. if base_url is None:
  1268. raise ValueError((
  1269. 'You need a base URL to make your links'
  1270. 'absolute. It can be provided by the base_url parameter.'))
  1271. def repl(attr):
  1272. def rep(i, e):
  1273. attr_value = self(e).attr(attr)
  1274. # when label hasn't such attr, pass
  1275. if attr_value is None:
  1276. return None
  1277. return self(e).attr(attr,
  1278. urljoin(base_url, attr_value.strip()))
  1279. return rep
  1280. self('a').each(repl('href'))
  1281. self('link').each(repl('href'))
  1282. self('script').each(repl('src'))
  1283. self('img').each(repl('src'))
  1284. self('iframe').each(repl('src'))
  1285. self('form').each(repl('action'))
  1286. return self
  1287. build_camel_case_aliases(PyQuery)