pytables.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604
  1. """ manage PyTables query interface via Expressions """
  2. import ast
  3. from functools import partial
  4. import numpy as np
  5. from pandas.compat import DeepChainMap, string_types, u
  6. from pandas.core.dtypes.common import is_list_like
  7. import pandas as pd
  8. from pandas.core.base import StringMixin
  9. import pandas.core.common as com
  10. from pandas.core.computation import expr, ops
  11. from pandas.core.computation.common import _ensure_decoded
  12. from pandas.core.computation.expr import BaseExprVisitor
  13. from pandas.core.computation.ops import UndefinedVariableError, is_term
  14. from pandas.io.formats.printing import pprint_thing, pprint_thing_encoded
  15. class Scope(expr.Scope):
  16. __slots__ = 'queryables',
  17. def __init__(self, level, global_dict=None, local_dict=None,
  18. queryables=None):
  19. super(Scope, self).__init__(level + 1, global_dict=global_dict,
  20. local_dict=local_dict)
  21. self.queryables = queryables or dict()
  22. class Term(ops.Term):
  23. def __new__(cls, name, env, side=None, encoding=None):
  24. klass = Constant if not isinstance(name, string_types) else cls
  25. supr_new = StringMixin.__new__
  26. return supr_new(klass)
  27. def __init__(self, name, env, side=None, encoding=None):
  28. super(Term, self).__init__(name, env, side=side, encoding=encoding)
  29. def _resolve_name(self):
  30. # must be a queryables
  31. if self.side == 'left':
  32. if self.name not in self.env.queryables:
  33. raise NameError('name {name!r} is not defined'
  34. .format(name=self.name))
  35. return self.name
  36. # resolve the rhs (and allow it to be None)
  37. try:
  38. return self.env.resolve(self.name, is_local=False)
  39. except UndefinedVariableError:
  40. return self.name
  41. @property
  42. def value(self):
  43. return self._value
  44. class Constant(Term):
  45. def __init__(self, value, env, side=None, encoding=None):
  46. super(Constant, self).__init__(value, env, side=side,
  47. encoding=encoding)
  48. def _resolve_name(self):
  49. return self._name
  50. class BinOp(ops.BinOp):
  51. _max_selectors = 31
  52. def __init__(self, op, lhs, rhs, queryables, encoding):
  53. super(BinOp, self).__init__(op, lhs, rhs)
  54. self.queryables = queryables
  55. self.encoding = encoding
  56. self.filter = None
  57. self.condition = None
  58. def _disallow_scalar_only_bool_ops(self):
  59. pass
  60. def prune(self, klass):
  61. def pr(left, right):
  62. """ create and return a new specialized BinOp from myself """
  63. if left is None:
  64. return right
  65. elif right is None:
  66. return left
  67. k = klass
  68. if isinstance(left, ConditionBinOp):
  69. if (isinstance(left, ConditionBinOp) and
  70. isinstance(right, ConditionBinOp)):
  71. k = JointConditionBinOp
  72. elif isinstance(left, k):
  73. return left
  74. elif isinstance(right, k):
  75. return right
  76. elif isinstance(left, FilterBinOp):
  77. if (isinstance(left, FilterBinOp) and
  78. isinstance(right, FilterBinOp)):
  79. k = JointFilterBinOp
  80. elif isinstance(left, k):
  81. return left
  82. elif isinstance(right, k):
  83. return right
  84. return k(self.op, left, right, queryables=self.queryables,
  85. encoding=self.encoding).evaluate()
  86. left, right = self.lhs, self.rhs
  87. if is_term(left) and is_term(right):
  88. res = pr(left.value, right.value)
  89. elif not is_term(left) and is_term(right):
  90. res = pr(left.prune(klass), right.value)
  91. elif is_term(left) and not is_term(right):
  92. res = pr(left.value, right.prune(klass))
  93. elif not (is_term(left) or is_term(right)):
  94. res = pr(left.prune(klass), right.prune(klass))
  95. return res
  96. def conform(self, rhs):
  97. """ inplace conform rhs """
  98. if not is_list_like(rhs):
  99. rhs = [rhs]
  100. if isinstance(rhs, np.ndarray):
  101. rhs = rhs.ravel()
  102. return rhs
  103. @property
  104. def is_valid(self):
  105. """ return True if this is a valid field """
  106. return self.lhs in self.queryables
  107. @property
  108. def is_in_table(self):
  109. """ return True if this is a valid column name for generation (e.g. an
  110. actual column in the table) """
  111. return self.queryables.get(self.lhs) is not None
  112. @property
  113. def kind(self):
  114. """ the kind of my field """
  115. return getattr(self.queryables.get(self.lhs), 'kind', None)
  116. @property
  117. def meta(self):
  118. """ the meta of my field """
  119. return getattr(self.queryables.get(self.lhs), 'meta', None)
  120. @property
  121. def metadata(self):
  122. """ the metadata of my field """
  123. return getattr(self.queryables.get(self.lhs), 'metadata', None)
  124. def generate(self, v):
  125. """ create and return the op string for this TermValue """
  126. val = v.tostring(self.encoding)
  127. return "({lhs} {op} {val})".format(lhs=self.lhs, op=self.op, val=val)
  128. def convert_value(self, v):
  129. """ convert the expression that is in the term to something that is
  130. accepted by pytables """
  131. def stringify(value):
  132. if self.encoding is not None:
  133. encoder = partial(pprint_thing_encoded,
  134. encoding=self.encoding)
  135. else:
  136. encoder = pprint_thing
  137. return encoder(value)
  138. kind = _ensure_decoded(self.kind)
  139. meta = _ensure_decoded(self.meta)
  140. if kind == u('datetime64') or kind == u('datetime'):
  141. if isinstance(v, (int, float)):
  142. v = stringify(v)
  143. v = _ensure_decoded(v)
  144. v = pd.Timestamp(v)
  145. if v.tz is not None:
  146. v = v.tz_convert('UTC')
  147. return TermValue(v, v.value, kind)
  148. elif kind == u('timedelta64') or kind == u('timedelta'):
  149. v = pd.Timedelta(v, unit='s').value
  150. return TermValue(int(v), v, kind)
  151. elif meta == u('category'):
  152. metadata = com.values_from_object(self.metadata)
  153. result = metadata.searchsorted(v, side='left')
  154. # result returns 0 if v is first element or if v is not in metadata
  155. # check that metadata contains v
  156. if not result and v not in metadata:
  157. result = -1
  158. return TermValue(result, result, u('integer'))
  159. elif kind == u('integer'):
  160. v = int(float(v))
  161. return TermValue(v, v, kind)
  162. elif kind == u('float'):
  163. v = float(v)
  164. return TermValue(v, v, kind)
  165. elif kind == u('bool'):
  166. if isinstance(v, string_types):
  167. v = not v.strip().lower() in [u('false'), u('f'), u('no'),
  168. u('n'), u('none'), u('0'),
  169. u('[]'), u('{}'), u('')]
  170. else:
  171. v = bool(v)
  172. return TermValue(v, v, kind)
  173. elif isinstance(v, string_types):
  174. # string quoting
  175. return TermValue(v, stringify(v), u('string'))
  176. else:
  177. raise TypeError("Cannot compare {v} of type {typ} to {kind} column"
  178. .format(v=v, typ=type(v), kind=kind))
  179. def convert_values(self):
  180. pass
  181. class FilterBinOp(BinOp):
  182. def __unicode__(self):
  183. return pprint_thing("[Filter : [{lhs}] -> [{op}]"
  184. .format(lhs=self.filter[0], op=self.filter[1]))
  185. def invert(self):
  186. """ invert the filter """
  187. if self.filter is not None:
  188. f = list(self.filter)
  189. f[1] = self.generate_filter_op(invert=True)
  190. self.filter = tuple(f)
  191. return self
  192. def format(self):
  193. """ return the actual filter format """
  194. return [self.filter]
  195. def evaluate(self):
  196. if not self.is_valid:
  197. raise ValueError("query term is not valid [{slf}]"
  198. .format(slf=self))
  199. rhs = self.conform(self.rhs)
  200. values = [TermValue(v, v, self.kind) for v in rhs]
  201. if self.is_in_table:
  202. # if too many values to create the expression, use a filter instead
  203. if self.op in ['==', '!='] and len(values) > self._max_selectors:
  204. filter_op = self.generate_filter_op()
  205. self.filter = (
  206. self.lhs,
  207. filter_op,
  208. pd.Index([v.value for v in values]))
  209. return self
  210. return None
  211. # equality conditions
  212. if self.op in ['==', '!=']:
  213. filter_op = self.generate_filter_op()
  214. self.filter = (
  215. self.lhs,
  216. filter_op,
  217. pd.Index([v.value for v in values]))
  218. else:
  219. raise TypeError("passing a filterable condition to a non-table "
  220. "indexer [{slf}]".format(slf=self))
  221. return self
  222. def generate_filter_op(self, invert=False):
  223. if (self.op == '!=' and not invert) or (self.op == '==' and invert):
  224. return lambda axis, vals: ~axis.isin(vals)
  225. else:
  226. return lambda axis, vals: axis.isin(vals)
  227. class JointFilterBinOp(FilterBinOp):
  228. def format(self):
  229. raise NotImplementedError("unable to collapse Joint Filters")
  230. def evaluate(self):
  231. return self
  232. class ConditionBinOp(BinOp):
  233. def __unicode__(self):
  234. return pprint_thing("[Condition : [{cond}]]"
  235. .format(cond=self.condition))
  236. def invert(self):
  237. """ invert the condition """
  238. # if self.condition is not None:
  239. # self.condition = "~(%s)" % self.condition
  240. # return self
  241. raise NotImplementedError("cannot use an invert condition when "
  242. "passing to numexpr")
  243. def format(self):
  244. """ return the actual ne format """
  245. return self.condition
  246. def evaluate(self):
  247. if not self.is_valid:
  248. raise ValueError("query term is not valid [{slf}]"
  249. .format(slf=self))
  250. # convert values if we are in the table
  251. if not self.is_in_table:
  252. return None
  253. rhs = self.conform(self.rhs)
  254. values = [self.convert_value(v) for v in rhs]
  255. # equality conditions
  256. if self.op in ['==', '!=']:
  257. # too many values to create the expression?
  258. if len(values) <= self._max_selectors:
  259. vs = [self.generate(v) for v in values]
  260. self.condition = "({cond})".format(cond=' | '.join(vs))
  261. # use a filter after reading
  262. else:
  263. return None
  264. else:
  265. self.condition = self.generate(values[0])
  266. return self
  267. class JointConditionBinOp(ConditionBinOp):
  268. def evaluate(self):
  269. self.condition = "({lhs} {op} {rhs})".format(lhs=self.lhs.condition,
  270. op=self.op,
  271. rhs=self.rhs.condition)
  272. return self
  273. class UnaryOp(ops.UnaryOp):
  274. def prune(self, klass):
  275. if self.op != '~':
  276. raise NotImplementedError("UnaryOp only support invert type ops")
  277. operand = self.operand
  278. operand = operand.prune(klass)
  279. if operand is not None:
  280. if issubclass(klass, ConditionBinOp):
  281. if operand.condition is not None:
  282. return operand.invert()
  283. elif issubclass(klass, FilterBinOp):
  284. if operand.filter is not None:
  285. return operand.invert()
  286. return None
  287. _op_classes = {'unary': UnaryOp}
  288. class ExprVisitor(BaseExprVisitor):
  289. const_type = Constant
  290. term_type = Term
  291. def __init__(self, env, engine, parser, **kwargs):
  292. super(ExprVisitor, self).__init__(env, engine, parser)
  293. for bin_op in self.binary_ops:
  294. bin_node = self.binary_op_nodes_map[bin_op]
  295. setattr(self, 'visit_{node}'.format(node=bin_node),
  296. lambda node, bin_op=bin_op: partial(BinOp, bin_op,
  297. **kwargs))
  298. def visit_UnaryOp(self, node, **kwargs):
  299. if isinstance(node.op, (ast.Not, ast.Invert)):
  300. return UnaryOp('~', self.visit(node.operand))
  301. elif isinstance(node.op, ast.USub):
  302. return self.const_type(-self.visit(node.operand).value, self.env)
  303. elif isinstance(node.op, ast.UAdd):
  304. raise NotImplementedError('Unary addition not supported')
  305. def visit_Index(self, node, **kwargs):
  306. return self.visit(node.value).value
  307. def visit_Assign(self, node, **kwargs):
  308. cmpr = ast.Compare(ops=[ast.Eq()], left=node.targets[0],
  309. comparators=[node.value])
  310. return self.visit(cmpr)
  311. def visit_Subscript(self, node, **kwargs):
  312. # only allow simple suscripts
  313. value = self.visit(node.value)
  314. slobj = self.visit(node.slice)
  315. try:
  316. value = value.value
  317. except AttributeError:
  318. pass
  319. try:
  320. return self.const_type(value[slobj], self.env)
  321. except TypeError:
  322. raise ValueError("cannot subscript {value!r} with "
  323. "{slobj!r}".format(value=value, slobj=slobj))
  324. def visit_Attribute(self, node, **kwargs):
  325. attr = node.attr
  326. value = node.value
  327. ctx = node.ctx.__class__
  328. if ctx == ast.Load:
  329. # resolve the value
  330. resolved = self.visit(value)
  331. # try to get the value to see if we are another expression
  332. try:
  333. resolved = resolved.value
  334. except (AttributeError):
  335. pass
  336. try:
  337. return self.term_type(getattr(resolved, attr), self.env)
  338. except AttributeError:
  339. # something like datetime.datetime where scope is overridden
  340. if isinstance(value, ast.Name) and value.id == attr:
  341. return resolved
  342. raise ValueError("Invalid Attribute context {name}"
  343. .format(name=ctx.__name__))
  344. def translate_In(self, op):
  345. return ast.Eq() if isinstance(op, ast.In) else op
  346. def _rewrite_membership_op(self, node, left, right):
  347. return self.visit(node.op), node.op, left, right
  348. def _validate_where(w):
  349. """
  350. Validate that the where statement is of the right type.
  351. The type may either be String, Expr, or list-like of Exprs.
  352. Parameters
  353. ----------
  354. w : String term expression, Expr, or list-like of Exprs.
  355. Returns
  356. -------
  357. where : The original where clause if the check was successful.
  358. Raises
  359. ------
  360. TypeError : An invalid data type was passed in for w (e.g. dict).
  361. """
  362. if not (isinstance(w, (Expr, string_types)) or is_list_like(w)):
  363. raise TypeError("where must be passed as a string, Expr, "
  364. "or list-like of Exprs")
  365. return w
  366. class Expr(expr.Expr):
  367. """ hold a pytables like expression, comprised of possibly multiple 'terms'
  368. Parameters
  369. ----------
  370. where : string term expression, Expr, or list-like of Exprs
  371. queryables : a "kinds" map (dict of column name -> kind), or None if column
  372. is non-indexable
  373. encoding : an encoding that will encode the query terms
  374. Returns
  375. -------
  376. an Expr object
  377. Examples
  378. --------
  379. 'index>=date'
  380. "columns=['A', 'D']"
  381. 'columns=A'
  382. 'columns==A'
  383. "~(columns=['A','B'])"
  384. 'index>df.index[3] & string="bar"'
  385. '(index>df.index[3] & index<=df.index[6]) | string="bar"'
  386. "ts>=Timestamp('2012-02-01')"
  387. "major_axis>=20130101"
  388. """
  389. def __init__(self, where, queryables=None, encoding=None, scope_level=0):
  390. where = _validate_where(where)
  391. self.encoding = encoding
  392. self.condition = None
  393. self.filter = None
  394. self.terms = None
  395. self._visitor = None
  396. # capture the environment if needed
  397. local_dict = DeepChainMap()
  398. if isinstance(where, Expr):
  399. local_dict = where.env.scope
  400. where = where.expr
  401. elif isinstance(where, (list, tuple)):
  402. for idx, w in enumerate(where):
  403. if isinstance(w, Expr):
  404. local_dict = w.env.scope
  405. else:
  406. w = _validate_where(w)
  407. where[idx] = w
  408. where = ' & '.join(map('({})'.format, com.flatten(where))) # noqa
  409. self.expr = where
  410. self.env = Scope(scope_level + 1, local_dict=local_dict)
  411. if queryables is not None and isinstance(self.expr, string_types):
  412. self.env.queryables.update(queryables)
  413. self._visitor = ExprVisitor(self.env, queryables=queryables,
  414. parser='pytables', engine='pytables',
  415. encoding=encoding)
  416. self.terms = self.parse()
  417. def __unicode__(self):
  418. if self.terms is not None:
  419. return pprint_thing(self.terms)
  420. return pprint_thing(self.expr)
  421. def evaluate(self):
  422. """ create and return the numexpr condition and filter """
  423. try:
  424. self.condition = self.terms.prune(ConditionBinOp)
  425. except AttributeError:
  426. raise ValueError("cannot process expression [{expr}], [{slf}] "
  427. "is not a valid condition".format(expr=self.expr,
  428. slf=self))
  429. try:
  430. self.filter = self.terms.prune(FilterBinOp)
  431. except AttributeError:
  432. raise ValueError("cannot process expression [{expr}], [{slf}] "
  433. "is not a valid filter".format(expr=self.expr,
  434. slf=self))
  435. return self.condition, self.filter
  436. class TermValue(object):
  437. """ hold a term value the we use to construct a condition/filter """
  438. def __init__(self, value, converted, kind):
  439. self.value = value
  440. self.converted = converted
  441. self.kind = kind
  442. def tostring(self, encoding):
  443. """ quote the string if not encoded
  444. else encode and return """
  445. if self.kind == u'string':
  446. if encoding is not None:
  447. return self.converted
  448. return '"{converted}"'.format(converted=self.converted)
  449. elif self.kind == u'float':
  450. # python 2 str(float) is not always
  451. # round-trippable so use repr()
  452. return repr(self.converted)
  453. return self.converted
  454. def maybe_expression(s):
  455. """ loose checking if s is a pytables-acceptable expression """
  456. if not isinstance(s, string_types):
  457. return False
  458. ops = ExprVisitor.binary_ops + ExprVisitor.unary_ops + ('=',)
  459. # make sure we have an op at least
  460. return any(op in s for op in ops)