""":func:`~pandas.eval` parsers """ import ast from functools import partial import tokenize import numpy as np from pandas.compat import StringIO, lmap, reduce, string_types, zip import pandas as pd from pandas import compat from pandas.core import common as com from pandas.core.base import StringMixin from pandas.core.computation.ops import ( _LOCAL_TAG, BinOp, Constant, Div, FuncNode, Op, Term, UnaryOp, UndefinedVariableError, _arith_ops_syms, _bool_ops_syms, _cmp_ops_syms, _mathops, _reductions, _unary_ops_syms, is_term) from pandas.core.computation.scope import Scope import pandas.io.formats.printing as printing def tokenize_string(source): """Tokenize a Python source code string. Parameters ---------- source : str A Python source code string """ line_reader = StringIO(source).readline for toknum, tokval, _, _, _ in tokenize.generate_tokens(line_reader): yield toknum, tokval def _rewrite_assign(tok): """Rewrite the assignment operator for PyTables expressions that use ``=`` as a substitute for ``==``. Parameters ---------- tok : tuple of int, str ints correspond to the all caps constants in the tokenize module Returns ------- t : tuple of int, str Either the input or token or the replacement values """ toknum, tokval = tok return toknum, '==' if tokval == '=' else tokval def _replace_booleans(tok): """Replace ``&`` with ``and`` and ``|`` with ``or`` so that bitwise precedence is changed to boolean precedence. Parameters ---------- tok : tuple of int, str ints correspond to the all caps constants in the tokenize module Returns ------- t : tuple of int, str Either the input or token or the replacement values """ toknum, tokval = tok if toknum == tokenize.OP: if tokval == '&': return tokenize.NAME, 'and' elif tokval == '|': return tokenize.NAME, 'or' return toknum, tokval return toknum, tokval def _replace_locals(tok): """Replace local variables with a syntactically valid name. Parameters ---------- tok : tuple of int, str ints correspond to the all caps constants in the tokenize module Returns ------- t : tuple of int, str Either the input or token or the replacement values Notes ----- This is somewhat of a hack in that we rewrite a string such as ``'@a'`` as ``'__pd_eval_local_a'`` by telling the tokenizer that ``__pd_eval_local_`` is a ``tokenize.OP`` and to replace the ``'@'`` symbol with it. """ toknum, tokval = tok if toknum == tokenize.OP and tokval == '@': return tokenize.OP, _LOCAL_TAG return toknum, tokval def _compose2(f, g): """Compose 2 callables""" return lambda *args, **kwargs: f(g(*args, **kwargs)) def _compose(*funcs): """Compose 2 or more callables""" assert len(funcs) > 1, 'At least 2 callables must be passed to compose' return reduce(_compose2, funcs) def _preparse(source, f=_compose(_replace_locals, _replace_booleans, _rewrite_assign)): """Compose a collection of tokenization functions Parameters ---------- source : str A Python source code string f : callable This takes a tuple of (toknum, tokval) as its argument and returns a tuple with the same structure but possibly different elements. Defaults to the composition of ``_rewrite_assign``, ``_replace_booleans``, and ``_replace_locals``. Returns ------- s : str Valid Python source code Notes ----- The `f` parameter can be any callable that takes *and* returns input of the form ``(toknum, tokval)``, where ``toknum`` is one of the constants from the ``tokenize`` module and ``tokval`` is a string. """ assert callable(f), 'f must be callable' return tokenize.untokenize(lmap(f, tokenize_string(source))) def _is_type(t): """Factory for a type checking function of type ``t`` or tuple of types.""" return lambda x: isinstance(x.value, t) _is_list = _is_type(list) _is_str = _is_type(string_types) # partition all AST nodes _all_nodes = frozenset(filter(lambda x: isinstance(x, type) and issubclass(x, ast.AST), (getattr(ast, node) for node in dir(ast)))) def _filter_nodes(superclass, all_nodes=_all_nodes): """Filter out AST nodes that are subclasses of ``superclass``.""" node_names = (node.__name__ for node in all_nodes if issubclass(node, superclass)) return frozenset(node_names) _all_node_names = frozenset(map(lambda x: x.__name__, _all_nodes)) _mod_nodes = _filter_nodes(ast.mod) _stmt_nodes = _filter_nodes(ast.stmt) _expr_nodes = _filter_nodes(ast.expr) _expr_context_nodes = _filter_nodes(ast.expr_context) _slice_nodes = _filter_nodes(ast.slice) _boolop_nodes = _filter_nodes(ast.boolop) _operator_nodes = _filter_nodes(ast.operator) _unary_op_nodes = _filter_nodes(ast.unaryop) _cmp_op_nodes = _filter_nodes(ast.cmpop) _comprehension_nodes = _filter_nodes(ast.comprehension) _handler_nodes = _filter_nodes(ast.excepthandler) _arguments_nodes = _filter_nodes(ast.arguments) _keyword_nodes = _filter_nodes(ast.keyword) _alias_nodes = _filter_nodes(ast.alias) # nodes that we don't support directly but are needed for parsing _hacked_nodes = frozenset(['Assign', 'Module', 'Expr']) _unsupported_expr_nodes = frozenset(['Yield', 'GeneratorExp', 'IfExp', 'DictComp', 'SetComp', 'Repr', 'Lambda', 'Set', 'AST', 'Is', 'IsNot']) # these nodes are low priority or won't ever be supported (e.g., AST) _unsupported_nodes = ((_stmt_nodes | _mod_nodes | _handler_nodes | _arguments_nodes | _keyword_nodes | _alias_nodes | _expr_context_nodes | _unsupported_expr_nodes) - _hacked_nodes) # we're adding a different assignment in some cases to be equality comparison # and we don't want `stmt` and friends in their so get only the class whose # names are capitalized _base_supported_nodes = (_all_node_names - _unsupported_nodes) | _hacked_nodes _msg = 'cannot both support and not support {intersection}'.format( intersection=_unsupported_nodes & _base_supported_nodes) assert not _unsupported_nodes & _base_supported_nodes, _msg def _node_not_implemented(node_name, cls): """Return a function that raises a NotImplementedError with a passed node name. """ def f(self, *args, **kwargs): raise NotImplementedError("{name!r} nodes are not " "implemented".format(name=node_name)) return f def disallow(nodes): """Decorator to disallow certain nodes from parsing. Raises a NotImplementedError instead. Returns ------- disallowed : callable """ def disallowed(cls): cls.unsupported_nodes = () for node in nodes: new_method = _node_not_implemented(node, cls) name = 'visit_{node}'.format(node=node) cls.unsupported_nodes += (name,) setattr(cls, name, new_method) return cls return disallowed def _op_maker(op_class, op_symbol): """Return a function to create an op class with its symbol already passed. Returns ------- f : callable """ def f(self, node, *args, **kwargs): """Return a partial function with an Op subclass with an operator already passed. Returns ------- f : callable """ return partial(op_class, op_symbol, *args, **kwargs) return f _op_classes = {'binary': BinOp, 'unary': UnaryOp} def add_ops(op_classes): """Decorator to add default implementation of ops.""" def f(cls): for op_attr_name, op_class in compat.iteritems(op_classes): ops = getattr(cls, '{name}_ops'.format(name=op_attr_name)) ops_map = getattr(cls, '{name}_op_nodes_map'.format( name=op_attr_name)) for op in ops: op_node = ops_map[op] if op_node is not None: made_op = _op_maker(op_class, op) setattr(cls, 'visit_{node}'.format(node=op_node), made_op) return cls return f @disallow(_unsupported_nodes) @add_ops(_op_classes) class BaseExprVisitor(ast.NodeVisitor): """Custom ast walker. Parsers of other engines should subclass this class if necessary. Parameters ---------- env : Scope engine : str parser : str preparser : callable """ const_type = Constant term_type = Term binary_ops = _cmp_ops_syms + _bool_ops_syms + _arith_ops_syms binary_op_nodes = ('Gt', 'Lt', 'GtE', 'LtE', 'Eq', 'NotEq', 'In', 'NotIn', 'BitAnd', 'BitOr', 'And', 'Or', 'Add', 'Sub', 'Mult', None, 'Pow', 'FloorDiv', 'Mod') binary_op_nodes_map = dict(zip(binary_ops, binary_op_nodes)) unary_ops = _unary_ops_syms unary_op_nodes = 'UAdd', 'USub', 'Invert', 'Not' unary_op_nodes_map = dict(zip(unary_ops, unary_op_nodes)) rewrite_map = { ast.Eq: ast.In, ast.NotEq: ast.NotIn, ast.In: ast.In, ast.NotIn: ast.NotIn } def __init__(self, env, engine, parser, preparser=_preparse): self.env = env self.engine = engine self.parser = parser self.preparser = preparser self.assigner = None def visit(self, node, **kwargs): if isinstance(node, string_types): clean = self.preparser(node) try: node = ast.fix_missing_locations(ast.parse(clean)) except SyntaxError as e: from keyword import iskeyword if any(iskeyword(x) for x in clean.split()): e.msg = ("Python keyword not valid identifier" " in numexpr query") raise e method = 'visit_' + node.__class__.__name__ visitor = getattr(self, method) return visitor(node, **kwargs) def visit_Module(self, node, **kwargs): if len(node.body) != 1: raise SyntaxError('only a single expression is allowed') expr = node.body[0] return self.visit(expr, **kwargs) def visit_Expr(self, node, **kwargs): return self.visit(node.value, **kwargs) def _rewrite_membership_op(self, node, left, right): # the kind of the operator (is actually an instance) op_instance = node.op op_type = type(op_instance) # must be two terms and the comparison operator must be ==/!=/in/not in if is_term(left) and is_term(right) and op_type in self.rewrite_map: left_list, right_list = map(_is_list, (left, right)) left_str, right_str = map(_is_str, (left, right)) # if there are any strings or lists in the expression if left_list or right_list or left_str or right_str: op_instance = self.rewrite_map[op_type]() # pop the string variable out of locals and replace it with a list # of one string, kind of a hack if right_str: name = self.env.add_tmp([right.value]) right = self.term_type(name, self.env) if left_str: name = self.env.add_tmp([left.value]) left = self.term_type(name, self.env) op = self.visit(op_instance) return op, op_instance, left, right def _maybe_transform_eq_ne(self, node, left=None, right=None): if left is None: left = self.visit(node.left, side='left') if right is None: right = self.visit(node.right, side='right') op, op_class, left, right = self._rewrite_membership_op(node, left, right) return op, op_class, left, right def _maybe_downcast_constants(self, left, right): f32 = np.dtype(np.float32) if left.is_scalar and not right.is_scalar and right.return_type == f32: # right is a float32 array, left is a scalar name = self.env.add_tmp(np.float32(left.value)) left = self.term_type(name, self.env) if right.is_scalar and not left.is_scalar and left.return_type == f32: # left is a float32 array, right is a scalar name = self.env.add_tmp(np.float32(right.value)) right = self.term_type(name, self.env) return left, right def _maybe_eval(self, binop, eval_in_python): # eval `in` and `not in` (for now) in "partial" python space # things that can be evaluated in "eval" space will be turned into # temporary variables. for example, # [1,2] in a + 2 * b # in that case a + 2 * b will be evaluated using numexpr, and the "in" # call will be evaluated using isin (in python space) return binop.evaluate(self.env, self.engine, self.parser, self.term_type, eval_in_python) def _maybe_evaluate_binop(self, op, op_class, lhs, rhs, eval_in_python=('in', 'not in'), maybe_eval_in_python=('==', '!=', '<', '>', '<=', '>=')): res = op(lhs, rhs) if res.has_invalid_return_type: raise TypeError("unsupported operand type(s) for {op}:" " '{lhs}' and '{rhs}'".format(op=res.op, lhs=lhs.type, rhs=rhs.type)) if self.engine != 'pytables': if (res.op in _cmp_ops_syms and getattr(lhs, 'is_datetime', False) or getattr(rhs, 'is_datetime', False)): # all date ops must be done in python bc numexpr doesn't work # well with NaT return self._maybe_eval(res, self.binary_ops) if res.op in eval_in_python: # "in"/"not in" ops are always evaluated in python return self._maybe_eval(res, eval_in_python) elif self.engine != 'pytables': if (getattr(lhs, 'return_type', None) == object or getattr(rhs, 'return_type', None) == object): # evaluate "==" and "!=" in python if either of our operands # has an object return type return self._maybe_eval(res, eval_in_python + maybe_eval_in_python) return res def visit_BinOp(self, node, **kwargs): op, op_class, left, right = self._maybe_transform_eq_ne(node) left, right = self._maybe_downcast_constants(left, right) return self._maybe_evaluate_binop(op, op_class, left, right) def visit_Div(self, node, **kwargs): truediv = self.env.scope['truediv'] return lambda lhs, rhs: Div(lhs, rhs, truediv) def visit_UnaryOp(self, node, **kwargs): op = self.visit(node.op) operand = self.visit(node.operand) return op(operand) def visit_Name(self, node, **kwargs): return self.term_type(node.id, self.env, **kwargs) def visit_NameConstant(self, node, **kwargs): return self.const_type(node.value, self.env) def visit_Num(self, node, **kwargs): return self.const_type(node.n, self.env) def visit_Str(self, node, **kwargs): name = self.env.add_tmp(node.s) return self.term_type(name, self.env) def visit_List(self, node, **kwargs): name = self.env.add_tmp([self.visit(e)(self.env) for e in node.elts]) return self.term_type(name, self.env) visit_Tuple = visit_List def visit_Index(self, node, **kwargs): """ df.index[4] """ return self.visit(node.value) def visit_Subscript(self, node, **kwargs): value = self.visit(node.value) slobj = self.visit(node.slice) result = pd.eval(slobj, local_dict=self.env, engine=self.engine, parser=self.parser) try: # a Term instance v = value.value[result] except AttributeError: # an Op instance lhs = pd.eval(value, local_dict=self.env, engine=self.engine, parser=self.parser) v = lhs[result] name = self.env.add_tmp(v) return self.term_type(name, env=self.env) def visit_Slice(self, node, **kwargs): """ df.index[slice(4,6)] """ lower = node.lower if lower is not None: lower = self.visit(lower).value upper = node.upper if upper is not None: upper = self.visit(upper).value step = node.step if step is not None: step = self.visit(step).value return slice(lower, upper, step) def visit_Assign(self, node, **kwargs): """ support a single assignment node, like c = a + b set the assigner at the top level, must be a Name node which might or might not exist in the resolvers """ if len(node.targets) != 1: raise SyntaxError('can only assign a single expression') if not isinstance(node.targets[0], ast.Name): raise SyntaxError('left hand side of an assignment must be a ' 'single name') if self.env.target is None: raise ValueError('cannot assign without a target object') try: assigner = self.visit(node.targets[0], **kwargs) except UndefinedVariableError: assigner = node.targets[0].id self.assigner = getattr(assigner, 'name', assigner) if self.assigner is None: raise SyntaxError('left hand side of an assignment must be a ' 'single resolvable name') return self.visit(node.value, **kwargs) def visit_Attribute(self, node, **kwargs): attr = node.attr value = node.value ctx = node.ctx if isinstance(ctx, ast.Load): # resolve the value resolved = self.visit(value).value try: v = getattr(resolved, attr) name = self.env.add_tmp(v) return self.term_type(name, self.env) except AttributeError: # something like datetime.datetime where scope is overridden if isinstance(value, ast.Name) and value.id == attr: return resolved raise ValueError("Invalid Attribute context {name}" .format(name=ctx.__name__)) def visit_Call_35(self, node, side=None, **kwargs): """ in 3.5 the starargs attribute was changed to be more flexible, #11097 """ if isinstance(node.func, ast.Attribute): res = self.visit_Attribute(node.func) elif not isinstance(node.func, ast.Name): raise TypeError("Only named functions are supported") else: try: res = self.visit(node.func) except UndefinedVariableError: # Check if this is a supported function name try: res = FuncNode(node.func.id) except ValueError: # Raise original error raise if res is None: raise ValueError("Invalid function call {func}" .format(func=node.func.id)) if hasattr(res, 'value'): res = res.value if isinstance(res, FuncNode): new_args = [self.visit(arg) for arg in node.args] if node.keywords: raise TypeError("Function \"{name}\" does not support keyword " "arguments".format(name=res.name)) return res(*new_args, **kwargs) else: new_args = [self.visit(arg).value for arg in node.args] for key in node.keywords: if not isinstance(key, ast.keyword): raise ValueError("keyword error in function call " "'{func}'".format(func=node.func.id)) if key.arg: # TODO: bug? kwargs.append(ast.keyword( keyword.arg, self.visit(keyword.value))) # noqa return self.const_type(res(*new_args, **kwargs), self.env) def visit_Call_legacy(self, node, side=None, **kwargs): # this can happen with: datetime.datetime if isinstance(node.func, ast.Attribute): res = self.visit_Attribute(node.func) elif not isinstance(node.func, ast.Name): raise TypeError("Only named functions are supported") else: try: res = self.visit(node.func) except UndefinedVariableError: # Check if this is a supported function name try: res = FuncNode(node.func.id) except ValueError: # Raise original error raise if res is None: raise ValueError("Invalid function call {func}" .format(func=node.func.id)) if hasattr(res, 'value'): res = res.value if isinstance(res, FuncNode): args = [self.visit(targ) for targ in node.args] if node.starargs is not None: args += self.visit(node.starargs) if node.keywords or node.kwargs: raise TypeError("Function \"{name}\" does not support keyword " "arguments".format(name=res.name)) return res(*args, **kwargs) else: args = [self.visit(targ).value for targ in node.args] if node.starargs is not None: args += self.visit(node.starargs).value keywords = {} for key in node.keywords: if not isinstance(key, ast.keyword): raise ValueError("keyword error in function call " "'{func}'".format(func=node.func.id)) keywords[key.arg] = self.visit(key.value).value if node.kwargs is not None: keywords.update(self.visit(node.kwargs).value) return self.const_type(res(*args, **keywords), self.env) def translate_In(self, op): return op def visit_Compare(self, node, **kwargs): ops = node.ops comps = node.comparators # base case: we have something like a CMP b if len(comps) == 1: op = self.translate_In(ops[0]) binop = ast.BinOp(op=op, left=node.left, right=comps[0]) return self.visit(binop) # recursive case: we have a chained comparison, a CMP b CMP c, etc. left = node.left values = [] for op, comp in zip(ops, comps): new_node = self.visit(ast.Compare(comparators=[comp], left=left, ops=[self.translate_In(op)])) left = comp values.append(new_node) return self.visit(ast.BoolOp(op=ast.And(), values=values)) def _try_visit_binop(self, bop): if isinstance(bop, (Op, Term)): return bop return self.visit(bop) def visit_BoolOp(self, node, **kwargs): def visitor(x, y): lhs = self._try_visit_binop(x) rhs = self._try_visit_binop(y) op, op_class, lhs, rhs = self._maybe_transform_eq_ne( node, lhs, rhs) return self._maybe_evaluate_binop(op, node.op, lhs, rhs) operands = node.values return reduce(visitor, operands) # ast.Call signature changed on 3.5, # conditionally change which methods is named # visit_Call depending on Python version, #11097 if compat.PY35: BaseExprVisitor.visit_Call = BaseExprVisitor.visit_Call_35 else: BaseExprVisitor.visit_Call = BaseExprVisitor.visit_Call_legacy _python_not_supported = frozenset(['Dict', 'BoolOp', 'In', 'NotIn']) _numexpr_supported_calls = frozenset(_reductions + _mathops) @disallow((_unsupported_nodes | _python_not_supported) - (_boolop_nodes | frozenset(['BoolOp', 'Attribute', 'In', 'NotIn', 'Tuple']))) class PandasExprVisitor(BaseExprVisitor): def __init__(self, env, engine, parser, preparser=partial(_preparse, f=_compose(_replace_locals, _replace_booleans))): super(PandasExprVisitor, self).__init__(env, engine, parser, preparser) @disallow(_unsupported_nodes | _python_not_supported | frozenset(['Not'])) class PythonExprVisitor(BaseExprVisitor): def __init__(self, env, engine, parser, preparser=lambda x: x): super(PythonExprVisitor, self).__init__(env, engine, parser, preparser=preparser) class Expr(StringMixin): """Object encapsulating an expression. Parameters ---------- expr : str engine : str, optional, default 'numexpr' parser : str, optional, default 'pandas' env : Scope, optional, default None truediv : bool, optional, default True level : int, optional, default 2 """ def __init__(self, expr, engine='numexpr', parser='pandas', env=None, truediv=True, level=0): self.expr = expr self.env = env or Scope(level=level + 1) self.engine = engine self.parser = parser self.env.scope['truediv'] = truediv self._visitor = _parsers[parser](self.env, self.engine, self.parser) self.terms = self.parse() @property def assigner(self): return getattr(self._visitor, 'assigner', None) def __call__(self): return self.terms(self.env) def __unicode__(self): return printing.pprint_thing(self.terms) def __len__(self): return len(self.expr) def parse(self): """Parse an expression""" return self._visitor.visit(self.expr) @property def names(self): """Get the names in an expression""" if is_term(self.terms): return frozenset([self.terms.name]) return frozenset(term.name for term in com.flatten(self.terms)) _parsers = {'python': PythonExprVisitor, 'pandas': PandasExprVisitor}