12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040 |
- # -*- coding: utf-8 -*-
- from __future__ import print_function
- import operator
- import numpy as np
- import pytest
- from pandas.compat import StringIO, lrange, range, zip
- import pandas.util._test_decorators as td
- import pandas as pd
- from pandas import DataFrame, Index, MultiIndex, Series, date_range
- from pandas.core.computation.check import _NUMEXPR_INSTALLED
- from pandas.tests.frame.common import TestData
- import pandas.util.testing as tm
- from pandas.util.testing import (
- assert_frame_equal, assert_series_equal, makeCustomDataframe as mkdf)
- PARSERS = 'python', 'pandas'
- ENGINES = 'python', pytest.param('numexpr', marks=td.skip_if_no_ne)
- @pytest.fixture(params=PARSERS, ids=lambda x: x)
- def parser(request):
- return request.param
- @pytest.fixture(params=ENGINES, ids=lambda x: x)
- def engine(request):
- return request.param
- def skip_if_no_pandas_parser(parser):
- if parser != 'pandas':
- pytest.skip("cannot evaluate with parser {0!r}".format(parser))
- class TestCompat(object):
- def setup_method(self, method):
- self.df = DataFrame({'A': [1, 2, 3]})
- self.expected1 = self.df[self.df.A > 0]
- self.expected2 = self.df.A + 1
- def test_query_default(self):
- # GH 12749
- # this should always work, whether _NUMEXPR_INSTALLED or not
- df = self.df
- result = df.query('A>0')
- assert_frame_equal(result, self.expected1)
- result = df.eval('A+1')
- assert_series_equal(result, self.expected2, check_names=False)
- def test_query_None(self):
- df = self.df
- result = df.query('A>0', engine=None)
- assert_frame_equal(result, self.expected1)
- result = df.eval('A+1', engine=None)
- assert_series_equal(result, self.expected2, check_names=False)
- def test_query_python(self):
- df = self.df
- result = df.query('A>0', engine='python')
- assert_frame_equal(result, self.expected1)
- result = df.eval('A+1', engine='python')
- assert_series_equal(result, self.expected2, check_names=False)
- def test_query_numexpr(self):
- df = self.df
- if _NUMEXPR_INSTALLED:
- result = df.query('A>0', engine='numexpr')
- assert_frame_equal(result, self.expected1)
- result = df.eval('A+1', engine='numexpr')
- assert_series_equal(result, self.expected2, check_names=False)
- else:
- pytest.raises(ImportError,
- lambda: df.query('A>0', engine='numexpr'))
- pytest.raises(ImportError,
- lambda: df.eval('A+1', engine='numexpr'))
- class TestDataFrameEval(TestData):
- def test_ops(self):
- # tst ops and reversed ops in evaluation
- # GH7198
- # smaller hits python, larger hits numexpr
- for n in [4, 4000]:
- df = DataFrame(1, index=range(n), columns=list('abcd'))
- df.iloc[0] = 2
- m = df.mean()
- for op_str, op, rop in [('+', '__add__', '__radd__'),
- ('-', '__sub__', '__rsub__'),
- ('*', '__mul__', '__rmul__'),
- ('/', '__truediv__', '__rtruediv__')]:
- base = (DataFrame(np.tile(m.values, n) # noqa
- .reshape(n, -1),
- columns=list('abcd')))
- expected = eval("base{op}df".format(op=op_str))
- # ops as strings
- result = eval("m{op}df".format(op=op_str))
- assert_frame_equal(result, expected)
- # these are commutative
- if op in ['+', '*']:
- result = getattr(df, op)(m)
- assert_frame_equal(result, expected)
- # these are not
- elif op in ['-', '/']:
- result = getattr(df, rop)(m)
- assert_frame_equal(result, expected)
- # GH7192
- df = DataFrame(dict(A=np.random.randn(25000)))
- df.iloc[0:5] = np.nan
- expected = (1 - np.isnan(df.iloc[0:25]))
- result = (1 - np.isnan(df)).iloc[0:25]
- assert_frame_equal(result, expected)
- def test_query_non_str(self):
- # GH 11485
- df = pd.DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'b']})
- msg = "expr must be a string to be evaluated"
- with pytest.raises(ValueError, match=msg):
- df.query(lambda x: x.B == "b")
- with pytest.raises(ValueError, match=msg):
- df.query(111)
- def test_query_empty_string(self):
- # GH 13139
- df = pd.DataFrame({'A': [1, 2, 3]})
- msg = "expr cannot be an empty string"
- with pytest.raises(ValueError, match=msg):
- df.query('')
- def test_eval_resolvers_as_list(self):
- # GH 14095
- df = DataFrame(np.random.randn(10, 2), columns=list('ab'))
- dict1 = {'a': 1}
- dict2 = {'b': 2}
- assert (df.eval('a + b', resolvers=[dict1, dict2]) ==
- dict1['a'] + dict2['b'])
- assert (pd.eval('a + b', resolvers=[dict1, dict2]) ==
- dict1['a'] + dict2['b'])
- class TestDataFrameQueryWithMultiIndex(object):
- def test_query_with_named_multiindex(self, parser, engine):
- skip_if_no_pandas_parser(parser)
- a = np.random.choice(['red', 'green'], size=10)
- b = np.random.choice(['eggs', 'ham'], size=10)
- index = MultiIndex.from_arrays([a, b], names=['color', 'food'])
- df = DataFrame(np.random.randn(10, 2), index=index)
- ind = Series(df.index.get_level_values('color').values, index=index,
- name='color')
- # equality
- res1 = df.query('color == "red"', parser=parser, engine=engine)
- res2 = df.query('"red" == color', parser=parser, engine=engine)
- exp = df[ind == 'red']
- assert_frame_equal(res1, exp)
- assert_frame_equal(res2, exp)
- # inequality
- res1 = df.query('color != "red"', parser=parser, engine=engine)
- res2 = df.query('"red" != color', parser=parser, engine=engine)
- exp = df[ind != 'red']
- assert_frame_equal(res1, exp)
- assert_frame_equal(res2, exp)
- # list equality (really just set membership)
- res1 = df.query('color == ["red"]', parser=parser, engine=engine)
- res2 = df.query('["red"] == color', parser=parser, engine=engine)
- exp = df[ind.isin(['red'])]
- assert_frame_equal(res1, exp)
- assert_frame_equal(res2, exp)
- res1 = df.query('color != ["red"]', parser=parser, engine=engine)
- res2 = df.query('["red"] != color', parser=parser, engine=engine)
- exp = df[~ind.isin(['red'])]
- assert_frame_equal(res1, exp)
- assert_frame_equal(res2, exp)
- # in/not in ops
- res1 = df.query('["red"] in color', parser=parser, engine=engine)
- res2 = df.query('"red" in color', parser=parser, engine=engine)
- exp = df[ind.isin(['red'])]
- assert_frame_equal(res1, exp)
- assert_frame_equal(res2, exp)
- res1 = df.query('["red"] not in color', parser=parser, engine=engine)
- res2 = df.query('"red" not in color', parser=parser, engine=engine)
- exp = df[~ind.isin(['red'])]
- assert_frame_equal(res1, exp)
- assert_frame_equal(res2, exp)
- def test_query_with_unnamed_multiindex(self, parser, engine):
- skip_if_no_pandas_parser(parser)
- a = np.random.choice(['red', 'green'], size=10)
- b = np.random.choice(['eggs', 'ham'], size=10)
- index = MultiIndex.from_arrays([a, b])
- df = DataFrame(np.random.randn(10, 2), index=index)
- ind = Series(df.index.get_level_values(0).values, index=index)
- res1 = df.query('ilevel_0 == "red"', parser=parser, engine=engine)
- res2 = df.query('"red" == ilevel_0', parser=parser, engine=engine)
- exp = df[ind == 'red']
- assert_frame_equal(res1, exp)
- assert_frame_equal(res2, exp)
- # inequality
- res1 = df.query('ilevel_0 != "red"', parser=parser, engine=engine)
- res2 = df.query('"red" != ilevel_0', parser=parser, engine=engine)
- exp = df[ind != 'red']
- assert_frame_equal(res1, exp)
- assert_frame_equal(res2, exp)
- # list equality (really just set membership)
- res1 = df.query('ilevel_0 == ["red"]', parser=parser, engine=engine)
- res2 = df.query('["red"] == ilevel_0', parser=parser, engine=engine)
- exp = df[ind.isin(['red'])]
- assert_frame_equal(res1, exp)
- assert_frame_equal(res2, exp)
- res1 = df.query('ilevel_0 != ["red"]', parser=parser, engine=engine)
- res2 = df.query('["red"] != ilevel_0', parser=parser, engine=engine)
- exp = df[~ind.isin(['red'])]
- assert_frame_equal(res1, exp)
- assert_frame_equal(res2, exp)
- # in/not in ops
- res1 = df.query('["red"] in ilevel_0', parser=parser, engine=engine)
- res2 = df.query('"red" in ilevel_0', parser=parser, engine=engine)
- exp = df[ind.isin(['red'])]
- assert_frame_equal(res1, exp)
- assert_frame_equal(res2, exp)
- res1 = df.query('["red"] not in ilevel_0', parser=parser,
- engine=engine)
- res2 = df.query('"red" not in ilevel_0', parser=parser, engine=engine)
- exp = df[~ind.isin(['red'])]
- assert_frame_equal(res1, exp)
- assert_frame_equal(res2, exp)
- # ## LEVEL 1
- ind = Series(df.index.get_level_values(1).values, index=index)
- res1 = df.query('ilevel_1 == "eggs"', parser=parser, engine=engine)
- res2 = df.query('"eggs" == ilevel_1', parser=parser, engine=engine)
- exp = df[ind == 'eggs']
- assert_frame_equal(res1, exp)
- assert_frame_equal(res2, exp)
- # inequality
- res1 = df.query('ilevel_1 != "eggs"', parser=parser, engine=engine)
- res2 = df.query('"eggs" != ilevel_1', parser=parser, engine=engine)
- exp = df[ind != 'eggs']
- assert_frame_equal(res1, exp)
- assert_frame_equal(res2, exp)
- # list equality (really just set membership)
- res1 = df.query('ilevel_1 == ["eggs"]', parser=parser, engine=engine)
- res2 = df.query('["eggs"] == ilevel_1', parser=parser, engine=engine)
- exp = df[ind.isin(['eggs'])]
- assert_frame_equal(res1, exp)
- assert_frame_equal(res2, exp)
- res1 = df.query('ilevel_1 != ["eggs"]', parser=parser, engine=engine)
- res2 = df.query('["eggs"] != ilevel_1', parser=parser, engine=engine)
- exp = df[~ind.isin(['eggs'])]
- assert_frame_equal(res1, exp)
- assert_frame_equal(res2, exp)
- # in/not in ops
- res1 = df.query('["eggs"] in ilevel_1', parser=parser, engine=engine)
- res2 = df.query('"eggs" in ilevel_1', parser=parser, engine=engine)
- exp = df[ind.isin(['eggs'])]
- assert_frame_equal(res1, exp)
- assert_frame_equal(res2, exp)
- res1 = df.query('["eggs"] not in ilevel_1', parser=parser,
- engine=engine)
- res2 = df.query('"eggs" not in ilevel_1', parser=parser, engine=engine)
- exp = df[~ind.isin(['eggs'])]
- assert_frame_equal(res1, exp)
- assert_frame_equal(res2, exp)
- def test_query_with_partially_named_multiindex(self, parser, engine):
- skip_if_no_pandas_parser(parser)
- a = np.random.choice(['red', 'green'], size=10)
- b = np.arange(10)
- index = MultiIndex.from_arrays([a, b])
- index.names = [None, 'rating']
- df = DataFrame(np.random.randn(10, 2), index=index)
- res = df.query('rating == 1', parser=parser, engine=engine)
- ind = Series(df.index.get_level_values('rating').values, index=index,
- name='rating')
- exp = df[ind == 1]
- assert_frame_equal(res, exp)
- res = df.query('rating != 1', parser=parser, engine=engine)
- ind = Series(df.index.get_level_values('rating').values, index=index,
- name='rating')
- exp = df[ind != 1]
- assert_frame_equal(res, exp)
- res = df.query('ilevel_0 == "red"', parser=parser, engine=engine)
- ind = Series(df.index.get_level_values(0).values, index=index)
- exp = df[ind == "red"]
- assert_frame_equal(res, exp)
- res = df.query('ilevel_0 != "red"', parser=parser, engine=engine)
- ind = Series(df.index.get_level_values(0).values, index=index)
- exp = df[ind != "red"]
- assert_frame_equal(res, exp)
- def test_query_multiindex_get_index_resolvers(self):
- df = mkdf(10, 3, r_idx_nlevels=2, r_idx_names=['spam', 'eggs'])
- resolvers = df._get_index_resolvers()
- def to_series(mi, level):
- level_values = mi.get_level_values(level)
- s = level_values.to_series()
- s.index = mi
- return s
- col_series = df.columns.to_series()
- expected = {'index': df.index,
- 'columns': col_series,
- 'spam': to_series(df.index, 'spam'),
- 'eggs': to_series(df.index, 'eggs'),
- 'C0': col_series}
- for k, v in resolvers.items():
- if isinstance(v, Index):
- assert v.is_(expected[k])
- elif isinstance(v, Series):
- assert_series_equal(v, expected[k])
- else:
- raise AssertionError("object must be a Series or Index")
- @pytest.mark.filterwarnings("ignore::FutureWarning")
- def test_raise_on_panel_with_multiindex(self, parser, engine):
- p = tm.makePanel(7)
- p.items = tm.makeCustomIndex(len(p.items), nlevels=2)
- with pytest.raises(NotImplementedError):
- pd.eval('p + 1', parser=parser, engine=engine)
- @td.skip_if_no_ne
- class TestDataFrameQueryNumExprPandas(object):
- @classmethod
- def setup_class(cls):
- cls.engine = 'numexpr'
- cls.parser = 'pandas'
- @classmethod
- def teardown_class(cls):
- del cls.engine, cls.parser
- def test_date_query_with_attribute_access(self):
- engine, parser = self.engine, self.parser
- skip_if_no_pandas_parser(parser)
- df = DataFrame(np.random.randn(5, 3))
- df['dates1'] = date_range('1/1/2012', periods=5)
- df['dates2'] = date_range('1/1/2013', periods=5)
- df['dates3'] = date_range('1/1/2014', periods=5)
- res = df.query('@df.dates1 < 20130101 < @df.dates3', engine=engine,
- parser=parser)
- expec = df[(df.dates1 < '20130101') & ('20130101' < df.dates3)]
- assert_frame_equal(res, expec)
- def test_date_query_no_attribute_access(self):
- engine, parser = self.engine, self.parser
- df = DataFrame(np.random.randn(5, 3))
- df['dates1'] = date_range('1/1/2012', periods=5)
- df['dates2'] = date_range('1/1/2013', periods=5)
- df['dates3'] = date_range('1/1/2014', periods=5)
- res = df.query('dates1 < 20130101 < dates3', engine=engine,
- parser=parser)
- expec = df[(df.dates1 < '20130101') & ('20130101' < df.dates3)]
- assert_frame_equal(res, expec)
- def test_date_query_with_NaT(self):
- engine, parser = self.engine, self.parser
- n = 10
- df = DataFrame(np.random.randn(n, 3))
- df['dates1'] = date_range('1/1/2012', periods=n)
- df['dates2'] = date_range('1/1/2013', periods=n)
- df['dates3'] = date_range('1/1/2014', periods=n)
- df.loc[np.random.rand(n) > 0.5, 'dates1'] = pd.NaT
- df.loc[np.random.rand(n) > 0.5, 'dates3'] = pd.NaT
- res = df.query('dates1 < 20130101 < dates3', engine=engine,
- parser=parser)
- expec = df[(df.dates1 < '20130101') & ('20130101' < df.dates3)]
- assert_frame_equal(res, expec)
- def test_date_index_query(self):
- engine, parser = self.engine, self.parser
- n = 10
- df = DataFrame(np.random.randn(n, 3))
- df['dates1'] = date_range('1/1/2012', periods=n)
- df['dates3'] = date_range('1/1/2014', periods=n)
- df.set_index('dates1', inplace=True, drop=True)
- res = df.query('index < 20130101 < dates3', engine=engine,
- parser=parser)
- expec = df[(df.index < '20130101') & ('20130101' < df.dates3)]
- assert_frame_equal(res, expec)
- def test_date_index_query_with_NaT(self):
- engine, parser = self.engine, self.parser
- n = 10
- df = DataFrame(np.random.randn(n, 3))
- df['dates1'] = date_range('1/1/2012', periods=n)
- df['dates3'] = date_range('1/1/2014', periods=n)
- df.iloc[0, 0] = pd.NaT
- df.set_index('dates1', inplace=True, drop=True)
- res = df.query('index < 20130101 < dates3', engine=engine,
- parser=parser)
- expec = df[(df.index < '20130101') & ('20130101' < df.dates3)]
- assert_frame_equal(res, expec)
- def test_date_index_query_with_NaT_duplicates(self):
- engine, parser = self.engine, self.parser
- n = 10
- d = {}
- d['dates1'] = date_range('1/1/2012', periods=n)
- d['dates3'] = date_range('1/1/2014', periods=n)
- df = DataFrame(d)
- df.loc[np.random.rand(n) > 0.5, 'dates1'] = pd.NaT
- df.set_index('dates1', inplace=True, drop=True)
- res = df.query('dates1 < 20130101 < dates3', engine=engine,
- parser=parser)
- expec = df[(df.index.to_series() < '20130101') &
- ('20130101' < df.dates3)]
- assert_frame_equal(res, expec)
- def test_date_query_with_non_date(self):
- engine, parser = self.engine, self.parser
- n = 10
- df = DataFrame({'dates': date_range('1/1/2012', periods=n),
- 'nondate': np.arange(n)})
- result = df.query('dates == nondate', parser=parser, engine=engine)
- assert len(result) == 0
- result = df.query('dates != nondate', parser=parser, engine=engine)
- assert_frame_equal(result, df)
- for op in ['<', '>', '<=', '>=']:
- with pytest.raises(TypeError):
- df.query('dates %s nondate' % op, parser=parser, engine=engine)
- def test_query_syntax_error(self):
- engine, parser = self.engine, self.parser
- df = DataFrame({"i": lrange(10), "+": lrange(3, 13),
- "r": lrange(4, 14)})
- with pytest.raises(SyntaxError):
- df.query('i - +', engine=engine, parser=parser)
- def test_query_scope(self):
- from pandas.core.computation.ops import UndefinedVariableError
- engine, parser = self.engine, self.parser
- skip_if_no_pandas_parser(parser)
- df = DataFrame(np.random.randn(20, 2), columns=list('ab'))
- a, b = 1, 2 # noqa
- res = df.query('a > b', engine=engine, parser=parser)
- expected = df[df.a > df.b]
- assert_frame_equal(res, expected)
- res = df.query('@a > b', engine=engine, parser=parser)
- expected = df[a > df.b]
- assert_frame_equal(res, expected)
- # no local variable c
- with pytest.raises(UndefinedVariableError):
- df.query('@a > b > @c', engine=engine, parser=parser)
- # no column named 'c'
- with pytest.raises(UndefinedVariableError):
- df.query('@a > b > c', engine=engine, parser=parser)
- def test_query_doesnt_pickup_local(self):
- from pandas.core.computation.ops import UndefinedVariableError
- engine, parser = self.engine, self.parser
- n = m = 10
- df = DataFrame(np.random.randint(m, size=(n, 3)), columns=list('abc'))
- # we don't pick up the local 'sin'
- with pytest.raises(UndefinedVariableError):
- df.query('sin > 5', engine=engine, parser=parser)
- def test_query_builtin(self):
- from pandas.core.computation.engines import NumExprClobberingError
- engine, parser = self.engine, self.parser
- n = m = 10
- df = DataFrame(np.random.randint(m, size=(n, 3)), columns=list('abc'))
- df.index.name = 'sin'
- msg = 'Variables in expression.+'
- with pytest.raises(NumExprClobberingError, match=msg):
- df.query('sin > 5', engine=engine, parser=parser)
- def test_query(self):
- engine, parser = self.engine, self.parser
- df = DataFrame(np.random.randn(10, 3), columns=['a', 'b', 'c'])
- assert_frame_equal(df.query('a < b', engine=engine, parser=parser),
- df[df.a < df.b])
- assert_frame_equal(df.query('a + b > b * c', engine=engine,
- parser=parser),
- df[df.a + df.b > df.b * df.c])
- def test_query_index_with_name(self):
- engine, parser = self.engine, self.parser
- df = DataFrame(np.random.randint(10, size=(10, 3)),
- index=Index(range(10), name='blob'),
- columns=['a', 'b', 'c'])
- res = df.query('(blob < 5) & (a < b)', engine=engine, parser=parser)
- expec = df[(df.index < 5) & (df.a < df.b)]
- assert_frame_equal(res, expec)
- res = df.query('blob < b', engine=engine, parser=parser)
- expec = df[df.index < df.b]
- assert_frame_equal(res, expec)
- def test_query_index_without_name(self):
- engine, parser = self.engine, self.parser
- df = DataFrame(np.random.randint(10, size=(10, 3)),
- index=range(10), columns=['a', 'b', 'c'])
- # "index" should refer to the index
- res = df.query('index < b', engine=engine, parser=parser)
- expec = df[df.index < df.b]
- assert_frame_equal(res, expec)
- # test against a scalar
- res = df.query('index < 5', engine=engine, parser=parser)
- expec = df[df.index < 5]
- assert_frame_equal(res, expec)
- def test_nested_scope(self):
- engine = self.engine
- parser = self.parser
- skip_if_no_pandas_parser(parser)
- df = DataFrame(np.random.randn(5, 3))
- df2 = DataFrame(np.random.randn(5, 3))
- expected = df[(df > 0) & (df2 > 0)]
- result = df.query('(@df > 0) & (@df2 > 0)', engine=engine,
- parser=parser)
- assert_frame_equal(result, expected)
- result = pd.eval('df[df > 0 and df2 > 0]', engine=engine,
- parser=parser)
- assert_frame_equal(result, expected)
- result = pd.eval('df[df > 0 and df2 > 0 and df[df > 0] > 0]',
- engine=engine, parser=parser)
- expected = df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)]
- assert_frame_equal(result, expected)
- result = pd.eval('df[(df>0) & (df2>0)]', engine=engine, parser=parser)
- expected = df.query('(@df>0) & (@df2>0)', engine=engine, parser=parser)
- assert_frame_equal(result, expected)
- def test_nested_raises_on_local_self_reference(self):
- from pandas.core.computation.ops import UndefinedVariableError
- df = DataFrame(np.random.randn(5, 3))
- # can't reference ourself b/c we're a local so @ is necessary
- with pytest.raises(UndefinedVariableError):
- df.query('df > 0', engine=self.engine, parser=self.parser)
- def test_local_syntax(self):
- skip_if_no_pandas_parser(self.parser)
- engine, parser = self.engine, self.parser
- df = DataFrame(np.random.randn(100, 10), columns=list('abcdefghij'))
- b = 1
- expect = df[df.a < b]
- result = df.query('a < @b', engine=engine, parser=parser)
- assert_frame_equal(result, expect)
- expect = df[df.a < df.b]
- result = df.query('a < b', engine=engine, parser=parser)
- assert_frame_equal(result, expect)
- def test_chained_cmp_and_in(self):
- skip_if_no_pandas_parser(self.parser)
- engine, parser = self.engine, self.parser
- cols = list('abc')
- df = DataFrame(np.random.randn(100, len(cols)), columns=cols)
- res = df.query('a < b < c and a not in b not in c', engine=engine,
- parser=parser)
- ind = (df.a < df.b) & (df.b < df.c) & ~df.b.isin(df.a) & ~df.c.isin(df.b) # noqa
- expec = df[ind]
- assert_frame_equal(res, expec)
- def test_local_variable_with_in(self):
- engine, parser = self.engine, self.parser
- skip_if_no_pandas_parser(parser)
- a = Series(np.random.randint(3, size=15), name='a')
- b = Series(np.random.randint(10, size=15), name='b')
- df = DataFrame({'a': a, 'b': b})
- expected = df.loc[(df.b - 1).isin(a)]
- result = df.query('b - 1 in a', engine=engine, parser=parser)
- assert_frame_equal(expected, result)
- b = Series(np.random.randint(10, size=15), name='b')
- expected = df.loc[(b - 1).isin(a)]
- result = df.query('@b - 1 in a', engine=engine, parser=parser)
- assert_frame_equal(expected, result)
- def test_at_inside_string(self):
- engine, parser = self.engine, self.parser
- skip_if_no_pandas_parser(parser)
- c = 1 # noqa
- df = DataFrame({'a': ['a', 'a', 'b', 'b', '@c', '@c']})
- result = df.query('a == "@c"', engine=engine, parser=parser)
- expected = df[df.a == "@c"]
- assert_frame_equal(result, expected)
- def test_query_undefined_local(self):
- from pandas.core.computation.ops import UndefinedVariableError
- engine, parser = self.engine, self.parser
- skip_if_no_pandas_parser(parser)
- df = DataFrame(np.random.rand(10, 2), columns=list('ab'))
- msg = "local variable 'c' is not defined"
- with pytest.raises(UndefinedVariableError, match=msg):
- df.query('a == @c', engine=engine, parser=parser)
- def test_index_resolvers_come_after_columns_with_the_same_name(self):
- n = 1 # noqa
- a = np.r_[20:101:20]
- df = DataFrame({'index': a, 'b': np.random.randn(a.size)})
- df.index.name = 'index'
- result = df.query('index > 5', engine=self.engine, parser=self.parser)
- expected = df[df['index'] > 5]
- assert_frame_equal(result, expected)
- df = DataFrame({'index': a,
- 'b': np.random.randn(a.size)})
- result = df.query('ilevel_0 > 5', engine=self.engine,
- parser=self.parser)
- expected = df.loc[df.index[df.index > 5]]
- assert_frame_equal(result, expected)
- df = DataFrame({'a': a, 'b': np.random.randn(a.size)})
- df.index.name = 'a'
- result = df.query('a > 5', engine=self.engine, parser=self.parser)
- expected = df[df.a > 5]
- assert_frame_equal(result, expected)
- result = df.query('index > 5', engine=self.engine, parser=self.parser)
- expected = df.loc[df.index[df.index > 5]]
- assert_frame_equal(result, expected)
- def test_inf(self):
- n = 10
- df = DataFrame({'a': np.random.rand(n), 'b': np.random.rand(n)})
- df.loc[::2, 0] = np.inf
- ops = '==', '!='
- d = dict(zip(ops, (operator.eq, operator.ne)))
- for op, f in d.items():
- q = 'a %s inf' % op
- expected = df[f(df.a, np.inf)]
- result = df.query(q, engine=self.engine, parser=self.parser)
- assert_frame_equal(result, expected)
- @td.skip_if_no_ne
- class TestDataFrameQueryNumExprPython(TestDataFrameQueryNumExprPandas):
- @classmethod
- def setup_class(cls):
- super(TestDataFrameQueryNumExprPython, cls).setup_class()
- cls.engine = 'numexpr'
- cls.parser = 'python'
- cls.frame = TestData().frame
- def test_date_query_no_attribute_access(self):
- engine, parser = self.engine, self.parser
- df = DataFrame(np.random.randn(5, 3))
- df['dates1'] = date_range('1/1/2012', periods=5)
- df['dates2'] = date_range('1/1/2013', periods=5)
- df['dates3'] = date_range('1/1/2014', periods=5)
- res = df.query('(dates1 < 20130101) & (20130101 < dates3)',
- engine=engine, parser=parser)
- expec = df[(df.dates1 < '20130101') & ('20130101' < df.dates3)]
- assert_frame_equal(res, expec)
- def test_date_query_with_NaT(self):
- engine, parser = self.engine, self.parser
- n = 10
- df = DataFrame(np.random.randn(n, 3))
- df['dates1'] = date_range('1/1/2012', periods=n)
- df['dates2'] = date_range('1/1/2013', periods=n)
- df['dates3'] = date_range('1/1/2014', periods=n)
- df.loc[np.random.rand(n) > 0.5, 'dates1'] = pd.NaT
- df.loc[np.random.rand(n) > 0.5, 'dates3'] = pd.NaT
- res = df.query('(dates1 < 20130101) & (20130101 < dates3)',
- engine=engine, parser=parser)
- expec = df[(df.dates1 < '20130101') & ('20130101' < df.dates3)]
- assert_frame_equal(res, expec)
- def test_date_index_query(self):
- engine, parser = self.engine, self.parser
- n = 10
- df = DataFrame(np.random.randn(n, 3))
- df['dates1'] = date_range('1/1/2012', periods=n)
- df['dates3'] = date_range('1/1/2014', periods=n)
- df.set_index('dates1', inplace=True, drop=True)
- res = df.query('(index < 20130101) & (20130101 < dates3)',
- engine=engine, parser=parser)
- expec = df[(df.index < '20130101') & ('20130101' < df.dates3)]
- assert_frame_equal(res, expec)
- def test_date_index_query_with_NaT(self):
- engine, parser = self.engine, self.parser
- n = 10
- df = DataFrame(np.random.randn(n, 3))
- df['dates1'] = date_range('1/1/2012', periods=n)
- df['dates3'] = date_range('1/1/2014', periods=n)
- df.iloc[0, 0] = pd.NaT
- df.set_index('dates1', inplace=True, drop=True)
- res = df.query('(index < 20130101) & (20130101 < dates3)',
- engine=engine, parser=parser)
- expec = df[(df.index < '20130101') & ('20130101' < df.dates3)]
- assert_frame_equal(res, expec)
- def test_date_index_query_with_NaT_duplicates(self):
- engine, parser = self.engine, self.parser
- n = 10
- df = DataFrame(np.random.randn(n, 3))
- df['dates1'] = date_range('1/1/2012', periods=n)
- df['dates3'] = date_range('1/1/2014', periods=n)
- df.loc[np.random.rand(n) > 0.5, 'dates1'] = pd.NaT
- df.set_index('dates1', inplace=True, drop=True)
- with pytest.raises(NotImplementedError):
- df.query('index < 20130101 < dates3', engine=engine, parser=parser)
- def test_nested_scope(self):
- from pandas.core.computation.ops import UndefinedVariableError
- engine = self.engine
- parser = self.parser
- # smoke test
- x = 1 # noqa
- result = pd.eval('x + 1', engine=engine, parser=parser)
- assert result == 2
- df = DataFrame(np.random.randn(5, 3))
- df2 = DataFrame(np.random.randn(5, 3))
- # don't have the pandas parser
- with pytest.raises(SyntaxError):
- df.query('(@df>0) & (@df2>0)', engine=engine, parser=parser)
- with pytest.raises(UndefinedVariableError):
- df.query('(df>0) & (df2>0)', engine=engine, parser=parser)
- expected = df[(df > 0) & (df2 > 0)]
- result = pd.eval('df[(df > 0) & (df2 > 0)]', engine=engine,
- parser=parser)
- assert_frame_equal(expected, result)
- expected = df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)]
- result = pd.eval('df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)]',
- engine=engine, parser=parser)
- assert_frame_equal(expected, result)
- class TestDataFrameQueryPythonPandas(TestDataFrameQueryNumExprPandas):
- @classmethod
- def setup_class(cls):
- super(TestDataFrameQueryPythonPandas, cls).setup_class()
- cls.engine = 'python'
- cls.parser = 'pandas'
- cls.frame = TestData().frame
- def test_query_builtin(self):
- engine, parser = self.engine, self.parser
- n = m = 10
- df = DataFrame(np.random.randint(m, size=(n, 3)), columns=list('abc'))
- df.index.name = 'sin'
- expected = df[df.index > 5]
- result = df.query('sin > 5', engine=engine, parser=parser)
- assert_frame_equal(expected, result)
- class TestDataFrameQueryPythonPython(TestDataFrameQueryNumExprPython):
- @classmethod
- def setup_class(cls):
- super(TestDataFrameQueryPythonPython, cls).setup_class()
- cls.engine = cls.parser = 'python'
- cls.frame = TestData().frame
- def test_query_builtin(self):
- engine, parser = self.engine, self.parser
- n = m = 10
- df = DataFrame(np.random.randint(m, size=(n, 3)), columns=list('abc'))
- df.index.name = 'sin'
- expected = df[df.index > 5]
- result = df.query('sin > 5', engine=engine, parser=parser)
- assert_frame_equal(expected, result)
- class TestDataFrameQueryStrings(object):
- def test_str_query_method(self, parser, engine):
- df = DataFrame(np.random.randn(10, 1), columns=['b'])
- df['strings'] = Series(list('aabbccddee'))
- expect = df[df.strings == 'a']
- if parser != 'pandas':
- col = 'strings'
- lst = '"a"'
- lhs = [col] * 2 + [lst] * 2
- rhs = lhs[::-1]
- eq, ne = '==', '!='
- ops = 2 * ([eq] + [ne])
- for lhs, op, rhs in zip(lhs, ops, rhs):
- ex = '{lhs} {op} {rhs}'.format(lhs=lhs, op=op, rhs=rhs)
- pytest.raises(NotImplementedError, df.query, ex,
- engine=engine, parser=parser,
- local_dict={'strings': df.strings})
- else:
- res = df.query('"a" == strings', engine=engine, parser=parser)
- assert_frame_equal(res, expect)
- res = df.query('strings == "a"', engine=engine, parser=parser)
- assert_frame_equal(res, expect)
- assert_frame_equal(res, df[df.strings.isin(['a'])])
- expect = df[df.strings != 'a']
- res = df.query('strings != "a"', engine=engine, parser=parser)
- assert_frame_equal(res, expect)
- res = df.query('"a" != strings', engine=engine, parser=parser)
- assert_frame_equal(res, expect)
- assert_frame_equal(res, df[~df.strings.isin(['a'])])
- def test_str_list_query_method(self, parser, engine):
- df = DataFrame(np.random.randn(10, 1), columns=['b'])
- df['strings'] = Series(list('aabbccddee'))
- expect = df[df.strings.isin(['a', 'b'])]
- if parser != 'pandas':
- col = 'strings'
- lst = '["a", "b"]'
- lhs = [col] * 2 + [lst] * 2
- rhs = lhs[::-1]
- eq, ne = '==', '!='
- ops = 2 * ([eq] + [ne])
- for lhs, op, rhs in zip(lhs, ops, rhs):
- ex = '{lhs} {op} {rhs}'.format(lhs=lhs, op=op, rhs=rhs)
- with pytest.raises(NotImplementedError):
- df.query(ex, engine=engine, parser=parser)
- else:
- res = df.query('strings == ["a", "b"]', engine=engine,
- parser=parser)
- assert_frame_equal(res, expect)
- res = df.query('["a", "b"] == strings', engine=engine,
- parser=parser)
- assert_frame_equal(res, expect)
- expect = df[~df.strings.isin(['a', 'b'])]
- res = df.query('strings != ["a", "b"]', engine=engine,
- parser=parser)
- assert_frame_equal(res, expect)
- res = df.query('["a", "b"] != strings', engine=engine,
- parser=parser)
- assert_frame_equal(res, expect)
- def test_query_with_string_columns(self, parser, engine):
- df = DataFrame({'a': list('aaaabbbbcccc'),
- 'b': list('aabbccddeeff'),
- 'c': np.random.randint(5, size=12),
- 'd': np.random.randint(9, size=12)})
- if parser == 'pandas':
- res = df.query('a in b', parser=parser, engine=engine)
- expec = df[df.a.isin(df.b)]
- assert_frame_equal(res, expec)
- res = df.query('a in b and c < d', parser=parser, engine=engine)
- expec = df[df.a.isin(df.b) & (df.c < df.d)]
- assert_frame_equal(res, expec)
- else:
- with pytest.raises(NotImplementedError):
- df.query('a in b', parser=parser, engine=engine)
- with pytest.raises(NotImplementedError):
- df.query('a in b and c < d', parser=parser, engine=engine)
- def test_object_array_eq_ne(self, parser, engine):
- df = DataFrame({'a': list('aaaabbbbcccc'),
- 'b': list('aabbccddeeff'),
- 'c': np.random.randint(5, size=12),
- 'd': np.random.randint(9, size=12)})
- res = df.query('a == b', parser=parser, engine=engine)
- exp = df[df.a == df.b]
- assert_frame_equal(res, exp)
- res = df.query('a != b', parser=parser, engine=engine)
- exp = df[df.a != df.b]
- assert_frame_equal(res, exp)
- def test_query_with_nested_strings(self, parser, engine):
- skip_if_no_pandas_parser(parser)
- raw = """id event timestamp
- 1 "page 1 load" 1/1/2014 0:00:01
- 1 "page 1 exit" 1/1/2014 0:00:31
- 2 "page 2 load" 1/1/2014 0:01:01
- 2 "page 2 exit" 1/1/2014 0:01:31
- 3 "page 3 load" 1/1/2014 0:02:01
- 3 "page 3 exit" 1/1/2014 0:02:31
- 4 "page 1 load" 2/1/2014 1:00:01
- 4 "page 1 exit" 2/1/2014 1:00:31
- 5 "page 2 load" 2/1/2014 1:01:01
- 5 "page 2 exit" 2/1/2014 1:01:31
- 6 "page 3 load" 2/1/2014 1:02:01
- 6 "page 3 exit" 2/1/2014 1:02:31
- """
- df = pd.read_csv(StringIO(raw), sep=r'\s{2,}', engine='python',
- parse_dates=['timestamp'])
- expected = df[df.event == '"page 1 load"']
- res = df.query("""'"page 1 load"' in event""", parser=parser,
- engine=engine)
- assert_frame_equal(expected, res)
- def test_query_with_nested_special_character(self, parser, engine):
- skip_if_no_pandas_parser(parser)
- df = DataFrame({'a': ['a', 'b', 'test & test'],
- 'b': [1, 2, 3]})
- res = df.query('a == "test & test"', parser=parser, engine=engine)
- expec = df[df.a == 'test & test']
- assert_frame_equal(res, expec)
- def test_query_lex_compare_strings(self, parser, engine):
- import operator as opr
- a = Series(np.random.choice(list('abcde'), 20))
- b = Series(np.arange(a.size))
- df = DataFrame({'X': a, 'Y': b})
- ops = {'<': opr.lt, '>': opr.gt, '<=': opr.le, '>=': opr.ge}
- for op, func in ops.items():
- res = df.query('X %s "d"' % op, engine=engine, parser=parser)
- expected = df[func(df.X, 'd')]
- assert_frame_equal(res, expected)
- def test_query_single_element_booleans(self, parser, engine):
- columns = 'bid', 'bidsize', 'ask', 'asksize'
- data = np.random.randint(2, size=(1, len(columns))).astype(bool)
- df = DataFrame(data, columns=columns)
- res = df.query('bid & ask', engine=engine, parser=parser)
- expected = df[df.bid & df.ask]
- assert_frame_equal(res, expected)
- def test_query_string_scalar_variable(self, parser, engine):
- skip_if_no_pandas_parser(parser)
- df = pd.DataFrame({'Symbol': ['BUD US', 'BUD US', 'IBM US', 'IBM US'],
- 'Price': [109.70, 109.72, 183.30, 183.35]})
- e = df[df.Symbol == 'BUD US']
- symb = 'BUD US' # noqa
- r = df.query('Symbol == @symb', parser=parser, engine=engine)
- assert_frame_equal(e, r)
- class TestDataFrameEvalWithFrame(object):
- def setup_method(self, method):
- self.frame = DataFrame(np.random.randn(10, 3), columns=list('abc'))
- def teardown_method(self, method):
- del self.frame
- def test_simple_expr(self, parser, engine):
- res = self.frame.eval('a + b', engine=engine, parser=parser)
- expect = self.frame.a + self.frame.b
- assert_series_equal(res, expect)
- def test_bool_arith_expr(self, parser, engine):
- res = self.frame.eval('a[a < 1] + b', engine=engine, parser=parser)
- expect = self.frame.a[self.frame.a < 1] + self.frame.b
- assert_series_equal(res, expect)
- @pytest.mark.parametrize('op', ['+', '-', '*', '/'])
- def test_invalid_type_for_operator_raises(self, parser, engine, op):
- df = DataFrame({'a': [1, 2], 'b': ['c', 'd']})
- msg = r"unsupported operand type\(s\) for .+: '.+' and '.+'"
- with pytest.raises(TypeError, match=msg):
- df.eval('a {0} b'.format(op), engine=engine, parser=parser)
|