test_pandas.py 51 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274
  1. # -*- coding: utf-8 -*-
  2. # pylint: disable-msg=W0612,E1101
  3. from datetime import timedelta
  4. import json
  5. import os
  6. import numpy as np
  7. import pytest
  8. from pandas.compat import (
  9. OrderedDict, StringIO, is_platform_32bit, lrange, range)
  10. import pandas.util._test_decorators as td
  11. import pandas as pd
  12. from pandas import (
  13. DataFrame, DatetimeIndex, Series, Timestamp, compat, read_json)
  14. import pandas.util.testing as tm
  15. from pandas.util.testing import (
  16. assert_almost_equal, assert_frame_equal, assert_index_equal,
  17. assert_series_equal, ensure_clean, network)
  18. _seriesd = tm.getSeriesData()
  19. _tsd = tm.getTimeSeriesData()
  20. _frame = DataFrame(_seriesd)
  21. _frame2 = DataFrame(_seriesd, columns=['D', 'C', 'B', 'A'])
  22. _intframe = DataFrame({k: v.astype(np.int64)
  23. for k, v in compat.iteritems(_seriesd)})
  24. _tsframe = DataFrame(_tsd)
  25. _cat_frame = _frame.copy()
  26. cat = ['bah'] * 5 + ['bar'] * 5 + ['baz'] * \
  27. 5 + ['foo'] * (len(_cat_frame) - 15)
  28. _cat_frame.index = pd.CategoricalIndex(cat, name='E')
  29. _cat_frame['E'] = list(reversed(cat))
  30. _cat_frame['sort'] = np.arange(len(_cat_frame), dtype='int64')
  31. _mixed_frame = _frame.copy()
  32. class TestPandasContainer(object):
  33. @pytest.fixture(scope="function", autouse=True)
  34. def setup(self, datapath):
  35. self.dirpath = datapath("io", "json", "data")
  36. self.ts = tm.makeTimeSeries()
  37. self.ts.name = 'ts'
  38. self.series = tm.makeStringSeries()
  39. self.series.name = 'series'
  40. self.objSeries = tm.makeObjectSeries()
  41. self.objSeries.name = 'objects'
  42. self.empty_series = Series([], index=[])
  43. self.empty_frame = DataFrame({})
  44. self.frame = _frame.copy()
  45. self.frame2 = _frame2.copy()
  46. self.intframe = _intframe.copy()
  47. self.tsframe = _tsframe.copy()
  48. self.mixed_frame = _mixed_frame.copy()
  49. self.categorical = _cat_frame.copy()
  50. yield
  51. del self.dirpath
  52. del self.ts
  53. del self.series
  54. del self.objSeries
  55. del self.empty_series
  56. del self.empty_frame
  57. del self.frame
  58. del self.frame2
  59. del self.intframe
  60. del self.tsframe
  61. del self.mixed_frame
  62. def test_frame_double_encoded_labels(self):
  63. df = DataFrame([['a', 'b'], ['c', 'd']],
  64. index=['index " 1', 'index / 2'],
  65. columns=['a \\ b', 'y / z'])
  66. assert_frame_equal(df, read_json(df.to_json(orient='split'),
  67. orient='split'))
  68. assert_frame_equal(df, read_json(df.to_json(orient='columns'),
  69. orient='columns'))
  70. assert_frame_equal(df, read_json(df.to_json(orient='index'),
  71. orient='index'))
  72. df_unser = read_json(df.to_json(orient='records'), orient='records')
  73. assert_index_equal(df.columns, df_unser.columns)
  74. tm.assert_numpy_array_equal(df.values, df_unser.values)
  75. def test_frame_non_unique_index(self):
  76. df = DataFrame([['a', 'b'], ['c', 'd']], index=[1, 1],
  77. columns=['x', 'y'])
  78. msg = "DataFrame index must be unique for orient='index'"
  79. with pytest.raises(ValueError, match=msg):
  80. df.to_json(orient='index')
  81. msg = "DataFrame index must be unique for orient='columns'"
  82. with pytest.raises(ValueError, match=msg):
  83. df.to_json(orient='columns')
  84. assert_frame_equal(df, read_json(df.to_json(orient='split'),
  85. orient='split'))
  86. unser = read_json(df.to_json(orient='records'), orient='records')
  87. tm.assert_index_equal(df.columns, unser.columns)
  88. tm.assert_almost_equal(df.values, unser.values)
  89. unser = read_json(df.to_json(orient='values'), orient='values')
  90. tm.assert_numpy_array_equal(df.values, unser.values)
  91. def test_frame_non_unique_columns(self):
  92. df = DataFrame([['a', 'b'], ['c', 'd']], index=[1, 2],
  93. columns=['x', 'x'])
  94. msg = "DataFrame columns must be unique for orient='index'"
  95. with pytest.raises(ValueError, match=msg):
  96. df.to_json(orient='index')
  97. msg = "DataFrame columns must be unique for orient='columns'"
  98. with pytest.raises(ValueError, match=msg):
  99. df.to_json(orient='columns')
  100. msg = "DataFrame columns must be unique for orient='records'"
  101. with pytest.raises(ValueError, match=msg):
  102. df.to_json(orient='records')
  103. assert_frame_equal(df, read_json(df.to_json(orient='split'),
  104. orient='split', dtype=False))
  105. unser = read_json(df.to_json(orient='values'), orient='values')
  106. tm.assert_numpy_array_equal(df.values, unser.values)
  107. # GH4377; duplicate columns not processing correctly
  108. df = DataFrame([['a', 'b'], ['c', 'd']], index=[
  109. 1, 2], columns=['x', 'y'])
  110. result = read_json(df.to_json(orient='split'), orient='split')
  111. assert_frame_equal(result, df)
  112. def _check(df):
  113. result = read_json(df.to_json(orient='split'), orient='split',
  114. convert_dates=['x'])
  115. assert_frame_equal(result, df)
  116. for o in [[['a', 'b'], ['c', 'd']],
  117. [[1.5, 2.5], [3.5, 4.5]],
  118. [[1, 2.5], [3, 4.5]],
  119. [[Timestamp('20130101'), 3.5],
  120. [Timestamp('20130102'), 4.5]]]:
  121. _check(DataFrame(o, index=[1, 2], columns=['x', 'x']))
  122. def test_frame_from_json_to_json(self):
  123. def _check_orient(df, orient, dtype=None, numpy=False,
  124. convert_axes=True, check_dtype=True, raise_ok=None,
  125. sort=None, check_index_type=True,
  126. check_column_type=True, check_numpy_dtype=False):
  127. if sort is not None:
  128. df = df.sort_values(sort)
  129. else:
  130. df = df.sort_index()
  131. # if we are not unique, then check that we are raising ValueError
  132. # for the appropriate orients
  133. if not df.index.is_unique and orient in ['index', 'columns']:
  134. msg = ("DataFrame index must be unique for orient='{}'"
  135. .format(orient))
  136. with pytest.raises(ValueError, match=msg):
  137. df.to_json(orient=orient)
  138. return
  139. if (not df.columns.is_unique and
  140. orient in ['index', 'columns', 'records']):
  141. # TODO: not executed. fix this.
  142. with pytest.raises(ValueError, match='ksjkajksfjksjfkjs'):
  143. df.to_json(orient=orient)
  144. return
  145. dfjson = df.to_json(orient=orient)
  146. try:
  147. unser = read_json(dfjson, orient=orient, dtype=dtype,
  148. numpy=numpy, convert_axes=convert_axes)
  149. except Exception as detail:
  150. if raise_ok is not None:
  151. if isinstance(detail, raise_ok):
  152. return
  153. raise
  154. if sort is not None and sort in unser.columns:
  155. unser = unser.sort_values(sort)
  156. else:
  157. unser = unser.sort_index()
  158. if dtype is False:
  159. check_dtype = False
  160. if not convert_axes and df.index.dtype.type == np.datetime64:
  161. unser.index = DatetimeIndex(
  162. unser.index.values.astype('i8') * 1e6)
  163. if orient == "records":
  164. # index is not captured in this orientation
  165. tm.assert_almost_equal(df.values, unser.values,
  166. check_dtype=check_numpy_dtype)
  167. tm.assert_index_equal(df.columns, unser.columns,
  168. exact=check_column_type)
  169. elif orient == "values":
  170. # index and cols are not captured in this orientation
  171. if numpy is True and df.shape == (0, 0):
  172. assert unser.shape[0] == 0
  173. else:
  174. tm.assert_almost_equal(df.values, unser.values,
  175. check_dtype=check_numpy_dtype)
  176. elif orient == "split":
  177. # index and col labels might not be strings
  178. unser.index = [str(i) for i in unser.index]
  179. unser.columns = [str(i) for i in unser.columns]
  180. if sort is None:
  181. unser = unser.sort_index()
  182. tm.assert_almost_equal(df.values, unser.values,
  183. check_dtype=check_numpy_dtype)
  184. else:
  185. if convert_axes:
  186. tm.assert_frame_equal(df, unser, check_dtype=check_dtype,
  187. check_index_type=check_index_type,
  188. check_column_type=check_column_type)
  189. else:
  190. tm.assert_frame_equal(df, unser, check_less_precise=False,
  191. check_dtype=check_dtype)
  192. def _check_all_orients(df, dtype=None, convert_axes=True,
  193. raise_ok=None, sort=None, check_index_type=True,
  194. check_column_type=True):
  195. # numpy=False
  196. if convert_axes:
  197. _check_orient(df, "columns", dtype=dtype, sort=sort,
  198. check_index_type=False, check_column_type=False)
  199. _check_orient(df, "records", dtype=dtype, sort=sort,
  200. check_index_type=False, check_column_type=False)
  201. _check_orient(df, "split", dtype=dtype, sort=sort,
  202. check_index_type=False, check_column_type=False)
  203. _check_orient(df, "index", dtype=dtype, sort=sort,
  204. check_index_type=False, check_column_type=False)
  205. _check_orient(df, "values", dtype=dtype, sort=sort,
  206. check_index_type=False, check_column_type=False)
  207. _check_orient(df, "columns", dtype=dtype,
  208. convert_axes=False, sort=sort)
  209. _check_orient(df, "records", dtype=dtype,
  210. convert_axes=False, sort=sort)
  211. _check_orient(df, "split", dtype=dtype,
  212. convert_axes=False, sort=sort)
  213. _check_orient(df, "index", dtype=dtype,
  214. convert_axes=False, sort=sort)
  215. _check_orient(df, "values", dtype=dtype,
  216. convert_axes=False, sort=sort)
  217. # numpy=True and raise_ok might be not None, so ignore the error
  218. if convert_axes:
  219. _check_orient(df, "columns", dtype=dtype, numpy=True,
  220. raise_ok=raise_ok, sort=sort,
  221. check_index_type=False, check_column_type=False)
  222. _check_orient(df, "records", dtype=dtype, numpy=True,
  223. raise_ok=raise_ok, sort=sort,
  224. check_index_type=False, check_column_type=False)
  225. _check_orient(df, "split", dtype=dtype, numpy=True,
  226. raise_ok=raise_ok, sort=sort,
  227. check_index_type=False, check_column_type=False)
  228. _check_orient(df, "index", dtype=dtype, numpy=True,
  229. raise_ok=raise_ok, sort=sort,
  230. check_index_type=False, check_column_type=False)
  231. _check_orient(df, "values", dtype=dtype, numpy=True,
  232. raise_ok=raise_ok, sort=sort,
  233. check_index_type=False, check_column_type=False)
  234. _check_orient(df, "columns", dtype=dtype, numpy=True,
  235. convert_axes=False, raise_ok=raise_ok, sort=sort)
  236. _check_orient(df, "records", dtype=dtype, numpy=True,
  237. convert_axes=False, raise_ok=raise_ok, sort=sort)
  238. _check_orient(df, "split", dtype=dtype, numpy=True,
  239. convert_axes=False, raise_ok=raise_ok, sort=sort)
  240. _check_orient(df, "index", dtype=dtype, numpy=True,
  241. convert_axes=False, raise_ok=raise_ok, sort=sort)
  242. _check_orient(df, "values", dtype=dtype, numpy=True,
  243. convert_axes=False, raise_ok=raise_ok, sort=sort)
  244. # basic
  245. _check_all_orients(self.frame)
  246. assert self.frame.to_json() == self.frame.to_json(orient="columns")
  247. _check_all_orients(self.intframe, dtype=self.intframe.values.dtype)
  248. _check_all_orients(self.intframe, dtype=False)
  249. # big one
  250. # index and columns are strings as all unserialised JSON object keys
  251. # are assumed to be strings
  252. biggie = DataFrame(np.zeros((200, 4)),
  253. columns=[str(i) for i in range(4)],
  254. index=[str(i) for i in range(200)])
  255. _check_all_orients(biggie, dtype=False, convert_axes=False)
  256. # dtypes
  257. _check_all_orients(DataFrame(biggie, dtype=np.float64),
  258. dtype=np.float64, convert_axes=False)
  259. _check_all_orients(DataFrame(biggie, dtype=np.int), dtype=np.int,
  260. convert_axes=False)
  261. _check_all_orients(DataFrame(biggie, dtype='U3'), dtype='U3',
  262. convert_axes=False, raise_ok=ValueError)
  263. # categorical
  264. _check_all_orients(self.categorical, sort='sort', raise_ok=ValueError)
  265. # empty
  266. _check_all_orients(self.empty_frame, check_index_type=False,
  267. check_column_type=False)
  268. # time series data
  269. _check_all_orients(self.tsframe)
  270. # mixed data
  271. index = pd.Index(['a', 'b', 'c', 'd', 'e'])
  272. data = {'A': [0., 1., 2., 3., 4.],
  273. 'B': [0., 1., 0., 1., 0.],
  274. 'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'],
  275. 'D': [True, False, True, False, True]}
  276. df = DataFrame(data=data, index=index)
  277. _check_orient(df, "split", check_dtype=False)
  278. _check_orient(df, "records", check_dtype=False)
  279. _check_orient(df, "values", check_dtype=False)
  280. _check_orient(df, "columns", check_dtype=False)
  281. # index oriented is problematic as it is read back in in a transposed
  282. # state, so the columns are interpreted as having mixed data and
  283. # given object dtypes.
  284. # force everything to have object dtype beforehand
  285. _check_orient(df.transpose().transpose(), "index", dtype=False)
  286. def test_frame_from_json_bad_data(self):
  287. with pytest.raises(ValueError, match='Expected object or value'):
  288. read_json(StringIO('{"key":b:a:d}'))
  289. # too few indices
  290. json = StringIO('{"columns":["A","B"],'
  291. '"index":["2","3"],'
  292. '"data":[[1.0,"1"],[2.0,"2"],[null,"3"]]}')
  293. msg = r"Shape of passed values is \(3, 2\), indices imply \(2, 2\)"
  294. with pytest.raises(ValueError, match=msg):
  295. read_json(json, orient="split")
  296. # too many columns
  297. json = StringIO('{"columns":["A","B","C"],'
  298. '"index":["1","2","3"],'
  299. '"data":[[1.0,"1"],[2.0,"2"],[null,"3"]]}')
  300. msg = "3 columns passed, passed data had 2 columns"
  301. with pytest.raises(AssertionError, match=msg):
  302. read_json(json, orient="split")
  303. # bad key
  304. json = StringIO('{"badkey":["A","B"],'
  305. '"index":["2","3"],'
  306. '"data":[[1.0,"1"],[2.0,"2"],[null,"3"]]}')
  307. with pytest.raises(ValueError, match=r"unexpected key\(s\): badkey"):
  308. read_json(json, orient="split")
  309. def test_frame_from_json_nones(self):
  310. df = DataFrame([[1, 2], [4, 5, 6]])
  311. unser = read_json(df.to_json())
  312. assert np.isnan(unser[2][0])
  313. df = DataFrame([['1', '2'], ['4', '5', '6']])
  314. unser = read_json(df.to_json())
  315. assert np.isnan(unser[2][0])
  316. unser = read_json(df.to_json(), dtype=False)
  317. assert unser[2][0] is None
  318. unser = read_json(df.to_json(), convert_axes=False, dtype=False)
  319. assert unser['2']['0'] is None
  320. unser = read_json(df.to_json(), numpy=False)
  321. assert np.isnan(unser[2][0])
  322. unser = read_json(df.to_json(), numpy=False, dtype=False)
  323. assert unser[2][0] is None
  324. unser = read_json(df.to_json(), numpy=False,
  325. convert_axes=False, dtype=False)
  326. assert unser['2']['0'] is None
  327. # infinities get mapped to nulls which get mapped to NaNs during
  328. # deserialisation
  329. df = DataFrame([[1, 2], [4, 5, 6]])
  330. df.loc[0, 2] = np.inf
  331. unser = read_json(df.to_json())
  332. assert np.isnan(unser[2][0])
  333. unser = read_json(df.to_json(), dtype=False)
  334. assert np.isnan(unser[2][0])
  335. df.loc[0, 2] = np.NINF
  336. unser = read_json(df.to_json())
  337. assert np.isnan(unser[2][0])
  338. unser = read_json(df.to_json(), dtype=False)
  339. assert np.isnan(unser[2][0])
  340. @pytest.mark.skipif(is_platform_32bit(),
  341. reason="not compliant on 32-bit, xref #15865")
  342. def test_frame_to_json_float_precision(self):
  343. df = pd.DataFrame([dict(a_float=0.95)])
  344. encoded = df.to_json(double_precision=1)
  345. assert encoded == '{"a_float":{"0":1.0}}'
  346. df = pd.DataFrame([dict(a_float=1.95)])
  347. encoded = df.to_json(double_precision=1)
  348. assert encoded == '{"a_float":{"0":2.0}}'
  349. df = pd.DataFrame([dict(a_float=-1.95)])
  350. encoded = df.to_json(double_precision=1)
  351. assert encoded == '{"a_float":{"0":-2.0}}'
  352. df = pd.DataFrame([dict(a_float=0.995)])
  353. encoded = df.to_json(double_precision=2)
  354. assert encoded == '{"a_float":{"0":1.0}}'
  355. df = pd.DataFrame([dict(a_float=0.9995)])
  356. encoded = df.to_json(double_precision=3)
  357. assert encoded == '{"a_float":{"0":1.0}}'
  358. df = pd.DataFrame([dict(a_float=0.99999999999999944)])
  359. encoded = df.to_json(double_precision=15)
  360. assert encoded == '{"a_float":{"0":1.0}}'
  361. def test_frame_to_json_except(self):
  362. df = DataFrame([1, 2, 3])
  363. msg = "Invalid value 'garbage' for option 'orient'"
  364. with pytest.raises(ValueError, match=msg):
  365. df.to_json(orient="garbage")
  366. def test_frame_empty(self):
  367. df = DataFrame(columns=['jim', 'joe'])
  368. assert not df._is_mixed_type
  369. assert_frame_equal(read_json(df.to_json(), dtype=dict(df.dtypes)), df,
  370. check_index_type=False)
  371. # GH 7445
  372. result = pd.DataFrame({'test': []}, index=[]).to_json(orient='columns')
  373. expected = '{"test":{}}'
  374. assert result == expected
  375. def test_frame_empty_mixedtype(self):
  376. # mixed type
  377. df = DataFrame(columns=['jim', 'joe'])
  378. df['joe'] = df['joe'].astype('i8')
  379. assert df._is_mixed_type
  380. assert_frame_equal(read_json(df.to_json(), dtype=dict(df.dtypes)), df,
  381. check_index_type=False)
  382. def test_frame_mixedtype_orient(self): # GH10289
  383. vals = [[10, 1, 'foo', .1, .01],
  384. [20, 2, 'bar', .2, .02],
  385. [30, 3, 'baz', .3, .03],
  386. [40, 4, 'qux', .4, .04]]
  387. df = DataFrame(vals, index=list('abcd'),
  388. columns=['1st', '2nd', '3rd', '4th', '5th'])
  389. assert df._is_mixed_type
  390. right = df.copy()
  391. for orient in ['split', 'index', 'columns']:
  392. inp = df.to_json(orient=orient)
  393. left = read_json(inp, orient=orient, convert_axes=False)
  394. assert_frame_equal(left, right)
  395. right.index = np.arange(len(df))
  396. inp = df.to_json(orient='records')
  397. left = read_json(inp, orient='records', convert_axes=False)
  398. assert_frame_equal(left, right)
  399. right.columns = np.arange(df.shape[1])
  400. inp = df.to_json(orient='values')
  401. left = read_json(inp, orient='values', convert_axes=False)
  402. assert_frame_equal(left, right)
  403. def test_v12_compat(self):
  404. df = DataFrame(
  405. [[1.56808523, 0.65727391, 1.81021139, -0.17251653],
  406. [-0.2550111, -0.08072427, -0.03202878, -0.17581665],
  407. [1.51493992, 0.11805825, 1.629455, -1.31506612],
  408. [-0.02765498, 0.44679743, 0.33192641, -0.27885413],
  409. [0.05951614, -2.69652057, 1.28163262, 0.34703478]],
  410. columns=['A', 'B', 'C', 'D'],
  411. index=pd.date_range('2000-01-03', '2000-01-07'))
  412. df['date'] = pd.Timestamp('19920106 18:21:32.12')
  413. df.iloc[3, df.columns.get_loc('date')] = pd.Timestamp('20130101')
  414. df['modified'] = df['date']
  415. df.iloc[1, df.columns.get_loc('modified')] = pd.NaT
  416. v12_json = os.path.join(self.dirpath, 'tsframe_v012.json')
  417. df_unser = pd.read_json(v12_json)
  418. assert_frame_equal(df, df_unser)
  419. df_iso = df.drop(['modified'], axis=1)
  420. v12_iso_json = os.path.join(self.dirpath, 'tsframe_iso_v012.json')
  421. df_unser_iso = pd.read_json(v12_iso_json)
  422. assert_frame_equal(df_iso, df_unser_iso)
  423. def test_blocks_compat_GH9037(self):
  424. index = pd.date_range('20000101', periods=10, freq='H')
  425. df_mixed = DataFrame(OrderedDict(
  426. float_1=[-0.92077639, 0.77434435, 1.25234727, 0.61485564,
  427. -0.60316077, 0.24653374, 0.28668979, -2.51969012,
  428. 0.95748401, -1.02970536],
  429. int_1=[19680418, 75337055, 99973684, 65103179, 79373900,
  430. 40314334, 21290235, 4991321, 41903419, 16008365],
  431. str_1=['78c608f1', '64a99743', '13d2ff52', 'ca7f4af2', '97236474',
  432. 'bde7e214', '1a6bde47', 'b1190be5', '7a669144', '8d64d068'],
  433. float_2=[-0.0428278, -1.80872357, 3.36042349, -0.7573685,
  434. -0.48217572, 0.86229683, 1.08935819, 0.93898739,
  435. -0.03030452, 1.43366348],
  436. str_2=['14f04af9', 'd085da90', '4bcfac83', '81504caf', '2ffef4a9',
  437. '08e2f5c4', '07e1af03', 'addbd4a7', '1f6a09ba', '4bfc4d87'],
  438. int_2=[86967717, 98098830, 51927505, 20372254, 12601730, 20884027,
  439. 34193846, 10561746, 24867120, 76131025]
  440. ), index=index)
  441. # JSON deserialisation always creates unicode strings
  442. df_mixed.columns = df_mixed.columns.astype('unicode')
  443. df_roundtrip = pd.read_json(df_mixed.to_json(orient='split'),
  444. orient='split')
  445. assert_frame_equal(df_mixed, df_roundtrip,
  446. check_index_type=True,
  447. check_column_type=True,
  448. check_frame_type=True,
  449. by_blocks=True,
  450. check_exact=True)
  451. def test_frame_nonprintable_bytes(self):
  452. # GH14256: failing column caused segfaults, if it is not the last one
  453. class BinaryThing(object):
  454. def __init__(self, hexed):
  455. self.hexed = hexed
  456. if compat.PY2:
  457. self.binary = hexed.decode('hex')
  458. else:
  459. self.binary = bytes.fromhex(hexed)
  460. def __str__(self):
  461. return self.hexed
  462. hexed = '574b4454ba8c5eb4f98a8f45'
  463. binthing = BinaryThing(hexed)
  464. # verify the proper conversion of printable content
  465. df_printable = DataFrame({'A': [binthing.hexed]})
  466. assert df_printable.to_json() == \
  467. '{{"A":{{"0":"{hex}"}}}}'.format(hex=hexed)
  468. # check if non-printable content throws appropriate Exception
  469. df_nonprintable = DataFrame({'A': [binthing]})
  470. msg = "Unsupported UTF-8 sequence length when encoding string"
  471. with pytest.raises(OverflowError, match=msg):
  472. df_nonprintable.to_json()
  473. # the same with multiple columns threw segfaults
  474. df_mixed = DataFrame({'A': [binthing], 'B': [1]},
  475. columns=['A', 'B'])
  476. with pytest.raises(OverflowError):
  477. df_mixed.to_json()
  478. # default_handler should resolve exceptions for non-string types
  479. assert df_nonprintable.to_json(default_handler=str) == \
  480. '{{"A":{{"0":"{hex}"}}}}'.format(hex=hexed)
  481. assert df_mixed.to_json(default_handler=str) == \
  482. '{{"A":{{"0":"{hex}"}},"B":{{"0":1}}}}'.format(hex=hexed)
  483. def test_label_overflow(self):
  484. # GH14256: buffer length not checked when writing label
  485. df = pd.DataFrame({'bar' * 100000: [1], 'foo': [1337]})
  486. assert df.to_json() == \
  487. '{{"{bar}":{{"0":1}},"foo":{{"0":1337}}}}'.format(
  488. bar=('bar' * 100000))
  489. def test_series_non_unique_index(self):
  490. s = Series(['a', 'b'], index=[1, 1])
  491. msg = "Series index must be unique for orient='index'"
  492. with pytest.raises(ValueError, match=msg):
  493. s.to_json(orient='index')
  494. assert_series_equal(s, read_json(s.to_json(orient='split'),
  495. orient='split', typ='series'))
  496. unser = read_json(s.to_json(orient='records'),
  497. orient='records', typ='series')
  498. tm.assert_numpy_array_equal(s.values, unser.values)
  499. def test_series_from_json_to_json(self):
  500. def _check_orient(series, orient, dtype=None, numpy=False,
  501. check_index_type=True):
  502. series = series.sort_index()
  503. unser = read_json(series.to_json(orient=orient),
  504. typ='series', orient=orient, numpy=numpy,
  505. dtype=dtype)
  506. unser = unser.sort_index()
  507. if orient == "records" or orient == "values":
  508. assert_almost_equal(series.values, unser.values)
  509. else:
  510. if orient == "split":
  511. assert_series_equal(series, unser,
  512. check_index_type=check_index_type)
  513. else:
  514. assert_series_equal(series, unser, check_names=False,
  515. check_index_type=check_index_type)
  516. def _check_all_orients(series, dtype=None, check_index_type=True):
  517. _check_orient(series, "columns", dtype=dtype,
  518. check_index_type=check_index_type)
  519. _check_orient(series, "records", dtype=dtype,
  520. check_index_type=check_index_type)
  521. _check_orient(series, "split", dtype=dtype,
  522. check_index_type=check_index_type)
  523. _check_orient(series, "index", dtype=dtype,
  524. check_index_type=check_index_type)
  525. _check_orient(series, "values", dtype=dtype)
  526. _check_orient(series, "columns", dtype=dtype, numpy=True,
  527. check_index_type=check_index_type)
  528. _check_orient(series, "records", dtype=dtype, numpy=True,
  529. check_index_type=check_index_type)
  530. _check_orient(series, "split", dtype=dtype, numpy=True,
  531. check_index_type=check_index_type)
  532. _check_orient(series, "index", dtype=dtype, numpy=True,
  533. check_index_type=check_index_type)
  534. _check_orient(series, "values", dtype=dtype, numpy=True,
  535. check_index_type=check_index_type)
  536. # basic
  537. _check_all_orients(self.series)
  538. assert self.series.to_json() == self.series.to_json(orient="index")
  539. objSeries = Series([str(d) for d in self.objSeries],
  540. index=self.objSeries.index,
  541. name=self.objSeries.name)
  542. _check_all_orients(objSeries, dtype=False)
  543. # empty_series has empty index with object dtype
  544. # which cannot be revert
  545. assert self.empty_series.index.dtype == np.object_
  546. _check_all_orients(self.empty_series, check_index_type=False)
  547. _check_all_orients(self.ts)
  548. # dtype
  549. s = Series(lrange(6), index=['a', 'b', 'c', 'd', 'e', 'f'])
  550. _check_all_orients(Series(s, dtype=np.float64), dtype=np.float64)
  551. _check_all_orients(Series(s, dtype=np.int), dtype=np.int)
  552. def test_series_to_json_except(self):
  553. s = Series([1, 2, 3])
  554. msg = "Invalid value 'garbage' for option 'orient'"
  555. with pytest.raises(ValueError, match=msg):
  556. s.to_json(orient="garbage")
  557. def test_series_from_json_precise_float(self):
  558. s = Series([4.56, 4.56, 4.56])
  559. result = read_json(s.to_json(), typ='series', precise_float=True)
  560. assert_series_equal(result, s, check_index_type=False)
  561. def test_series_with_dtype(self):
  562. # GH 21986
  563. s = Series([4.56, 4.56, 4.56])
  564. result = read_json(s.to_json(), typ='series', dtype=np.int64)
  565. expected = Series([4] * 3)
  566. assert_series_equal(result, expected)
  567. def test_frame_from_json_precise_float(self):
  568. df = DataFrame([[4.56, 4.56, 4.56], [4.56, 4.56, 4.56]])
  569. result = read_json(df.to_json(), precise_float=True)
  570. assert_frame_equal(result, df, check_index_type=False,
  571. check_column_type=False)
  572. def test_typ(self):
  573. s = Series(lrange(6), index=['a', 'b', 'c',
  574. 'd', 'e', 'f'], dtype='int64')
  575. result = read_json(s.to_json(), typ=None)
  576. assert_series_equal(result, s)
  577. def test_reconstruction_index(self):
  578. df = DataFrame([[1, 2, 3], [4, 5, 6]])
  579. result = read_json(df.to_json())
  580. assert_frame_equal(result, df)
  581. df = DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}, index=['A', 'B', 'C'])
  582. result = read_json(df.to_json())
  583. assert_frame_equal(result, df)
  584. def test_path(self):
  585. with ensure_clean('test.json') as path:
  586. for df in [self.frame, self.frame2, self.intframe, self.tsframe,
  587. self.mixed_frame]:
  588. df.to_json(path)
  589. read_json(path)
  590. def test_axis_dates(self):
  591. # frame
  592. json = self.tsframe.to_json()
  593. result = read_json(json)
  594. assert_frame_equal(result, self.tsframe)
  595. # series
  596. json = self.ts.to_json()
  597. result = read_json(json, typ='series')
  598. assert_series_equal(result, self.ts, check_names=False)
  599. assert result.name is None
  600. def test_convert_dates(self):
  601. # frame
  602. df = self.tsframe.copy()
  603. df['date'] = Timestamp('20130101')
  604. json = df.to_json()
  605. result = read_json(json)
  606. assert_frame_equal(result, df)
  607. df['foo'] = 1.
  608. json = df.to_json(date_unit='ns')
  609. result = read_json(json, convert_dates=False)
  610. expected = df.copy()
  611. expected['date'] = expected['date'].values.view('i8')
  612. expected['foo'] = expected['foo'].astype('int64')
  613. assert_frame_equal(result, expected)
  614. # series
  615. ts = Series(Timestamp('20130101'), index=self.ts.index)
  616. json = ts.to_json()
  617. result = read_json(json, typ='series')
  618. assert_series_equal(result, ts)
  619. def test_convert_dates_infer(self):
  620. # GH10747
  621. from pandas.io.json import dumps
  622. infer_words = ['trade_time', 'date', 'datetime', 'sold_at',
  623. 'modified', 'timestamp', 'timestamps']
  624. for infer_word in infer_words:
  625. data = [{'id': 1, infer_word: 1036713600000}, {'id': 2}]
  626. expected = DataFrame([[1, Timestamp('2002-11-08')], [2, pd.NaT]],
  627. columns=['id', infer_word])
  628. result = read_json(dumps(data))[['id', infer_word]]
  629. assert_frame_equal(result, expected)
  630. def test_date_format_frame(self):
  631. df = self.tsframe.copy()
  632. def test_w_date(date, date_unit=None):
  633. df['date'] = Timestamp(date)
  634. df.iloc[1, df.columns.get_loc('date')] = pd.NaT
  635. df.iloc[5, df.columns.get_loc('date')] = pd.NaT
  636. if date_unit:
  637. json = df.to_json(date_format='iso', date_unit=date_unit)
  638. else:
  639. json = df.to_json(date_format='iso')
  640. result = read_json(json)
  641. assert_frame_equal(result, df)
  642. test_w_date('20130101 20:43:42.123')
  643. test_w_date('20130101 20:43:42', date_unit='s')
  644. test_w_date('20130101 20:43:42.123', date_unit='ms')
  645. test_w_date('20130101 20:43:42.123456', date_unit='us')
  646. test_w_date('20130101 20:43:42.123456789', date_unit='ns')
  647. msg = "Invalid value 'foo' for option 'date_unit'"
  648. with pytest.raises(ValueError, match=msg):
  649. df.to_json(date_format='iso', date_unit='foo')
  650. def test_date_format_series(self):
  651. def test_w_date(date, date_unit=None):
  652. ts = Series(Timestamp(date), index=self.ts.index)
  653. ts.iloc[1] = pd.NaT
  654. ts.iloc[5] = pd.NaT
  655. if date_unit:
  656. json = ts.to_json(date_format='iso', date_unit=date_unit)
  657. else:
  658. json = ts.to_json(date_format='iso')
  659. result = read_json(json, typ='series')
  660. assert_series_equal(result, ts)
  661. test_w_date('20130101 20:43:42.123')
  662. test_w_date('20130101 20:43:42', date_unit='s')
  663. test_w_date('20130101 20:43:42.123', date_unit='ms')
  664. test_w_date('20130101 20:43:42.123456', date_unit='us')
  665. test_w_date('20130101 20:43:42.123456789', date_unit='ns')
  666. ts = Series(Timestamp('20130101 20:43:42.123'), index=self.ts.index)
  667. msg = "Invalid value 'foo' for option 'date_unit'"
  668. with pytest.raises(ValueError, match=msg):
  669. ts.to_json(date_format='iso', date_unit='foo')
  670. def test_date_unit(self):
  671. df = self.tsframe.copy()
  672. df['date'] = Timestamp('20130101 20:43:42')
  673. dl = df.columns.get_loc('date')
  674. df.iloc[1, dl] = Timestamp('19710101 20:43:42')
  675. df.iloc[2, dl] = Timestamp('21460101 20:43:42')
  676. df.iloc[4, dl] = pd.NaT
  677. for unit in ('s', 'ms', 'us', 'ns'):
  678. json = df.to_json(date_format='epoch', date_unit=unit)
  679. # force date unit
  680. result = read_json(json, date_unit=unit)
  681. assert_frame_equal(result, df)
  682. # detect date unit
  683. result = read_json(json, date_unit=None)
  684. assert_frame_equal(result, df)
  685. def test_weird_nested_json(self):
  686. # this used to core dump the parser
  687. s = r'''{
  688. "status": "success",
  689. "data": {
  690. "posts": [
  691. {
  692. "id": 1,
  693. "title": "A blog post",
  694. "body": "Some useful content"
  695. },
  696. {
  697. "id": 2,
  698. "title": "Another blog post",
  699. "body": "More content"
  700. }
  701. ]
  702. }
  703. }'''
  704. read_json(s)
  705. def test_doc_example(self):
  706. dfj2 = DataFrame(np.random.randn(5, 2), columns=list('AB'))
  707. dfj2['date'] = Timestamp('20130101')
  708. dfj2['ints'] = lrange(5)
  709. dfj2['bools'] = True
  710. dfj2.index = pd.date_range('20130101', periods=5)
  711. json = dfj2.to_json()
  712. result = read_json(json, dtype={'ints': np.int64, 'bools': np.bool_})
  713. assert_frame_equal(result, result)
  714. def test_misc_example(self):
  715. # parsing unordered input fails
  716. result = read_json('[{"a": 1, "b": 2}, {"b":2, "a" :1}]', numpy=True)
  717. expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
  718. error_msg = """DataFrame\\.index are different
  719. DataFrame\\.index values are different \\(100\\.0 %\\)
  720. \\[left\\]: Index\\(\\[u?'a', u?'b'\\], dtype='object'\\)
  721. \\[right\\]: RangeIndex\\(start=0, stop=2, step=1\\)"""
  722. with pytest.raises(AssertionError, match=error_msg):
  723. assert_frame_equal(result, expected, check_index_type=False)
  724. result = read_json('[{"a": 1, "b": 2}, {"b":2, "a" :1}]')
  725. expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
  726. assert_frame_equal(result, expected)
  727. @network
  728. @pytest.mark.single
  729. def test_round_trip_exception_(self):
  730. # GH 3867
  731. csv = 'https://raw.github.com/hayd/lahman2012/master/csvs/Teams.csv'
  732. df = pd.read_csv(csv)
  733. s = df.to_json()
  734. result = pd.read_json(s)
  735. assert_frame_equal(result.reindex(
  736. index=df.index, columns=df.columns), df)
  737. @network
  738. @pytest.mark.single
  739. def test_url(self):
  740. url = 'https://api.github.com/repos/pandas-dev/pandas/issues?per_page=5' # noqa
  741. result = read_json(url, convert_dates=True)
  742. for c in ['created_at', 'closed_at', 'updated_at']:
  743. assert result[c].dtype == 'datetime64[ns]'
  744. def test_timedelta(self):
  745. converter = lambda x: pd.to_timedelta(x, unit='ms')
  746. s = Series([timedelta(23), timedelta(seconds=5)])
  747. assert s.dtype == 'timedelta64[ns]'
  748. result = pd.read_json(s.to_json(), typ='series').apply(converter)
  749. assert_series_equal(result, s)
  750. s = Series([timedelta(23), timedelta(seconds=5)],
  751. index=pd.Index([0, 1]))
  752. assert s.dtype == 'timedelta64[ns]'
  753. result = pd.read_json(s.to_json(), typ='series').apply(converter)
  754. assert_series_equal(result, s)
  755. frame = DataFrame([timedelta(23), timedelta(seconds=5)])
  756. assert frame[0].dtype == 'timedelta64[ns]'
  757. assert_frame_equal(frame, pd.read_json(frame.to_json())
  758. .apply(converter))
  759. frame = DataFrame({'a': [timedelta(days=23), timedelta(seconds=5)],
  760. 'b': [1, 2],
  761. 'c': pd.date_range(start='20130101', periods=2)})
  762. result = pd.read_json(frame.to_json(date_unit='ns'))
  763. result['a'] = pd.to_timedelta(result.a, unit='ns')
  764. result['c'] = pd.to_datetime(result.c)
  765. assert_frame_equal(frame, result)
  766. def test_mixed_timedelta_datetime(self):
  767. frame = DataFrame({'a': [timedelta(23), pd.Timestamp('20130101')]},
  768. dtype=object)
  769. expected = DataFrame({'a': [pd.Timedelta(frame.a[0]).value,
  770. pd.Timestamp(frame.a[1]).value]})
  771. result = pd.read_json(frame.to_json(date_unit='ns'),
  772. dtype={'a': 'int64'})
  773. assert_frame_equal(result, expected, check_index_type=False)
  774. def test_default_handler(self):
  775. value = object()
  776. frame = DataFrame({'a': [7, value]})
  777. expected = DataFrame({'a': [7, str(value)]})
  778. result = pd.read_json(frame.to_json(default_handler=str))
  779. assert_frame_equal(expected, result, check_index_type=False)
  780. def test_default_handler_indirect(self):
  781. from pandas.io.json import dumps
  782. def default(obj):
  783. if isinstance(obj, complex):
  784. return [('mathjs', 'Complex'),
  785. ('re', obj.real),
  786. ('im', obj.imag)]
  787. return str(obj)
  788. df_list = [9, DataFrame({'a': [1, 'STR', complex(4, -5)],
  789. 'b': [float('nan'), None, 'N/A']},
  790. columns=['a', 'b'])]
  791. expected = ('[9,[[1,null],["STR",null],[[["mathjs","Complex"],'
  792. '["re",4.0],["im",-5.0]],"N\\/A"]]]')
  793. assert dumps(df_list, default_handler=default,
  794. orient="values") == expected
  795. def test_default_handler_numpy_unsupported_dtype(self):
  796. # GH12554 to_json raises 'Unhandled numpy dtype 15'
  797. df = DataFrame({'a': [1, 2.3, complex(4, -5)],
  798. 'b': [float('nan'), None, complex(1.2, 0)]},
  799. columns=['a', 'b'])
  800. expected = ('[["(1+0j)","(nan+0j)"],'
  801. '["(2.3+0j)","(nan+0j)"],'
  802. '["(4-5j)","(1.2+0j)"]]')
  803. assert df.to_json(default_handler=str, orient="values") == expected
  804. def test_default_handler_raises(self):
  805. msg = "raisin"
  806. def my_handler_raises(obj):
  807. raise TypeError(msg)
  808. with pytest.raises(TypeError, match=msg):
  809. DataFrame({'a': [1, 2, object()]}).to_json(
  810. default_handler=my_handler_raises)
  811. with pytest.raises(TypeError, match=msg):
  812. DataFrame({'a': [1, 2, complex(4, -5)]}).to_json(
  813. default_handler=my_handler_raises)
  814. def test_categorical(self):
  815. # GH4377 df.to_json segfaults with non-ndarray blocks
  816. df = DataFrame({"A": ["a", "b", "c", "a", "b", "b", "a"]})
  817. df["B"] = df["A"]
  818. expected = df.to_json()
  819. df["B"] = df["A"].astype('category')
  820. assert expected == df.to_json()
  821. s = df["A"]
  822. sc = df["B"]
  823. assert s.to_json() == sc.to_json()
  824. def test_datetime_tz(self):
  825. # GH4377 df.to_json segfaults with non-ndarray blocks
  826. tz_range = pd.date_range('20130101', periods=3, tz='US/Eastern')
  827. tz_naive = tz_range.tz_convert('utc').tz_localize(None)
  828. df = DataFrame({
  829. 'A': tz_range,
  830. 'B': pd.date_range('20130101', periods=3)})
  831. df_naive = df.copy()
  832. df_naive['A'] = tz_naive
  833. expected = df_naive.to_json()
  834. assert expected == df.to_json()
  835. stz = Series(tz_range)
  836. s_naive = Series(tz_naive)
  837. assert stz.to_json() == s_naive.to_json()
  838. def test_sparse(self):
  839. # GH4377 df.to_json segfaults with non-ndarray blocks
  840. df = pd.DataFrame(np.random.randn(10, 4))
  841. df.loc[:8] = np.nan
  842. sdf = df.to_sparse()
  843. expected = df.to_json()
  844. assert expected == sdf.to_json()
  845. s = pd.Series(np.random.randn(10))
  846. s.loc[:8] = np.nan
  847. ss = s.to_sparse()
  848. expected = s.to_json()
  849. assert expected == ss.to_json()
  850. def test_tz_is_utc(self):
  851. from pandas.io.json import dumps
  852. exp = '"2013-01-10T05:00:00.000Z"'
  853. ts = Timestamp('2013-01-10 05:00:00Z')
  854. assert dumps(ts, iso_dates=True) == exp
  855. dt = ts.to_pydatetime()
  856. assert dumps(dt, iso_dates=True) == exp
  857. ts = Timestamp('2013-01-10 00:00:00', tz='US/Eastern')
  858. assert dumps(ts, iso_dates=True) == exp
  859. dt = ts.to_pydatetime()
  860. assert dumps(dt, iso_dates=True) == exp
  861. ts = Timestamp('2013-01-10 00:00:00-0500')
  862. assert dumps(ts, iso_dates=True) == exp
  863. dt = ts.to_pydatetime()
  864. assert dumps(dt, iso_dates=True) == exp
  865. def test_tz_range_is_utc(self):
  866. from pandas.io.json import dumps
  867. exp = '["2013-01-01T05:00:00.000Z","2013-01-02T05:00:00.000Z"]'
  868. dfexp = ('{"DT":{'
  869. '"0":"2013-01-01T05:00:00.000Z",'
  870. '"1":"2013-01-02T05:00:00.000Z"}}')
  871. tz_range = pd.date_range('2013-01-01 05:00:00Z', periods=2)
  872. assert dumps(tz_range, iso_dates=True) == exp
  873. dti = pd.DatetimeIndex(tz_range)
  874. assert dumps(dti, iso_dates=True) == exp
  875. df = DataFrame({'DT': dti})
  876. result = dumps(df, iso_dates=True)
  877. assert result == dfexp
  878. tz_range = pd.date_range('2013-01-01 00:00:00', periods=2,
  879. tz='US/Eastern')
  880. assert dumps(tz_range, iso_dates=True) == exp
  881. dti = pd.DatetimeIndex(tz_range)
  882. assert dumps(dti, iso_dates=True) == exp
  883. df = DataFrame({'DT': dti})
  884. assert dumps(df, iso_dates=True) == dfexp
  885. tz_range = pd.date_range('2013-01-01 00:00:00-0500', periods=2)
  886. assert dumps(tz_range, iso_dates=True) == exp
  887. dti = pd.DatetimeIndex(tz_range)
  888. assert dumps(dti, iso_dates=True) == exp
  889. df = DataFrame({'DT': dti})
  890. assert dumps(df, iso_dates=True) == dfexp
  891. def test_read_inline_jsonl(self):
  892. # GH9180
  893. result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True)
  894. expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
  895. assert_frame_equal(result, expected)
  896. @td.skip_if_not_us_locale
  897. def test_read_s3_jsonl(self, s3_resource):
  898. # GH17200
  899. result = read_json('s3n://pandas-test/items.jsonl', lines=True)
  900. expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
  901. assert_frame_equal(result, expected)
  902. def test_read_local_jsonl(self):
  903. # GH17200
  904. with ensure_clean('tmp_items.json') as path:
  905. with open(path, 'w') as infile:
  906. infile.write('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n')
  907. result = read_json(path, lines=True)
  908. expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
  909. assert_frame_equal(result, expected)
  910. def test_read_jsonl_unicode_chars(self):
  911. # GH15132: non-ascii unicode characters
  912. # \u201d == RIGHT DOUBLE QUOTATION MARK
  913. # simulate file handle
  914. json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
  915. json = StringIO(json)
  916. result = read_json(json, lines=True)
  917. expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]],
  918. columns=['a', 'b'])
  919. assert_frame_equal(result, expected)
  920. # simulate string
  921. json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
  922. result = read_json(json, lines=True)
  923. expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]],
  924. columns=['a', 'b'])
  925. assert_frame_equal(result, expected)
  926. def test_read_json_large_numbers(self):
  927. # GH18842
  928. json = '{"articleId": "1404366058080022500245"}'
  929. json = StringIO(json)
  930. result = read_json(json, typ="series")
  931. expected = Series(1.404366e+21, index=['articleId'])
  932. assert_series_equal(result, expected)
  933. json = '{"0": {"articleId": "1404366058080022500245"}}'
  934. json = StringIO(json)
  935. result = read_json(json)
  936. expected = DataFrame(1.404366e+21, index=['articleId'], columns=[0])
  937. assert_frame_equal(result, expected)
  938. def test_to_jsonl(self):
  939. # GH9180
  940. df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
  941. result = df.to_json(orient="records", lines=True)
  942. expected = '{"a":1,"b":2}\n{"a":1,"b":2}'
  943. assert result == expected
  944. df = DataFrame([["foo}", "bar"], ['foo"', "bar"]], columns=['a', 'b'])
  945. result = df.to_json(orient="records", lines=True)
  946. expected = '{"a":"foo}","b":"bar"}\n{"a":"foo\\"","b":"bar"}'
  947. assert result == expected
  948. assert_frame_equal(pd.read_json(result, lines=True), df)
  949. # GH15096: escaped characters in columns and data
  950. df = DataFrame([["foo\\", "bar"], ['foo"', "bar"]],
  951. columns=["a\\", 'b'])
  952. result = df.to_json(orient="records", lines=True)
  953. expected = ('{"a\\\\":"foo\\\\","b":"bar"}\n'
  954. '{"a\\\\":"foo\\"","b":"bar"}')
  955. assert result == expected
  956. assert_frame_equal(pd.read_json(result, lines=True), df)
  957. def test_latin_encoding(self):
  958. if compat.PY2:
  959. pytest.skip("[unicode] is not implemented as a table column")
  960. # GH 13774
  961. pytest.skip("encoding not implemented in .to_json(), "
  962. "xref #13774")
  963. values = [[b'E\xc9, 17', b'', b'a', b'b', b'c'],
  964. [b'E\xc9, 17', b'a', b'b', b'c'],
  965. [b'EE, 17', b'', b'a', b'b', b'c'],
  966. [b'E\xc9, 17', b'\xf8\xfc', b'a', b'b', b'c'],
  967. [b'', b'a', b'b', b'c'],
  968. [b'\xf8\xfc', b'a', b'b', b'c'],
  969. [b'A\xf8\xfc', b'', b'a', b'b', b'c'],
  970. [np.nan, b'', b'b', b'c'],
  971. [b'A\xf8\xfc', np.nan, b'', b'b', b'c']]
  972. def _try_decode(x, encoding='latin-1'):
  973. try:
  974. return x.decode(encoding)
  975. except AttributeError:
  976. return x
  977. # not sure how to remove latin-1 from code in python 2 and 3
  978. values = [[_try_decode(x) for x in y] for y in values]
  979. examples = []
  980. for dtype in ['category', object]:
  981. for val in values:
  982. examples.append(Series(val, dtype=dtype))
  983. def roundtrip(s, encoding='latin-1'):
  984. with ensure_clean('test.json') as path:
  985. s.to_json(path, encoding=encoding)
  986. retr = read_json(path, encoding=encoding)
  987. assert_series_equal(s, retr, check_categorical=False)
  988. for s in examples:
  989. roundtrip(s)
  990. def test_data_frame_size_after_to_json(self):
  991. # GH15344
  992. df = DataFrame({'a': [str(1)]})
  993. size_before = df.memory_usage(index=True, deep=True).sum()
  994. df.to_json()
  995. size_after = df.memory_usage(index=True, deep=True).sum()
  996. assert size_before == size_after
  997. @pytest.mark.parametrize('data, expected', [
  998. (DataFrame([[1, 2], [4, 5]], columns=['a', 'b']),
  999. {'columns': ['a', 'b'], 'data': [[1, 2], [4, 5]]}),
  1000. (DataFrame([[1, 2], [4, 5]], columns=['a', 'b']).rename_axis('foo'),
  1001. {'columns': ['a', 'b'], 'data': [[1, 2], [4, 5]]}),
  1002. (DataFrame([[1, 2], [4, 5]], columns=['a', 'b'],
  1003. index=[['a', 'b'], ['c', 'd']]),
  1004. {'columns': ['a', 'b'], 'data': [[1, 2], [4, 5]]}),
  1005. (Series([1, 2, 3], name='A'),
  1006. {'name': 'A', 'data': [1, 2, 3]}),
  1007. (Series([1, 2, 3], name='A').rename_axis('foo'),
  1008. {'name': 'A', 'data': [1, 2, 3]}),
  1009. (Series([1, 2], name='A', index=[['a', 'b'], ['c', 'd']]),
  1010. {'name': 'A', 'data': [1, 2]}),
  1011. ])
  1012. def test_index_false_to_json_split(self, data, expected):
  1013. # GH 17394
  1014. # Testing index=False in to_json with orient='split'
  1015. result = data.to_json(orient='split', index=False)
  1016. result = json.loads(result)
  1017. assert result == expected
  1018. @pytest.mark.parametrize('data', [
  1019. (DataFrame([[1, 2], [4, 5]], columns=['a', 'b'])),
  1020. (DataFrame([[1, 2], [4, 5]], columns=['a', 'b']).rename_axis('foo')),
  1021. (DataFrame([[1, 2], [4, 5]], columns=['a', 'b'],
  1022. index=[['a', 'b'], ['c', 'd']])),
  1023. (Series([1, 2, 3], name='A')),
  1024. (Series([1, 2, 3], name='A').rename_axis('foo')),
  1025. (Series([1, 2], name='A', index=[['a', 'b'], ['c', 'd']])),
  1026. ])
  1027. def test_index_false_to_json_table(self, data):
  1028. # GH 17394
  1029. # Testing index=False in to_json with orient='table'
  1030. result = data.to_json(orient='table', index=False)
  1031. result = json.loads(result)
  1032. expected = {
  1033. 'schema': pd.io.json.build_table_schema(data, index=False),
  1034. 'data': DataFrame(data).to_dict(orient='records')
  1035. }
  1036. assert result == expected
  1037. @pytest.mark.parametrize('orient', [
  1038. 'records', 'index', 'columns', 'values'
  1039. ])
  1040. def test_index_false_error_to_json(self, orient):
  1041. # GH 17394
  1042. # Testing error message from to_json with index=False
  1043. df = pd.DataFrame([[1, 2], [4, 5]], columns=['a', 'b'])
  1044. msg = ("'index=False' is only valid when "
  1045. "'orient' is 'split' or 'table'")
  1046. with pytest.raises(ValueError, match=msg):
  1047. df.to_json(orient=orient, index=False)
  1048. @pytest.mark.parametrize('orient', ['split', 'table'])
  1049. @pytest.mark.parametrize('index', [True, False])
  1050. def test_index_false_from_json_to_json(self, orient, index):
  1051. # GH25170
  1052. # Test index=False in from_json to_json
  1053. expected = DataFrame({'a': [1, 2], 'b': [3, 4]})
  1054. dfjson = expected.to_json(orient=orient, index=index)
  1055. result = read_json(dfjson, orient=orient)
  1056. assert_frame_equal(result, expected)