json.py 33 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951
  1. # pylint: disable-msg=E1101,W0613,W0603
  2. from itertools import islice
  3. import os
  4. import numpy as np
  5. import pandas._libs.json as json
  6. from pandas._libs.tslibs import iNaT
  7. from pandas.compat import StringIO, long, to_str, u
  8. from pandas.errors import AbstractMethodError
  9. from pandas.core.dtypes.common import is_period_dtype
  10. from pandas import DataFrame, MultiIndex, Series, compat, isna, to_datetime
  11. from pandas.core.reshape.concat import concat
  12. from pandas.io.common import (
  13. BaseIterator, _get_handle, _infer_compression, _stringify_path,
  14. get_filepath_or_buffer)
  15. from pandas.io.formats.printing import pprint_thing
  16. from pandas.io.parsers import _validate_integer
  17. from .normalize import _convert_to_line_delimits
  18. from .table_schema import build_table_schema, parse_table_schema
  19. loads = json.loads
  20. dumps = json.dumps
  21. TABLE_SCHEMA_VERSION = '0.20.0'
  22. # interface to/from
  23. def to_json(path_or_buf, obj, orient=None, date_format='epoch',
  24. double_precision=10, force_ascii=True, date_unit='ms',
  25. default_handler=None, lines=False, compression='infer',
  26. index=True):
  27. if not index and orient not in ['split', 'table']:
  28. raise ValueError("'index=False' is only valid when 'orient' is "
  29. "'split' or 'table'")
  30. path_or_buf = _stringify_path(path_or_buf)
  31. if lines and orient != 'records':
  32. raise ValueError(
  33. "'lines' keyword only valid when 'orient' is records")
  34. if orient == 'table' and isinstance(obj, Series):
  35. obj = obj.to_frame(name=obj.name or 'values')
  36. if orient == 'table' and isinstance(obj, DataFrame):
  37. writer = JSONTableWriter
  38. elif isinstance(obj, Series):
  39. writer = SeriesWriter
  40. elif isinstance(obj, DataFrame):
  41. writer = FrameWriter
  42. else:
  43. raise NotImplementedError("'obj' should be a Series or a DataFrame")
  44. s = writer(
  45. obj, orient=orient, date_format=date_format,
  46. double_precision=double_precision, ensure_ascii=force_ascii,
  47. date_unit=date_unit, default_handler=default_handler,
  48. index=index).write()
  49. if lines:
  50. s = _convert_to_line_delimits(s)
  51. if isinstance(path_or_buf, compat.string_types):
  52. fh, handles = _get_handle(path_or_buf, 'w', compression=compression)
  53. try:
  54. fh.write(s)
  55. finally:
  56. fh.close()
  57. elif path_or_buf is None:
  58. return s
  59. else:
  60. path_or_buf.write(s)
  61. class Writer(object):
  62. def __init__(self, obj, orient, date_format, double_precision,
  63. ensure_ascii, date_unit, index, default_handler=None):
  64. self.obj = obj
  65. if orient is None:
  66. orient = self._default_orient
  67. self.orient = orient
  68. self.date_format = date_format
  69. self.double_precision = double_precision
  70. self.ensure_ascii = ensure_ascii
  71. self.date_unit = date_unit
  72. self.default_handler = default_handler
  73. self.index = index
  74. self.is_copy = None
  75. self._format_axes()
  76. def _format_axes(self):
  77. raise AbstractMethodError(self)
  78. def write(self):
  79. return self._write(self.obj, self.orient, self.double_precision,
  80. self.ensure_ascii, self.date_unit,
  81. self.date_format == 'iso', self.default_handler)
  82. def _write(self, obj, orient, double_precision, ensure_ascii,
  83. date_unit, iso_dates, default_handler):
  84. return dumps(
  85. obj,
  86. orient=orient,
  87. double_precision=double_precision,
  88. ensure_ascii=ensure_ascii,
  89. date_unit=date_unit,
  90. iso_dates=iso_dates,
  91. default_handler=default_handler
  92. )
  93. class SeriesWriter(Writer):
  94. _default_orient = 'index'
  95. def _format_axes(self):
  96. if not self.obj.index.is_unique and self.orient == 'index':
  97. raise ValueError("Series index must be unique for orient="
  98. "'{orient}'".format(orient=self.orient))
  99. def _write(self, obj, orient, double_precision, ensure_ascii,
  100. date_unit, iso_dates, default_handler):
  101. if not self.index and orient == 'split':
  102. obj = {"name": obj.name, "data": obj.values}
  103. return super(SeriesWriter, self)._write(obj, orient,
  104. double_precision,
  105. ensure_ascii, date_unit,
  106. iso_dates, default_handler)
  107. class FrameWriter(Writer):
  108. _default_orient = 'columns'
  109. def _format_axes(self):
  110. """
  111. Try to format axes if they are datelike.
  112. """
  113. if not self.obj.index.is_unique and self.orient in (
  114. 'index', 'columns'):
  115. raise ValueError("DataFrame index must be unique for orient="
  116. "'{orient}'.".format(orient=self.orient))
  117. if not self.obj.columns.is_unique and self.orient in (
  118. 'index', 'columns', 'records'):
  119. raise ValueError("DataFrame columns must be unique for orient="
  120. "'{orient}'.".format(orient=self.orient))
  121. def _write(self, obj, orient, double_precision, ensure_ascii,
  122. date_unit, iso_dates, default_handler):
  123. if not self.index and orient == 'split':
  124. obj = obj.to_dict(orient='split')
  125. del obj["index"]
  126. return super(FrameWriter, self)._write(obj, orient,
  127. double_precision,
  128. ensure_ascii, date_unit,
  129. iso_dates, default_handler)
  130. class JSONTableWriter(FrameWriter):
  131. _default_orient = 'records'
  132. def __init__(self, obj, orient, date_format, double_precision,
  133. ensure_ascii, date_unit, index, default_handler=None):
  134. """
  135. Adds a `schema` attribute with the Table Schema, resets
  136. the index (can't do in caller, because the schema inference needs
  137. to know what the index is, forces orient to records, and forces
  138. date_format to 'iso'.
  139. """
  140. super(JSONTableWriter, self).__init__(
  141. obj, orient, date_format, double_precision, ensure_ascii,
  142. date_unit, index, default_handler=default_handler)
  143. if date_format != 'iso':
  144. msg = ("Trying to write with `orient='table'` and "
  145. "`date_format='{fmt}'`. Table Schema requires dates "
  146. "to be formatted with `date_format='iso'`"
  147. .format(fmt=date_format))
  148. raise ValueError(msg)
  149. self.schema = build_table_schema(obj, index=self.index)
  150. # NotImplementd on a column MultiIndex
  151. if obj.ndim == 2 and isinstance(obj.columns, MultiIndex):
  152. raise NotImplementedError(
  153. "orient='table' is not supported for MultiIndex")
  154. # TODO: Do this timedelta properly in objToJSON.c See GH #15137
  155. if ((obj.ndim == 1) and (obj.name in set(obj.index.names)) or
  156. len(obj.columns & obj.index.names)):
  157. msg = "Overlapping names between the index and columns"
  158. raise ValueError(msg)
  159. obj = obj.copy()
  160. timedeltas = obj.select_dtypes(include=['timedelta']).columns
  161. if len(timedeltas):
  162. obj[timedeltas] = obj[timedeltas].applymap(
  163. lambda x: x.isoformat())
  164. # Convert PeriodIndex to datetimes before serialzing
  165. if is_period_dtype(obj.index):
  166. obj.index = obj.index.to_timestamp()
  167. # exclude index from obj if index=False
  168. if not self.index:
  169. self.obj = obj.reset_index(drop=True)
  170. else:
  171. self.obj = obj.reset_index(drop=False)
  172. self.date_format = 'iso'
  173. self.orient = 'records'
  174. self.index = index
  175. def _write(self, obj, orient, double_precision, ensure_ascii,
  176. date_unit, iso_dates, default_handler):
  177. data = super(JSONTableWriter, self)._write(obj, orient,
  178. double_precision,
  179. ensure_ascii, date_unit,
  180. iso_dates,
  181. default_handler)
  182. serialized = '{{"schema": {schema}, "data": {data}}}'.format(
  183. schema=dumps(self.schema), data=data)
  184. return serialized
  185. def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
  186. convert_axes=True, convert_dates=True, keep_default_dates=True,
  187. numpy=False, precise_float=False, date_unit=None, encoding=None,
  188. lines=False, chunksize=None, compression='infer'):
  189. """
  190. Convert a JSON string to pandas object.
  191. Parameters
  192. ----------
  193. path_or_buf : a valid JSON string or file-like, default: None
  194. The string could be a URL. Valid URL schemes include http, ftp, s3,
  195. gcs, and file. For file URLs, a host is expected. For instance, a local
  196. file could be ``file://localhost/path/to/table.json``
  197. orient : string,
  198. Indication of expected JSON string format.
  199. Compatible JSON strings can be produced by ``to_json()`` with a
  200. corresponding orient value.
  201. The set of possible orients is:
  202. - ``'split'`` : dict like
  203. ``{index -> [index], columns -> [columns], data -> [values]}``
  204. - ``'records'`` : list like
  205. ``[{column -> value}, ... , {column -> value}]``
  206. - ``'index'`` : dict like ``{index -> {column -> value}}``
  207. - ``'columns'`` : dict like ``{column -> {index -> value}}``
  208. - ``'values'`` : just the values array
  209. The allowed and default values depend on the value
  210. of the `typ` parameter.
  211. * when ``typ == 'series'``,
  212. - allowed orients are ``{'split','records','index'}``
  213. - default is ``'index'``
  214. - The Series index must be unique for orient ``'index'``.
  215. * when ``typ == 'frame'``,
  216. - allowed orients are ``{'split','records','index',
  217. 'columns','values', 'table'}``
  218. - default is ``'columns'``
  219. - The DataFrame index must be unique for orients ``'index'`` and
  220. ``'columns'``.
  221. - The DataFrame columns must be unique for orients ``'index'``,
  222. ``'columns'``, and ``'records'``.
  223. .. versionadded:: 0.23.0
  224. 'table' as an allowed value for the ``orient`` argument
  225. typ : type of object to recover (series or frame), default 'frame'
  226. dtype : boolean or dict, default True
  227. If True, infer dtypes, if a dict of column to dtype, then use those,
  228. if False, then don't infer dtypes at all, applies only to the data.
  229. convert_axes : boolean, default True
  230. Try to convert the axes to the proper dtypes.
  231. convert_dates : boolean, default True
  232. List of columns to parse for dates; If True, then try to parse
  233. datelike columns default is True; a column label is datelike if
  234. * it ends with ``'_at'``,
  235. * it ends with ``'_time'``,
  236. * it begins with ``'timestamp'``,
  237. * it is ``'modified'``, or
  238. * it is ``'date'``
  239. keep_default_dates : boolean, default True
  240. If parsing dates, then parse the default datelike columns
  241. numpy : boolean, default False
  242. Direct decoding to numpy arrays. Supports numeric data only, but
  243. non-numeric column and index labels are supported. Note also that the
  244. JSON ordering MUST be the same for each term if numpy=True.
  245. precise_float : boolean, default False
  246. Set to enable usage of higher precision (strtod) function when
  247. decoding string to double values. Default (False) is to use fast but
  248. less precise builtin functionality
  249. date_unit : string, default None
  250. The timestamp unit to detect if converting dates. The default behaviour
  251. is to try and detect the correct precision, but if this is not desired
  252. then pass one of 's', 'ms', 'us' or 'ns' to force parsing only seconds,
  253. milliseconds, microseconds or nanoseconds respectively.
  254. encoding : str, default is 'utf-8'
  255. The encoding to use to decode py3 bytes.
  256. .. versionadded:: 0.19.0
  257. lines : boolean, default False
  258. Read the file as a json object per line.
  259. .. versionadded:: 0.19.0
  260. chunksize : integer, default None
  261. Return JsonReader object for iteration.
  262. See the `line-delimted json docs
  263. <http://pandas.pydata.org/pandas-docs/stable/io.html#io-jsonl>`_
  264. for more information on ``chunksize``.
  265. This can only be passed if `lines=True`.
  266. If this is None, the file will be read into memory all at once.
  267. .. versionadded:: 0.21.0
  268. compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
  269. For on-the-fly decompression of on-disk data. If 'infer', then use
  270. gzip, bz2, zip or xz if path_or_buf is a string ending in
  271. '.gz', '.bz2', '.zip', or 'xz', respectively, and no decompression
  272. otherwise. If using 'zip', the ZIP file must contain only one data
  273. file to be read in. Set to None for no decompression.
  274. .. versionadded:: 0.21.0
  275. Returns
  276. -------
  277. result : Series or DataFrame, depending on the value of `typ`.
  278. See Also
  279. --------
  280. DataFrame.to_json
  281. Notes
  282. -----
  283. Specific to ``orient='table'``, if a :class:`DataFrame` with a literal
  284. :class:`Index` name of `index` gets written with :func:`to_json`, the
  285. subsequent read operation will incorrectly set the :class:`Index` name to
  286. ``None``. This is because `index` is also used by :func:`DataFrame.to_json`
  287. to denote a missing :class:`Index` name, and the subsequent
  288. :func:`read_json` operation cannot distinguish between the two. The same
  289. limitation is encountered with a :class:`MultiIndex` and any names
  290. beginning with ``'level_'``.
  291. Examples
  292. --------
  293. >>> df = pd.DataFrame([['a', 'b'], ['c', 'd']],
  294. ... index=['row 1', 'row 2'],
  295. ... columns=['col 1', 'col 2'])
  296. Encoding/decoding a Dataframe using ``'split'`` formatted JSON:
  297. >>> df.to_json(orient='split')
  298. '{"columns":["col 1","col 2"],
  299. "index":["row 1","row 2"],
  300. "data":[["a","b"],["c","d"]]}'
  301. >>> pd.read_json(_, orient='split')
  302. col 1 col 2
  303. row 1 a b
  304. row 2 c d
  305. Encoding/decoding a Dataframe using ``'index'`` formatted JSON:
  306. >>> df.to_json(orient='index')
  307. '{"row 1":{"col 1":"a","col 2":"b"},"row 2":{"col 1":"c","col 2":"d"}}'
  308. >>> pd.read_json(_, orient='index')
  309. col 1 col 2
  310. row 1 a b
  311. row 2 c d
  312. Encoding/decoding a Dataframe using ``'records'`` formatted JSON.
  313. Note that index labels are not preserved with this encoding.
  314. >>> df.to_json(orient='records')
  315. '[{"col 1":"a","col 2":"b"},{"col 1":"c","col 2":"d"}]'
  316. >>> pd.read_json(_, orient='records')
  317. col 1 col 2
  318. 0 a b
  319. 1 c d
  320. Encoding with Table Schema
  321. >>> df.to_json(orient='table')
  322. '{"schema": {"fields": [{"name": "index", "type": "string"},
  323. {"name": "col 1", "type": "string"},
  324. {"name": "col 2", "type": "string"}],
  325. "primaryKey": "index",
  326. "pandas_version": "0.20.0"},
  327. "data": [{"index": "row 1", "col 1": "a", "col 2": "b"},
  328. {"index": "row 2", "col 1": "c", "col 2": "d"}]}'
  329. """
  330. compression = _infer_compression(path_or_buf, compression)
  331. filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer(
  332. path_or_buf, encoding=encoding, compression=compression,
  333. )
  334. json_reader = JsonReader(
  335. filepath_or_buffer, orient=orient, typ=typ, dtype=dtype,
  336. convert_axes=convert_axes, convert_dates=convert_dates,
  337. keep_default_dates=keep_default_dates, numpy=numpy,
  338. precise_float=precise_float, date_unit=date_unit, encoding=encoding,
  339. lines=lines, chunksize=chunksize, compression=compression,
  340. )
  341. if chunksize:
  342. return json_reader
  343. result = json_reader.read()
  344. if should_close:
  345. try:
  346. filepath_or_buffer.close()
  347. except: # noqa: flake8
  348. pass
  349. return result
  350. class JsonReader(BaseIterator):
  351. """
  352. JsonReader provides an interface for reading in a JSON file.
  353. If initialized with ``lines=True`` and ``chunksize``, can be iterated over
  354. ``chunksize`` lines at a time. Otherwise, calling ``read`` reads in the
  355. whole document.
  356. """
  357. def __init__(self, filepath_or_buffer, orient, typ, dtype, convert_axes,
  358. convert_dates, keep_default_dates, numpy, precise_float,
  359. date_unit, encoding, lines, chunksize, compression):
  360. self.path_or_buf = filepath_or_buffer
  361. self.orient = orient
  362. self.typ = typ
  363. self.dtype = dtype
  364. self.convert_axes = convert_axes
  365. self.convert_dates = convert_dates
  366. self.keep_default_dates = keep_default_dates
  367. self.numpy = numpy
  368. self.precise_float = precise_float
  369. self.date_unit = date_unit
  370. self.encoding = encoding
  371. self.compression = compression
  372. self.lines = lines
  373. self.chunksize = chunksize
  374. self.nrows_seen = 0
  375. self.should_close = False
  376. if self.chunksize is not None:
  377. self.chunksize = _validate_integer("chunksize", self.chunksize, 1)
  378. if not self.lines:
  379. raise ValueError("chunksize can only be passed if lines=True")
  380. data = self._get_data_from_filepath(filepath_or_buffer)
  381. self.data = self._preprocess_data(data)
  382. def _preprocess_data(self, data):
  383. """
  384. At this point, the data either has a `read` attribute (e.g. a file
  385. object or a StringIO) or is a string that is a JSON document.
  386. If self.chunksize, we prepare the data for the `__next__` method.
  387. Otherwise, we read it into memory for the `read` method.
  388. """
  389. if hasattr(data, 'read') and not self.chunksize:
  390. data = data.read()
  391. if not hasattr(data, 'read') and self.chunksize:
  392. data = StringIO(data)
  393. return data
  394. def _get_data_from_filepath(self, filepath_or_buffer):
  395. """
  396. The function read_json accepts three input types:
  397. 1. filepath (string-like)
  398. 2. file-like object (e.g. open file object, StringIO)
  399. 3. JSON string
  400. This method turns (1) into (2) to simplify the rest of the processing.
  401. It returns input types (2) and (3) unchanged.
  402. """
  403. data = filepath_or_buffer
  404. exists = False
  405. if isinstance(data, compat.string_types):
  406. try:
  407. exists = os.path.exists(filepath_or_buffer)
  408. # gh-5874: if the filepath is too long will raise here
  409. except (TypeError, ValueError):
  410. pass
  411. if exists or self.compression is not None:
  412. data, _ = _get_handle(filepath_or_buffer, 'r',
  413. encoding=self.encoding,
  414. compression=self.compression)
  415. self.should_close = True
  416. self.open_stream = data
  417. return data
  418. def _combine_lines(self, lines):
  419. """
  420. Combines a list of JSON objects into one JSON object.
  421. """
  422. lines = filter(None, map(lambda x: x.strip(), lines))
  423. return '[' + ','.join(lines) + ']'
  424. def read(self):
  425. """
  426. Read the whole JSON input into a pandas object.
  427. """
  428. if self.lines and self.chunksize:
  429. obj = concat(self)
  430. elif self.lines:
  431. data = to_str(self.data)
  432. obj = self._get_object_parser(
  433. self._combine_lines(data.split('\n'))
  434. )
  435. else:
  436. obj = self._get_object_parser(self.data)
  437. self.close()
  438. return obj
  439. def _get_object_parser(self, json):
  440. """
  441. Parses a json document into a pandas object.
  442. """
  443. typ = self.typ
  444. dtype = self.dtype
  445. kwargs = {
  446. "orient": self.orient, "dtype": self.dtype,
  447. "convert_axes": self.convert_axes,
  448. "convert_dates": self.convert_dates,
  449. "keep_default_dates": self.keep_default_dates, "numpy": self.numpy,
  450. "precise_float": self.precise_float, "date_unit": self.date_unit
  451. }
  452. obj = None
  453. if typ == 'frame':
  454. obj = FrameParser(json, **kwargs).parse()
  455. if typ == 'series' or obj is None:
  456. if not isinstance(dtype, bool):
  457. kwargs['dtype'] = dtype
  458. obj = SeriesParser(json, **kwargs).parse()
  459. return obj
  460. def close(self):
  461. """
  462. If we opened a stream earlier, in _get_data_from_filepath, we should
  463. close it.
  464. If an open stream or file was passed, we leave it open.
  465. """
  466. if self.should_close:
  467. try:
  468. self.open_stream.close()
  469. except (IOError, AttributeError):
  470. pass
  471. def __next__(self):
  472. lines = list(islice(self.data, self.chunksize))
  473. if lines:
  474. lines_json = self._combine_lines(lines)
  475. obj = self._get_object_parser(lines_json)
  476. # Make sure that the returned objects have the right index.
  477. obj.index = range(self.nrows_seen, self.nrows_seen + len(obj))
  478. self.nrows_seen += len(obj)
  479. return obj
  480. self.close()
  481. raise StopIteration
  482. class Parser(object):
  483. _STAMP_UNITS = ('s', 'ms', 'us', 'ns')
  484. _MIN_STAMPS = {
  485. 's': long(31536000),
  486. 'ms': long(31536000000),
  487. 'us': long(31536000000000),
  488. 'ns': long(31536000000000000)}
  489. def __init__(self, json, orient, dtype=True, convert_axes=True,
  490. convert_dates=True, keep_default_dates=False, numpy=False,
  491. precise_float=False, date_unit=None):
  492. self.json = json
  493. if orient is None:
  494. orient = self._default_orient
  495. self.orient = orient
  496. self.dtype = dtype
  497. if orient == "split":
  498. numpy = False
  499. if date_unit is not None:
  500. date_unit = date_unit.lower()
  501. if date_unit not in self._STAMP_UNITS:
  502. raise ValueError('date_unit must be one of {units}'
  503. .format(units=self._STAMP_UNITS))
  504. self.min_stamp = self._MIN_STAMPS[date_unit]
  505. else:
  506. self.min_stamp = self._MIN_STAMPS['s']
  507. self.numpy = numpy
  508. self.precise_float = precise_float
  509. self.convert_axes = convert_axes
  510. self.convert_dates = convert_dates
  511. self.date_unit = date_unit
  512. self.keep_default_dates = keep_default_dates
  513. self.obj = None
  514. def check_keys_split(self, decoded):
  515. """
  516. Checks that dict has only the appropriate keys for orient='split'.
  517. """
  518. bad_keys = set(decoded.keys()).difference(set(self._split_keys))
  519. if bad_keys:
  520. bad_keys = ", ".join(bad_keys)
  521. raise ValueError(u("JSON data had unexpected key(s): {bad_keys}")
  522. .format(bad_keys=pprint_thing(bad_keys)))
  523. def parse(self):
  524. # try numpy
  525. numpy = self.numpy
  526. if numpy:
  527. self._parse_numpy()
  528. else:
  529. self._parse_no_numpy()
  530. if self.obj is None:
  531. return None
  532. if self.convert_axes:
  533. self._convert_axes()
  534. self._try_convert_types()
  535. return self.obj
  536. def _convert_axes(self):
  537. """
  538. Try to convert axes.
  539. """
  540. for axis in self.obj._AXIS_NUMBERS.keys():
  541. new_axis, result = self._try_convert_data(
  542. axis, self.obj._get_axis(axis), use_dtypes=False,
  543. convert_dates=True)
  544. if result:
  545. setattr(self.obj, axis, new_axis)
  546. def _try_convert_types(self):
  547. raise AbstractMethodError(self)
  548. def _try_convert_data(self, name, data, use_dtypes=True,
  549. convert_dates=True):
  550. """
  551. Try to parse a ndarray like into a column by inferring dtype.
  552. """
  553. # don't try to coerce, unless a force conversion
  554. if use_dtypes:
  555. if self.dtype is False:
  556. return data, False
  557. elif self.dtype is True:
  558. pass
  559. else:
  560. # dtype to force
  561. dtype = (self.dtype.get(name)
  562. if isinstance(self.dtype, dict) else self.dtype)
  563. if dtype is not None:
  564. try:
  565. dtype = np.dtype(dtype)
  566. return data.astype(dtype), True
  567. except (TypeError, ValueError):
  568. return data, False
  569. if convert_dates:
  570. new_data, result = self._try_convert_to_date(data)
  571. if result:
  572. return new_data, True
  573. result = False
  574. if data.dtype == 'object':
  575. # try float
  576. try:
  577. data = data.astype('float64')
  578. result = True
  579. except (TypeError, ValueError):
  580. pass
  581. if data.dtype.kind == 'f':
  582. if data.dtype != 'float64':
  583. # coerce floats to 64
  584. try:
  585. data = data.astype('float64')
  586. result = True
  587. except (TypeError, ValueError):
  588. pass
  589. # don't coerce 0-len data
  590. if len(data) and (data.dtype == 'float' or data.dtype == 'object'):
  591. # coerce ints if we can
  592. try:
  593. new_data = data.astype('int64')
  594. if (new_data == data).all():
  595. data = new_data
  596. result = True
  597. except (TypeError, ValueError):
  598. pass
  599. # coerce ints to 64
  600. if data.dtype == 'int':
  601. # coerce floats to 64
  602. try:
  603. data = data.astype('int64')
  604. result = True
  605. except (TypeError, ValueError):
  606. pass
  607. return data, result
  608. def _try_convert_to_date(self, data):
  609. """
  610. Try to parse a ndarray like into a date column.
  611. Try to coerce object in epoch/iso formats and integer/float in epoch
  612. formats. Return a boolean if parsing was successful.
  613. """
  614. # no conversion on empty
  615. if not len(data):
  616. return data, False
  617. new_data = data
  618. if new_data.dtype == 'object':
  619. try:
  620. new_data = data.astype('int64')
  621. except (TypeError, ValueError, OverflowError):
  622. pass
  623. # ignore numbers that are out of range
  624. if issubclass(new_data.dtype.type, np.number):
  625. in_range = (isna(new_data.values) | (new_data > self.min_stamp) |
  626. (new_data.values == iNaT))
  627. if not in_range.all():
  628. return data, False
  629. date_units = (self.date_unit,) if self.date_unit else self._STAMP_UNITS
  630. for date_unit in date_units:
  631. try:
  632. new_data = to_datetime(new_data, errors='raise',
  633. unit=date_unit)
  634. except ValueError:
  635. continue
  636. except Exception:
  637. break
  638. return new_data, True
  639. return data, False
  640. def _try_convert_dates(self):
  641. raise AbstractMethodError(self)
  642. class SeriesParser(Parser):
  643. _default_orient = 'index'
  644. _split_keys = ('name', 'index', 'data')
  645. def _parse_no_numpy(self):
  646. json = self.json
  647. orient = self.orient
  648. if orient == "split":
  649. decoded = {str(k): v for k, v in compat.iteritems(
  650. loads(json, precise_float=self.precise_float))}
  651. self.check_keys_split(decoded)
  652. self.obj = Series(dtype=None, **decoded)
  653. else:
  654. self.obj = Series(
  655. loads(json, precise_float=self.precise_float), dtype=None)
  656. def _parse_numpy(self):
  657. json = self.json
  658. orient = self.orient
  659. if orient == "split":
  660. decoded = loads(json, dtype=None, numpy=True,
  661. precise_float=self.precise_float)
  662. decoded = {str(k): v for k, v in compat.iteritems(decoded)}
  663. self.check_keys_split(decoded)
  664. self.obj = Series(**decoded)
  665. elif orient == "columns" or orient == "index":
  666. self.obj = Series(*loads(json, dtype=None, numpy=True,
  667. labelled=True,
  668. precise_float=self.precise_float))
  669. else:
  670. self.obj = Series(loads(json, dtype=None, numpy=True,
  671. precise_float=self.precise_float))
  672. def _try_convert_types(self):
  673. if self.obj is None:
  674. return
  675. obj, result = self._try_convert_data(
  676. 'data', self.obj, convert_dates=self.convert_dates)
  677. if result:
  678. self.obj = obj
  679. class FrameParser(Parser):
  680. _default_orient = 'columns'
  681. _split_keys = ('columns', 'index', 'data')
  682. def _parse_numpy(self):
  683. json = self.json
  684. orient = self.orient
  685. if orient == "columns":
  686. args = loads(json, dtype=None, numpy=True, labelled=True,
  687. precise_float=self.precise_float)
  688. if len(args):
  689. args = (args[0].T, args[2], args[1])
  690. self.obj = DataFrame(*args)
  691. elif orient == "split":
  692. decoded = loads(json, dtype=None, numpy=True,
  693. precise_float=self.precise_float)
  694. decoded = {str(k): v for k, v in compat.iteritems(decoded)}
  695. self.check_keys_split(decoded)
  696. self.obj = DataFrame(**decoded)
  697. elif orient == "values":
  698. self.obj = DataFrame(loads(json, dtype=None, numpy=True,
  699. precise_float=self.precise_float))
  700. else:
  701. self.obj = DataFrame(*loads(json, dtype=None, numpy=True,
  702. labelled=True,
  703. precise_float=self.precise_float))
  704. def _parse_no_numpy(self):
  705. json = self.json
  706. orient = self.orient
  707. if orient == "columns":
  708. self.obj = DataFrame(
  709. loads(json, precise_float=self.precise_float), dtype=None)
  710. elif orient == "split":
  711. decoded = {str(k): v for k, v in compat.iteritems(
  712. loads(json, precise_float=self.precise_float))}
  713. self.check_keys_split(decoded)
  714. self.obj = DataFrame(dtype=None, **decoded)
  715. elif orient == "index":
  716. self.obj = DataFrame(
  717. loads(json, precise_float=self.precise_float), dtype=None).T
  718. elif orient == 'table':
  719. self.obj = parse_table_schema(json,
  720. precise_float=self.precise_float)
  721. else:
  722. self.obj = DataFrame(
  723. loads(json, precise_float=self.precise_float), dtype=None)
  724. def _process_converter(self, f, filt=None):
  725. """
  726. Take a conversion function and possibly recreate the frame.
  727. """
  728. if filt is None:
  729. filt = lambda col, c: True
  730. needs_new_obj = False
  731. new_obj = dict()
  732. for i, (col, c) in enumerate(self.obj.iteritems()):
  733. if filt(col, c):
  734. new_data, result = f(col, c)
  735. if result:
  736. c = new_data
  737. needs_new_obj = True
  738. new_obj[i] = c
  739. if needs_new_obj:
  740. # possibly handle dup columns
  741. new_obj = DataFrame(new_obj, index=self.obj.index)
  742. new_obj.columns = self.obj.columns
  743. self.obj = new_obj
  744. def _try_convert_types(self):
  745. if self.obj is None:
  746. return
  747. if self.convert_dates:
  748. self._try_convert_dates()
  749. self._process_converter(
  750. lambda col, c: self._try_convert_data(col, c, convert_dates=False))
  751. def _try_convert_dates(self):
  752. if self.obj is None:
  753. return
  754. # our columns to parse
  755. convert_dates = self.convert_dates
  756. if convert_dates is True:
  757. convert_dates = []
  758. convert_dates = set(convert_dates)
  759. def is_ok(col):
  760. """
  761. Return if this col is ok to try for a date parse.
  762. """
  763. if not isinstance(col, compat.string_types):
  764. return False
  765. col_lower = col.lower()
  766. if (col_lower.endswith('_at') or
  767. col_lower.endswith('_time') or
  768. col_lower == 'modified' or
  769. col_lower == 'date' or
  770. col_lower == 'datetime' or
  771. col_lower.startswith('timestamp')):
  772. return True
  773. return False
  774. self._process_converter(
  775. lambda col, c: self._try_convert_to_date(c),
  776. lambda col, c: ((self.keep_default_dates and is_ok(col)) or
  777. col in convert_dates))