table_schema.py 10.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326
  1. """
  2. Table Schema builders
  3. http://specs.frictionlessdata.io/json-table-schema/
  4. """
  5. import warnings
  6. import pandas._libs.json as json
  7. from pandas.core.dtypes.common import (
  8. is_bool_dtype, is_categorical_dtype, is_datetime64_dtype,
  9. is_datetime64tz_dtype, is_integer_dtype, is_numeric_dtype, is_period_dtype,
  10. is_string_dtype, is_timedelta64_dtype)
  11. from pandas import DataFrame
  12. from pandas.api.types import CategoricalDtype
  13. import pandas.core.common as com
  14. loads = json.loads
  15. def as_json_table_type(x):
  16. """
  17. Convert a NumPy / pandas type to its corresponding json_table.
  18. Parameters
  19. ----------
  20. x : array or dtype
  21. Returns
  22. -------
  23. t : str
  24. the Table Schema data types
  25. Notes
  26. -----
  27. This table shows the relationship between NumPy / pandas dtypes,
  28. and Table Schema dtypes.
  29. ============== =================
  30. Pandas type Table Schema type
  31. ============== =================
  32. int64 integer
  33. float64 number
  34. bool boolean
  35. datetime64[ns] datetime
  36. timedelta64[ns] duration
  37. object str
  38. categorical any
  39. =============== =================
  40. """
  41. if is_integer_dtype(x):
  42. return 'integer'
  43. elif is_bool_dtype(x):
  44. return 'boolean'
  45. elif is_numeric_dtype(x):
  46. return 'number'
  47. elif (is_datetime64_dtype(x) or is_datetime64tz_dtype(x) or
  48. is_period_dtype(x)):
  49. return 'datetime'
  50. elif is_timedelta64_dtype(x):
  51. return 'duration'
  52. elif is_categorical_dtype(x):
  53. return 'any'
  54. elif is_string_dtype(x):
  55. return 'string'
  56. else:
  57. return 'any'
  58. def set_default_names(data):
  59. """Sets index names to 'index' for regular, or 'level_x' for Multi"""
  60. if com._all_not_none(*data.index.names):
  61. nms = data.index.names
  62. if len(nms) == 1 and data.index.name == 'index':
  63. warnings.warn("Index name of 'index' is not round-trippable")
  64. elif len(nms) > 1 and any(x.startswith('level_') for x in nms):
  65. warnings.warn("Index names beginning with 'level_' are not "
  66. "round-trippable")
  67. return data
  68. data = data.copy()
  69. if data.index.nlevels > 1:
  70. names = [name if name is not None else 'level_{}'.format(i)
  71. for i, name in enumerate(data.index.names)]
  72. data.index.names = names
  73. else:
  74. data.index.name = data.index.name or 'index'
  75. return data
  76. def convert_pandas_type_to_json_field(arr, dtype=None):
  77. dtype = dtype or arr.dtype
  78. if arr.name is None:
  79. name = 'values'
  80. else:
  81. name = arr.name
  82. field = {'name': name,
  83. 'type': as_json_table_type(dtype)}
  84. if is_categorical_dtype(arr):
  85. if hasattr(arr, 'categories'):
  86. cats = arr.categories
  87. ordered = arr.ordered
  88. else:
  89. cats = arr.cat.categories
  90. ordered = arr.cat.ordered
  91. field['constraints'] = {"enum": list(cats)}
  92. field['ordered'] = ordered
  93. elif is_period_dtype(arr):
  94. field['freq'] = arr.freqstr
  95. elif is_datetime64tz_dtype(arr):
  96. if hasattr(arr, 'dt'):
  97. field['tz'] = arr.dt.tz.zone
  98. else:
  99. field['tz'] = arr.tz.zone
  100. return field
  101. def convert_json_field_to_pandas_type(field):
  102. """
  103. Converts a JSON field descriptor into its corresponding NumPy / pandas type
  104. Parameters
  105. ----------
  106. field
  107. A JSON field descriptor
  108. Returns
  109. -------
  110. dtype
  111. Raises
  112. -----
  113. ValueError
  114. If the type of the provided field is unknown or currently unsupported
  115. Examples
  116. --------
  117. >>> convert_json_field_to_pandas_type({'name': 'an_int',
  118. 'type': 'integer'})
  119. 'int64'
  120. >>> convert_json_field_to_pandas_type({'name': 'a_categorical',
  121. 'type': 'any',
  122. 'contraints': {'enum': [
  123. 'a', 'b', 'c']},
  124. 'ordered': True})
  125. 'CategoricalDtype(categories=['a', 'b', 'c'], ordered=True)'
  126. >>> convert_json_field_to_pandas_type({'name': 'a_datetime',
  127. 'type': 'datetime'})
  128. 'datetime64[ns]'
  129. >>> convert_json_field_to_pandas_type({'name': 'a_datetime_with_tz',
  130. 'type': 'datetime',
  131. 'tz': 'US/Central'})
  132. 'datetime64[ns, US/Central]'
  133. """
  134. typ = field['type']
  135. if typ == 'string':
  136. return 'object'
  137. elif typ == 'integer':
  138. return 'int64'
  139. elif typ == 'number':
  140. return 'float64'
  141. elif typ == 'boolean':
  142. return 'bool'
  143. elif typ == 'duration':
  144. return 'timedelta64'
  145. elif typ == 'datetime':
  146. if field.get('tz'):
  147. return 'datetime64[ns, {tz}]'.format(tz=field['tz'])
  148. else:
  149. return 'datetime64[ns]'
  150. elif typ == 'any':
  151. if 'constraints' in field and 'ordered' in field:
  152. return CategoricalDtype(categories=field['constraints']['enum'],
  153. ordered=field['ordered'])
  154. else:
  155. return 'object'
  156. raise ValueError("Unsupported or invalid field type: {}".format(typ))
  157. def build_table_schema(data, index=True, primary_key=None, version=True):
  158. """
  159. Create a Table schema from ``data``.
  160. Parameters
  161. ----------
  162. data : Series, DataFrame
  163. index : bool, default True
  164. Whether to include ``data.index`` in the schema.
  165. primary_key : bool or None, default True
  166. column names to designate as the primary key.
  167. The default `None` will set `'primaryKey'` to the index
  168. level or levels if the index is unique.
  169. version : bool, default True
  170. Whether to include a field `pandas_version` with the version
  171. of pandas that generated the schema.
  172. Returns
  173. -------
  174. schema : dict
  175. Notes
  176. -----
  177. See `_as_json_table_type` for conversion types.
  178. Timedeltas as converted to ISO8601 duration format with
  179. 9 decimal places after the seconds field for nanosecond precision.
  180. Categoricals are converted to the `any` dtype, and use the `enum` field
  181. constraint to list the allowed values. The `ordered` attribute is included
  182. in an `ordered` field.
  183. Examples
  184. --------
  185. >>> df = pd.DataFrame(
  186. ... {'A': [1, 2, 3],
  187. ... 'B': ['a', 'b', 'c'],
  188. ... 'C': pd.date_range('2016-01-01', freq='d', periods=3),
  189. ... }, index=pd.Index(range(3), name='idx'))
  190. >>> build_table_schema(df)
  191. {'fields': [{'name': 'idx', 'type': 'integer'},
  192. {'name': 'A', 'type': 'integer'},
  193. {'name': 'B', 'type': 'string'},
  194. {'name': 'C', 'type': 'datetime'}],
  195. 'pandas_version': '0.20.0',
  196. 'primaryKey': ['idx']}
  197. """
  198. if index is True:
  199. data = set_default_names(data)
  200. schema = {}
  201. fields = []
  202. if index:
  203. if data.index.nlevels > 1:
  204. for level in data.index.levels:
  205. fields.append(convert_pandas_type_to_json_field(level))
  206. else:
  207. fields.append(convert_pandas_type_to_json_field(data.index))
  208. if data.ndim > 1:
  209. for column, s in data.iteritems():
  210. fields.append(convert_pandas_type_to_json_field(s))
  211. else:
  212. fields.append(convert_pandas_type_to_json_field(data))
  213. schema['fields'] = fields
  214. if index and data.index.is_unique and primary_key is None:
  215. if data.index.nlevels == 1:
  216. schema['primaryKey'] = [data.index.name]
  217. else:
  218. schema['primaryKey'] = data.index.names
  219. elif primary_key is not None:
  220. schema['primaryKey'] = primary_key
  221. if version:
  222. schema['pandas_version'] = '0.20.0'
  223. return schema
  224. def parse_table_schema(json, precise_float):
  225. """
  226. Builds a DataFrame from a given schema
  227. Parameters
  228. ----------
  229. json :
  230. A JSON table schema
  231. precise_float : boolean
  232. Flag controlling precision when decoding string to double values, as
  233. dictated by ``read_json``
  234. Returns
  235. -------
  236. df : DataFrame
  237. Raises
  238. ------
  239. NotImplementedError
  240. If the JSON table schema contains either timezone or timedelta data
  241. Notes
  242. -----
  243. Because :func:`DataFrame.to_json` uses the string 'index' to denote a
  244. name-less :class:`Index`, this function sets the name of the returned
  245. :class:`DataFrame` to ``None`` when said string is encountered with a
  246. normal :class:`Index`. For a :class:`MultiIndex`, the same limitation
  247. applies to any strings beginning with 'level_'. Therefore, an
  248. :class:`Index` name of 'index' and :class:`MultiIndex` names starting
  249. with 'level_' are not supported.
  250. See Also
  251. --------
  252. build_table_schema : Inverse function.
  253. pandas.read_json
  254. """
  255. table = loads(json, precise_float=precise_float)
  256. col_order = [field['name'] for field in table['schema']['fields']]
  257. df = DataFrame(table['data'], columns=col_order)[col_order]
  258. dtypes = {field['name']: convert_json_field_to_pandas_type(field)
  259. for field in table['schema']['fields']}
  260. # Cannot directly use as_type with timezone data on object; raise for now
  261. if any(str(x).startswith('datetime64[ns, ') for x in dtypes.values()):
  262. raise NotImplementedError('table="orient" can not yet read timezone '
  263. 'data')
  264. # No ISO constructor for Timedelta as of yet, so need to raise
  265. if 'timedelta64' in dtypes.values():
  266. raise NotImplementedError('table="orient" can not yet read '
  267. 'ISO-formatted Timedelta data')
  268. df = df.astype(dtypes)
  269. if 'primaryKey' in table['schema']:
  270. df = df.set_index(table['schema']['primaryKey'])
  271. if len(df.index.names) == 1:
  272. if df.index.name == 'index':
  273. df.index.name = None
  274. else:
  275. df.index.names = [None if x.startswith('level_') else x for x in
  276. df.index.names]
  277. return df