feather_format.py 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127
  1. """ feather-format compat """
  2. from distutils.version import LooseVersion
  3. from pandas.compat import range
  4. from pandas.util._decorators import deprecate_kwarg
  5. from pandas import DataFrame, Int64Index, RangeIndex
  6. from pandas.io.common import _stringify_path
  7. def _try_import():
  8. # since pandas is a dependency of pyarrow
  9. # we need to import on first use
  10. try:
  11. import pyarrow
  12. from pyarrow import feather
  13. except ImportError:
  14. # give a nice error message
  15. raise ImportError("pyarrow is not installed\n\n"
  16. "you can install via conda\n"
  17. "conda install pyarrow -c conda-forge\n"
  18. "or via pip\n"
  19. "pip install -U pyarrow\n")
  20. if LooseVersion(pyarrow.__version__) < LooseVersion('0.9.0'):
  21. raise ImportError("pyarrow >= 0.9.0 required for feather support\n\n"
  22. "you can install via conda\n"
  23. "conda install pyarrow -c conda-forge"
  24. "or via pip\n"
  25. "pip install -U pyarrow\n")
  26. return feather, pyarrow
  27. def to_feather(df, path):
  28. """
  29. Write a DataFrame to the feather-format
  30. Parameters
  31. ----------
  32. df : DataFrame
  33. path : string file path, or file-like object
  34. """
  35. path = _stringify_path(path)
  36. if not isinstance(df, DataFrame):
  37. raise ValueError("feather only support IO with DataFrames")
  38. feather = _try_import()[0]
  39. valid_types = {'string', 'unicode'}
  40. # validate index
  41. # --------------
  42. # validate that we have only a default index
  43. # raise on anything else as we don't serialize the index
  44. if not isinstance(df.index, Int64Index):
  45. raise ValueError("feather does not support serializing {} "
  46. "for the index; you can .reset_index()"
  47. "to make the index into column(s)".format(
  48. type(df.index)))
  49. if not df.index.equals(RangeIndex.from_range(range(len(df)))):
  50. raise ValueError("feather does not support serializing a "
  51. "non-default index for the index; you "
  52. "can .reset_index() to make the index "
  53. "into column(s)")
  54. if df.index.name is not None:
  55. raise ValueError("feather does not serialize index meta-data on a "
  56. "default index")
  57. # validate columns
  58. # ----------------
  59. # must have value column names (strings only)
  60. if df.columns.inferred_type not in valid_types:
  61. raise ValueError("feather must have string column names")
  62. feather.write_feather(df, path)
  63. @deprecate_kwarg(old_arg_name='nthreads', new_arg_name='use_threads')
  64. def read_feather(path, columns=None, use_threads=True):
  65. """
  66. Load a feather-format object from the file path
  67. .. versionadded 0.20.0
  68. Parameters
  69. ----------
  70. path : string file path, or file-like object
  71. columns : sequence, default None
  72. If not provided, all columns are read
  73. .. versionadded 0.24.0
  74. nthreads : int, default 1
  75. Number of CPU threads to use when reading to pandas.DataFrame
  76. .. versionadded 0.21.0
  77. .. deprecated 0.24.0
  78. use_threads : bool, default True
  79. Whether to parallelize reading using multiple threads
  80. .. versionadded 0.24.0
  81. Returns
  82. -------
  83. type of object stored in file
  84. """
  85. feather, pyarrow = _try_import()
  86. path = _stringify_path(path)
  87. if LooseVersion(pyarrow.__version__) < LooseVersion('0.11.0'):
  88. int_use_threads = int(use_threads)
  89. if int_use_threads < 1:
  90. int_use_threads = 1
  91. return feather.read_feather(path, columns=columns,
  92. nthreads=int_use_threads)
  93. return feather.read_feather(path, columns=columns,
  94. use_threads=bool(use_threads))