csvs.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315
  1. # -*- coding: utf-8 -*-
  2. """
  3. Module for formatting output data into CSV files.
  4. """
  5. from __future__ import print_function
  6. import csv as csvlib
  7. import os
  8. import warnings
  9. from zipfile import ZipFile
  10. import numpy as np
  11. from pandas._libs import writers as libwriters
  12. from pandas.compat import StringIO, range, zip
  13. from pandas.core.dtypes.generic import (
  14. ABCDatetimeIndex, ABCIndexClass, ABCMultiIndex, ABCPeriodIndex)
  15. from pandas.core.dtypes.missing import notna
  16. from pandas import compat
  17. from pandas.io.common import (
  18. UnicodeWriter, _get_handle, _infer_compression, get_filepath_or_buffer)
  19. class CSVFormatter(object):
  20. def __init__(self, obj, path_or_buf=None, sep=",", na_rep='',
  21. float_format=None, cols=None, header=True, index=True,
  22. index_label=None, mode='w', nanRep=None, encoding=None,
  23. compression='infer', quoting=None, line_terminator='\n',
  24. chunksize=None, tupleize_cols=False, quotechar='"',
  25. date_format=None, doublequote=True, escapechar=None,
  26. decimal='.'):
  27. self.obj = obj
  28. if path_or_buf is None:
  29. path_or_buf = StringIO()
  30. self.path_or_buf, _, _, _ = get_filepath_or_buffer(
  31. path_or_buf, encoding=encoding, compression=compression, mode=mode
  32. )
  33. self.sep = sep
  34. self.na_rep = na_rep
  35. self.float_format = float_format
  36. self.decimal = decimal
  37. self.header = header
  38. self.index = index
  39. self.index_label = index_label
  40. self.mode = mode
  41. if encoding is None:
  42. encoding = 'ascii' if compat.PY2 else 'utf-8'
  43. self.encoding = encoding
  44. self.compression = _infer_compression(self.path_or_buf, compression)
  45. if quoting is None:
  46. quoting = csvlib.QUOTE_MINIMAL
  47. self.quoting = quoting
  48. if quoting == csvlib.QUOTE_NONE:
  49. # prevents crash in _csv
  50. quotechar = None
  51. self.quotechar = quotechar
  52. self.doublequote = doublequote
  53. self.escapechar = escapechar
  54. self.line_terminator = line_terminator or os.linesep
  55. self.date_format = date_format
  56. self.tupleize_cols = tupleize_cols
  57. self.has_mi_columns = (isinstance(obj.columns, ABCMultiIndex) and
  58. not self.tupleize_cols)
  59. # validate mi options
  60. if self.has_mi_columns:
  61. if cols is not None:
  62. raise TypeError("cannot specify cols with a MultiIndex on the "
  63. "columns")
  64. if cols is not None:
  65. if isinstance(cols, ABCIndexClass):
  66. cols = cols.to_native_types(na_rep=na_rep,
  67. float_format=float_format,
  68. date_format=date_format,
  69. quoting=self.quoting)
  70. else:
  71. cols = list(cols)
  72. self.obj = self.obj.loc[:, cols]
  73. # update columns to include possible multiplicity of dupes
  74. # and make sure sure cols is just a list of labels
  75. cols = self.obj.columns
  76. if isinstance(cols, ABCIndexClass):
  77. cols = cols.to_native_types(na_rep=na_rep,
  78. float_format=float_format,
  79. date_format=date_format,
  80. quoting=self.quoting)
  81. else:
  82. cols = list(cols)
  83. # save it
  84. self.cols = cols
  85. # preallocate data 2d list
  86. self.blocks = self.obj._data.blocks
  87. ncols = sum(b.shape[0] for b in self.blocks)
  88. self.data = [None] * ncols
  89. if chunksize is None:
  90. chunksize = (100000 // (len(self.cols) or 1)) or 1
  91. self.chunksize = int(chunksize)
  92. self.data_index = obj.index
  93. if (isinstance(self.data_index, (ABCDatetimeIndex, ABCPeriodIndex)) and
  94. date_format is not None):
  95. from pandas import Index
  96. self.data_index = Index([x.strftime(date_format) if notna(x) else
  97. '' for x in self.data_index])
  98. self.nlevels = getattr(self.data_index, 'nlevels', 1)
  99. if not index:
  100. self.nlevels = 0
  101. def save(self):
  102. """
  103. Create the writer & save
  104. """
  105. # GH21227 internal compression is not used when file-like passed.
  106. if self.compression and hasattr(self.path_or_buf, 'write'):
  107. msg = ("compression has no effect when passing file-like "
  108. "object as input.")
  109. warnings.warn(msg, RuntimeWarning, stacklevel=2)
  110. # when zip compression is called.
  111. is_zip = isinstance(self.path_or_buf, ZipFile) or (
  112. not hasattr(self.path_or_buf, 'write')
  113. and self.compression == 'zip')
  114. if is_zip:
  115. # zipfile doesn't support writing string to archive. uses string
  116. # buffer to receive csv writing and dump into zip compression
  117. # file handle. GH21241, GH21118
  118. f = StringIO()
  119. close = False
  120. elif hasattr(self.path_or_buf, 'write'):
  121. f = self.path_or_buf
  122. close = False
  123. else:
  124. f, handles = _get_handle(self.path_or_buf, self.mode,
  125. encoding=self.encoding,
  126. compression=self.compression)
  127. close = True
  128. try:
  129. writer_kwargs = dict(lineterminator=self.line_terminator,
  130. delimiter=self.sep, quoting=self.quoting,
  131. doublequote=self.doublequote,
  132. escapechar=self.escapechar,
  133. quotechar=self.quotechar)
  134. if self.encoding == 'ascii':
  135. self.writer = csvlib.writer(f, **writer_kwargs)
  136. else:
  137. writer_kwargs['encoding'] = self.encoding
  138. self.writer = UnicodeWriter(f, **writer_kwargs)
  139. self._save()
  140. finally:
  141. if is_zip:
  142. # GH17778 handles zip compression separately.
  143. buf = f.getvalue()
  144. if hasattr(self.path_or_buf, 'write'):
  145. self.path_or_buf.write(buf)
  146. else:
  147. f, handles = _get_handle(self.path_or_buf, self.mode,
  148. encoding=self.encoding,
  149. compression=self.compression)
  150. f.write(buf)
  151. close = True
  152. if close:
  153. f.close()
  154. for _fh in handles:
  155. _fh.close()
  156. def _save_header(self):
  157. writer = self.writer
  158. obj = self.obj
  159. index_label = self.index_label
  160. cols = self.cols
  161. has_mi_columns = self.has_mi_columns
  162. header = self.header
  163. encoded_labels = []
  164. has_aliases = isinstance(header, (tuple, list, np.ndarray,
  165. ABCIndexClass))
  166. if not (has_aliases or self.header):
  167. return
  168. if has_aliases:
  169. if len(header) != len(cols):
  170. raise ValueError(('Writing {ncols} cols but got {nalias} '
  171. 'aliases'.format(ncols=len(cols),
  172. nalias=len(header))))
  173. else:
  174. write_cols = header
  175. else:
  176. write_cols = cols
  177. if self.index:
  178. # should write something for index label
  179. if index_label is not False:
  180. if index_label is None:
  181. if isinstance(obj.index, ABCMultiIndex):
  182. index_label = []
  183. for i, name in enumerate(obj.index.names):
  184. if name is None:
  185. name = ''
  186. index_label.append(name)
  187. else:
  188. index_label = obj.index.name
  189. if index_label is None:
  190. index_label = ['']
  191. else:
  192. index_label = [index_label]
  193. elif not isinstance(index_label,
  194. (list, tuple, np.ndarray, ABCIndexClass)):
  195. # given a string for a DF with Index
  196. index_label = [index_label]
  197. encoded_labels = list(index_label)
  198. else:
  199. encoded_labels = []
  200. if not has_mi_columns or has_aliases:
  201. encoded_labels += list(write_cols)
  202. writer.writerow(encoded_labels)
  203. else:
  204. # write out the mi
  205. columns = obj.columns
  206. # write out the names for each level, then ALL of the values for
  207. # each level
  208. for i in range(columns.nlevels):
  209. # we need at least 1 index column to write our col names
  210. col_line = []
  211. if self.index:
  212. # name is the first column
  213. col_line.append(columns.names[i])
  214. if isinstance(index_label, list) and len(index_label) > 1:
  215. col_line.extend([''] * (len(index_label) - 1))
  216. col_line.extend(columns._get_level_values(i))
  217. writer.writerow(col_line)
  218. # Write out the index line if it's not empty.
  219. # Otherwise, we will print out an extraneous
  220. # blank line between the mi and the data rows.
  221. if encoded_labels and set(encoded_labels) != {''}:
  222. encoded_labels.extend([''] * len(columns))
  223. writer.writerow(encoded_labels)
  224. def _save(self):
  225. self._save_header()
  226. nrows = len(self.data_index)
  227. # write in chunksize bites
  228. chunksize = self.chunksize
  229. chunks = int(nrows / chunksize) + 1
  230. for i in range(chunks):
  231. start_i = i * chunksize
  232. end_i = min((i + 1) * chunksize, nrows)
  233. if start_i >= end_i:
  234. break
  235. self._save_chunk(start_i, end_i)
  236. def _save_chunk(self, start_i, end_i):
  237. data_index = self.data_index
  238. # create the data for a chunk
  239. slicer = slice(start_i, end_i)
  240. for i in range(len(self.blocks)):
  241. b = self.blocks[i]
  242. d = b.to_native_types(slicer=slicer, na_rep=self.na_rep,
  243. float_format=self.float_format,
  244. decimal=self.decimal,
  245. date_format=self.date_format,
  246. quoting=self.quoting)
  247. for col_loc, col in zip(b.mgr_locs, d):
  248. # self.data is a preallocated list
  249. self.data[col_loc] = col
  250. ix = data_index.to_native_types(slicer=slicer, na_rep=self.na_rep,
  251. float_format=self.float_format,
  252. decimal=self.decimal,
  253. date_format=self.date_format,
  254. quoting=self.quoting)
  255. libwriters.write_csv_rows(self.data, ix, self.nlevels,
  256. self.cols, self.writer)