test_excel.py 100 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566
  1. from collections import OrderedDict
  2. import contextlib
  3. from datetime import date, datetime, time, timedelta
  4. from distutils.version import LooseVersion
  5. from functools import partial
  6. import os
  7. import warnings
  8. from warnings import catch_warnings
  9. import numpy as np
  10. from numpy import nan
  11. import pytest
  12. from pandas.compat import PY36, BytesIO, iteritems, map, range, u
  13. import pandas.util._test_decorators as td
  14. import pandas as pd
  15. from pandas import DataFrame, Index, MultiIndex, Series
  16. from pandas.core.config import get_option, set_option
  17. import pandas.util.testing as tm
  18. from pandas.util.testing import ensure_clean, makeCustomDataframe as mkdf
  19. from pandas.io.common import URLError
  20. from pandas.io.excel import (
  21. ExcelFile, ExcelWriter, _OpenpyxlWriter, _XlsxWriter, _XlwtWriter,
  22. read_excel, register_writer)
  23. from pandas.io.formats.excel import ExcelFormatter
  24. from pandas.io.parsers import read_csv
  25. _seriesd = tm.getSeriesData()
  26. _tsd = tm.getTimeSeriesData()
  27. _frame = DataFrame(_seriesd)[:10]
  28. _frame2 = DataFrame(_seriesd, columns=['D', 'C', 'B', 'A'])[:10]
  29. _tsframe = tm.makeTimeDataFrame()[:5]
  30. _mixed_frame = _frame.copy()
  31. _mixed_frame['foo'] = 'bar'
  32. @contextlib.contextmanager
  33. def ignore_xlrd_time_clock_warning():
  34. """
  35. Context manager to ignore warnings raised by the xlrd library,
  36. regarding the deprecation of `time.clock` in Python 3.7.
  37. """
  38. with warnings.catch_warnings():
  39. warnings.filterwarnings(
  40. action='ignore',
  41. message='time.clock has been deprecated',
  42. category=DeprecationWarning)
  43. yield
  44. @td.skip_if_no('xlrd', '1.0.0')
  45. class SharedItems(object):
  46. @pytest.fixture(autouse=True)
  47. def setup_method(self, datapath):
  48. self.dirpath = datapath("io", "data")
  49. self.frame = _frame.copy()
  50. self.frame2 = _frame2.copy()
  51. self.tsframe = _tsframe.copy()
  52. self.mixed_frame = _mixed_frame.copy()
  53. def get_csv_refdf(self, basename):
  54. """
  55. Obtain the reference data from read_csv with the Python engine.
  56. Parameters
  57. ----------
  58. basename : str
  59. File base name, excluding file extension.
  60. Returns
  61. -------
  62. dfref : DataFrame
  63. """
  64. pref = os.path.join(self.dirpath, basename + '.csv')
  65. dfref = read_csv(pref, index_col=0, parse_dates=True, engine='python')
  66. return dfref
  67. def get_excelfile(self, basename, ext):
  68. """
  69. Return test data ExcelFile instance.
  70. Parameters
  71. ----------
  72. basename : str
  73. File base name, excluding file extension.
  74. Returns
  75. -------
  76. excel : io.excel.ExcelFile
  77. """
  78. return ExcelFile(os.path.join(self.dirpath, basename + ext))
  79. def get_exceldf(self, basename, ext, *args, **kwds):
  80. """
  81. Return test data DataFrame.
  82. Parameters
  83. ----------
  84. basename : str
  85. File base name, excluding file extension.
  86. Returns
  87. -------
  88. df : DataFrame
  89. """
  90. pth = os.path.join(self.dirpath, basename + ext)
  91. return read_excel(pth, *args, **kwds)
  92. class ReadingTestsBase(SharedItems):
  93. # This is based on ExcelWriterBase
  94. @pytest.fixture(autouse=True, params=['xlrd', None])
  95. def set_engine(self, request):
  96. func_name = "get_exceldf"
  97. old_func = getattr(self, func_name)
  98. new_func = partial(old_func, engine=request.param)
  99. setattr(self, func_name, new_func)
  100. yield
  101. setattr(self, func_name, old_func)
  102. @td.skip_if_no("xlrd", "1.0.1") # see gh-22682
  103. def test_usecols_int(self, ext):
  104. df_ref = self.get_csv_refdf("test1")
  105. df_ref = df_ref.reindex(columns=["A", "B", "C"])
  106. # usecols as int
  107. with tm.assert_produces_warning(FutureWarning,
  108. check_stacklevel=False):
  109. with ignore_xlrd_time_clock_warning():
  110. df1 = self.get_exceldf("test1", ext, "Sheet1",
  111. index_col=0, usecols=3)
  112. # usecols as int
  113. with tm.assert_produces_warning(FutureWarning,
  114. check_stacklevel=False):
  115. with ignore_xlrd_time_clock_warning():
  116. df2 = self.get_exceldf("test1", ext, "Sheet2", skiprows=[1],
  117. index_col=0, usecols=3)
  118. # parse_cols instead of usecols, usecols as int
  119. with tm.assert_produces_warning(FutureWarning,
  120. check_stacklevel=False):
  121. with ignore_xlrd_time_clock_warning():
  122. df3 = self.get_exceldf("test1", ext, "Sheet2", skiprows=[1],
  123. index_col=0, parse_cols=3)
  124. # TODO add index to xls file)
  125. tm.assert_frame_equal(df1, df_ref, check_names=False)
  126. tm.assert_frame_equal(df2, df_ref, check_names=False)
  127. tm.assert_frame_equal(df3, df_ref, check_names=False)
  128. @td.skip_if_no('xlrd', '1.0.1') # GH-22682
  129. def test_usecols_list(self, ext):
  130. dfref = self.get_csv_refdf('test1')
  131. dfref = dfref.reindex(columns=['B', 'C'])
  132. df1 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0,
  133. usecols=[0, 2, 3])
  134. df2 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1],
  135. index_col=0, usecols=[0, 2, 3])
  136. with tm.assert_produces_warning(FutureWarning):
  137. with ignore_xlrd_time_clock_warning():
  138. df3 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1],
  139. index_col=0, parse_cols=[0, 2, 3])
  140. # TODO add index to xls file)
  141. tm.assert_frame_equal(df1, dfref, check_names=False)
  142. tm.assert_frame_equal(df2, dfref, check_names=False)
  143. tm.assert_frame_equal(df3, dfref, check_names=False)
  144. @td.skip_if_no('xlrd', '1.0.1') # GH-22682
  145. def test_usecols_str(self, ext):
  146. dfref = self.get_csv_refdf('test1')
  147. df1 = dfref.reindex(columns=['A', 'B', 'C'])
  148. df2 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0,
  149. usecols='A:D')
  150. df3 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1],
  151. index_col=0, usecols='A:D')
  152. with tm.assert_produces_warning(FutureWarning):
  153. with ignore_xlrd_time_clock_warning():
  154. df4 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1],
  155. index_col=0, parse_cols='A:D')
  156. # TODO add index to xls, read xls ignores index name ?
  157. tm.assert_frame_equal(df2, df1, check_names=False)
  158. tm.assert_frame_equal(df3, df1, check_names=False)
  159. tm.assert_frame_equal(df4, df1, check_names=False)
  160. df1 = dfref.reindex(columns=['B', 'C'])
  161. df2 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0,
  162. usecols='A,C,D')
  163. df3 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1],
  164. index_col=0, usecols='A,C,D')
  165. # TODO add index to xls file
  166. tm.assert_frame_equal(df2, df1, check_names=False)
  167. tm.assert_frame_equal(df3, df1, check_names=False)
  168. df1 = dfref.reindex(columns=['B', 'C'])
  169. df2 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0,
  170. usecols='A,C:D')
  171. df3 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1],
  172. index_col=0, usecols='A,C:D')
  173. tm.assert_frame_equal(df2, df1, check_names=False)
  174. tm.assert_frame_equal(df3, df1, check_names=False)
  175. @pytest.mark.parametrize("usecols", [
  176. [0, 1, 3], [0, 3, 1],
  177. [1, 0, 3], [1, 3, 0],
  178. [3, 0, 1], [3, 1, 0],
  179. ])
  180. def test_usecols_diff_positional_int_columns_order(self, ext, usecols):
  181. expected = self.get_csv_refdf("test1")[["A", "C"]]
  182. result = self.get_exceldf("test1", ext, "Sheet1",
  183. index_col=0, usecols=usecols)
  184. tm.assert_frame_equal(result, expected, check_names=False)
  185. @pytest.mark.parametrize("usecols", [
  186. ["B", "D"], ["D", "B"]
  187. ])
  188. def test_usecols_diff_positional_str_columns_order(self, ext, usecols):
  189. expected = self.get_csv_refdf("test1")[["B", "D"]]
  190. expected.index = range(len(expected))
  191. result = self.get_exceldf("test1", ext, "Sheet1", usecols=usecols)
  192. tm.assert_frame_equal(result, expected, check_names=False)
  193. def test_read_excel_without_slicing(self, ext):
  194. expected = self.get_csv_refdf("test1")
  195. result = self.get_exceldf("test1", ext, "Sheet1", index_col=0)
  196. tm.assert_frame_equal(result, expected, check_names=False)
  197. def test_usecols_excel_range_str(self, ext):
  198. expected = self.get_csv_refdf("test1")[["C", "D"]]
  199. result = self.get_exceldf("test1", ext, "Sheet1",
  200. index_col=0, usecols="A,D:E")
  201. tm.assert_frame_equal(result, expected, check_names=False)
  202. def test_usecols_excel_range_str_invalid(self, ext):
  203. msg = "Invalid column name: E1"
  204. with pytest.raises(ValueError, match=msg):
  205. self.get_exceldf("test1", ext, "Sheet1", usecols="D:E1")
  206. def test_index_col_label_error(self, ext):
  207. msg = "list indices must be integers.*, not str"
  208. with pytest.raises(TypeError, match=msg):
  209. self.get_exceldf("test1", ext, "Sheet1", index_col=["A"],
  210. usecols=["A", "C"])
  211. def test_index_col_empty(self, ext):
  212. # see gh-9208
  213. result = self.get_exceldf("test1", ext, "Sheet3",
  214. index_col=["A", "B", "C"])
  215. expected = DataFrame(columns=["D", "E", "F"],
  216. index=MultiIndex(levels=[[]] * 3,
  217. codes=[[]] * 3,
  218. names=["A", "B", "C"]))
  219. tm.assert_frame_equal(result, expected)
  220. @pytest.mark.parametrize("index_col", [None, 2])
  221. def test_index_col_with_unnamed(self, ext, index_col):
  222. # see gh-18792
  223. result = self.get_exceldf("test1", ext, "Sheet4",
  224. index_col=index_col)
  225. expected = DataFrame([["i1", "a", "x"], ["i2", "b", "y"]],
  226. columns=["Unnamed: 0", "col1", "col2"])
  227. if index_col:
  228. expected = expected.set_index(expected.columns[index_col])
  229. tm.assert_frame_equal(result, expected)
  230. def test_usecols_pass_non_existent_column(self, ext):
  231. msg = ("Usecols do not match columns, "
  232. "columns expected but not found: " + r"\['E'\]")
  233. with pytest.raises(ValueError, match=msg):
  234. self.get_exceldf("test1", ext, usecols=["E"])
  235. def test_usecols_wrong_type(self, ext):
  236. msg = ("'usecols' must either be list-like of "
  237. "all strings, all unicode, all integers or a callable.")
  238. with pytest.raises(ValueError, match=msg):
  239. self.get_exceldf("test1", ext, usecols=["E1", 0])
  240. def test_excel_stop_iterator(self, ext):
  241. parsed = self.get_exceldf('test2', ext, 'Sheet1')
  242. expected = DataFrame([['aaaa', 'bbbbb']], columns=['Test', 'Test1'])
  243. tm.assert_frame_equal(parsed, expected)
  244. def test_excel_cell_error_na(self, ext):
  245. parsed = self.get_exceldf('test3', ext, 'Sheet1')
  246. expected = DataFrame([[np.nan]], columns=['Test'])
  247. tm.assert_frame_equal(parsed, expected)
  248. def test_excel_passes_na(self, ext):
  249. excel = self.get_excelfile('test4', ext)
  250. parsed = read_excel(excel, 'Sheet1', keep_default_na=False,
  251. na_values=['apple'])
  252. expected = DataFrame([['NA'], [1], ['NA'], [np.nan], ['rabbit']],
  253. columns=['Test'])
  254. tm.assert_frame_equal(parsed, expected)
  255. parsed = read_excel(excel, 'Sheet1', keep_default_na=True,
  256. na_values=['apple'])
  257. expected = DataFrame([[np.nan], [1], [np.nan], [np.nan], ['rabbit']],
  258. columns=['Test'])
  259. tm.assert_frame_equal(parsed, expected)
  260. # 13967
  261. excel = self.get_excelfile('test5', ext)
  262. parsed = read_excel(excel, 'Sheet1', keep_default_na=False,
  263. na_values=['apple'])
  264. expected = DataFrame([['1.#QNAN'], [1], ['nan'], [np.nan], ['rabbit']],
  265. columns=['Test'])
  266. tm.assert_frame_equal(parsed, expected)
  267. parsed = read_excel(excel, 'Sheet1', keep_default_na=True,
  268. na_values=['apple'])
  269. expected = DataFrame([[np.nan], [1], [np.nan], [np.nan], ['rabbit']],
  270. columns=['Test'])
  271. tm.assert_frame_equal(parsed, expected)
  272. @td.skip_if_no('xlrd', '1.0.1') # GH-22682
  273. def test_deprecated_sheetname(self, ext):
  274. # gh-17964
  275. excel = self.get_excelfile('test1', ext)
  276. with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
  277. read_excel(excel, sheetname='Sheet1')
  278. with pytest.raises(TypeError):
  279. read_excel(excel, sheet='Sheet1')
  280. @td.skip_if_no('xlrd', '1.0.1') # GH-22682
  281. def test_excel_table_sheet_by_index(self, ext):
  282. excel = self.get_excelfile('test1', ext)
  283. dfref = self.get_csv_refdf('test1')
  284. df1 = read_excel(excel, 0, index_col=0)
  285. df2 = read_excel(excel, 1, skiprows=[1], index_col=0)
  286. tm.assert_frame_equal(df1, dfref, check_names=False)
  287. tm.assert_frame_equal(df2, dfref, check_names=False)
  288. df1 = excel.parse(0, index_col=0)
  289. df2 = excel.parse(1, skiprows=[1], index_col=0)
  290. tm.assert_frame_equal(df1, dfref, check_names=False)
  291. tm.assert_frame_equal(df2, dfref, check_names=False)
  292. df3 = read_excel(excel, 0, index_col=0, skipfooter=1)
  293. tm.assert_frame_equal(df3, df1.iloc[:-1])
  294. with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
  295. df4 = read_excel(excel, 0, index_col=0, skip_footer=1)
  296. tm.assert_frame_equal(df3, df4)
  297. df3 = excel.parse(0, index_col=0, skipfooter=1)
  298. tm.assert_frame_equal(df3, df1.iloc[:-1])
  299. import xlrd
  300. with pytest.raises(xlrd.XLRDError):
  301. read_excel(excel, 'asdf')
  302. def test_excel_table(self, ext):
  303. dfref = self.get_csv_refdf('test1')
  304. df1 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0)
  305. df2 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1],
  306. index_col=0)
  307. # TODO add index to file
  308. tm.assert_frame_equal(df1, dfref, check_names=False)
  309. tm.assert_frame_equal(df2, dfref, check_names=False)
  310. df3 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0,
  311. skipfooter=1)
  312. tm.assert_frame_equal(df3, df1.iloc[:-1])
  313. def test_reader_special_dtypes(self, ext):
  314. expected = DataFrame.from_dict(OrderedDict([
  315. ("IntCol", [1, 2, -3, 4, 0]),
  316. ("FloatCol", [1.25, 2.25, 1.83, 1.92, 0.0000000005]),
  317. ("BoolCol", [True, False, True, True, False]),
  318. ("StrCol", [1, 2, 3, 4, 5]),
  319. # GH5394 - this is why convert_float isn't vectorized
  320. ("Str2Col", ["a", 3, "c", "d", "e"]),
  321. ("DateCol", [datetime(2013, 10, 30), datetime(2013, 10, 31),
  322. datetime(1905, 1, 1), datetime(2013, 12, 14),
  323. datetime(2015, 3, 14)])
  324. ]))
  325. basename = 'test_types'
  326. # should read in correctly and infer types
  327. actual = self.get_exceldf(basename, ext, 'Sheet1')
  328. tm.assert_frame_equal(actual, expected)
  329. # if not coercing number, then int comes in as float
  330. float_expected = expected.copy()
  331. float_expected["IntCol"] = float_expected["IntCol"].astype(float)
  332. float_expected.loc[float_expected.index[1], "Str2Col"] = 3.0
  333. actual = self.get_exceldf(basename, ext, 'Sheet1', convert_float=False)
  334. tm.assert_frame_equal(actual, float_expected)
  335. # check setting Index (assuming xls and xlsx are the same here)
  336. for icol, name in enumerate(expected.columns):
  337. actual = self.get_exceldf(basename, ext, 'Sheet1', index_col=icol)
  338. exp = expected.set_index(name)
  339. tm.assert_frame_equal(actual, exp)
  340. # convert_float and converters should be different but both accepted
  341. expected["StrCol"] = expected["StrCol"].apply(str)
  342. actual = self.get_exceldf(
  343. basename, ext, 'Sheet1', converters={"StrCol": str})
  344. tm.assert_frame_equal(actual, expected)
  345. no_convert_float = float_expected.copy()
  346. no_convert_float["StrCol"] = no_convert_float["StrCol"].apply(str)
  347. actual = self.get_exceldf(basename, ext, 'Sheet1', convert_float=False,
  348. converters={"StrCol": str})
  349. tm.assert_frame_equal(actual, no_convert_float)
  350. # GH8212 - support for converters and missing values
  351. def test_reader_converters(self, ext):
  352. basename = 'test_converters'
  353. expected = DataFrame.from_dict(OrderedDict([
  354. ("IntCol", [1, 2, -3, -1000, 0]),
  355. ("FloatCol", [12.5, np.nan, 18.3, 19.2, 0.000000005]),
  356. ("BoolCol", ['Found', 'Found', 'Found', 'Not found', 'Found']),
  357. ("StrCol", ['1', np.nan, '3', '4', '5']),
  358. ]))
  359. converters = {'IntCol': lambda x: int(x) if x != '' else -1000,
  360. 'FloatCol': lambda x: 10 * x if x else np.nan,
  361. 2: lambda x: 'Found' if x != '' else 'Not found',
  362. 3: lambda x: str(x) if x else '',
  363. }
  364. # should read in correctly and set types of single cells (not array
  365. # dtypes)
  366. actual = self.get_exceldf(basename, ext, 'Sheet1',
  367. converters=converters)
  368. tm.assert_frame_equal(actual, expected)
  369. def test_reader_dtype(self, ext):
  370. # GH 8212
  371. basename = 'testdtype'
  372. actual = self.get_exceldf(basename, ext)
  373. expected = DataFrame({
  374. 'a': [1, 2, 3, 4],
  375. 'b': [2.5, 3.5, 4.5, 5.5],
  376. 'c': [1, 2, 3, 4],
  377. 'd': [1.0, 2.0, np.nan, 4.0]}).reindex(
  378. columns=['a', 'b', 'c', 'd'])
  379. tm.assert_frame_equal(actual, expected)
  380. actual = self.get_exceldf(basename, ext,
  381. dtype={'a': 'float64',
  382. 'b': 'float32',
  383. 'c': str})
  384. expected['a'] = expected['a'].astype('float64')
  385. expected['b'] = expected['b'].astype('float32')
  386. expected['c'] = ['001', '002', '003', '004']
  387. tm.assert_frame_equal(actual, expected)
  388. with pytest.raises(ValueError):
  389. self.get_exceldf(basename, ext, dtype={'d': 'int64'})
  390. @pytest.mark.parametrize("dtype,expected", [
  391. (None,
  392. DataFrame({
  393. "a": [1, 2, 3, 4],
  394. "b": [2.5, 3.5, 4.5, 5.5],
  395. "c": [1, 2, 3, 4],
  396. "d": [1.0, 2.0, np.nan, 4.0]
  397. })),
  398. ({"a": "float64",
  399. "b": "float32",
  400. "c": str,
  401. "d": str
  402. },
  403. DataFrame({
  404. "a": Series([1, 2, 3, 4], dtype="float64"),
  405. "b": Series([2.5, 3.5, 4.5, 5.5], dtype="float32"),
  406. "c": ["001", "002", "003", "004"],
  407. "d": ["1", "2", np.nan, "4"]
  408. })),
  409. ])
  410. def test_reader_dtype_str(self, ext, dtype, expected):
  411. # see gh-20377
  412. basename = "testdtype"
  413. actual = self.get_exceldf(basename, ext, dtype=dtype)
  414. tm.assert_frame_equal(actual, expected)
  415. def test_reading_all_sheets(self, ext):
  416. # Test reading all sheetnames by setting sheetname to None,
  417. # Ensure a dict is returned.
  418. # See PR #9450
  419. basename = 'test_multisheet'
  420. dfs = self.get_exceldf(basename, ext, sheet_name=None)
  421. # ensure this is not alphabetical to test order preservation
  422. expected_keys = ['Charlie', 'Alpha', 'Beta']
  423. tm.assert_contains_all(expected_keys, dfs.keys())
  424. # Issue 9930
  425. # Ensure sheet order is preserved
  426. assert expected_keys == list(dfs.keys())
  427. def test_reading_multiple_specific_sheets(self, ext):
  428. # Test reading specific sheetnames by specifying a mixed list
  429. # of integers and strings, and confirm that duplicated sheet
  430. # references (positions/names) are removed properly.
  431. # Ensure a dict is returned
  432. # See PR #9450
  433. basename = 'test_multisheet'
  434. # Explicitly request duplicates. Only the set should be returned.
  435. expected_keys = [2, 'Charlie', 'Charlie']
  436. dfs = self.get_exceldf(basename, ext, sheet_name=expected_keys)
  437. expected_keys = list(set(expected_keys))
  438. tm.assert_contains_all(expected_keys, dfs.keys())
  439. assert len(expected_keys) == len(dfs.keys())
  440. def test_reading_all_sheets_with_blank(self, ext):
  441. # Test reading all sheetnames by setting sheetname to None,
  442. # In the case where some sheets are blank.
  443. # Issue #11711
  444. basename = 'blank_with_header'
  445. dfs = self.get_exceldf(basename, ext, sheet_name=None)
  446. expected_keys = ['Sheet1', 'Sheet2', 'Sheet3']
  447. tm.assert_contains_all(expected_keys, dfs.keys())
  448. # GH6403
  449. def test_read_excel_blank(self, ext):
  450. actual = self.get_exceldf('blank', ext, 'Sheet1')
  451. tm.assert_frame_equal(actual, DataFrame())
  452. def test_read_excel_blank_with_header(self, ext):
  453. expected = DataFrame(columns=['col_1', 'col_2'])
  454. actual = self.get_exceldf('blank_with_header', ext, 'Sheet1')
  455. tm.assert_frame_equal(actual, expected)
  456. @td.skip_if_no("xlwt")
  457. @td.skip_if_no("openpyxl")
  458. @pytest.mark.parametrize("header,expected", [
  459. (None, DataFrame([np.nan] * 4)),
  460. (0, DataFrame({"Unnamed: 0": [np.nan] * 3}))
  461. ])
  462. def test_read_one_empty_col_no_header(self, ext, header, expected):
  463. # xref gh-12292
  464. filename = "no_header"
  465. df = pd.DataFrame(
  466. [["", 1, 100],
  467. ["", 2, 200],
  468. ["", 3, 300],
  469. ["", 4, 400]]
  470. )
  471. with ensure_clean(ext) as path:
  472. df.to_excel(path, filename, index=False, header=False)
  473. result = read_excel(path, filename, usecols=[0], header=header)
  474. tm.assert_frame_equal(result, expected)
  475. @td.skip_if_no("xlwt")
  476. @td.skip_if_no("openpyxl")
  477. @pytest.mark.parametrize("header,expected", [
  478. (None, DataFrame([0] + [np.nan] * 4)),
  479. (0, DataFrame([np.nan] * 4))
  480. ])
  481. def test_read_one_empty_col_with_header(self, ext, header, expected):
  482. filename = "with_header"
  483. df = pd.DataFrame(
  484. [["", 1, 100],
  485. ["", 2, 200],
  486. ["", 3, 300],
  487. ["", 4, 400]]
  488. )
  489. with ensure_clean(ext) as path:
  490. df.to_excel(path, 'with_header', index=False, header=True)
  491. result = read_excel(path, filename, usecols=[0], header=header)
  492. tm.assert_frame_equal(result, expected)
  493. @td.skip_if_no('openpyxl')
  494. @td.skip_if_no('xlwt')
  495. def test_set_column_names_in_parameter(self, ext):
  496. # GH 12870 : pass down column names associated with
  497. # keyword argument names
  498. refdf = pd.DataFrame([[1, 'foo'], [2, 'bar'],
  499. [3, 'baz']], columns=['a', 'b'])
  500. with ensure_clean(ext) as pth:
  501. with ExcelWriter(pth) as writer:
  502. refdf.to_excel(writer, 'Data_no_head',
  503. header=False, index=False)
  504. refdf.to_excel(writer, 'Data_with_head', index=False)
  505. refdf.columns = ['A', 'B']
  506. with ExcelFile(pth) as reader:
  507. xlsdf_no_head = read_excel(reader, 'Data_no_head',
  508. header=None, names=['A', 'B'])
  509. xlsdf_with_head = read_excel(reader, 'Data_with_head',
  510. index_col=None, names=['A', 'B'])
  511. tm.assert_frame_equal(xlsdf_no_head, refdf)
  512. tm.assert_frame_equal(xlsdf_with_head, refdf)
  513. def test_date_conversion_overflow(self, ext):
  514. # GH 10001 : pandas.ExcelFile ignore parse_dates=False
  515. expected = pd.DataFrame([[pd.Timestamp('2016-03-12'), 'Marc Johnson'],
  516. [pd.Timestamp('2016-03-16'), 'Jack Black'],
  517. [1e+20, 'Timothy Brown']],
  518. columns=['DateColWithBigInt', 'StringCol'])
  519. result = self.get_exceldf('testdateoverflow', ext)
  520. tm.assert_frame_equal(result, expected)
  521. @td.skip_if_no("xlrd", "1.0.1") # see gh-22682
  522. def test_sheet_name_and_sheetname(self, ext):
  523. # gh-10559: Minor improvement: Change "sheet_name" to "sheetname"
  524. # gh-10969: DOC: Consistent var names (sheetname vs sheet_name)
  525. # gh-12604: CLN GH10559 Rename sheetname variable to sheet_name
  526. # gh-20920: ExcelFile.parse() and pd.read_xlsx() have different
  527. # behavior for "sheetname" argument
  528. filename = "test1"
  529. sheet_name = "Sheet1"
  530. df_ref = self.get_csv_refdf(filename)
  531. df1 = self.get_exceldf(filename, ext,
  532. sheet_name=sheet_name, index_col=0) # doc
  533. with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
  534. with ignore_xlrd_time_clock_warning():
  535. df2 = self.get_exceldf(filename, ext, index_col=0,
  536. sheetname=sheet_name) # backward compat
  537. excel = self.get_excelfile(filename, ext)
  538. df1_parse = excel.parse(sheet_name=sheet_name, index_col=0) # doc
  539. with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
  540. df2_parse = excel.parse(index_col=0,
  541. sheetname=sheet_name) # backward compat
  542. tm.assert_frame_equal(df1, df_ref, check_names=False)
  543. tm.assert_frame_equal(df2, df_ref, check_names=False)
  544. tm.assert_frame_equal(df1_parse, df_ref, check_names=False)
  545. tm.assert_frame_equal(df2_parse, df_ref, check_names=False)
  546. def test_sheet_name_both_raises(self, ext):
  547. with pytest.raises(TypeError, match="Cannot specify both"):
  548. self.get_exceldf('test1', ext, sheetname='Sheet1',
  549. sheet_name='Sheet1')
  550. excel = self.get_excelfile('test1', ext)
  551. with pytest.raises(TypeError, match="Cannot specify both"):
  552. excel.parse(sheetname='Sheet1',
  553. sheet_name='Sheet1')
  554. def test_excel_read_buffer(self, ext):
  555. pth = os.path.join(self.dirpath, 'test1' + ext)
  556. expected = read_excel(pth, 'Sheet1', index_col=0)
  557. with open(pth, 'rb') as f:
  558. actual = read_excel(f, 'Sheet1', index_col=0)
  559. tm.assert_frame_equal(expected, actual)
  560. with open(pth, 'rb') as f:
  561. xls = ExcelFile(f)
  562. actual = read_excel(xls, 'Sheet1', index_col=0)
  563. tm.assert_frame_equal(expected, actual)
  564. def test_bad_engine_raises(self, ext):
  565. bad_engine = 'foo'
  566. with pytest.raises(ValueError, match="Unknown engine: foo"):
  567. read_excel('', engine=bad_engine)
  568. @tm.network
  569. def test_read_from_http_url(self, ext):
  570. url = ('https://raw.github.com/pandas-dev/pandas/master/'
  571. 'pandas/tests/io/data/test1' + ext)
  572. url_table = read_excel(url)
  573. local_table = self.get_exceldf('test1', ext)
  574. tm.assert_frame_equal(url_table, local_table)
  575. @td.skip_if_not_us_locale
  576. def test_read_from_s3_url(self, ext, s3_resource):
  577. # Bucket "pandas-test" created in tests/io/conftest.py
  578. file_name = os.path.join(self.dirpath, 'test1' + ext)
  579. with open(file_name, "rb") as f:
  580. s3_resource.Bucket("pandas-test").put_object(Key="test1" + ext,
  581. Body=f)
  582. url = ('s3://pandas-test/test1' + ext)
  583. url_table = read_excel(url)
  584. local_table = self.get_exceldf('test1', ext)
  585. tm.assert_frame_equal(url_table, local_table)
  586. @pytest.mark.slow
  587. # ignore warning from old xlrd
  588. @pytest.mark.filterwarnings("ignore:This metho:PendingDeprecationWarning")
  589. def test_read_from_file_url(self, ext):
  590. # FILE
  591. localtable = os.path.join(self.dirpath, 'test1' + ext)
  592. local_table = read_excel(localtable)
  593. try:
  594. url_table = read_excel('file://localhost/' + localtable)
  595. except URLError:
  596. # fails on some systems
  597. import platform
  598. pytest.skip("failing on %s" %
  599. ' '.join(platform.uname()).strip())
  600. tm.assert_frame_equal(url_table, local_table)
  601. @td.skip_if_no('pathlib')
  602. def test_read_from_pathlib_path(self, ext):
  603. # GH12655
  604. from pathlib import Path
  605. str_path = os.path.join(self.dirpath, 'test1' + ext)
  606. expected = read_excel(str_path, 'Sheet1', index_col=0)
  607. path_obj = Path(self.dirpath, 'test1' + ext)
  608. actual = read_excel(path_obj, 'Sheet1', index_col=0)
  609. tm.assert_frame_equal(expected, actual)
  610. @td.skip_if_no('py.path')
  611. def test_read_from_py_localpath(self, ext):
  612. # GH12655
  613. from py.path import local as LocalPath
  614. str_path = os.path.join(self.dirpath, 'test1' + ext)
  615. expected = read_excel(str_path, 'Sheet1', index_col=0)
  616. abs_dir = os.path.abspath(self.dirpath)
  617. path_obj = LocalPath(abs_dir).join('test1' + ext)
  618. actual = read_excel(path_obj, 'Sheet1', index_col=0)
  619. tm.assert_frame_equal(expected, actual)
  620. def test_reader_closes_file(self, ext):
  621. pth = os.path.join(self.dirpath, 'test1' + ext)
  622. f = open(pth, 'rb')
  623. with ExcelFile(f) as xlsx:
  624. # parses okay
  625. read_excel(xlsx, 'Sheet1', index_col=0)
  626. assert f.closed
  627. @td.skip_if_no("xlwt")
  628. @td.skip_if_no("openpyxl")
  629. def test_creating_and_reading_multiple_sheets(self, ext):
  630. # see gh-9450
  631. #
  632. # Test reading multiple sheets, from a runtime
  633. # created Excel file with multiple sheets.
  634. def tdf(col_sheet_name):
  635. d, i = [11, 22, 33], [1, 2, 3]
  636. return DataFrame(d, i, columns=[col_sheet_name])
  637. sheets = ["AAA", "BBB", "CCC"]
  638. dfs = [tdf(s) for s in sheets]
  639. dfs = dict(zip(sheets, dfs))
  640. with ensure_clean(ext) as pth:
  641. with ExcelWriter(pth) as ew:
  642. for sheetname, df in iteritems(dfs):
  643. df.to_excel(ew, sheetname)
  644. dfs_returned = read_excel(pth, sheet_name=sheets, index_col=0)
  645. for s in sheets:
  646. tm.assert_frame_equal(dfs[s], dfs_returned[s])
  647. def test_reader_seconds(self, ext):
  648. # Test reading times with and without milliseconds. GH5945.
  649. expected = DataFrame.from_dict({"Time": [time(1, 2, 3),
  650. time(2, 45, 56, 100000),
  651. time(4, 29, 49, 200000),
  652. time(6, 13, 42, 300000),
  653. time(7, 57, 35, 400000),
  654. time(9, 41, 28, 500000),
  655. time(11, 25, 21, 600000),
  656. time(13, 9, 14, 700000),
  657. time(14, 53, 7, 800000),
  658. time(16, 37, 0, 900000),
  659. time(18, 20, 54)]})
  660. actual = self.get_exceldf('times_1900', ext, 'Sheet1')
  661. tm.assert_frame_equal(actual, expected)
  662. actual = self.get_exceldf('times_1904', ext, 'Sheet1')
  663. tm.assert_frame_equal(actual, expected)
  664. def test_read_excel_multiindex(self, ext):
  665. # see gh-4679
  666. mi = MultiIndex.from_product([["foo", "bar"], ["a", "b"]])
  667. mi_file = os.path.join(self.dirpath, "testmultiindex" + ext)
  668. # "mi_column" sheet
  669. expected = DataFrame([[1, 2.5, pd.Timestamp("2015-01-01"), True],
  670. [2, 3.5, pd.Timestamp("2015-01-02"), False],
  671. [3, 4.5, pd.Timestamp("2015-01-03"), False],
  672. [4, 5.5, pd.Timestamp("2015-01-04"), True]],
  673. columns=mi)
  674. actual = read_excel(mi_file, "mi_column", header=[0, 1], index_col=0)
  675. tm.assert_frame_equal(actual, expected)
  676. # "mi_index" sheet
  677. expected.index = mi
  678. expected.columns = ["a", "b", "c", "d"]
  679. actual = read_excel(mi_file, "mi_index", index_col=[0, 1])
  680. tm.assert_frame_equal(actual, expected, check_names=False)
  681. # "both" sheet
  682. expected.columns = mi
  683. actual = read_excel(mi_file, "both", index_col=[0, 1], header=[0, 1])
  684. tm.assert_frame_equal(actual, expected, check_names=False)
  685. # "mi_index_name" sheet
  686. expected.columns = ["a", "b", "c", "d"]
  687. expected.index = mi.set_names(["ilvl1", "ilvl2"])
  688. actual = read_excel(mi_file, "mi_index_name", index_col=[0, 1])
  689. tm.assert_frame_equal(actual, expected)
  690. # "mi_column_name" sheet
  691. expected.index = list(range(4))
  692. expected.columns = mi.set_names(["c1", "c2"])
  693. actual = read_excel(mi_file, "mi_column_name",
  694. header=[0, 1], index_col=0)
  695. tm.assert_frame_equal(actual, expected)
  696. # see gh-11317
  697. # "name_with_int" sheet
  698. expected.columns = mi.set_levels(
  699. [1, 2], level=1).set_names(["c1", "c2"])
  700. actual = read_excel(mi_file, "name_with_int",
  701. index_col=0, header=[0, 1])
  702. tm.assert_frame_equal(actual, expected)
  703. # "both_name" sheet
  704. expected.columns = mi.set_names(["c1", "c2"])
  705. expected.index = mi.set_names(["ilvl1", "ilvl2"])
  706. actual = read_excel(mi_file, "both_name",
  707. index_col=[0, 1], header=[0, 1])
  708. tm.assert_frame_equal(actual, expected)
  709. # "both_skiprows" sheet
  710. actual = read_excel(mi_file, "both_name_skiprows", index_col=[0, 1],
  711. header=[0, 1], skiprows=2)
  712. tm.assert_frame_equal(actual, expected)
  713. def test_read_excel_multiindex_header_only(self, ext):
  714. # see gh-11733.
  715. #
  716. # Don't try to parse a header name if there isn't one.
  717. mi_file = os.path.join(self.dirpath, "testmultiindex" + ext)
  718. result = read_excel(mi_file, "index_col_none", header=[0, 1])
  719. exp_columns = MultiIndex.from_product([("A", "B"), ("key", "val")])
  720. expected = DataFrame([[1, 2, 3, 4]] * 2, columns=exp_columns)
  721. tm.assert_frame_equal(result, expected)
  722. @td.skip_if_no("xlsxwriter")
  723. def test_read_excel_multiindex_empty_level(self, ext):
  724. # see gh-12453
  725. with ensure_clean(ext) as path:
  726. df = DataFrame({
  727. ("One", "x"): {0: 1},
  728. ("Two", "X"): {0: 3},
  729. ("Two", "Y"): {0: 7},
  730. ("Zero", ""): {0: 0}
  731. })
  732. expected = DataFrame({
  733. ("One", "x"): {0: 1},
  734. ("Two", "X"): {0: 3},
  735. ("Two", "Y"): {0: 7},
  736. ("Zero", "Unnamed: 4_level_1"): {0: 0}
  737. })
  738. df.to_excel(path)
  739. actual = pd.read_excel(path, header=[0, 1], index_col=0)
  740. tm.assert_frame_equal(actual, expected)
  741. df = pd.DataFrame({
  742. ("Beg", ""): {0: 0},
  743. ("Middle", "x"): {0: 1},
  744. ("Tail", "X"): {0: 3},
  745. ("Tail", "Y"): {0: 7}
  746. })
  747. expected = pd.DataFrame({
  748. ("Beg", "Unnamed: 1_level_1"): {0: 0},
  749. ("Middle", "x"): {0: 1},
  750. ("Tail", "X"): {0: 3},
  751. ("Tail", "Y"): {0: 7}
  752. })
  753. df.to_excel(path)
  754. actual = pd.read_excel(path, header=[0, 1], index_col=0)
  755. tm.assert_frame_equal(actual, expected)
  756. @td.skip_if_no("xlsxwriter")
  757. @pytest.mark.parametrize("c_idx_names", [True, False])
  758. @pytest.mark.parametrize("r_idx_names", [True, False])
  759. @pytest.mark.parametrize("c_idx_levels", [1, 3])
  760. @pytest.mark.parametrize("r_idx_levels", [1, 3])
  761. def test_excel_multindex_roundtrip(self, ext, c_idx_names, r_idx_names,
  762. c_idx_levels, r_idx_levels):
  763. # see gh-4679
  764. with ensure_clean(ext) as pth:
  765. if c_idx_levels == 1 and c_idx_names:
  766. pytest.skip("Column index name cannot be "
  767. "serialized unless it's a MultiIndex")
  768. # Empty name case current read in as
  769. # unnamed levels, not Nones.
  770. check_names = r_idx_names or r_idx_levels <= 1
  771. df = mkdf(5, 5, c_idx_names, r_idx_names,
  772. c_idx_levels, r_idx_levels)
  773. df.to_excel(pth)
  774. act = pd.read_excel(pth, index_col=list(range(r_idx_levels)),
  775. header=list(range(c_idx_levels)))
  776. tm.assert_frame_equal(df, act, check_names=check_names)
  777. df.iloc[0, :] = np.nan
  778. df.to_excel(pth)
  779. act = pd.read_excel(pth, index_col=list(range(r_idx_levels)),
  780. header=list(range(c_idx_levels)))
  781. tm.assert_frame_equal(df, act, check_names=check_names)
  782. df.iloc[-1, :] = np.nan
  783. df.to_excel(pth)
  784. act = pd.read_excel(pth, index_col=list(range(r_idx_levels)),
  785. header=list(range(c_idx_levels)))
  786. tm.assert_frame_equal(df, act, check_names=check_names)
  787. def test_excel_old_index_format(self, ext):
  788. # see gh-4679
  789. filename = "test_index_name_pre17" + ext
  790. in_file = os.path.join(self.dirpath, filename)
  791. # We detect headers to determine if index names exist, so
  792. # that "index" name in the "names" version of the data will
  793. # now be interpreted as rows that include null data.
  794. data = np.array([[None, None, None, None, None],
  795. ["R0C0", "R0C1", "R0C2", "R0C3", "R0C4"],
  796. ["R1C0", "R1C1", "R1C2", "R1C3", "R1C4"],
  797. ["R2C0", "R2C1", "R2C2", "R2C3", "R2C4"],
  798. ["R3C0", "R3C1", "R3C2", "R3C3", "R3C4"],
  799. ["R4C0", "R4C1", "R4C2", "R4C3", "R4C4"]])
  800. columns = ["C_l0_g0", "C_l0_g1", "C_l0_g2", "C_l0_g3", "C_l0_g4"]
  801. mi = MultiIndex(levels=[["R0", "R_l0_g0", "R_l0_g1",
  802. "R_l0_g2", "R_l0_g3", "R_l0_g4"],
  803. ["R1", "R_l1_g0", "R_l1_g1",
  804. "R_l1_g2", "R_l1_g3", "R_l1_g4"]],
  805. codes=[[0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5]],
  806. names=[None, None])
  807. si = Index(["R0", "R_l0_g0", "R_l0_g1", "R_l0_g2",
  808. "R_l0_g3", "R_l0_g4"], name=None)
  809. expected = pd.DataFrame(data, index=si, columns=columns)
  810. actual = pd.read_excel(in_file, "single_names", index_col=0)
  811. tm.assert_frame_equal(actual, expected)
  812. expected.index = mi
  813. actual = pd.read_excel(in_file, "multi_names", index_col=[0, 1])
  814. tm.assert_frame_equal(actual, expected)
  815. # The analogous versions of the "names" version data
  816. # where there are explicitly no names for the indices.
  817. data = np.array([["R0C0", "R0C1", "R0C2", "R0C3", "R0C4"],
  818. ["R1C0", "R1C1", "R1C2", "R1C3", "R1C4"],
  819. ["R2C0", "R2C1", "R2C2", "R2C3", "R2C4"],
  820. ["R3C0", "R3C1", "R3C2", "R3C3", "R3C4"],
  821. ["R4C0", "R4C1", "R4C2", "R4C3", "R4C4"]])
  822. columns = ["C_l0_g0", "C_l0_g1", "C_l0_g2", "C_l0_g3", "C_l0_g4"]
  823. mi = MultiIndex(levels=[["R_l0_g0", "R_l0_g1", "R_l0_g2",
  824. "R_l0_g3", "R_l0_g4"],
  825. ["R_l1_g0", "R_l1_g1", "R_l1_g2",
  826. "R_l1_g3", "R_l1_g4"]],
  827. codes=[[0, 1, 2, 3, 4], [0, 1, 2, 3, 4]],
  828. names=[None, None])
  829. si = Index(["R_l0_g0", "R_l0_g1", "R_l0_g2",
  830. "R_l0_g3", "R_l0_g4"], name=None)
  831. expected = pd.DataFrame(data, index=si, columns=columns)
  832. actual = pd.read_excel(in_file, "single_no_names", index_col=0)
  833. tm.assert_frame_equal(actual, expected)
  834. expected.index = mi
  835. actual = pd.read_excel(in_file, "multi_no_names", index_col=[0, 1])
  836. tm.assert_frame_equal(actual, expected, check_names=False)
  837. def test_read_excel_bool_header_arg(self, ext):
  838. # GH 6114
  839. for arg in [True, False]:
  840. with pytest.raises(TypeError):
  841. pd.read_excel(os.path.join(self.dirpath, 'test1' + ext),
  842. header=arg)
  843. def test_read_excel_chunksize(self, ext):
  844. # GH 8011
  845. with pytest.raises(NotImplementedError):
  846. pd.read_excel(os.path.join(self.dirpath, 'test1' + ext),
  847. chunksize=100)
  848. @td.skip_if_no("xlwt")
  849. @td.skip_if_no("openpyxl")
  850. def test_read_excel_parse_dates(self, ext):
  851. # see gh-11544, gh-12051
  852. df = DataFrame(
  853. {"col": [1, 2, 3],
  854. "date_strings": pd.date_range("2012-01-01", periods=3)})
  855. df2 = df.copy()
  856. df2["date_strings"] = df2["date_strings"].dt.strftime("%m/%d/%Y")
  857. with ensure_clean(ext) as pth:
  858. df2.to_excel(pth)
  859. res = read_excel(pth, index_col=0)
  860. tm.assert_frame_equal(df2, res)
  861. res = read_excel(pth, parse_dates=["date_strings"], index_col=0)
  862. tm.assert_frame_equal(df, res)
  863. date_parser = lambda x: pd.datetime.strptime(x, "%m/%d/%Y")
  864. res = read_excel(pth, parse_dates=["date_strings"],
  865. date_parser=date_parser, index_col=0)
  866. tm.assert_frame_equal(df, res)
  867. def test_read_excel_skiprows_list(self, ext):
  868. # GH 4903
  869. actual = pd.read_excel(os.path.join(self.dirpath,
  870. 'testskiprows' + ext),
  871. 'skiprows_list', skiprows=[0, 2])
  872. expected = DataFrame([[1, 2.5, pd.Timestamp('2015-01-01'), True],
  873. [2, 3.5, pd.Timestamp('2015-01-02'), False],
  874. [3, 4.5, pd.Timestamp('2015-01-03'), False],
  875. [4, 5.5, pd.Timestamp('2015-01-04'), True]],
  876. columns=['a', 'b', 'c', 'd'])
  877. tm.assert_frame_equal(actual, expected)
  878. actual = pd.read_excel(os.path.join(self.dirpath,
  879. 'testskiprows' + ext),
  880. 'skiprows_list', skiprows=np.array([0, 2]))
  881. tm.assert_frame_equal(actual, expected)
  882. def test_read_excel_nrows(self, ext):
  883. # GH 16645
  884. num_rows_to_pull = 5
  885. actual = pd.read_excel(os.path.join(self.dirpath, 'test1' + ext),
  886. nrows=num_rows_to_pull)
  887. expected = pd.read_excel(os.path.join(self.dirpath,
  888. 'test1' + ext))
  889. expected = expected[:num_rows_to_pull]
  890. tm.assert_frame_equal(actual, expected)
  891. def test_read_excel_nrows_greater_than_nrows_in_file(self, ext):
  892. # GH 16645
  893. expected = pd.read_excel(os.path.join(self.dirpath,
  894. 'test1' + ext))
  895. num_records_in_file = len(expected)
  896. num_rows_to_pull = num_records_in_file + 10
  897. actual = pd.read_excel(os.path.join(self.dirpath, 'test1' + ext),
  898. nrows=num_rows_to_pull)
  899. tm.assert_frame_equal(actual, expected)
  900. def test_read_excel_nrows_non_integer_parameter(self, ext):
  901. # GH 16645
  902. msg = "'nrows' must be an integer >=0"
  903. with pytest.raises(ValueError, match=msg):
  904. pd.read_excel(os.path.join(self.dirpath, 'test1' + ext),
  905. nrows='5')
  906. def test_read_excel_squeeze(self, ext):
  907. # GH 12157
  908. f = os.path.join(self.dirpath, 'test_squeeze' + ext)
  909. actual = pd.read_excel(f, 'two_columns', index_col=0, squeeze=True)
  910. expected = pd.Series([2, 3, 4], [4, 5, 6], name='b')
  911. expected.index.name = 'a'
  912. tm.assert_series_equal(actual, expected)
  913. actual = pd.read_excel(f, 'two_columns', squeeze=True)
  914. expected = pd.DataFrame({'a': [4, 5, 6],
  915. 'b': [2, 3, 4]})
  916. tm.assert_frame_equal(actual, expected)
  917. actual = pd.read_excel(f, 'one_column', squeeze=True)
  918. expected = pd.Series([1, 2, 3], name='a')
  919. tm.assert_series_equal(actual, expected)
  920. @pytest.mark.parametrize("ext", ['.xls', '.xlsx', '.xlsm'])
  921. class TestXlrdReader(ReadingTestsBase):
  922. """
  923. This is the base class for the xlrd tests, and 3 different file formats
  924. are supported: xls, xlsx, xlsm
  925. """
  926. @td.skip_if_no("xlwt")
  927. def test_read_xlrd_book(self, ext):
  928. import xlrd
  929. df = self.frame
  930. engine = "xlrd"
  931. sheet_name = "SheetA"
  932. with ensure_clean(ext) as pth:
  933. df.to_excel(pth, sheet_name)
  934. book = xlrd.open_workbook(pth)
  935. with ExcelFile(book, engine=engine) as xl:
  936. result = read_excel(xl, sheet_name, index_col=0)
  937. tm.assert_frame_equal(df, result)
  938. result = read_excel(book, sheet_name=sheet_name,
  939. engine=engine, index_col=0)
  940. tm.assert_frame_equal(df, result)
  941. class _WriterBase(SharedItems):
  942. @pytest.fixture(autouse=True)
  943. def set_engine_and_path(self, request, merge_cells, engine, ext):
  944. """Fixture to set engine and open file for use in each test case
  945. Rather than requiring `engine=...` to be provided explicitly as an
  946. argument in each test, this fixture sets a global option to dictate
  947. which engine should be used to write Excel files. After executing
  948. the test it rolls back said change to the global option.
  949. It also uses a context manager to open a temporary excel file for
  950. the function to write to, accessible via `self.path`
  951. Notes
  952. -----
  953. This fixture will run as part of each test method defined in the
  954. class and any subclasses, on account of the `autouse=True`
  955. argument
  956. """
  957. option_name = 'io.excel.{ext}.writer'.format(ext=ext.strip('.'))
  958. prev_engine = get_option(option_name)
  959. set_option(option_name, engine)
  960. with ensure_clean(ext) as path:
  961. self.path = path
  962. yield
  963. set_option(option_name, prev_engine) # Roll back option change
  964. @pytest.mark.parametrize("merge_cells", [True, False])
  965. @pytest.mark.parametrize("engine,ext", [
  966. pytest.param('openpyxl', '.xlsx', marks=pytest.mark.skipif(
  967. not td.safe_import('openpyxl'), reason='No openpyxl')),
  968. pytest.param('openpyxl', '.xlsm', marks=pytest.mark.skipif(
  969. not td.safe_import('openpyxl'), reason='No openpyxl')),
  970. pytest.param('xlwt', '.xls', marks=pytest.mark.skipif(
  971. not td.safe_import('xlwt'), reason='No xlwt')),
  972. pytest.param('xlsxwriter', '.xlsx', marks=pytest.mark.skipif(
  973. not td.safe_import('xlsxwriter'), reason='No xlsxwriter'))
  974. ])
  975. class TestExcelWriter(_WriterBase):
  976. # Base class for test cases to run with different Excel writers.
  977. def test_excel_sheet_by_name_raise(self, *_):
  978. import xlrd
  979. gt = DataFrame(np.random.randn(10, 2))
  980. gt.to_excel(self.path)
  981. xl = ExcelFile(self.path)
  982. df = read_excel(xl, 0, index_col=0)
  983. tm.assert_frame_equal(gt, df)
  984. with pytest.raises(xlrd.XLRDError):
  985. read_excel(xl, "0")
  986. def test_excel_writer_context_manager(self, *_):
  987. with ExcelWriter(self.path) as writer:
  988. self.frame.to_excel(writer, "Data1")
  989. self.frame2.to_excel(writer, "Data2")
  990. with ExcelFile(self.path) as reader:
  991. found_df = read_excel(reader, "Data1", index_col=0)
  992. found_df2 = read_excel(reader, "Data2", index_col=0)
  993. tm.assert_frame_equal(found_df, self.frame)
  994. tm.assert_frame_equal(found_df2, self.frame2)
  995. def test_roundtrip(self, merge_cells, engine, ext):
  996. self.frame['A'][:5] = nan
  997. self.frame.to_excel(self.path, 'test1')
  998. self.frame.to_excel(self.path, 'test1', columns=['A', 'B'])
  999. self.frame.to_excel(self.path, 'test1', header=False)
  1000. self.frame.to_excel(self.path, 'test1', index=False)
  1001. # test roundtrip
  1002. self.frame.to_excel(self.path, 'test1')
  1003. recons = read_excel(self.path, 'test1', index_col=0)
  1004. tm.assert_frame_equal(self.frame, recons)
  1005. self.frame.to_excel(self.path, 'test1', index=False)
  1006. recons = read_excel(self.path, 'test1', index_col=None)
  1007. recons.index = self.frame.index
  1008. tm.assert_frame_equal(self.frame, recons)
  1009. self.frame.to_excel(self.path, 'test1', na_rep='NA')
  1010. recons = read_excel(self.path, 'test1', index_col=0, na_values=['NA'])
  1011. tm.assert_frame_equal(self.frame, recons)
  1012. # GH 3611
  1013. self.frame.to_excel(self.path, 'test1', na_rep='88')
  1014. recons = read_excel(self.path, 'test1', index_col=0, na_values=['88'])
  1015. tm.assert_frame_equal(self.frame, recons)
  1016. self.frame.to_excel(self.path, 'test1', na_rep='88')
  1017. recons = read_excel(self.path, 'test1', index_col=0,
  1018. na_values=[88, 88.0])
  1019. tm.assert_frame_equal(self.frame, recons)
  1020. # GH 6573
  1021. self.frame.to_excel(self.path, 'Sheet1')
  1022. recons = read_excel(self.path, index_col=0)
  1023. tm.assert_frame_equal(self.frame, recons)
  1024. self.frame.to_excel(self.path, '0')
  1025. recons = read_excel(self.path, index_col=0)
  1026. tm.assert_frame_equal(self.frame, recons)
  1027. # GH 8825 Pandas Series should provide to_excel method
  1028. s = self.frame["A"]
  1029. s.to_excel(self.path)
  1030. recons = read_excel(self.path, index_col=0)
  1031. tm.assert_frame_equal(s.to_frame(), recons)
  1032. def test_mixed(self, merge_cells, engine, ext):
  1033. self.mixed_frame.to_excel(self.path, 'test1')
  1034. reader = ExcelFile(self.path)
  1035. recons = read_excel(reader, 'test1', index_col=0)
  1036. tm.assert_frame_equal(self.mixed_frame, recons)
  1037. def test_ts_frame(self, *_):
  1038. df = tm.makeTimeDataFrame()[:5]
  1039. df.to_excel(self.path, "test1")
  1040. reader = ExcelFile(self.path)
  1041. recons = read_excel(reader, "test1", index_col=0)
  1042. tm.assert_frame_equal(df, recons)
  1043. def test_basics_with_nan(self, merge_cells, engine, ext):
  1044. self.frame['A'][:5] = nan
  1045. self.frame.to_excel(self.path, 'test1')
  1046. self.frame.to_excel(self.path, 'test1', columns=['A', 'B'])
  1047. self.frame.to_excel(self.path, 'test1', header=False)
  1048. self.frame.to_excel(self.path, 'test1', index=False)
  1049. @pytest.mark.parametrize("np_type", [
  1050. np.int8, np.int16, np.int32, np.int64])
  1051. def test_int_types(self, merge_cells, engine, ext, np_type):
  1052. # Test np.int values read come back as int
  1053. # (rather than float which is Excel's format).
  1054. frame = DataFrame(np.random.randint(-10, 10, size=(10, 2)),
  1055. dtype=np_type)
  1056. frame.to_excel(self.path, "test1")
  1057. reader = ExcelFile(self.path)
  1058. recons = read_excel(reader, "test1", index_col=0)
  1059. int_frame = frame.astype(np.int64)
  1060. tm.assert_frame_equal(int_frame, recons)
  1061. recons2 = read_excel(self.path, "test1", index_col=0)
  1062. tm.assert_frame_equal(int_frame, recons2)
  1063. # Test with convert_float=False comes back as float.
  1064. float_frame = frame.astype(float)
  1065. recons = read_excel(self.path, "test1",
  1066. convert_float=False, index_col=0)
  1067. tm.assert_frame_equal(recons, float_frame,
  1068. check_index_type=False,
  1069. check_column_type=False)
  1070. @pytest.mark.parametrize("np_type", [
  1071. np.float16, np.float32, np.float64])
  1072. def test_float_types(self, merge_cells, engine, ext, np_type):
  1073. # Test np.float values read come back as float.
  1074. frame = DataFrame(np.random.random_sample(10), dtype=np_type)
  1075. frame.to_excel(self.path, "test1")
  1076. reader = ExcelFile(self.path)
  1077. recons = read_excel(reader, "test1", index_col=0).astype(np_type)
  1078. tm.assert_frame_equal(frame, recons, check_dtype=False)
  1079. @pytest.mark.parametrize("np_type", [np.bool8, np.bool_])
  1080. def test_bool_types(self, merge_cells, engine, ext, np_type):
  1081. # Test np.bool values read come back as float.
  1082. frame = (DataFrame([1, 0, True, False], dtype=np_type))
  1083. frame.to_excel(self.path, "test1")
  1084. reader = ExcelFile(self.path)
  1085. recons = read_excel(reader, "test1", index_col=0).astype(np_type)
  1086. tm.assert_frame_equal(frame, recons)
  1087. def test_inf_roundtrip(self, *_):
  1088. frame = DataFrame([(1, np.inf), (2, 3), (5, -np.inf)])
  1089. frame.to_excel(self.path, "test1")
  1090. reader = ExcelFile(self.path)
  1091. recons = read_excel(reader, "test1", index_col=0)
  1092. tm.assert_frame_equal(frame, recons)
  1093. def test_sheets(self, merge_cells, engine, ext):
  1094. self.frame['A'][:5] = nan
  1095. self.frame.to_excel(self.path, 'test1')
  1096. self.frame.to_excel(self.path, 'test1', columns=['A', 'B'])
  1097. self.frame.to_excel(self.path, 'test1', header=False)
  1098. self.frame.to_excel(self.path, 'test1', index=False)
  1099. # Test writing to separate sheets
  1100. writer = ExcelWriter(self.path)
  1101. self.frame.to_excel(writer, 'test1')
  1102. self.tsframe.to_excel(writer, 'test2')
  1103. writer.save()
  1104. reader = ExcelFile(self.path)
  1105. recons = read_excel(reader, 'test1', index_col=0)
  1106. tm.assert_frame_equal(self.frame, recons)
  1107. recons = read_excel(reader, 'test2', index_col=0)
  1108. tm.assert_frame_equal(self.tsframe, recons)
  1109. assert 2 == len(reader.sheet_names)
  1110. assert 'test1' == reader.sheet_names[0]
  1111. assert 'test2' == reader.sheet_names[1]
  1112. def test_colaliases(self, merge_cells, engine, ext):
  1113. self.frame['A'][:5] = nan
  1114. self.frame.to_excel(self.path, 'test1')
  1115. self.frame.to_excel(self.path, 'test1', columns=['A', 'B'])
  1116. self.frame.to_excel(self.path, 'test1', header=False)
  1117. self.frame.to_excel(self.path, 'test1', index=False)
  1118. # column aliases
  1119. col_aliases = Index(['AA', 'X', 'Y', 'Z'])
  1120. self.frame2.to_excel(self.path, 'test1', header=col_aliases)
  1121. reader = ExcelFile(self.path)
  1122. rs = read_excel(reader, 'test1', index_col=0)
  1123. xp = self.frame2.copy()
  1124. xp.columns = col_aliases
  1125. tm.assert_frame_equal(xp, rs)
  1126. def test_roundtrip_indexlabels(self, merge_cells, engine, ext):
  1127. self.frame['A'][:5] = nan
  1128. self.frame.to_excel(self.path, 'test1')
  1129. self.frame.to_excel(self.path, 'test1', columns=['A', 'B'])
  1130. self.frame.to_excel(self.path, 'test1', header=False)
  1131. self.frame.to_excel(self.path, 'test1', index=False)
  1132. # test index_label
  1133. frame = (DataFrame(np.random.randn(10, 2)) >= 0)
  1134. frame.to_excel(self.path, 'test1',
  1135. index_label=['test'],
  1136. merge_cells=merge_cells)
  1137. reader = ExcelFile(self.path)
  1138. recons = read_excel(reader, 'test1',
  1139. index_col=0,
  1140. ).astype(np.int64)
  1141. frame.index.names = ['test']
  1142. assert frame.index.names == recons.index.names
  1143. frame = (DataFrame(np.random.randn(10, 2)) >= 0)
  1144. frame.to_excel(self.path,
  1145. 'test1',
  1146. index_label=['test', 'dummy', 'dummy2'],
  1147. merge_cells=merge_cells)
  1148. reader = ExcelFile(self.path)
  1149. recons = read_excel(reader, 'test1',
  1150. index_col=0,
  1151. ).astype(np.int64)
  1152. frame.index.names = ['test']
  1153. assert frame.index.names == recons.index.names
  1154. frame = (DataFrame(np.random.randn(10, 2)) >= 0)
  1155. frame.to_excel(self.path,
  1156. 'test1',
  1157. index_label='test',
  1158. merge_cells=merge_cells)
  1159. reader = ExcelFile(self.path)
  1160. recons = read_excel(reader, 'test1',
  1161. index_col=0,
  1162. ).astype(np.int64)
  1163. frame.index.names = ['test']
  1164. tm.assert_frame_equal(frame, recons.astype(bool))
  1165. self.frame.to_excel(self.path,
  1166. 'test1',
  1167. columns=['A', 'B', 'C', 'D'],
  1168. index=False, merge_cells=merge_cells)
  1169. # take 'A' and 'B' as indexes (same row as cols 'C', 'D')
  1170. df = self.frame.copy()
  1171. df = df.set_index(['A', 'B'])
  1172. reader = ExcelFile(self.path)
  1173. recons = read_excel(reader, 'test1', index_col=[0, 1])
  1174. tm.assert_frame_equal(df, recons, check_less_precise=True)
  1175. def test_excel_roundtrip_indexname(self, merge_cells, engine, ext):
  1176. df = DataFrame(np.random.randn(10, 4))
  1177. df.index.name = 'foo'
  1178. df.to_excel(self.path, merge_cells=merge_cells)
  1179. xf = ExcelFile(self.path)
  1180. result = read_excel(xf, xf.sheet_names[0],
  1181. index_col=0)
  1182. tm.assert_frame_equal(result, df)
  1183. assert result.index.name == 'foo'
  1184. def test_excel_roundtrip_datetime(self, merge_cells, *_):
  1185. # datetime.date, not sure what to test here exactly
  1186. tsf = self.tsframe.copy()
  1187. tsf.index = [x.date() for x in self.tsframe.index]
  1188. tsf.to_excel(self.path, "test1", merge_cells=merge_cells)
  1189. reader = ExcelFile(self.path)
  1190. recons = read_excel(reader, "test1", index_col=0)
  1191. tm.assert_frame_equal(self.tsframe, recons)
  1192. def test_excel_date_datetime_format(self, merge_cells, engine, ext):
  1193. # see gh-4133
  1194. #
  1195. # Excel output format strings
  1196. df = DataFrame([[date(2014, 1, 31),
  1197. date(1999, 9, 24)],
  1198. [datetime(1998, 5, 26, 23, 33, 4),
  1199. datetime(2014, 2, 28, 13, 5, 13)]],
  1200. index=["DATE", "DATETIME"], columns=["X", "Y"])
  1201. df_expected = DataFrame([[datetime(2014, 1, 31),
  1202. datetime(1999, 9, 24)],
  1203. [datetime(1998, 5, 26, 23, 33, 4),
  1204. datetime(2014, 2, 28, 13, 5, 13)]],
  1205. index=["DATE", "DATETIME"], columns=["X", "Y"])
  1206. with ensure_clean(ext) as filename2:
  1207. writer1 = ExcelWriter(self.path)
  1208. writer2 = ExcelWriter(filename2,
  1209. date_format="DD.MM.YYYY",
  1210. datetime_format="DD.MM.YYYY HH-MM-SS")
  1211. df.to_excel(writer1, "test1")
  1212. df.to_excel(writer2, "test1")
  1213. writer1.close()
  1214. writer2.close()
  1215. reader1 = ExcelFile(self.path)
  1216. reader2 = ExcelFile(filename2)
  1217. rs1 = read_excel(reader1, "test1", index_col=0)
  1218. rs2 = read_excel(reader2, "test1", index_col=0)
  1219. tm.assert_frame_equal(rs1, rs2)
  1220. # Since the reader returns a datetime object for dates,
  1221. # we need to use df_expected to check the result.
  1222. tm.assert_frame_equal(rs2, df_expected)
  1223. def test_to_excel_interval_no_labels(self, *_):
  1224. # see gh-19242
  1225. #
  1226. # Test writing Interval without labels.
  1227. frame = DataFrame(np.random.randint(-10, 10, size=(20, 1)),
  1228. dtype=np.int64)
  1229. expected = frame.copy()
  1230. frame["new"] = pd.cut(frame[0], 10)
  1231. expected["new"] = pd.cut(expected[0], 10).astype(str)
  1232. frame.to_excel(self.path, "test1")
  1233. reader = ExcelFile(self.path)
  1234. recons = read_excel(reader, "test1", index_col=0)
  1235. tm.assert_frame_equal(expected, recons)
  1236. def test_to_excel_interval_labels(self, *_):
  1237. # see gh-19242
  1238. #
  1239. # Test writing Interval with labels.
  1240. frame = DataFrame(np.random.randint(-10, 10, size=(20, 1)),
  1241. dtype=np.int64)
  1242. expected = frame.copy()
  1243. intervals = pd.cut(frame[0], 10, labels=["A", "B", "C", "D", "E",
  1244. "F", "G", "H", "I", "J"])
  1245. frame["new"] = intervals
  1246. expected["new"] = pd.Series(list(intervals))
  1247. frame.to_excel(self.path, "test1")
  1248. reader = ExcelFile(self.path)
  1249. recons = read_excel(reader, "test1", index_col=0)
  1250. tm.assert_frame_equal(expected, recons)
  1251. def test_to_excel_timedelta(self, *_):
  1252. # see gh-19242, gh-9155
  1253. #
  1254. # Test writing timedelta to xls.
  1255. frame = DataFrame(np.random.randint(-10, 10, size=(20, 1)),
  1256. columns=["A"], dtype=np.int64)
  1257. expected = frame.copy()
  1258. frame["new"] = frame["A"].apply(lambda x: timedelta(seconds=x))
  1259. expected["new"] = expected["A"].apply(
  1260. lambda x: timedelta(seconds=x).total_seconds() / float(86400))
  1261. frame.to_excel(self.path, "test1")
  1262. reader = ExcelFile(self.path)
  1263. recons = read_excel(reader, "test1", index_col=0)
  1264. tm.assert_frame_equal(expected, recons)
  1265. def test_to_excel_periodindex(self, merge_cells, engine, ext):
  1266. frame = self.tsframe
  1267. xp = frame.resample('M', kind='period').mean()
  1268. xp.to_excel(self.path, 'sht1')
  1269. reader = ExcelFile(self.path)
  1270. rs = read_excel(reader, 'sht1', index_col=0)
  1271. tm.assert_frame_equal(xp, rs.to_period('M'))
  1272. def test_to_excel_multiindex(self, merge_cells, engine, ext):
  1273. frame = self.frame
  1274. arrays = np.arange(len(frame.index) * 2).reshape(2, -1)
  1275. new_index = MultiIndex.from_arrays(arrays,
  1276. names=['first', 'second'])
  1277. frame.index = new_index
  1278. frame.to_excel(self.path, 'test1', header=False)
  1279. frame.to_excel(self.path, 'test1', columns=['A', 'B'])
  1280. # round trip
  1281. frame.to_excel(self.path, 'test1', merge_cells=merge_cells)
  1282. reader = ExcelFile(self.path)
  1283. df = read_excel(reader, 'test1', index_col=[0, 1])
  1284. tm.assert_frame_equal(frame, df)
  1285. # GH13511
  1286. def test_to_excel_multiindex_nan_label(self, merge_cells, engine, ext):
  1287. frame = pd.DataFrame({'A': [None, 2, 3],
  1288. 'B': [10, 20, 30],
  1289. 'C': np.random.sample(3)})
  1290. frame = frame.set_index(['A', 'B'])
  1291. frame.to_excel(self.path, merge_cells=merge_cells)
  1292. df = read_excel(self.path, index_col=[0, 1])
  1293. tm.assert_frame_equal(frame, df)
  1294. # Test for Issue 11328. If column indices are integers, make
  1295. # sure they are handled correctly for either setting of
  1296. # merge_cells
  1297. def test_to_excel_multiindex_cols(self, merge_cells, engine, ext):
  1298. frame = self.frame
  1299. arrays = np.arange(len(frame.index) * 2).reshape(2, -1)
  1300. new_index = MultiIndex.from_arrays(arrays,
  1301. names=['first', 'second'])
  1302. frame.index = new_index
  1303. new_cols_index = MultiIndex.from_tuples([(40, 1), (40, 2),
  1304. (50, 1), (50, 2)])
  1305. frame.columns = new_cols_index
  1306. header = [0, 1]
  1307. if not merge_cells:
  1308. header = 0
  1309. # round trip
  1310. frame.to_excel(self.path, 'test1', merge_cells=merge_cells)
  1311. reader = ExcelFile(self.path)
  1312. df = read_excel(reader, 'test1', header=header,
  1313. index_col=[0, 1])
  1314. if not merge_cells:
  1315. fm = frame.columns.format(sparsify=False,
  1316. adjoin=False, names=False)
  1317. frame.columns = [".".join(map(str, q)) for q in zip(*fm)]
  1318. tm.assert_frame_equal(frame, df)
  1319. def test_to_excel_multiindex_dates(self, merge_cells, engine, ext):
  1320. # try multiindex with dates
  1321. tsframe = self.tsframe.copy()
  1322. new_index = [tsframe.index, np.arange(len(tsframe.index))]
  1323. tsframe.index = MultiIndex.from_arrays(new_index)
  1324. tsframe.index.names = ['time', 'foo']
  1325. tsframe.to_excel(self.path, 'test1', merge_cells=merge_cells)
  1326. reader = ExcelFile(self.path)
  1327. recons = read_excel(reader, 'test1',
  1328. index_col=[0, 1])
  1329. tm.assert_frame_equal(tsframe, recons)
  1330. assert recons.index.names == ('time', 'foo')
  1331. def test_to_excel_multiindex_no_write_index(self, merge_cells, engine,
  1332. ext):
  1333. # Test writing and re-reading a MI witout the index. GH 5616.
  1334. # Initial non-MI frame.
  1335. frame1 = DataFrame({'a': [10, 20], 'b': [30, 40], 'c': [50, 60]})
  1336. # Add a MI.
  1337. frame2 = frame1.copy()
  1338. multi_index = MultiIndex.from_tuples([(70, 80), (90, 100)])
  1339. frame2.index = multi_index
  1340. # Write out to Excel without the index.
  1341. frame2.to_excel(self.path, 'test1', index=False)
  1342. # Read it back in.
  1343. reader = ExcelFile(self.path)
  1344. frame3 = read_excel(reader, 'test1')
  1345. # Test that it is the same as the initial frame.
  1346. tm.assert_frame_equal(frame1, frame3)
  1347. def test_to_excel_float_format(self, *_):
  1348. df = DataFrame([[0.123456, 0.234567, 0.567567],
  1349. [12.32112, 123123.2, 321321.2]],
  1350. index=["A", "B"], columns=["X", "Y", "Z"])
  1351. df.to_excel(self.path, "test1", float_format="%.2f")
  1352. reader = ExcelFile(self.path)
  1353. result = read_excel(reader, "test1", index_col=0)
  1354. expected = DataFrame([[0.12, 0.23, 0.57],
  1355. [12.32, 123123.20, 321321.20]],
  1356. index=["A", "B"], columns=["X", "Y", "Z"])
  1357. tm.assert_frame_equal(result, expected)
  1358. def test_to_excel_output_encoding(self, merge_cells, engine, ext):
  1359. # Avoid mixed inferred_type.
  1360. df = DataFrame([[u"\u0192", u"\u0193", u"\u0194"],
  1361. [u"\u0195", u"\u0196", u"\u0197"]],
  1362. index=[u"A\u0192", u"B"],
  1363. columns=[u"X\u0193", u"Y", u"Z"])
  1364. with ensure_clean("__tmp_to_excel_float_format__." + ext) as filename:
  1365. df.to_excel(filename, sheet_name="TestSheet", encoding="utf8")
  1366. result = read_excel(filename, "TestSheet",
  1367. encoding="utf8", index_col=0)
  1368. tm.assert_frame_equal(result, df)
  1369. def test_to_excel_unicode_filename(self, merge_cells, engine, ext):
  1370. with ensure_clean(u("\u0192u.") + ext) as filename:
  1371. try:
  1372. f = open(filename, "wb")
  1373. except UnicodeEncodeError:
  1374. pytest.skip("No unicode file names on this system")
  1375. else:
  1376. f.close()
  1377. df = DataFrame([[0.123456, 0.234567, 0.567567],
  1378. [12.32112, 123123.2, 321321.2]],
  1379. index=["A", "B"], columns=["X", "Y", "Z"])
  1380. df.to_excel(filename, "test1", float_format="%.2f")
  1381. reader = ExcelFile(filename)
  1382. result = read_excel(reader, "test1", index_col=0)
  1383. expected = DataFrame([[0.12, 0.23, 0.57],
  1384. [12.32, 123123.20, 321321.20]],
  1385. index=["A", "B"], columns=["X", "Y", "Z"])
  1386. tm.assert_frame_equal(result, expected)
  1387. # def test_to_excel_header_styling_xls(self, merge_cells, engine, ext):
  1388. # import StringIO
  1389. # s = StringIO(
  1390. # """Date,ticker,type,value
  1391. # 2001-01-01,x,close,12.2
  1392. # 2001-01-01,x,open ,12.1
  1393. # 2001-01-01,y,close,12.2
  1394. # 2001-01-01,y,open ,12.1
  1395. # 2001-02-01,x,close,12.2
  1396. # 2001-02-01,x,open ,12.1
  1397. # 2001-02-01,y,close,12.2
  1398. # 2001-02-01,y,open ,12.1
  1399. # 2001-03-01,x,close,12.2
  1400. # 2001-03-01,x,open ,12.1
  1401. # 2001-03-01,y,close,12.2
  1402. # 2001-03-01,y,open ,12.1""")
  1403. # df = read_csv(s, parse_dates=["Date"])
  1404. # pdf = df.pivot_table(values="value", rows=["ticker"],
  1405. # cols=["Date", "type"])
  1406. # try:
  1407. # import xlwt
  1408. # import xlrd
  1409. # except ImportError:
  1410. # pytest.skip
  1411. # filename = '__tmp_to_excel_header_styling_xls__.xls'
  1412. # pdf.to_excel(filename, 'test1')
  1413. # wbk = xlrd.open_workbook(filename,
  1414. # formatting_info=True)
  1415. # assert ["test1"] == wbk.sheet_names()
  1416. # ws = wbk.sheet_by_name('test1')
  1417. # assert [(0, 1, 5, 7), (0, 1, 3, 5), (0, 1, 1, 3)] == ws.merged_cells
  1418. # for i in range(0, 2):
  1419. # for j in range(0, 7):
  1420. # xfx = ws.cell_xf_index(0, 0)
  1421. # cell_xf = wbk.xf_list[xfx]
  1422. # font = wbk.font_list
  1423. # assert 1 == font[cell_xf.font_index].bold
  1424. # assert 1 == cell_xf.border.top_line_style
  1425. # assert 1 == cell_xf.border.right_line_style
  1426. # assert 1 == cell_xf.border.bottom_line_style
  1427. # assert 1 == cell_xf.border.left_line_style
  1428. # assert 2 == cell_xf.alignment.hor_align
  1429. # os.remove(filename)
  1430. # def test_to_excel_header_styling_xlsx(self, merge_cells, engine, ext):
  1431. # import StringIO
  1432. # s = StringIO(
  1433. # """Date,ticker,type,value
  1434. # 2001-01-01,x,close,12.2
  1435. # 2001-01-01,x,open ,12.1
  1436. # 2001-01-01,y,close,12.2
  1437. # 2001-01-01,y,open ,12.1
  1438. # 2001-02-01,x,close,12.2
  1439. # 2001-02-01,x,open ,12.1
  1440. # 2001-02-01,y,close,12.2
  1441. # 2001-02-01,y,open ,12.1
  1442. # 2001-03-01,x,close,12.2
  1443. # 2001-03-01,x,open ,12.1
  1444. # 2001-03-01,y,close,12.2
  1445. # 2001-03-01,y,open ,12.1""")
  1446. # df = read_csv(s, parse_dates=["Date"])
  1447. # pdf = df.pivot_table(values="value", rows=["ticker"],
  1448. # cols=["Date", "type"])
  1449. # try:
  1450. # import openpyxl
  1451. # from openpyxl.cell import get_column_letter
  1452. # except ImportError:
  1453. # pytest.skip
  1454. # if openpyxl.__version__ < '1.6.1':
  1455. # pytest.skip
  1456. # # test xlsx_styling
  1457. # filename = '__tmp_to_excel_header_styling_xlsx__.xlsx'
  1458. # pdf.to_excel(filename, 'test1')
  1459. # wbk = openpyxl.load_workbook(filename)
  1460. # assert ["test1"] == wbk.get_sheet_names()
  1461. # ws = wbk.get_sheet_by_name('test1')
  1462. # xlsaddrs = ["%s2" % chr(i) for i in range(ord('A'), ord('H'))]
  1463. # xlsaddrs += ["A%s" % i for i in range(1, 6)]
  1464. # xlsaddrs += ["B1", "D1", "F1"]
  1465. # for xlsaddr in xlsaddrs:
  1466. # cell = ws.cell(xlsaddr)
  1467. # assert cell.style.font.bold
  1468. # assert (openpyxl.style.Border.BORDER_THIN ==
  1469. # cell.style.borders.top.border_style)
  1470. # assert (openpyxl.style.Border.BORDER_THIN ==
  1471. # cell.style.borders.right.border_style)
  1472. # assert (openpyxl.style.Border.BORDER_THIN ==
  1473. # cell.style.borders.bottom.border_style)
  1474. # assert (openpyxl.style.Border.BORDER_THIN ==
  1475. # cell.style.borders.left.border_style)
  1476. # assert (openpyxl.style.Alignment.HORIZONTAL_CENTER ==
  1477. # cell.style.alignment.horizontal)
  1478. # mergedcells_addrs = ["C1", "E1", "G1"]
  1479. # for maddr in mergedcells_addrs:
  1480. # assert ws.cell(maddr).merged
  1481. # os.remove(filename)
  1482. @pytest.mark.parametrize("use_headers", [True, False])
  1483. @pytest.mark.parametrize("r_idx_nlevels", [1, 2, 3])
  1484. @pytest.mark.parametrize("c_idx_nlevels", [1, 2, 3])
  1485. def test_excel_010_hemstring(self, merge_cells, engine, ext,
  1486. c_idx_nlevels, r_idx_nlevels, use_headers):
  1487. def roundtrip(data, header=True, parser_hdr=0, index=True):
  1488. data.to_excel(self.path, header=header,
  1489. merge_cells=merge_cells, index=index)
  1490. xf = ExcelFile(self.path)
  1491. return read_excel(xf, xf.sheet_names[0], header=parser_hdr)
  1492. # Basic test.
  1493. parser_header = 0 if use_headers else None
  1494. res = roundtrip(DataFrame([0]), use_headers, parser_header)
  1495. assert res.shape == (1, 2)
  1496. assert res.iloc[0, 0] is not np.nan
  1497. # More complex tests with multi-index.
  1498. nrows = 5
  1499. ncols = 3
  1500. from pandas.util.testing import makeCustomDataframe as mkdf
  1501. # ensure limited functionality in 0.10
  1502. # override of gh-2370 until sorted out in 0.11
  1503. df = mkdf(nrows, ncols, r_idx_nlevels=r_idx_nlevels,
  1504. c_idx_nlevels=c_idx_nlevels)
  1505. # This if will be removed once multi-column Excel writing
  1506. # is implemented. For now fixing gh-9794.
  1507. if c_idx_nlevels > 1:
  1508. with pytest.raises(NotImplementedError):
  1509. roundtrip(df, use_headers, index=False)
  1510. else:
  1511. res = roundtrip(df, use_headers)
  1512. if use_headers:
  1513. assert res.shape == (nrows, ncols + r_idx_nlevels)
  1514. else:
  1515. # First row taken as columns.
  1516. assert res.shape == (nrows - 1, ncols + r_idx_nlevels)
  1517. # No NaNs.
  1518. for r in range(len(res.index)):
  1519. for c in range(len(res.columns)):
  1520. assert res.iloc[r, c] is not np.nan
  1521. def test_duplicated_columns(self, *_):
  1522. # see gh-5235
  1523. df = DataFrame([[1, 2, 3], [1, 2, 3], [1, 2, 3]],
  1524. columns=["A", "B", "B"])
  1525. df.to_excel(self.path, "test1")
  1526. expected = DataFrame([[1, 2, 3], [1, 2, 3], [1, 2, 3]],
  1527. columns=["A", "B", "B.1"])
  1528. # By default, we mangle.
  1529. result = read_excel(self.path, "test1", index_col=0)
  1530. tm.assert_frame_equal(result, expected)
  1531. # Explicitly, we pass in the parameter.
  1532. result = read_excel(self.path, "test1", index_col=0,
  1533. mangle_dupe_cols=True)
  1534. tm.assert_frame_equal(result, expected)
  1535. # see gh-11007, gh-10970
  1536. df = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]],
  1537. columns=["A", "B", "A", "B"])
  1538. df.to_excel(self.path, "test1")
  1539. result = read_excel(self.path, "test1", index_col=0)
  1540. expected = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]],
  1541. columns=["A", "B", "A.1", "B.1"])
  1542. tm.assert_frame_equal(result, expected)
  1543. # see gh-10982
  1544. df.to_excel(self.path, "test1", index=False, header=False)
  1545. result = read_excel(self.path, "test1", header=None)
  1546. expected = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]])
  1547. tm.assert_frame_equal(result, expected)
  1548. msg = "Setting mangle_dupe_cols=False is not supported yet"
  1549. with pytest.raises(ValueError, match=msg):
  1550. read_excel(self.path, "test1", header=None, mangle_dupe_cols=False)
  1551. def test_swapped_columns(self, merge_cells, engine, ext):
  1552. # Test for issue #5427.
  1553. write_frame = DataFrame({'A': [1, 1, 1],
  1554. 'B': [2, 2, 2]})
  1555. write_frame.to_excel(self.path, 'test1', columns=['B', 'A'])
  1556. read_frame = read_excel(self.path, 'test1', header=0)
  1557. tm.assert_series_equal(write_frame['A'], read_frame['A'])
  1558. tm.assert_series_equal(write_frame['B'], read_frame['B'])
  1559. def test_invalid_columns(self, *_):
  1560. # see gh-10982
  1561. write_frame = DataFrame({"A": [1, 1, 1],
  1562. "B": [2, 2, 2]})
  1563. with tm.assert_produces_warning(FutureWarning,
  1564. check_stacklevel=False):
  1565. write_frame.to_excel(self.path, "test1", columns=["B", "C"])
  1566. expected = write_frame.reindex(columns=["B", "C"])
  1567. read_frame = read_excel(self.path, "test1", index_col=0)
  1568. tm.assert_frame_equal(expected, read_frame)
  1569. with pytest.raises(KeyError):
  1570. write_frame.to_excel(self.path, "test1", columns=["C", "D"])
  1571. def test_comment_arg(self, *_):
  1572. # see gh-18735
  1573. #
  1574. # Test the comment argument functionality to read_excel.
  1575. # Create file to read in.
  1576. df = DataFrame({"A": ["one", "#one", "one"],
  1577. "B": ["two", "two", "#two"]})
  1578. df.to_excel(self.path, "test_c")
  1579. # Read file without comment arg.
  1580. result1 = read_excel(self.path, "test_c", index_col=0)
  1581. result1.iloc[1, 0] = None
  1582. result1.iloc[1, 1] = None
  1583. result1.iloc[2, 1] = None
  1584. result2 = read_excel(self.path, "test_c", comment="#", index_col=0)
  1585. tm.assert_frame_equal(result1, result2)
  1586. def test_comment_default(self, merge_cells, engine, ext):
  1587. # Re issue #18735
  1588. # Test the comment argument default to read_excel
  1589. # Create file to read in
  1590. df = DataFrame({'A': ['one', '#one', 'one'],
  1591. 'B': ['two', 'two', '#two']})
  1592. df.to_excel(self.path, 'test_c')
  1593. # Read file with default and explicit comment=None
  1594. result1 = read_excel(self.path, 'test_c')
  1595. result2 = read_excel(self.path, 'test_c', comment=None)
  1596. tm.assert_frame_equal(result1, result2)
  1597. def test_comment_used(self, *_):
  1598. # see gh-18735
  1599. #
  1600. # Test the comment argument is working as expected when used.
  1601. # Create file to read in.
  1602. df = DataFrame({"A": ["one", "#one", "one"],
  1603. "B": ["two", "two", "#two"]})
  1604. df.to_excel(self.path, "test_c")
  1605. # Test read_frame_comment against manually produced expected output.
  1606. expected = DataFrame({"A": ["one", None, "one"],
  1607. "B": ["two", None, None]})
  1608. result = read_excel(self.path, "test_c", comment="#", index_col=0)
  1609. tm.assert_frame_equal(result, expected)
  1610. def test_comment_empty_line(self, merge_cells, engine, ext):
  1611. # Re issue #18735
  1612. # Test that read_excel ignores commented lines at the end of file
  1613. df = DataFrame({'a': ['1', '#2'], 'b': ['2', '3']})
  1614. df.to_excel(self.path, index=False)
  1615. # Test that all-comment lines at EoF are ignored
  1616. expected = DataFrame({'a': [1], 'b': [2]})
  1617. result = read_excel(self.path, comment='#')
  1618. tm.assert_frame_equal(result, expected)
  1619. def test_datetimes(self, merge_cells, engine, ext):
  1620. # Test writing and reading datetimes. For issue #9139. (xref #9185)
  1621. datetimes = [datetime(2013, 1, 13, 1, 2, 3),
  1622. datetime(2013, 1, 13, 2, 45, 56),
  1623. datetime(2013, 1, 13, 4, 29, 49),
  1624. datetime(2013, 1, 13, 6, 13, 42),
  1625. datetime(2013, 1, 13, 7, 57, 35),
  1626. datetime(2013, 1, 13, 9, 41, 28),
  1627. datetime(2013, 1, 13, 11, 25, 21),
  1628. datetime(2013, 1, 13, 13, 9, 14),
  1629. datetime(2013, 1, 13, 14, 53, 7),
  1630. datetime(2013, 1, 13, 16, 37, 0),
  1631. datetime(2013, 1, 13, 18, 20, 52)]
  1632. write_frame = DataFrame({'A': datetimes})
  1633. write_frame.to_excel(self.path, 'Sheet1')
  1634. read_frame = read_excel(self.path, 'Sheet1', header=0)
  1635. tm.assert_series_equal(write_frame['A'], read_frame['A'])
  1636. def test_bytes_io(self, merge_cells, engine, ext):
  1637. # see gh-7074
  1638. bio = BytesIO()
  1639. df = DataFrame(np.random.randn(10, 2))
  1640. # Pass engine explicitly, as there is no file path to infer from.
  1641. writer = ExcelWriter(bio, engine=engine)
  1642. df.to_excel(writer)
  1643. writer.save()
  1644. bio.seek(0)
  1645. reread_df = read_excel(bio, index_col=0)
  1646. tm.assert_frame_equal(df, reread_df)
  1647. def test_write_lists_dict(self, *_):
  1648. # see gh-8188.
  1649. df = DataFrame({"mixed": ["a", ["b", "c"], {"d": "e", "f": 2}],
  1650. "numeric": [1, 2, 3.0],
  1651. "str": ["apple", "banana", "cherry"]})
  1652. df.to_excel(self.path, "Sheet1")
  1653. read = read_excel(self.path, "Sheet1", header=0, index_col=0)
  1654. expected = df.copy()
  1655. expected.mixed = expected.mixed.apply(str)
  1656. expected.numeric = expected.numeric.astype("int64")
  1657. tm.assert_frame_equal(read, expected)
  1658. def test_true_and_false_value_options(self, *_):
  1659. # see gh-13347
  1660. df = pd.DataFrame([["foo", "bar"]], columns=["col1", "col2"])
  1661. expected = df.replace({"foo": True, "bar": False})
  1662. df.to_excel(self.path)
  1663. read_frame = read_excel(self.path, true_values=["foo"],
  1664. false_values=["bar"], index_col=0)
  1665. tm.assert_frame_equal(read_frame, expected)
  1666. def test_freeze_panes(self, *_):
  1667. # see gh-15160
  1668. expected = DataFrame([[1, 2], [3, 4]], columns=["col1", "col2"])
  1669. expected.to_excel(self.path, "Sheet1", freeze_panes=(1, 1))
  1670. result = read_excel(self.path, index_col=0)
  1671. tm.assert_frame_equal(result, expected)
  1672. def test_path_path_lib(self, merge_cells, engine, ext):
  1673. df = tm.makeDataFrame()
  1674. writer = partial(df.to_excel, engine=engine)
  1675. reader = partial(pd.read_excel, index_col=0)
  1676. result = tm.round_trip_pathlib(writer, reader,
  1677. path="foo.{ext}".format(ext=ext))
  1678. tm.assert_frame_equal(result, df)
  1679. def test_path_local_path(self, merge_cells, engine, ext):
  1680. df = tm.makeDataFrame()
  1681. writer = partial(df.to_excel, engine=engine)
  1682. reader = partial(pd.read_excel, index_col=0)
  1683. result = tm.round_trip_pathlib(writer, reader,
  1684. path="foo.{ext}".format(ext=ext))
  1685. tm.assert_frame_equal(result, df)
  1686. @td.skip_if_no('openpyxl')
  1687. @pytest.mark.parametrize("merge_cells,ext,engine", [
  1688. (None, '.xlsx', 'openpyxl')])
  1689. class TestOpenpyxlTests(_WriterBase):
  1690. def test_to_excel_styleconverter(self, merge_cells, ext, engine):
  1691. from openpyxl import styles
  1692. hstyle = {
  1693. "font": {
  1694. "color": '00FF0000',
  1695. "bold": True,
  1696. },
  1697. "borders": {
  1698. "top": "thin",
  1699. "right": "thin",
  1700. "bottom": "thin",
  1701. "left": "thin",
  1702. },
  1703. "alignment": {
  1704. "horizontal": "center",
  1705. "vertical": "top",
  1706. },
  1707. "fill": {
  1708. "patternType": 'solid',
  1709. 'fgColor': {
  1710. 'rgb': '006666FF',
  1711. 'tint': 0.3,
  1712. },
  1713. },
  1714. "number_format": {
  1715. "format_code": "0.00"
  1716. },
  1717. "protection": {
  1718. "locked": True,
  1719. "hidden": False,
  1720. },
  1721. }
  1722. font_color = styles.Color('00FF0000')
  1723. font = styles.Font(bold=True, color=font_color)
  1724. side = styles.Side(style=styles.borders.BORDER_THIN)
  1725. border = styles.Border(top=side, right=side, bottom=side, left=side)
  1726. alignment = styles.Alignment(horizontal='center', vertical='top')
  1727. fill_color = styles.Color(rgb='006666FF', tint=0.3)
  1728. fill = styles.PatternFill(patternType='solid', fgColor=fill_color)
  1729. number_format = '0.00'
  1730. protection = styles.Protection(locked=True, hidden=False)
  1731. kw = _OpenpyxlWriter._convert_to_style_kwargs(hstyle)
  1732. assert kw['font'] == font
  1733. assert kw['border'] == border
  1734. assert kw['alignment'] == alignment
  1735. assert kw['fill'] == fill
  1736. assert kw['number_format'] == number_format
  1737. assert kw['protection'] == protection
  1738. def test_write_cells_merge_styled(self, merge_cells, ext, engine):
  1739. from pandas.io.formats.excel import ExcelCell
  1740. sheet_name = 'merge_styled'
  1741. sty_b1 = {'font': {'color': '00FF0000'}}
  1742. sty_a2 = {'font': {'color': '0000FF00'}}
  1743. initial_cells = [
  1744. ExcelCell(col=1, row=0, val=42, style=sty_b1),
  1745. ExcelCell(col=0, row=1, val=99, style=sty_a2),
  1746. ]
  1747. sty_merged = {'font': {'color': '000000FF', 'bold': True}}
  1748. sty_kwargs = _OpenpyxlWriter._convert_to_style_kwargs(sty_merged)
  1749. openpyxl_sty_merged = sty_kwargs['font']
  1750. merge_cells = [
  1751. ExcelCell(col=0, row=0, val='pandas',
  1752. mergestart=1, mergeend=1, style=sty_merged),
  1753. ]
  1754. with ensure_clean(ext) as path:
  1755. writer = _OpenpyxlWriter(path)
  1756. writer.write_cells(initial_cells, sheet_name=sheet_name)
  1757. writer.write_cells(merge_cells, sheet_name=sheet_name)
  1758. wks = writer.sheets[sheet_name]
  1759. xcell_b1 = wks['B1']
  1760. xcell_a2 = wks['A2']
  1761. assert xcell_b1.font == openpyxl_sty_merged
  1762. assert xcell_a2.font == openpyxl_sty_merged
  1763. @pytest.mark.parametrize("mode,expected", [
  1764. ('w', ['baz']), ('a', ['foo', 'bar', 'baz'])])
  1765. def test_write_append_mode(self, merge_cells, ext, engine, mode, expected):
  1766. import openpyxl
  1767. df = DataFrame([1], columns=['baz'])
  1768. with ensure_clean(ext) as f:
  1769. wb = openpyxl.Workbook()
  1770. wb.worksheets[0].title = 'foo'
  1771. wb.worksheets[0]['A1'].value = 'foo'
  1772. wb.create_sheet('bar')
  1773. wb.worksheets[1]['A1'].value = 'bar'
  1774. wb.save(f)
  1775. writer = ExcelWriter(f, engine=engine, mode=mode)
  1776. df.to_excel(writer, sheet_name='baz', index=False)
  1777. writer.save()
  1778. wb2 = openpyxl.load_workbook(f)
  1779. result = [sheet.title for sheet in wb2.worksheets]
  1780. assert result == expected
  1781. for index, cell_value in enumerate(expected):
  1782. assert wb2.worksheets[index]['A1'].value == cell_value
  1783. @td.skip_if_no('xlwt')
  1784. @pytest.mark.parametrize("merge_cells,ext,engine", [
  1785. (None, '.xls', 'xlwt')])
  1786. class TestXlwtTests(_WriterBase):
  1787. def test_excel_raise_error_on_multiindex_columns_and_no_index(
  1788. self, merge_cells, ext, engine):
  1789. # MultiIndex as columns is not yet implemented 9794
  1790. cols = MultiIndex.from_tuples([('site', ''),
  1791. ('2014', 'height'),
  1792. ('2014', 'weight')])
  1793. df = DataFrame(np.random.randn(10, 3), columns=cols)
  1794. with pytest.raises(NotImplementedError):
  1795. with ensure_clean(ext) as path:
  1796. df.to_excel(path, index=False)
  1797. def test_excel_multiindex_columns_and_index_true(self, merge_cells, ext,
  1798. engine):
  1799. cols = MultiIndex.from_tuples([('site', ''),
  1800. ('2014', 'height'),
  1801. ('2014', 'weight')])
  1802. df = pd.DataFrame(np.random.randn(10, 3), columns=cols)
  1803. with ensure_clean(ext) as path:
  1804. df.to_excel(path, index=True)
  1805. def test_excel_multiindex_index(self, merge_cells, ext, engine):
  1806. # MultiIndex as index works so assert no error #9794
  1807. cols = MultiIndex.from_tuples([('site', ''),
  1808. ('2014', 'height'),
  1809. ('2014', 'weight')])
  1810. df = DataFrame(np.random.randn(3, 10), index=cols)
  1811. with ensure_clean(ext) as path:
  1812. df.to_excel(path, index=False)
  1813. def test_to_excel_styleconverter(self, merge_cells, ext, engine):
  1814. import xlwt
  1815. hstyle = {"font": {"bold": True},
  1816. "borders": {"top": "thin",
  1817. "right": "thin",
  1818. "bottom": "thin",
  1819. "left": "thin"},
  1820. "alignment": {"horizontal": "center", "vertical": "top"}}
  1821. xls_style = _XlwtWriter._convert_to_style(hstyle)
  1822. assert xls_style.font.bold
  1823. assert xlwt.Borders.THIN == xls_style.borders.top
  1824. assert xlwt.Borders.THIN == xls_style.borders.right
  1825. assert xlwt.Borders.THIN == xls_style.borders.bottom
  1826. assert xlwt.Borders.THIN == xls_style.borders.left
  1827. assert xlwt.Alignment.HORZ_CENTER == xls_style.alignment.horz
  1828. assert xlwt.Alignment.VERT_TOP == xls_style.alignment.vert
  1829. def test_write_append_mode_raises(self, merge_cells, ext, engine):
  1830. msg = "Append mode is not supported with xlwt!"
  1831. with ensure_clean(ext) as f:
  1832. with pytest.raises(ValueError, match=msg):
  1833. ExcelWriter(f, engine=engine, mode='a')
  1834. @td.skip_if_no('xlsxwriter')
  1835. @pytest.mark.parametrize("merge_cells,ext,engine", [
  1836. (None, '.xlsx', 'xlsxwriter')])
  1837. class TestXlsxWriterTests(_WriterBase):
  1838. @td.skip_if_no('openpyxl')
  1839. def test_column_format(self, merge_cells, ext, engine):
  1840. # Test that column formats are applied to cells. Test for issue #9167.
  1841. # Applicable to xlsxwriter only.
  1842. with warnings.catch_warnings():
  1843. # Ignore the openpyxl lxml warning.
  1844. warnings.simplefilter("ignore")
  1845. import openpyxl
  1846. with ensure_clean(ext) as path:
  1847. frame = DataFrame({'A': [123456, 123456],
  1848. 'B': [123456, 123456]})
  1849. writer = ExcelWriter(path)
  1850. frame.to_excel(writer)
  1851. # Add a number format to col B and ensure it is applied to cells.
  1852. num_format = '#,##0'
  1853. write_workbook = writer.book
  1854. write_worksheet = write_workbook.worksheets()[0]
  1855. col_format = write_workbook.add_format({'num_format': num_format})
  1856. write_worksheet.set_column('B:B', None, col_format)
  1857. writer.save()
  1858. read_workbook = openpyxl.load_workbook(path)
  1859. try:
  1860. read_worksheet = read_workbook['Sheet1']
  1861. except TypeError:
  1862. # compat
  1863. read_worksheet = read_workbook.get_sheet_by_name(name='Sheet1')
  1864. # Get the number format from the cell.
  1865. try:
  1866. cell = read_worksheet['B2']
  1867. except TypeError:
  1868. # compat
  1869. cell = read_worksheet.cell('B2')
  1870. try:
  1871. read_num_format = cell.number_format
  1872. except Exception:
  1873. read_num_format = cell.style.number_format._format_code
  1874. assert read_num_format == num_format
  1875. def test_write_append_mode_raises(self, merge_cells, ext, engine):
  1876. msg = "Append mode is not supported with xlsxwriter!"
  1877. with ensure_clean(ext) as f:
  1878. with pytest.raises(ValueError, match=msg):
  1879. ExcelWriter(f, engine=engine, mode='a')
  1880. class TestExcelWriterEngineTests(object):
  1881. @pytest.mark.parametrize('klass,ext', [
  1882. pytest.param(_XlsxWriter, '.xlsx', marks=pytest.mark.skipif(
  1883. not td.safe_import('xlsxwriter'), reason='No xlsxwriter')),
  1884. pytest.param(_OpenpyxlWriter, '.xlsx', marks=pytest.mark.skipif(
  1885. not td.safe_import('openpyxl'), reason='No openpyxl')),
  1886. pytest.param(_XlwtWriter, '.xls', marks=pytest.mark.skipif(
  1887. not td.safe_import('xlwt'), reason='No xlwt'))
  1888. ])
  1889. def test_ExcelWriter_dispatch(self, klass, ext):
  1890. with ensure_clean(ext) as path:
  1891. writer = ExcelWriter(path)
  1892. if ext == '.xlsx' and td.safe_import('xlsxwriter'):
  1893. # xlsxwriter has preference over openpyxl if both installed
  1894. assert isinstance(writer, _XlsxWriter)
  1895. else:
  1896. assert isinstance(writer, klass)
  1897. def test_ExcelWriter_dispatch_raises(self):
  1898. with pytest.raises(ValueError, match='No engine'):
  1899. ExcelWriter('nothing')
  1900. @pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning")
  1901. def test_register_writer(self):
  1902. # some awkward mocking to test out dispatch and such actually works
  1903. called_save = []
  1904. called_write_cells = []
  1905. class DummyClass(ExcelWriter):
  1906. called_save = False
  1907. called_write_cells = False
  1908. supported_extensions = ['test', 'xlsx', 'xls']
  1909. engine = 'dummy'
  1910. def save(self):
  1911. called_save.append(True)
  1912. def write_cells(self, *args, **kwargs):
  1913. called_write_cells.append(True)
  1914. def check_called(func):
  1915. func()
  1916. assert len(called_save) >= 1
  1917. assert len(called_write_cells) >= 1
  1918. del called_save[:]
  1919. del called_write_cells[:]
  1920. with pd.option_context('io.excel.xlsx.writer', 'dummy'):
  1921. register_writer(DummyClass)
  1922. writer = ExcelWriter('something.test')
  1923. assert isinstance(writer, DummyClass)
  1924. df = tm.makeCustomDataframe(1, 1)
  1925. with catch_warnings(record=True):
  1926. panel = tm.makePanel()
  1927. func = lambda: df.to_excel('something.test')
  1928. check_called(func)
  1929. check_called(lambda: panel.to_excel('something.test'))
  1930. check_called(lambda: df.to_excel('something.xlsx'))
  1931. check_called(
  1932. lambda: df.to_excel(
  1933. 'something.xls', engine='dummy'))
  1934. @pytest.mark.parametrize('engine', [
  1935. pytest.param('xlwt',
  1936. marks=pytest.mark.xfail(reason='xlwt does not support '
  1937. 'openpyxl-compatible '
  1938. 'style dicts')),
  1939. 'xlsxwriter',
  1940. 'openpyxl',
  1941. ])
  1942. def test_styler_to_excel(engine):
  1943. def style(df):
  1944. # XXX: RGB colors not supported in xlwt
  1945. return DataFrame([['font-weight: bold', '', ''],
  1946. ['', 'color: blue', ''],
  1947. ['', '', 'text-decoration: underline'],
  1948. ['border-style: solid', '', ''],
  1949. ['', 'font-style: italic', ''],
  1950. ['', '', 'text-align: right'],
  1951. ['background-color: red', '', ''],
  1952. ['number-format: 0%', '', ''],
  1953. ['', '', ''],
  1954. ['', '', ''],
  1955. ['', '', '']],
  1956. index=df.index, columns=df.columns)
  1957. def assert_equal_style(cell1, cell2, engine):
  1958. if engine in ['xlsxwriter', 'openpyxl']:
  1959. pytest.xfail(reason=("GH25351: failing on some attribute "
  1960. "comparisons in {}".format(engine)))
  1961. # XXX: should find a better way to check equality
  1962. assert cell1.alignment.__dict__ == cell2.alignment.__dict__
  1963. assert cell1.border.__dict__ == cell2.border.__dict__
  1964. assert cell1.fill.__dict__ == cell2.fill.__dict__
  1965. assert cell1.font.__dict__ == cell2.font.__dict__
  1966. assert cell1.number_format == cell2.number_format
  1967. assert cell1.protection.__dict__ == cell2.protection.__dict__
  1968. def custom_converter(css):
  1969. # use bold iff there is custom style attached to the cell
  1970. if css.strip(' \n;'):
  1971. return {'font': {'bold': True}}
  1972. return {}
  1973. pytest.importorskip('jinja2')
  1974. pytest.importorskip(engine)
  1975. # Prepare spreadsheets
  1976. df = DataFrame(np.random.randn(11, 3))
  1977. with ensure_clean('.xlsx' if engine != 'xlwt' else '.xls') as path:
  1978. writer = ExcelWriter(path, engine=engine)
  1979. df.to_excel(writer, sheet_name='frame')
  1980. df.style.to_excel(writer, sheet_name='unstyled')
  1981. styled = df.style.apply(style, axis=None)
  1982. styled.to_excel(writer, sheet_name='styled')
  1983. ExcelFormatter(styled, style_converter=custom_converter).write(
  1984. writer, sheet_name='custom')
  1985. writer.save()
  1986. if engine not in ('openpyxl', 'xlsxwriter'):
  1987. # For other engines, we only smoke test
  1988. return
  1989. openpyxl = pytest.importorskip('openpyxl')
  1990. wb = openpyxl.load_workbook(path)
  1991. # (1) compare DataFrame.to_excel and Styler.to_excel when unstyled
  1992. n_cells = 0
  1993. for col1, col2 in zip(wb['frame'].columns,
  1994. wb['unstyled'].columns):
  1995. assert len(col1) == len(col2)
  1996. for cell1, cell2 in zip(col1, col2):
  1997. assert cell1.value == cell2.value
  1998. assert_equal_style(cell1, cell2, engine)
  1999. n_cells += 1
  2000. # ensure iteration actually happened:
  2001. assert n_cells == (11 + 1) * (3 + 1)
  2002. # (2) check styling with default converter
  2003. # XXX: openpyxl (as at 2.4) prefixes colors with 00, xlsxwriter with FF
  2004. alpha = '00' if engine == 'openpyxl' else 'FF'
  2005. n_cells = 0
  2006. for col1, col2 in zip(wb['frame'].columns,
  2007. wb['styled'].columns):
  2008. assert len(col1) == len(col2)
  2009. for cell1, cell2 in zip(col1, col2):
  2010. ref = '%s%d' % (cell2.column, cell2.row)
  2011. # XXX: this isn't as strong a test as ideal; we should
  2012. # confirm that differences are exclusive
  2013. if ref == 'B2':
  2014. assert not cell1.font.bold
  2015. assert cell2.font.bold
  2016. elif ref == 'C3':
  2017. assert cell1.font.color.rgb != cell2.font.color.rgb
  2018. assert cell2.font.color.rgb == alpha + '0000FF'
  2019. elif ref == 'D4':
  2020. # This fails with engine=xlsxwriter due to
  2021. # https://bitbucket.org/openpyxl/openpyxl/issues/800
  2022. if engine == 'xlsxwriter' \
  2023. and (LooseVersion(openpyxl.__version__) <
  2024. LooseVersion('2.4.6')):
  2025. pass
  2026. else:
  2027. assert cell1.font.underline != cell2.font.underline
  2028. assert cell2.font.underline == 'single'
  2029. elif ref == 'B5':
  2030. assert not cell1.border.left.style
  2031. assert (cell2.border.top.style ==
  2032. cell2.border.right.style ==
  2033. cell2.border.bottom.style ==
  2034. cell2.border.left.style ==
  2035. 'medium')
  2036. elif ref == 'C6':
  2037. assert not cell1.font.italic
  2038. assert cell2.font.italic
  2039. elif ref == 'D7':
  2040. assert (cell1.alignment.horizontal !=
  2041. cell2.alignment.horizontal)
  2042. assert cell2.alignment.horizontal == 'right'
  2043. elif ref == 'B8':
  2044. assert cell1.fill.fgColor.rgb != cell2.fill.fgColor.rgb
  2045. assert cell1.fill.patternType != cell2.fill.patternType
  2046. assert cell2.fill.fgColor.rgb == alpha + 'FF0000'
  2047. assert cell2.fill.patternType == 'solid'
  2048. elif ref == 'B9':
  2049. assert cell1.number_format == 'General'
  2050. assert cell2.number_format == '0%'
  2051. else:
  2052. assert_equal_style(cell1, cell2, engine)
  2053. assert cell1.value == cell2.value
  2054. n_cells += 1
  2055. assert n_cells == (11 + 1) * (3 + 1)
  2056. # (3) check styling with custom converter
  2057. n_cells = 0
  2058. for col1, col2 in zip(wb['frame'].columns,
  2059. wb['custom'].columns):
  2060. assert len(col1) == len(col2)
  2061. for cell1, cell2 in zip(col1, col2):
  2062. ref = '%s%d' % (cell2.column, cell2.row)
  2063. if ref in ('B2', 'C3', 'D4', 'B5', 'C6', 'D7', 'B8', 'B9'):
  2064. assert not cell1.font.bold
  2065. assert cell2.font.bold
  2066. else:
  2067. assert_equal_style(cell1, cell2, engine)
  2068. assert cell1.value == cell2.value
  2069. n_cells += 1
  2070. assert n_cells == (11 + 1) * (3 + 1)
  2071. @td.skip_if_no('openpyxl')
  2072. @pytest.mark.skipif(not PY36, reason='requires fspath')
  2073. class TestFSPath(object):
  2074. def test_excelfile_fspath(self):
  2075. with tm.ensure_clean('foo.xlsx') as path:
  2076. df = DataFrame({"A": [1, 2]})
  2077. df.to_excel(path)
  2078. xl = ExcelFile(path)
  2079. result = os.fspath(xl)
  2080. assert result == path
  2081. def test_excelwriter_fspath(self):
  2082. with tm.ensure_clean('foo.xlsx') as path:
  2083. writer = ExcelWriter(path)
  2084. assert os.fspath(writer) == str(path)