test_stata.py 68 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613
  1. # -*- coding: utf-8 -*-
  2. # pylint: disable=E1101
  3. from collections import OrderedDict
  4. import datetime as dt
  5. from datetime import datetime
  6. import gzip
  7. import io
  8. import os
  9. import struct
  10. import warnings
  11. import numpy as np
  12. import pytest
  13. import pandas.compat as compat
  14. from pandas.compat import PY3, ResourceWarning, iterkeys
  15. from pandas.core.dtypes.common import is_categorical_dtype
  16. import pandas as pd
  17. from pandas.core.frame import DataFrame, Series
  18. import pandas.util.testing as tm
  19. from pandas.io.parsers import read_csv
  20. from pandas.io.stata import (
  21. InvalidColumnName, PossiblePrecisionLoss, StataMissingValue, StataReader,
  22. read_stata)
  23. @pytest.fixture
  24. def dirpath(datapath):
  25. return datapath("io", "data")
  26. @pytest.fixture
  27. def parsed_114(dirpath):
  28. dta14_114 = os.path.join(dirpath, 'stata5_114.dta')
  29. parsed_114 = read_stata(dta14_114, convert_dates=True)
  30. parsed_114.index.name = 'index'
  31. return parsed_114
  32. class TestStata(object):
  33. @pytest.fixture(autouse=True)
  34. def setup_method(self, datapath):
  35. self.dirpath = datapath("io", "data")
  36. self.dta1_114 = os.path.join(self.dirpath, 'stata1_114.dta')
  37. self.dta1_117 = os.path.join(self.dirpath, 'stata1_117.dta')
  38. self.dta2_113 = os.path.join(self.dirpath, 'stata2_113.dta')
  39. self.dta2_114 = os.path.join(self.dirpath, 'stata2_114.dta')
  40. self.dta2_115 = os.path.join(self.dirpath, 'stata2_115.dta')
  41. self.dta2_117 = os.path.join(self.dirpath, 'stata2_117.dta')
  42. self.dta3_113 = os.path.join(self.dirpath, 'stata3_113.dta')
  43. self.dta3_114 = os.path.join(self.dirpath, 'stata3_114.dta')
  44. self.dta3_115 = os.path.join(self.dirpath, 'stata3_115.dta')
  45. self.dta3_117 = os.path.join(self.dirpath, 'stata3_117.dta')
  46. self.csv3 = os.path.join(self.dirpath, 'stata3.csv')
  47. self.dta4_113 = os.path.join(self.dirpath, 'stata4_113.dta')
  48. self.dta4_114 = os.path.join(self.dirpath, 'stata4_114.dta')
  49. self.dta4_115 = os.path.join(self.dirpath, 'stata4_115.dta')
  50. self.dta4_117 = os.path.join(self.dirpath, 'stata4_117.dta')
  51. self.dta_encoding = os.path.join(self.dirpath, 'stata1_encoding.dta')
  52. self.csv14 = os.path.join(self.dirpath, 'stata5.csv')
  53. self.dta14_113 = os.path.join(self.dirpath, 'stata5_113.dta')
  54. self.dta14_114 = os.path.join(self.dirpath, 'stata5_114.dta')
  55. self.dta14_115 = os.path.join(self.dirpath, 'stata5_115.dta')
  56. self.dta14_117 = os.path.join(self.dirpath, 'stata5_117.dta')
  57. self.csv15 = os.path.join(self.dirpath, 'stata6.csv')
  58. self.dta15_113 = os.path.join(self.dirpath, 'stata6_113.dta')
  59. self.dta15_114 = os.path.join(self.dirpath, 'stata6_114.dta')
  60. self.dta15_115 = os.path.join(self.dirpath, 'stata6_115.dta')
  61. self.dta15_117 = os.path.join(self.dirpath, 'stata6_117.dta')
  62. self.dta16_115 = os.path.join(self.dirpath, 'stata7_115.dta')
  63. self.dta16_117 = os.path.join(self.dirpath, 'stata7_117.dta')
  64. self.dta17_113 = os.path.join(self.dirpath, 'stata8_113.dta')
  65. self.dta17_115 = os.path.join(self.dirpath, 'stata8_115.dta')
  66. self.dta17_117 = os.path.join(self.dirpath, 'stata8_117.dta')
  67. self.dta18_115 = os.path.join(self.dirpath, 'stata9_115.dta')
  68. self.dta18_117 = os.path.join(self.dirpath, 'stata9_117.dta')
  69. self.dta19_115 = os.path.join(self.dirpath, 'stata10_115.dta')
  70. self.dta19_117 = os.path.join(self.dirpath, 'stata10_117.dta')
  71. self.dta20_115 = os.path.join(self.dirpath, 'stata11_115.dta')
  72. self.dta20_117 = os.path.join(self.dirpath, 'stata11_117.dta')
  73. self.dta21_117 = os.path.join(self.dirpath, 'stata12_117.dta')
  74. self.dta22_118 = os.path.join(self.dirpath, 'stata14_118.dta')
  75. self.dta23 = os.path.join(self.dirpath, 'stata15.dta')
  76. self.dta24_111 = os.path.join(self.dirpath, 'stata7_111.dta')
  77. self.dta25_118 = os.path.join(self.dirpath, 'stata16_118.dta')
  78. self.stata_dates = os.path.join(self.dirpath, 'stata13_dates.dta')
  79. def read_dta(self, file):
  80. # Legacy default reader configuration
  81. return read_stata(file, convert_dates=True)
  82. def read_csv(self, file):
  83. return read_csv(file, parse_dates=True)
  84. @pytest.mark.parametrize('version', [114, 117])
  85. def test_read_empty_dta(self, version):
  86. empty_ds = DataFrame(columns=['unit'])
  87. # GH 7369, make sure can read a 0-obs dta file
  88. with tm.ensure_clean() as path:
  89. empty_ds.to_stata(path, write_index=False, version=version)
  90. empty_ds2 = read_stata(path)
  91. tm.assert_frame_equal(empty_ds, empty_ds2)
  92. def test_data_method(self):
  93. # Minimal testing of legacy data method
  94. with StataReader(self.dta1_114) as rdr:
  95. with tm.assert_produces_warning(UserWarning):
  96. parsed_114_data = rdr.data()
  97. with StataReader(self.dta1_114) as rdr:
  98. parsed_114_read = rdr.read()
  99. tm.assert_frame_equal(parsed_114_data, parsed_114_read)
  100. @pytest.mark.parametrize(
  101. 'file', ['dta1_114', 'dta1_117'])
  102. def test_read_dta1(self, file):
  103. file = getattr(self, file)
  104. parsed = self.read_dta(file)
  105. # Pandas uses np.nan as missing value.
  106. # Thus, all columns will be of type float, regardless of their name.
  107. expected = DataFrame([(np.nan, np.nan, np.nan, np.nan, np.nan)],
  108. columns=['float_miss', 'double_miss', 'byte_miss',
  109. 'int_miss', 'long_miss'])
  110. # this is an oddity as really the nan should be float64, but
  111. # the casting doesn't fail so need to match stata here
  112. expected['float_miss'] = expected['float_miss'].astype(np.float32)
  113. tm.assert_frame_equal(parsed, expected)
  114. def test_read_dta2(self):
  115. expected = DataFrame.from_records(
  116. [
  117. (
  118. datetime(2006, 11, 19, 23, 13, 20),
  119. 1479596223000,
  120. datetime(2010, 1, 20),
  121. datetime(2010, 1, 8),
  122. datetime(2010, 1, 1),
  123. datetime(1974, 7, 1),
  124. datetime(2010, 1, 1),
  125. datetime(2010, 1, 1)
  126. ),
  127. (
  128. datetime(1959, 12, 31, 20, 3, 20),
  129. -1479590,
  130. datetime(1953, 10, 2),
  131. datetime(1948, 6, 10),
  132. datetime(1955, 1, 1),
  133. datetime(1955, 7, 1),
  134. datetime(1955, 1, 1),
  135. datetime(2, 1, 1)
  136. ),
  137. (
  138. pd.NaT,
  139. pd.NaT,
  140. pd.NaT,
  141. pd.NaT,
  142. pd.NaT,
  143. pd.NaT,
  144. pd.NaT,
  145. pd.NaT,
  146. )
  147. ],
  148. columns=['datetime_c', 'datetime_big_c', 'date', 'weekly_date',
  149. 'monthly_date', 'quarterly_date', 'half_yearly_date',
  150. 'yearly_date']
  151. )
  152. expected['yearly_date'] = expected['yearly_date'].astype('O')
  153. with warnings.catch_warnings(record=True) as w:
  154. warnings.simplefilter("always")
  155. parsed_114 = self.read_dta(self.dta2_114)
  156. parsed_115 = self.read_dta(self.dta2_115)
  157. parsed_117 = self.read_dta(self.dta2_117)
  158. # 113 is buggy due to limits of date format support in Stata
  159. # parsed_113 = self.read_dta(self.dta2_113)
  160. # Remove resource warnings
  161. w = [x for x in w if x.category is UserWarning]
  162. # should get warning for each call to read_dta
  163. assert len(w) == 3
  164. # buggy test because of the NaT comparison on certain platforms
  165. # Format 113 test fails since it does not support tc and tC formats
  166. # tm.assert_frame_equal(parsed_113, expected)
  167. tm.assert_frame_equal(parsed_114, expected,
  168. check_datetimelike_compat=True)
  169. tm.assert_frame_equal(parsed_115, expected,
  170. check_datetimelike_compat=True)
  171. tm.assert_frame_equal(parsed_117, expected,
  172. check_datetimelike_compat=True)
  173. @pytest.mark.parametrize(
  174. 'file', ['dta3_113', 'dta3_114', 'dta3_115', 'dta3_117'])
  175. def test_read_dta3(self, file):
  176. file = getattr(self, file)
  177. parsed = self.read_dta(file)
  178. # match stata here
  179. expected = self.read_csv(self.csv3)
  180. expected = expected.astype(np.float32)
  181. expected['year'] = expected['year'].astype(np.int16)
  182. expected['quarter'] = expected['quarter'].astype(np.int8)
  183. tm.assert_frame_equal(parsed, expected)
  184. @pytest.mark.parametrize(
  185. 'file', ['dta4_113', 'dta4_114', 'dta4_115', 'dta4_117'])
  186. def test_read_dta4(self, file):
  187. file = getattr(self, file)
  188. parsed = self.read_dta(file)
  189. expected = DataFrame.from_records(
  190. [
  191. ["one", "ten", "one", "one", "one"],
  192. ["two", "nine", "two", "two", "two"],
  193. ["three", "eight", "three", "three", "three"],
  194. ["four", "seven", 4, "four", "four"],
  195. ["five", "six", 5, np.nan, "five"],
  196. ["six", "five", 6, np.nan, "six"],
  197. ["seven", "four", 7, np.nan, "seven"],
  198. ["eight", "three", 8, np.nan, "eight"],
  199. ["nine", "two", 9, np.nan, "nine"],
  200. ["ten", "one", "ten", np.nan, "ten"]
  201. ],
  202. columns=['fully_labeled', 'fully_labeled2', 'incompletely_labeled',
  203. 'labeled_with_missings', 'float_labelled'])
  204. # these are all categoricals
  205. expected = pd.concat([expected[col].astype('category')
  206. for col in expected], axis=1)
  207. # stata doesn't save .category metadata
  208. tm.assert_frame_equal(parsed, expected, check_categorical=False)
  209. # File containing strls
  210. def test_read_dta12(self):
  211. parsed_117 = self.read_dta(self.dta21_117)
  212. expected = DataFrame.from_records(
  213. [
  214. [1, "abc", "abcdefghi"],
  215. [3, "cba", "qwertywertyqwerty"],
  216. [93, "", "strl"],
  217. ],
  218. columns=['x', 'y', 'z'])
  219. tm.assert_frame_equal(parsed_117, expected, check_dtype=False)
  220. def test_read_dta18(self):
  221. parsed_118 = self.read_dta(self.dta22_118)
  222. parsed_118["Bytes"] = parsed_118["Bytes"].astype('O')
  223. expected = DataFrame.from_records(
  224. [['Cat', 'Bogota', u'Bogotá', 1, 1.0, u'option b Ünicode', 1.0],
  225. ['Dog', 'Boston', u'Uzunköprü', np.nan, np.nan, np.nan, np.nan],
  226. ['Plane', 'Rome', u'Tromsø', 0, 0.0, 'option a', 0.0],
  227. ['Potato', 'Tokyo', u'Elâzığ', -4, 4.0, 4, 4],
  228. ['', '', '', 0, 0.3332999, 'option a', 1 / 3.]
  229. ],
  230. columns=['Things', 'Cities', 'Unicode_Cities_Strl',
  231. 'Ints', 'Floats', 'Bytes', 'Longs'])
  232. expected["Floats"] = expected["Floats"].astype(np.float32)
  233. for col in parsed_118.columns:
  234. tm.assert_almost_equal(parsed_118[col], expected[col])
  235. with StataReader(self.dta22_118) as rdr:
  236. vl = rdr.variable_labels()
  237. vl_expected = {u'Unicode_Cities_Strl':
  238. u'Here are some strls with Ünicode chars',
  239. u'Longs': u'long data',
  240. u'Things': u'Here are some things',
  241. u'Bytes': u'byte data',
  242. u'Ints': u'int data',
  243. u'Cities': u'Here are some cities',
  244. u'Floats': u'float data'}
  245. tm.assert_dict_equal(vl, vl_expected)
  246. assert rdr.data_label == u'This is a Ünicode data label'
  247. def test_read_write_dta5(self):
  248. original = DataFrame([(np.nan, np.nan, np.nan, np.nan, np.nan)],
  249. columns=['float_miss', 'double_miss', 'byte_miss',
  250. 'int_miss', 'long_miss'])
  251. original.index.name = 'index'
  252. with tm.ensure_clean() as path:
  253. original.to_stata(path, None)
  254. written_and_read_again = self.read_dta(path)
  255. tm.assert_frame_equal(written_and_read_again.set_index('index'),
  256. original)
  257. def test_write_dta6(self):
  258. original = self.read_csv(self.csv3)
  259. original.index.name = 'index'
  260. original.index = original.index.astype(np.int32)
  261. original['year'] = original['year'].astype(np.int32)
  262. original['quarter'] = original['quarter'].astype(np.int32)
  263. with tm.ensure_clean() as path:
  264. original.to_stata(path, None)
  265. written_and_read_again = self.read_dta(path)
  266. tm.assert_frame_equal(written_and_read_again.set_index('index'),
  267. original, check_index_type=False)
  268. @pytest.mark.parametrize('version', [114, 117])
  269. def test_read_write_dta10(self, version):
  270. original = DataFrame(data=[["string", "object", 1, 1.1,
  271. np.datetime64('2003-12-25')]],
  272. columns=['string', 'object', 'integer',
  273. 'floating', 'datetime'])
  274. original["object"] = Series(original["object"], dtype=object)
  275. original.index.name = 'index'
  276. original.index = original.index.astype(np.int32)
  277. original['integer'] = original['integer'].astype(np.int32)
  278. with tm.ensure_clean() as path:
  279. original.to_stata(path, {'datetime': 'tc'}, version=version)
  280. written_and_read_again = self.read_dta(path)
  281. # original.index is np.int32, read index is np.int64
  282. tm.assert_frame_equal(written_and_read_again.set_index('index'),
  283. original, check_index_type=False)
  284. def test_stata_doc_examples(self):
  285. with tm.ensure_clean() as path:
  286. df = DataFrame(np.random.randn(10, 2), columns=list('AB'))
  287. df.to_stata(path)
  288. def test_write_preserves_original(self):
  289. # 9795
  290. np.random.seed(423)
  291. df = pd.DataFrame(np.random.randn(5, 4), columns=list('abcd'))
  292. df.loc[2, 'a':'c'] = np.nan
  293. df_copy = df.copy()
  294. with tm.ensure_clean() as path:
  295. df.to_stata(path, write_index=False)
  296. tm.assert_frame_equal(df, df_copy)
  297. @pytest.mark.parametrize('version', [114, 117])
  298. def test_encoding(self, version):
  299. # GH 4626, proper encoding handling
  300. raw = read_stata(self.dta_encoding)
  301. with tm.assert_produces_warning(FutureWarning):
  302. encoded = read_stata(self.dta_encoding, encoding='latin-1')
  303. result = encoded.kreis1849[0]
  304. expected = raw.kreis1849[0]
  305. assert result == expected
  306. assert isinstance(result, compat.string_types)
  307. with tm.ensure_clean() as path:
  308. with tm.assert_produces_warning(FutureWarning):
  309. encoded.to_stata(path, write_index=False, version=version,
  310. encoding='latin-1')
  311. reread_encoded = read_stata(path)
  312. tm.assert_frame_equal(encoded, reread_encoded)
  313. def test_read_write_dta11(self):
  314. original = DataFrame([(1, 2, 3, 4)],
  315. columns=['good', compat.u('b\u00E4d'), '8number',
  316. 'astringwithmorethan32characters______'])
  317. formatted = DataFrame([(1, 2, 3, 4)],
  318. columns=['good', 'b_d', '_8number',
  319. 'astringwithmorethan32characters_'])
  320. formatted.index.name = 'index'
  321. formatted = formatted.astype(np.int32)
  322. with tm.ensure_clean() as path:
  323. with tm.assert_produces_warning(pd.io.stata.InvalidColumnName):
  324. original.to_stata(path, None)
  325. written_and_read_again = self.read_dta(path)
  326. tm.assert_frame_equal(
  327. written_and_read_again.set_index('index'), formatted)
  328. @pytest.mark.parametrize('version', [114, 117])
  329. def test_read_write_dta12(self, version):
  330. original = DataFrame([(1, 2, 3, 4, 5, 6)],
  331. columns=['astringwithmorethan32characters_1',
  332. 'astringwithmorethan32characters_2',
  333. '+',
  334. '-',
  335. 'short',
  336. 'delete'])
  337. formatted = DataFrame([(1, 2, 3, 4, 5, 6)],
  338. columns=['astringwithmorethan32characters_',
  339. '_0astringwithmorethan32character',
  340. '_',
  341. '_1_',
  342. '_short',
  343. '_delete'])
  344. formatted.index.name = 'index'
  345. formatted = formatted.astype(np.int32)
  346. with tm.ensure_clean() as path:
  347. with warnings.catch_warnings(record=True) as w:
  348. warnings.simplefilter('always', InvalidColumnName)
  349. original.to_stata(path, None, version=version)
  350. # should get a warning for that format.
  351. assert len(w) == 1
  352. written_and_read_again = self.read_dta(path)
  353. tm.assert_frame_equal(
  354. written_and_read_again.set_index('index'), formatted)
  355. def test_read_write_dta13(self):
  356. s1 = Series(2 ** 9, dtype=np.int16)
  357. s2 = Series(2 ** 17, dtype=np.int32)
  358. s3 = Series(2 ** 33, dtype=np.int64)
  359. original = DataFrame({'int16': s1, 'int32': s2, 'int64': s3})
  360. original.index.name = 'index'
  361. formatted = original
  362. formatted['int64'] = formatted['int64'].astype(np.float64)
  363. with tm.ensure_clean() as path:
  364. original.to_stata(path)
  365. written_and_read_again = self.read_dta(path)
  366. tm.assert_frame_equal(written_and_read_again.set_index('index'),
  367. formatted)
  368. @pytest.mark.parametrize('version', [114, 117])
  369. @pytest.mark.parametrize(
  370. 'file', ['dta14_113', 'dta14_114', 'dta14_115', 'dta14_117'])
  371. def test_read_write_reread_dta14(self, file, parsed_114, version):
  372. file = getattr(self, file)
  373. parsed = self.read_dta(file)
  374. parsed.index.name = 'index'
  375. expected = self.read_csv(self.csv14)
  376. cols = ['byte_', 'int_', 'long_', 'float_', 'double_']
  377. for col in cols:
  378. expected[col] = expected[col]._convert(datetime=True, numeric=True)
  379. expected['float_'] = expected['float_'].astype(np.float32)
  380. expected['date_td'] = pd.to_datetime(
  381. expected['date_td'], errors='coerce')
  382. tm.assert_frame_equal(parsed_114, parsed)
  383. with tm.ensure_clean() as path:
  384. parsed_114.to_stata(path, {'date_td': 'td'}, version=version)
  385. written_and_read_again = self.read_dta(path)
  386. tm.assert_frame_equal(
  387. written_and_read_again.set_index('index'), parsed_114)
  388. @pytest.mark.parametrize(
  389. 'file', ['dta15_113', 'dta15_114', 'dta15_115', 'dta15_117'])
  390. def test_read_write_reread_dta15(self, file):
  391. expected = self.read_csv(self.csv15)
  392. expected['byte_'] = expected['byte_'].astype(np.int8)
  393. expected['int_'] = expected['int_'].astype(np.int16)
  394. expected['long_'] = expected['long_'].astype(np.int32)
  395. expected['float_'] = expected['float_'].astype(np.float32)
  396. expected['double_'] = expected['double_'].astype(np.float64)
  397. expected['date_td'] = expected['date_td'].apply(
  398. datetime.strptime, args=('%Y-%m-%d',))
  399. file = getattr(self, file)
  400. parsed = self.read_dta(file)
  401. tm.assert_frame_equal(expected, parsed)
  402. @pytest.mark.parametrize('version', [114, 117])
  403. def test_timestamp_and_label(self, version):
  404. original = DataFrame([(1,)], columns=['variable'])
  405. time_stamp = datetime(2000, 2, 29, 14, 21)
  406. data_label = 'This is a data file.'
  407. with tm.ensure_clean() as path:
  408. original.to_stata(path, time_stamp=time_stamp,
  409. data_label=data_label,
  410. version=version)
  411. with StataReader(path) as reader:
  412. assert reader.time_stamp == '29 Feb 2000 14:21'
  413. assert reader.data_label == data_label
  414. @pytest.mark.parametrize('version', [114, 117])
  415. def test_invalid_timestamp(self, version):
  416. original = DataFrame([(1,)], columns=['variable'])
  417. time_stamp = '01 Jan 2000, 00:00:00'
  418. with tm.ensure_clean() as path:
  419. msg = "time_stamp should be datetime type"
  420. with pytest.raises(ValueError, match=msg):
  421. original.to_stata(path, time_stamp=time_stamp,
  422. version=version)
  423. def test_numeric_column_names(self):
  424. original = DataFrame(np.reshape(np.arange(25.0), (5, 5)))
  425. original.index.name = 'index'
  426. with tm.ensure_clean() as path:
  427. # should get a warning for that format.
  428. with tm.assert_produces_warning(InvalidColumnName):
  429. original.to_stata(path)
  430. written_and_read_again = self.read_dta(path)
  431. written_and_read_again = written_and_read_again.set_index('index')
  432. columns = list(written_and_read_again.columns)
  433. convert_col_name = lambda x: int(x[1])
  434. written_and_read_again.columns = map(convert_col_name, columns)
  435. tm.assert_frame_equal(original, written_and_read_again)
  436. @pytest.mark.parametrize('version', [114, 117])
  437. def test_nan_to_missing_value(self, version):
  438. s1 = Series(np.arange(4.0), dtype=np.float32)
  439. s2 = Series(np.arange(4.0), dtype=np.float64)
  440. s1[::2] = np.nan
  441. s2[1::2] = np.nan
  442. original = DataFrame({'s1': s1, 's2': s2})
  443. original.index.name = 'index'
  444. with tm.ensure_clean() as path:
  445. original.to_stata(path, version=version)
  446. written_and_read_again = self.read_dta(path)
  447. written_and_read_again = written_and_read_again.set_index('index')
  448. tm.assert_frame_equal(written_and_read_again, original)
  449. def test_no_index(self):
  450. columns = ['x', 'y']
  451. original = DataFrame(np.reshape(np.arange(10.0), (5, 2)),
  452. columns=columns)
  453. original.index.name = 'index_not_written'
  454. with tm.ensure_clean() as path:
  455. original.to_stata(path, write_index=False)
  456. written_and_read_again = self.read_dta(path)
  457. with pytest.raises(KeyError, match=original.index.name):
  458. written_and_read_again['index_not_written']
  459. def test_string_no_dates(self):
  460. s1 = Series(['a', 'A longer string'])
  461. s2 = Series([1.0, 2.0], dtype=np.float64)
  462. original = DataFrame({'s1': s1, 's2': s2})
  463. original.index.name = 'index'
  464. with tm.ensure_clean() as path:
  465. original.to_stata(path)
  466. written_and_read_again = self.read_dta(path)
  467. tm.assert_frame_equal(written_and_read_again.set_index('index'),
  468. original)
  469. def test_large_value_conversion(self):
  470. s0 = Series([1, 99], dtype=np.int8)
  471. s1 = Series([1, 127], dtype=np.int8)
  472. s2 = Series([1, 2 ** 15 - 1], dtype=np.int16)
  473. s3 = Series([1, 2 ** 63 - 1], dtype=np.int64)
  474. original = DataFrame({'s0': s0, 's1': s1, 's2': s2, 's3': s3})
  475. original.index.name = 'index'
  476. with tm.ensure_clean() as path:
  477. with tm.assert_produces_warning(PossiblePrecisionLoss):
  478. original.to_stata(path)
  479. written_and_read_again = self.read_dta(path)
  480. modified = original.copy()
  481. modified['s1'] = Series(modified['s1'], dtype=np.int16)
  482. modified['s2'] = Series(modified['s2'], dtype=np.int32)
  483. modified['s3'] = Series(modified['s3'], dtype=np.float64)
  484. tm.assert_frame_equal(written_and_read_again.set_index('index'),
  485. modified)
  486. def test_dates_invalid_column(self):
  487. original = DataFrame([datetime(2006, 11, 19, 23, 13, 20)])
  488. original.index.name = 'index'
  489. with tm.ensure_clean() as path:
  490. with tm.assert_produces_warning(InvalidColumnName):
  491. original.to_stata(path, {0: 'tc'})
  492. written_and_read_again = self.read_dta(path)
  493. modified = original.copy()
  494. modified.columns = ['_0']
  495. tm.assert_frame_equal(written_and_read_again.set_index('index'),
  496. modified)
  497. def test_105(self):
  498. # Data obtained from:
  499. # http://go.worldbank.org/ZXY29PVJ21
  500. dpath = os.path.join(self.dirpath, 'S4_EDUC1.dta')
  501. df = pd.read_stata(dpath)
  502. df0 = [[1, 1, 3, -2], [2, 1, 2, -2], [4, 1, 1, -2]]
  503. df0 = pd.DataFrame(df0)
  504. df0.columns = ["clustnum", "pri_schl", "psch_num", "psch_dis"]
  505. df0['clustnum'] = df0["clustnum"].astype(np.int16)
  506. df0['pri_schl'] = df0["pri_schl"].astype(np.int8)
  507. df0['psch_num'] = df0["psch_num"].astype(np.int8)
  508. df0['psch_dis'] = df0["psch_dis"].astype(np.float32)
  509. tm.assert_frame_equal(df.head(3), df0)
  510. def test_value_labels_old_format(self):
  511. # GH 19417
  512. #
  513. # Test that value_labels() returns an empty dict if the file format
  514. # predates supporting value labels.
  515. dpath = os.path.join(self.dirpath, 'S4_EDUC1.dta')
  516. reader = StataReader(dpath)
  517. assert reader.value_labels() == {}
  518. reader.close()
  519. def test_date_export_formats(self):
  520. columns = ['tc', 'td', 'tw', 'tm', 'tq', 'th', 'ty']
  521. conversions = {c: c for c in columns}
  522. data = [datetime(2006, 11, 20, 23, 13, 20)] * len(columns)
  523. original = DataFrame([data], columns=columns)
  524. original.index.name = 'index'
  525. expected_values = [datetime(2006, 11, 20, 23, 13, 20), # Time
  526. datetime(2006, 11, 20), # Day
  527. datetime(2006, 11, 19), # Week
  528. datetime(2006, 11, 1), # Month
  529. datetime(2006, 10, 1), # Quarter year
  530. datetime(2006, 7, 1), # Half year
  531. datetime(2006, 1, 1)] # Year
  532. expected = DataFrame([expected_values], columns=columns)
  533. expected.index.name = 'index'
  534. with tm.ensure_clean() as path:
  535. original.to_stata(path, conversions)
  536. written_and_read_again = self.read_dta(path)
  537. tm.assert_frame_equal(written_and_read_again.set_index('index'),
  538. expected)
  539. def test_write_missing_strings(self):
  540. original = DataFrame([["1"], [None]], columns=["foo"])
  541. expected = DataFrame([["1"], [""]], columns=["foo"])
  542. expected.index.name = 'index'
  543. with tm.ensure_clean() as path:
  544. original.to_stata(path)
  545. written_and_read_again = self.read_dta(path)
  546. tm.assert_frame_equal(written_and_read_again.set_index('index'),
  547. expected)
  548. @pytest.mark.parametrize('version', [114, 117])
  549. @pytest.mark.parametrize('byteorder', ['>', '<'])
  550. def test_bool_uint(self, byteorder, version):
  551. s0 = Series([0, 1, True], dtype=np.bool)
  552. s1 = Series([0, 1, 100], dtype=np.uint8)
  553. s2 = Series([0, 1, 255], dtype=np.uint8)
  554. s3 = Series([0, 1, 2 ** 15 - 100], dtype=np.uint16)
  555. s4 = Series([0, 1, 2 ** 16 - 1], dtype=np.uint16)
  556. s5 = Series([0, 1, 2 ** 31 - 100], dtype=np.uint32)
  557. s6 = Series([0, 1, 2 ** 32 - 1], dtype=np.uint32)
  558. original = DataFrame({'s0': s0, 's1': s1, 's2': s2, 's3': s3,
  559. 's4': s4, 's5': s5, 's6': s6})
  560. original.index.name = 'index'
  561. expected = original.copy()
  562. expected_types = (np.int8, np.int8, np.int16, np.int16, np.int32,
  563. np.int32, np.float64)
  564. for c, t in zip(expected.columns, expected_types):
  565. expected[c] = expected[c].astype(t)
  566. with tm.ensure_clean() as path:
  567. original.to_stata(path, byteorder=byteorder, version=version)
  568. written_and_read_again = self.read_dta(path)
  569. written_and_read_again = written_and_read_again.set_index('index')
  570. tm.assert_frame_equal(written_and_read_again, expected)
  571. def test_variable_labels(self):
  572. with StataReader(self.dta16_115) as rdr:
  573. sr_115 = rdr.variable_labels()
  574. with StataReader(self.dta16_117) as rdr:
  575. sr_117 = rdr.variable_labels()
  576. keys = ('var1', 'var2', 'var3')
  577. labels = ('label1', 'label2', 'label3')
  578. for k, v in compat.iteritems(sr_115):
  579. assert k in sr_117
  580. assert v == sr_117[k]
  581. assert k in keys
  582. assert v in labels
  583. def test_minimal_size_col(self):
  584. str_lens = (1, 100, 244)
  585. s = {}
  586. for str_len in str_lens:
  587. s['s' + str(str_len)] = Series(['a' * str_len,
  588. 'b' * str_len, 'c' * str_len])
  589. original = DataFrame(s)
  590. with tm.ensure_clean() as path:
  591. original.to_stata(path, write_index=False)
  592. with StataReader(path) as sr:
  593. typlist = sr.typlist
  594. variables = sr.varlist
  595. formats = sr.fmtlist
  596. for variable, fmt, typ in zip(variables, formats, typlist):
  597. assert int(variable[1:]) == int(fmt[1:-1])
  598. assert int(variable[1:]) == typ
  599. def test_excessively_long_string(self):
  600. str_lens = (1, 244, 500)
  601. s = {}
  602. for str_len in str_lens:
  603. s['s' + str(str_len)] = Series(['a' * str_len,
  604. 'b' * str_len, 'c' * str_len])
  605. original = DataFrame(s)
  606. msg = (r"Fixed width strings in Stata \.dta files are limited to 244"
  607. r" \(or fewer\)\ncharacters\. Column 's500' does not satisfy"
  608. r" this restriction\. Use the\n'version=117' parameter to write"
  609. r" the newer \(Stata 13 and later\) format\.")
  610. with pytest.raises(ValueError, match=msg):
  611. with tm.ensure_clean() as path:
  612. original.to_stata(path)
  613. def test_missing_value_generator(self):
  614. types = ('b', 'h', 'l')
  615. df = DataFrame([[0.0]], columns=['float_'])
  616. with tm.ensure_clean() as path:
  617. df.to_stata(path)
  618. with StataReader(path) as rdr:
  619. valid_range = rdr.VALID_RANGE
  620. expected_values = ['.' + chr(97 + i) for i in range(26)]
  621. expected_values.insert(0, '.')
  622. for t in types:
  623. offset = valid_range[t][1]
  624. for i in range(0, 27):
  625. val = StataMissingValue(offset + 1 + i)
  626. assert val.string == expected_values[i]
  627. # Test extremes for floats
  628. val = StataMissingValue(struct.unpack('<f', b'\x00\x00\x00\x7f')[0])
  629. assert val.string == '.'
  630. val = StataMissingValue(struct.unpack('<f', b'\x00\xd0\x00\x7f')[0])
  631. assert val.string == '.z'
  632. # Test extremes for floats
  633. val = StataMissingValue(struct.unpack(
  634. '<d', b'\x00\x00\x00\x00\x00\x00\xe0\x7f')[0])
  635. assert val.string == '.'
  636. val = StataMissingValue(struct.unpack(
  637. '<d', b'\x00\x00\x00\x00\x00\x1a\xe0\x7f')[0])
  638. assert val.string == '.z'
  639. @pytest.mark.parametrize(
  640. 'file', ['dta17_113', 'dta17_115', 'dta17_117'])
  641. def test_missing_value_conversion(self, file):
  642. columns = ['int8_', 'int16_', 'int32_', 'float32_', 'float64_']
  643. smv = StataMissingValue(101)
  644. keys = [key for key in iterkeys(smv.MISSING_VALUES)]
  645. keys.sort()
  646. data = []
  647. for i in range(27):
  648. row = [StataMissingValue(keys[i + (j * 27)]) for j in range(5)]
  649. data.append(row)
  650. expected = DataFrame(data, columns=columns)
  651. parsed = read_stata(getattr(self, file), convert_missing=True)
  652. tm.assert_frame_equal(parsed, expected)
  653. def test_big_dates(self):
  654. yr = [1960, 2000, 9999, 100, 2262, 1677]
  655. mo = [1, 1, 12, 1, 4, 9]
  656. dd = [1, 1, 31, 1, 22, 23]
  657. hr = [0, 0, 23, 0, 0, 0]
  658. mm = [0, 0, 59, 0, 0, 0]
  659. ss = [0, 0, 59, 0, 0, 0]
  660. expected = []
  661. for i in range(len(yr)):
  662. row = []
  663. for j in range(7):
  664. if j == 0:
  665. row.append(
  666. datetime(yr[i], mo[i], dd[i], hr[i], mm[i], ss[i]))
  667. elif j == 6:
  668. row.append(datetime(yr[i], 1, 1))
  669. else:
  670. row.append(datetime(yr[i], mo[i], dd[i]))
  671. expected.append(row)
  672. expected.append([pd.NaT] * 7)
  673. columns = ['date_tc', 'date_td', 'date_tw', 'date_tm', 'date_tq',
  674. 'date_th', 'date_ty']
  675. # Fixes for weekly, quarterly,half,year
  676. expected[2][2] = datetime(9999, 12, 24)
  677. expected[2][3] = datetime(9999, 12, 1)
  678. expected[2][4] = datetime(9999, 10, 1)
  679. expected[2][5] = datetime(9999, 7, 1)
  680. expected[4][2] = datetime(2262, 4, 16)
  681. expected[4][3] = expected[4][4] = datetime(2262, 4, 1)
  682. expected[4][5] = expected[4][6] = datetime(2262, 1, 1)
  683. expected[5][2] = expected[5][3] = expected[
  684. 5][4] = datetime(1677, 10, 1)
  685. expected[5][5] = expected[5][6] = datetime(1678, 1, 1)
  686. expected = DataFrame(expected, columns=columns, dtype=np.object)
  687. parsed_115 = read_stata(self.dta18_115)
  688. parsed_117 = read_stata(self.dta18_117)
  689. tm.assert_frame_equal(expected, parsed_115,
  690. check_datetimelike_compat=True)
  691. tm.assert_frame_equal(expected, parsed_117,
  692. check_datetimelike_compat=True)
  693. date_conversion = {c: c[-2:] for c in columns}
  694. # {c : c[-2:] for c in columns}
  695. with tm.ensure_clean() as path:
  696. expected.index.name = 'index'
  697. expected.to_stata(path, date_conversion)
  698. written_and_read_again = self.read_dta(path)
  699. tm.assert_frame_equal(written_and_read_again.set_index('index'),
  700. expected,
  701. check_datetimelike_compat=True)
  702. def test_dtype_conversion(self):
  703. expected = self.read_csv(self.csv15)
  704. expected['byte_'] = expected['byte_'].astype(np.int8)
  705. expected['int_'] = expected['int_'].astype(np.int16)
  706. expected['long_'] = expected['long_'].astype(np.int32)
  707. expected['float_'] = expected['float_'].astype(np.float32)
  708. expected['double_'] = expected['double_'].astype(np.float64)
  709. expected['date_td'] = expected['date_td'].apply(datetime.strptime,
  710. args=('%Y-%m-%d',))
  711. no_conversion = read_stata(self.dta15_117,
  712. convert_dates=True)
  713. tm.assert_frame_equal(expected, no_conversion)
  714. conversion = read_stata(self.dta15_117,
  715. convert_dates=True,
  716. preserve_dtypes=False)
  717. # read_csv types are the same
  718. expected = self.read_csv(self.csv15)
  719. expected['date_td'] = expected['date_td'].apply(datetime.strptime,
  720. args=('%Y-%m-%d',))
  721. tm.assert_frame_equal(expected, conversion)
  722. def test_drop_column(self):
  723. expected = self.read_csv(self.csv15)
  724. expected['byte_'] = expected['byte_'].astype(np.int8)
  725. expected['int_'] = expected['int_'].astype(np.int16)
  726. expected['long_'] = expected['long_'].astype(np.int32)
  727. expected['float_'] = expected['float_'].astype(np.float32)
  728. expected['double_'] = expected['double_'].astype(np.float64)
  729. expected['date_td'] = expected['date_td'].apply(datetime.strptime,
  730. args=('%Y-%m-%d',))
  731. columns = ['byte_', 'int_', 'long_']
  732. expected = expected[columns]
  733. dropped = read_stata(self.dta15_117, convert_dates=True,
  734. columns=columns)
  735. tm.assert_frame_equal(expected, dropped)
  736. # See PR 10757
  737. columns = ['int_', 'long_', 'byte_']
  738. expected = expected[columns]
  739. reordered = read_stata(self.dta15_117, convert_dates=True,
  740. columns=columns)
  741. tm.assert_frame_equal(expected, reordered)
  742. msg = "columns contains duplicate entries"
  743. with pytest.raises(ValueError, match=msg):
  744. columns = ['byte_', 'byte_']
  745. read_stata(self.dta15_117, convert_dates=True, columns=columns)
  746. msg = ("The following columns were not found in the Stata data set:"
  747. " not_found")
  748. with pytest.raises(ValueError, match=msg):
  749. columns = ['byte_', 'int_', 'long_', 'not_found']
  750. read_stata(self.dta15_117, convert_dates=True, columns=columns)
  751. @pytest.mark.parametrize('version', [114, 117])
  752. @pytest.mark.filterwarnings(
  753. "ignore:\\nStata value:pandas.io.stata.ValueLabelTypeMismatch"
  754. )
  755. def test_categorical_writing(self, version):
  756. original = DataFrame.from_records(
  757. [
  758. ["one", "ten", "one", "one", "one", 1],
  759. ["two", "nine", "two", "two", "two", 2],
  760. ["three", "eight", "three", "three", "three", 3],
  761. ["four", "seven", 4, "four", "four", 4],
  762. ["five", "six", 5, np.nan, "five", 5],
  763. ["six", "five", 6, np.nan, "six", 6],
  764. ["seven", "four", 7, np.nan, "seven", 7],
  765. ["eight", "three", 8, np.nan, "eight", 8],
  766. ["nine", "two", 9, np.nan, "nine", 9],
  767. ["ten", "one", "ten", np.nan, "ten", 10]
  768. ],
  769. columns=['fully_labeled', 'fully_labeled2', 'incompletely_labeled',
  770. 'labeled_with_missings', 'float_labelled', 'unlabeled'])
  771. expected = original.copy()
  772. # these are all categoricals
  773. original = pd.concat([original[col].astype('category')
  774. for col in original], axis=1)
  775. expected['incompletely_labeled'] = expected[
  776. 'incompletely_labeled'].apply(str)
  777. expected['unlabeled'] = expected['unlabeled'].apply(str)
  778. expected = pd.concat([expected[col].astype('category')
  779. for col in expected], axis=1)
  780. expected.index.name = 'index'
  781. with tm.ensure_clean() as path:
  782. original.to_stata(path, version=version)
  783. written_and_read_again = self.read_dta(path)
  784. res = written_and_read_again.set_index('index')
  785. tm.assert_frame_equal(res, expected, check_categorical=False)
  786. def test_categorical_warnings_and_errors(self):
  787. # Warning for non-string labels
  788. # Error for labels too long
  789. original = pd.DataFrame.from_records(
  790. [['a' * 10000],
  791. ['b' * 10000],
  792. ['c' * 10000],
  793. ['d' * 10000]],
  794. columns=['Too_long'])
  795. original = pd.concat([original[col].astype('category')
  796. for col in original], axis=1)
  797. with tm.ensure_clean() as path:
  798. msg = ("Stata value labels for a single variable must have"
  799. r" a combined length less than 32,000 characters\.")
  800. with pytest.raises(ValueError, match=msg):
  801. original.to_stata(path)
  802. original = pd.DataFrame.from_records(
  803. [['a'],
  804. ['b'],
  805. ['c'],
  806. ['d'],
  807. [1]],
  808. columns=['Too_long'])
  809. original = pd.concat([original[col].astype('category')
  810. for col in original], axis=1)
  811. with tm.assert_produces_warning(pd.io.stata.ValueLabelTypeMismatch):
  812. original.to_stata(path)
  813. # should get a warning for mixed content
  814. @pytest.mark.parametrize('version', [114, 117])
  815. def test_categorical_with_stata_missing_values(self, version):
  816. values = [['a' + str(i)] for i in range(120)]
  817. values.append([np.nan])
  818. original = pd.DataFrame.from_records(values, columns=['many_labels'])
  819. original = pd.concat([original[col].astype('category')
  820. for col in original], axis=1)
  821. original.index.name = 'index'
  822. with tm.ensure_clean() as path:
  823. original.to_stata(path, version=version)
  824. written_and_read_again = self.read_dta(path)
  825. res = written_and_read_again.set_index('index')
  826. tm.assert_frame_equal(res, original, check_categorical=False)
  827. @pytest.mark.parametrize(
  828. 'file', ['dta19_115', 'dta19_117'])
  829. def test_categorical_order(self, file):
  830. # Directly construct using expected codes
  831. # Format is is_cat, col_name, labels (in order), underlying data
  832. expected = [(True, 'ordered', ['a', 'b', 'c', 'd', 'e'], np.arange(5)),
  833. (True, 'reverse', ['a', 'b', 'c',
  834. 'd', 'e'], np.arange(5)[::-1]),
  835. (True, 'noorder', ['a', 'b', 'c', 'd',
  836. 'e'], np.array([2, 1, 4, 0, 3])),
  837. (True, 'floating', [
  838. 'a', 'b', 'c', 'd', 'e'], np.arange(0, 5)),
  839. (True, 'float_missing', [
  840. 'a', 'd', 'e'], np.array([0, 1, 2, -1, -1])),
  841. (False, 'nolabel', [
  842. 1.0, 2.0, 3.0, 4.0, 5.0], np.arange(5)),
  843. (True, 'int32_mixed', ['d', 2, 'e', 'b', 'a'],
  844. np.arange(5))]
  845. cols = []
  846. for is_cat, col, labels, codes in expected:
  847. if is_cat:
  848. cols.append((col, pd.Categorical.from_codes(codes, labels)))
  849. else:
  850. cols.append((col, pd.Series(labels, dtype=np.float32)))
  851. expected = DataFrame.from_dict(OrderedDict(cols))
  852. # Read with and with out categoricals, ensure order is identical
  853. file = getattr(self, file)
  854. parsed = read_stata(file)
  855. tm.assert_frame_equal(expected, parsed, check_categorical=False)
  856. # Check identity of codes
  857. for col in expected:
  858. if is_categorical_dtype(expected[col]):
  859. tm.assert_series_equal(expected[col].cat.codes,
  860. parsed[col].cat.codes)
  861. tm.assert_index_equal(expected[col].cat.categories,
  862. parsed[col].cat.categories)
  863. @pytest.mark.parametrize(
  864. 'file', ['dta20_115', 'dta20_117'])
  865. def test_categorical_sorting(self, file):
  866. parsed = read_stata(getattr(self, file))
  867. # Sort based on codes, not strings
  868. parsed = parsed.sort_values("srh", na_position='first')
  869. # Don't sort index
  870. parsed.index = np.arange(parsed.shape[0])
  871. codes = [-1, -1, 0, 1, 1, 1, 2, 2, 3, 4]
  872. categories = ["Poor", "Fair", "Good", "Very good", "Excellent"]
  873. cat = pd.Categorical.from_codes(codes=codes, categories=categories)
  874. expected = pd.Series(cat, name='srh')
  875. tm.assert_series_equal(expected, parsed["srh"],
  876. check_categorical=False)
  877. @pytest.mark.parametrize(
  878. 'file', ['dta19_115', 'dta19_117'])
  879. def test_categorical_ordering(self, file):
  880. file = getattr(self, file)
  881. parsed = read_stata(file)
  882. parsed_unordered = read_stata(file,
  883. order_categoricals=False)
  884. for col in parsed:
  885. if not is_categorical_dtype(parsed[col]):
  886. continue
  887. assert parsed[col].cat.ordered
  888. assert not parsed_unordered[col].cat.ordered
  889. @pytest.mark.parametrize(
  890. 'file', ['dta1_117', 'dta2_117', 'dta3_117',
  891. 'dta4_117', 'dta14_117', 'dta15_117',
  892. 'dta16_117', 'dta17_117', 'dta18_117',
  893. 'dta19_117', 'dta20_117'])
  894. @pytest.mark.parametrize(
  895. 'chunksize', [1, 2])
  896. @pytest.mark.parametrize(
  897. 'convert_categoricals', [False, True])
  898. @pytest.mark.parametrize(
  899. 'convert_dates', [False, True])
  900. def test_read_chunks_117(self, file, chunksize,
  901. convert_categoricals, convert_dates):
  902. fname = getattr(self, file)
  903. with warnings.catch_warnings(record=True) as w:
  904. warnings.simplefilter("always")
  905. parsed = read_stata(
  906. fname,
  907. convert_categoricals=convert_categoricals,
  908. convert_dates=convert_dates)
  909. itr = read_stata(
  910. fname, iterator=True,
  911. convert_categoricals=convert_categoricals,
  912. convert_dates=convert_dates)
  913. pos = 0
  914. for j in range(5):
  915. with warnings.catch_warnings(record=True) as w: # noqa
  916. warnings.simplefilter("always")
  917. try:
  918. chunk = itr.read(chunksize)
  919. except StopIteration:
  920. break
  921. from_frame = parsed.iloc[pos:pos + chunksize, :]
  922. tm.assert_frame_equal(
  923. from_frame, chunk, check_dtype=False,
  924. check_datetimelike_compat=True,
  925. check_categorical=False)
  926. pos += chunksize
  927. itr.close()
  928. def test_iterator(self):
  929. fname = self.dta3_117
  930. parsed = read_stata(fname)
  931. with read_stata(fname, iterator=True) as itr:
  932. chunk = itr.read(5)
  933. tm.assert_frame_equal(parsed.iloc[0:5, :], chunk)
  934. with read_stata(fname, chunksize=5) as itr:
  935. chunk = list(itr)
  936. tm.assert_frame_equal(parsed.iloc[0:5, :], chunk[0])
  937. with read_stata(fname, iterator=True) as itr:
  938. chunk = itr.get_chunk(5)
  939. tm.assert_frame_equal(parsed.iloc[0:5, :], chunk)
  940. with read_stata(fname, chunksize=5) as itr:
  941. chunk = itr.get_chunk()
  942. tm.assert_frame_equal(parsed.iloc[0:5, :], chunk)
  943. # GH12153
  944. with read_stata(fname, chunksize=4) as itr:
  945. from_chunks = pd.concat(itr)
  946. tm.assert_frame_equal(parsed, from_chunks)
  947. @pytest.mark.parametrize(
  948. 'file', ['dta2_115', 'dta3_115', 'dta4_115',
  949. 'dta14_115', 'dta15_115', 'dta16_115',
  950. 'dta17_115', 'dta18_115', 'dta19_115',
  951. 'dta20_115'])
  952. @pytest.mark.parametrize(
  953. 'chunksize', [1, 2])
  954. @pytest.mark.parametrize(
  955. 'convert_categoricals', [False, True])
  956. @pytest.mark.parametrize(
  957. 'convert_dates', [False, True])
  958. def test_read_chunks_115(self, file, chunksize,
  959. convert_categoricals, convert_dates):
  960. fname = getattr(self, file)
  961. # Read the whole file
  962. with warnings.catch_warnings(record=True) as w:
  963. warnings.simplefilter("always")
  964. parsed = read_stata(
  965. fname,
  966. convert_categoricals=convert_categoricals,
  967. convert_dates=convert_dates)
  968. # Compare to what we get when reading by chunk
  969. itr = read_stata(
  970. fname, iterator=True,
  971. convert_dates=convert_dates,
  972. convert_categoricals=convert_categoricals)
  973. pos = 0
  974. for j in range(5):
  975. with warnings.catch_warnings(record=True) as w: # noqa
  976. warnings.simplefilter("always")
  977. try:
  978. chunk = itr.read(chunksize)
  979. except StopIteration:
  980. break
  981. from_frame = parsed.iloc[pos:pos + chunksize, :]
  982. tm.assert_frame_equal(
  983. from_frame, chunk, check_dtype=False,
  984. check_datetimelike_compat=True,
  985. check_categorical=False)
  986. pos += chunksize
  987. itr.close()
  988. def test_read_chunks_columns(self):
  989. fname = self.dta3_117
  990. columns = ['quarter', 'cpi', 'm1']
  991. chunksize = 2
  992. parsed = read_stata(fname, columns=columns)
  993. with read_stata(fname, iterator=True) as itr:
  994. pos = 0
  995. for j in range(5):
  996. chunk = itr.read(chunksize, columns=columns)
  997. if chunk is None:
  998. break
  999. from_frame = parsed.iloc[pos:pos + chunksize, :]
  1000. tm.assert_frame_equal(from_frame, chunk, check_dtype=False)
  1001. pos += chunksize
  1002. @pytest.mark.parametrize('version', [114, 117])
  1003. def test_write_variable_labels(self, version):
  1004. # GH 13631, add support for writing variable labels
  1005. original = pd.DataFrame({'a': [1, 2, 3, 4],
  1006. 'b': [1.0, 3.0, 27.0, 81.0],
  1007. 'c': ['Atlanta', 'Birmingham',
  1008. 'Cincinnati', 'Detroit']})
  1009. original.index.name = 'index'
  1010. variable_labels = {'a': 'City Rank', 'b': 'City Exponent', 'c': 'City'}
  1011. with tm.ensure_clean() as path:
  1012. original.to_stata(path,
  1013. variable_labels=variable_labels,
  1014. version=version)
  1015. with StataReader(path) as sr:
  1016. read_labels = sr.variable_labels()
  1017. expected_labels = {'index': '',
  1018. 'a': 'City Rank',
  1019. 'b': 'City Exponent',
  1020. 'c': 'City'}
  1021. assert read_labels == expected_labels
  1022. variable_labels['index'] = 'The Index'
  1023. with tm.ensure_clean() as path:
  1024. original.to_stata(path,
  1025. variable_labels=variable_labels,
  1026. version=version)
  1027. with StataReader(path) as sr:
  1028. read_labels = sr.variable_labels()
  1029. assert read_labels == variable_labels
  1030. @pytest.mark.parametrize('version', [114, 117])
  1031. def test_invalid_variable_labels(self, version):
  1032. original = pd.DataFrame({'a': [1, 2, 3, 4],
  1033. 'b': [1.0, 3.0, 27.0, 81.0],
  1034. 'c': ['Atlanta', 'Birmingham',
  1035. 'Cincinnati', 'Detroit']})
  1036. original.index.name = 'index'
  1037. variable_labels = {'a': 'very long' * 10,
  1038. 'b': 'City Exponent',
  1039. 'c': 'City'}
  1040. with tm.ensure_clean() as path:
  1041. msg = "Variable labels must be 80 characters or fewer"
  1042. with pytest.raises(ValueError, match=msg):
  1043. original.to_stata(path,
  1044. variable_labels=variable_labels,
  1045. version=version)
  1046. variable_labels['a'] = u'invalid character Œ'
  1047. with tm.ensure_clean() as path:
  1048. msg = ("Variable labels must contain only characters that can be"
  1049. " encoded in Latin-1")
  1050. with pytest.raises(ValueError, match=msg):
  1051. original.to_stata(path,
  1052. variable_labels=variable_labels,
  1053. version=version)
  1054. def test_write_variable_label_errors(self):
  1055. original = pd.DataFrame({'a': [1, 2, 3, 4],
  1056. 'b': [1.0, 3.0, 27.0, 81.0],
  1057. 'c': ['Atlanta', 'Birmingham',
  1058. 'Cincinnati', 'Detroit']})
  1059. values = [u'\u03A1', u'\u0391',
  1060. u'\u039D', u'\u0394',
  1061. u'\u0391', u'\u03A3']
  1062. variable_labels_utf8 = {'a': 'City Rank',
  1063. 'b': 'City Exponent',
  1064. 'c': u''.join(values)}
  1065. msg = ("Variable labels must contain only characters that can be"
  1066. " encoded in Latin-1")
  1067. with pytest.raises(ValueError, match=msg):
  1068. with tm.ensure_clean() as path:
  1069. original.to_stata(path, variable_labels=variable_labels_utf8)
  1070. variable_labels_long = {'a': 'City Rank',
  1071. 'b': 'City Exponent',
  1072. 'c': 'A very, very, very long variable label '
  1073. 'that is too long for Stata which means '
  1074. 'that it has more than 80 characters'}
  1075. msg = "Variable labels must be 80 characters or fewer"
  1076. with pytest.raises(ValueError, match=msg):
  1077. with tm.ensure_clean() as path:
  1078. original.to_stata(path, variable_labels=variable_labels_long)
  1079. def test_default_date_conversion(self):
  1080. # GH 12259
  1081. dates = [dt.datetime(1999, 12, 31, 12, 12, 12, 12000),
  1082. dt.datetime(2012, 12, 21, 12, 21, 12, 21000),
  1083. dt.datetime(1776, 7, 4, 7, 4, 7, 4000)]
  1084. original = pd.DataFrame({'nums': [1.0, 2.0, 3.0],
  1085. 'strs': ['apple', 'banana', 'cherry'],
  1086. 'dates': dates})
  1087. with tm.ensure_clean() as path:
  1088. original.to_stata(path, write_index=False)
  1089. reread = read_stata(path, convert_dates=True)
  1090. tm.assert_frame_equal(original, reread)
  1091. original.to_stata(path,
  1092. write_index=False,
  1093. convert_dates={'dates': 'tc'})
  1094. direct = read_stata(path, convert_dates=True)
  1095. tm.assert_frame_equal(reread, direct)
  1096. dates_idx = original.columns.tolist().index('dates')
  1097. original.to_stata(path,
  1098. write_index=False,
  1099. convert_dates={dates_idx: 'tc'})
  1100. direct = read_stata(path, convert_dates=True)
  1101. tm.assert_frame_equal(reread, direct)
  1102. def test_unsupported_type(self):
  1103. original = pd.DataFrame({'a': [1 + 2j, 2 + 4j]})
  1104. msg = "Data type complex128 not supported"
  1105. with pytest.raises(NotImplementedError, match=msg):
  1106. with tm.ensure_clean() as path:
  1107. original.to_stata(path)
  1108. def test_unsupported_datetype(self):
  1109. dates = [dt.datetime(1999, 12, 31, 12, 12, 12, 12000),
  1110. dt.datetime(2012, 12, 21, 12, 21, 12, 21000),
  1111. dt.datetime(1776, 7, 4, 7, 4, 7, 4000)]
  1112. original = pd.DataFrame({'nums': [1.0, 2.0, 3.0],
  1113. 'strs': ['apple', 'banana', 'cherry'],
  1114. 'dates': dates})
  1115. msg = "Format %tC not implemented"
  1116. with pytest.raises(NotImplementedError, match=msg):
  1117. with tm.ensure_clean() as path:
  1118. original.to_stata(path, convert_dates={'dates': 'tC'})
  1119. dates = pd.date_range('1-1-1990', periods=3, tz='Asia/Hong_Kong')
  1120. original = pd.DataFrame({'nums': [1.0, 2.0, 3.0],
  1121. 'strs': ['apple', 'banana', 'cherry'],
  1122. 'dates': dates})
  1123. with pytest.raises(NotImplementedError):
  1124. with tm.ensure_clean() as path:
  1125. original.to_stata(path)
  1126. def test_repeated_column_labels(self):
  1127. # GH 13923
  1128. msg = (r"Value labels for column ethnicsn are not unique\. The"
  1129. r" repeated labels are:\n\n-+wolof")
  1130. with pytest.raises(ValueError, match=msg):
  1131. read_stata(self.dta23, convert_categoricals=True)
  1132. def test_stata_111(self):
  1133. # 111 is an old version but still used by current versions of
  1134. # SAS when exporting to Stata format. We do not know of any
  1135. # on-line documentation for this version.
  1136. df = read_stata(self.dta24_111)
  1137. original = pd.DataFrame({'y': [1, 1, 1, 1, 1, 0, 0, np.NaN, 0, 0],
  1138. 'x': [1, 2, 1, 3, np.NaN, 4, 3, 5, 1, 6],
  1139. 'w': [2, np.NaN, 5, 2, 4, 4, 3, 1, 2, 3],
  1140. 'z': ['a', 'b', 'c', 'd', 'e', '', 'g', 'h',
  1141. 'i', 'j']})
  1142. original = original[['y', 'x', 'w', 'z']]
  1143. tm.assert_frame_equal(original, df)
  1144. def test_out_of_range_double(self):
  1145. # GH 14618
  1146. df = DataFrame({'ColumnOk': [0.0,
  1147. np.finfo(np.double).eps,
  1148. 4.49423283715579e+307],
  1149. 'ColumnTooBig': [0.0,
  1150. np.finfo(np.double).eps,
  1151. np.finfo(np.double).max]})
  1152. msg = (r"Column ColumnTooBig has a maximum value \(.+\)"
  1153. r" outside the range supported by Stata \(.+\)")
  1154. with pytest.raises(ValueError, match=msg):
  1155. with tm.ensure_clean() as path:
  1156. df.to_stata(path)
  1157. df.loc[2, 'ColumnTooBig'] = np.inf
  1158. msg = ("Column ColumnTooBig has a maximum value of infinity which"
  1159. " is outside the range supported by Stata")
  1160. with pytest.raises(ValueError, match=msg):
  1161. with tm.ensure_clean() as path:
  1162. df.to_stata(path)
  1163. def test_out_of_range_float(self):
  1164. original = DataFrame({'ColumnOk': [0.0,
  1165. np.finfo(np.float32).eps,
  1166. np.finfo(np.float32).max / 10.0],
  1167. 'ColumnTooBig': [0.0,
  1168. np.finfo(np.float32).eps,
  1169. np.finfo(np.float32).max]})
  1170. original.index.name = 'index'
  1171. for col in original:
  1172. original[col] = original[col].astype(np.float32)
  1173. with tm.ensure_clean() as path:
  1174. original.to_stata(path)
  1175. reread = read_stata(path)
  1176. original['ColumnTooBig'] = original['ColumnTooBig'].astype(
  1177. np.float64)
  1178. tm.assert_frame_equal(original,
  1179. reread.set_index('index'))
  1180. original.loc[2, 'ColumnTooBig'] = np.inf
  1181. msg = ("Column ColumnTooBig has a maximum value of infinity which"
  1182. " is outside the range supported by Stata")
  1183. with pytest.raises(ValueError, match=msg):
  1184. with tm.ensure_clean() as path:
  1185. original.to_stata(path)
  1186. def test_path_pathlib(self):
  1187. df = tm.makeDataFrame()
  1188. df.index.name = 'index'
  1189. reader = lambda x: read_stata(x).set_index('index')
  1190. result = tm.round_trip_pathlib(df.to_stata, reader)
  1191. tm.assert_frame_equal(df, result)
  1192. def test_pickle_path_localpath(self):
  1193. df = tm.makeDataFrame()
  1194. df.index.name = 'index'
  1195. reader = lambda x: read_stata(x).set_index('index')
  1196. result = tm.round_trip_localpath(df.to_stata, reader)
  1197. tm.assert_frame_equal(df, result)
  1198. @pytest.mark.parametrize(
  1199. 'write_index', [True, False])
  1200. def test_value_labels_iterator(self, write_index):
  1201. # GH 16923
  1202. d = {'A': ['B', 'E', 'C', 'A', 'E']}
  1203. df = pd.DataFrame(data=d)
  1204. df['A'] = df['A'].astype('category')
  1205. with tm.ensure_clean() as path:
  1206. df.to_stata(path, write_index=write_index)
  1207. with pd.read_stata(path, iterator=True) as dta_iter:
  1208. value_labels = dta_iter.value_labels()
  1209. assert value_labels == {'A': {0: 'A', 1: 'B', 2: 'C', 3: 'E'}}
  1210. def test_set_index(self):
  1211. # GH 17328
  1212. df = tm.makeDataFrame()
  1213. df.index.name = 'index'
  1214. with tm.ensure_clean() as path:
  1215. df.to_stata(path)
  1216. reread = pd.read_stata(path, index_col='index')
  1217. tm.assert_frame_equal(df, reread)
  1218. @pytest.mark.parametrize(
  1219. 'column', ['ms', 'day', 'week', 'month', 'qtr', 'half', 'yr'])
  1220. def test_date_parsing_ignores_format_details(self, column):
  1221. # GH 17797
  1222. #
  1223. # Test that display formats are ignored when determining if a numeric
  1224. # column is a date value.
  1225. #
  1226. # All date types are stored as numbers and format associated with the
  1227. # column denotes both the type of the date and the display format.
  1228. #
  1229. # STATA supports 9 date types which each have distinct units. We test 7
  1230. # of the 9 types, ignoring %tC and %tb. %tC is a variant of %tc that
  1231. # accounts for leap seconds and %tb relies on STATAs business calendar.
  1232. df = read_stata(self.stata_dates)
  1233. unformatted = df.loc[0, column]
  1234. formatted = df.loc[0, column + "_fmt"]
  1235. assert unformatted == formatted
  1236. def test_writer_117(self):
  1237. original = DataFrame(data=[['string', 'object', 1, 1, 1, 1.1, 1.1,
  1238. np.datetime64('2003-12-25'),
  1239. 'a', 'a' * 2045, 'a' * 5000, 'a'],
  1240. ['string-1', 'object-1', 1, 1, 1, 1.1, 1.1,
  1241. np.datetime64('2003-12-26'),
  1242. 'b', 'b' * 2045, '', '']
  1243. ],
  1244. columns=['string', 'object', 'int8', 'int16',
  1245. 'int32', 'float32', 'float64',
  1246. 'datetime',
  1247. 's1', 's2045', 'srtl', 'forced_strl'])
  1248. original['object'] = Series(original['object'], dtype=object)
  1249. original['int8'] = Series(original['int8'], dtype=np.int8)
  1250. original['int16'] = Series(original['int16'], dtype=np.int16)
  1251. original['int32'] = original['int32'].astype(np.int32)
  1252. original['float32'] = Series(original['float32'], dtype=np.float32)
  1253. original.index.name = 'index'
  1254. original.index = original.index.astype(np.int32)
  1255. copy = original.copy()
  1256. with tm.ensure_clean() as path:
  1257. original.to_stata(path,
  1258. convert_dates={'datetime': 'tc'},
  1259. convert_strl=['forced_strl'],
  1260. version=117)
  1261. written_and_read_again = self.read_dta(path)
  1262. # original.index is np.int32, read index is np.int64
  1263. tm.assert_frame_equal(written_and_read_again.set_index('index'),
  1264. original, check_index_type=False)
  1265. tm.assert_frame_equal(original, copy)
  1266. def test_convert_strl_name_swap(self):
  1267. original = DataFrame([['a' * 3000, 'A', 'apple'],
  1268. ['b' * 1000, 'B', 'banana']],
  1269. columns=['long1' * 10, 'long', 1])
  1270. original.index.name = 'index'
  1271. with tm.assert_produces_warning(pd.io.stata.InvalidColumnName):
  1272. with tm.ensure_clean() as path:
  1273. original.to_stata(path, convert_strl=['long', 1], version=117)
  1274. reread = self.read_dta(path)
  1275. reread = reread.set_index('index')
  1276. reread.columns = original.columns
  1277. tm.assert_frame_equal(reread, original,
  1278. check_index_type=False)
  1279. def test_invalid_date_conversion(self):
  1280. # GH 12259
  1281. dates = [dt.datetime(1999, 12, 31, 12, 12, 12, 12000),
  1282. dt.datetime(2012, 12, 21, 12, 21, 12, 21000),
  1283. dt.datetime(1776, 7, 4, 7, 4, 7, 4000)]
  1284. original = pd.DataFrame({'nums': [1.0, 2.0, 3.0],
  1285. 'strs': ['apple', 'banana', 'cherry'],
  1286. 'dates': dates})
  1287. with tm.ensure_clean() as path:
  1288. msg = "convert_dates key must be a column or an integer"
  1289. with pytest.raises(ValueError, match=msg):
  1290. original.to_stata(path,
  1291. convert_dates={'wrong_name': 'tc'})
  1292. @pytest.mark.parametrize('version', [114, 117])
  1293. def test_nonfile_writing(self, version):
  1294. # GH 21041
  1295. bio = io.BytesIO()
  1296. df = tm.makeDataFrame()
  1297. df.index.name = 'index'
  1298. with tm.ensure_clean() as path:
  1299. df.to_stata(bio, version=version)
  1300. bio.seek(0)
  1301. with open(path, 'wb') as dta:
  1302. dta.write(bio.read())
  1303. reread = pd.read_stata(path, index_col='index')
  1304. tm.assert_frame_equal(df, reread)
  1305. def test_gzip_writing(self):
  1306. # writing version 117 requires seek and cannot be used with gzip
  1307. df = tm.makeDataFrame()
  1308. df.index.name = 'index'
  1309. with tm.ensure_clean() as path:
  1310. with gzip.GzipFile(path, 'wb') as gz:
  1311. df.to_stata(gz, version=114)
  1312. with gzip.GzipFile(path, 'rb') as gz:
  1313. reread = pd.read_stata(gz, index_col='index')
  1314. tm.assert_frame_equal(df, reread)
  1315. def test_unicode_dta_118(self):
  1316. unicode_df = self.read_dta(self.dta25_118)
  1317. columns = ['utf8', 'latin1', 'ascii', 'utf8_strl', 'ascii_strl']
  1318. values = [[u'ραηδας', u'PÄNDÄS', 'p', u'ραηδας', 'p'],
  1319. [u'ƤĀńĐąŜ', u'Ö', 'a', u'ƤĀńĐąŜ', 'a'],
  1320. [u'ᴘᴀᴎᴅᴀS', u'Ü', 'n', u'ᴘᴀᴎᴅᴀS', 'n'],
  1321. [' ', ' ', 'd', ' ', 'd'],
  1322. [' ', '', 'a', ' ', 'a'],
  1323. ['', '', 's', '', 's'],
  1324. ['', '', ' ', '', ' ']]
  1325. expected = pd.DataFrame(values, columns=columns)
  1326. tm.assert_frame_equal(unicode_df, expected)
  1327. def test_mixed_string_strl(self):
  1328. # GH 23633
  1329. output = [
  1330. {'mixed': 'string' * 500,
  1331. 'number': 0},
  1332. {'mixed': None,
  1333. 'number': 1}
  1334. ]
  1335. output = pd.DataFrame(output)
  1336. output.number = output.number.astype('int32')
  1337. with tm.ensure_clean() as path:
  1338. output.to_stata(path, write_index=False, version=117)
  1339. reread = read_stata(path)
  1340. expected = output.fillna('')
  1341. tm.assert_frame_equal(reread, expected)
  1342. # Check strl supports all None (null)
  1343. output.loc[:, 'mixed'] = None
  1344. output.to_stata(path, write_index=False, convert_strl=['mixed'],
  1345. version=117)
  1346. reread = read_stata(path)
  1347. expected = output.fillna('')
  1348. tm.assert_frame_equal(reread, expected)
  1349. @pytest.mark.parametrize('version', [114, 117])
  1350. def test_all_none_exception(self, version):
  1351. output = [
  1352. {'none': 'none',
  1353. 'number': 0},
  1354. {'none': None,
  1355. 'number': 1}
  1356. ]
  1357. output = pd.DataFrame(output)
  1358. output.loc[:, 'none'] = None
  1359. with tm.ensure_clean() as path:
  1360. msg = (r"Column `none` cannot be exported\.\n\n"
  1361. "Only string-like object arrays containing all strings or a"
  1362. r" mix of strings and None can be exported\. Object arrays"
  1363. r" containing only null values are prohibited\. Other"
  1364. " object typescannot be exported and must first be"
  1365. r" converted to one of the supported types\.")
  1366. with pytest.raises(ValueError, match=msg):
  1367. output.to_stata(path, version=version)
  1368. @pytest.mark.parametrize('version', [114, 117])
  1369. def test_invalid_file_not_written(self, version):
  1370. content = 'Here is one __�__ Another one __·__ Another one __½__'
  1371. df = DataFrame([content], columns=['invalid'])
  1372. expected_exc = UnicodeEncodeError if PY3 else UnicodeDecodeError
  1373. with tm.ensure_clean() as path:
  1374. msg1 = (r"'latin-1' codec can't encode character '\\ufffd'"
  1375. r" in position 14: ordinal not in range\(256\)")
  1376. msg2 = ("'ascii' codec can't decode byte 0xef in position 14:"
  1377. r" ordinal not in range\(128\)")
  1378. with pytest.raises(expected_exc, match=r'{}|{}'.format(
  1379. msg1, msg2)):
  1380. with tm.assert_produces_warning(ResourceWarning):
  1381. df.to_stata(path)
  1382. def test_strl_latin1(self):
  1383. # GH 23573, correct GSO data to reflect correct size
  1384. output = DataFrame([[u'pandas'] * 2, [u'þâÑÐŧ'] * 2],
  1385. columns=['var_str', 'var_strl'])
  1386. with tm.ensure_clean() as path:
  1387. output.to_stata(path, version=117, convert_strl=['var_strl'])
  1388. with open(path, 'rb') as reread:
  1389. content = reread.read()
  1390. expected = u'þâÑÐŧ'
  1391. assert expected.encode('latin-1') in content
  1392. assert expected.encode('utf-8') in content
  1393. gsos = content.split(b'strls')[1][1:-2]
  1394. for gso in gsos.split(b'GSO')[1:]:
  1395. val = gso.split(b'\x00')[-2]
  1396. size = gso[gso.find(b'\x82') + 1]
  1397. if not PY3:
  1398. size = ord(size)
  1399. assert len(val) == size - 1