test_arffread.py 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259
  1. from __future__ import division, print_function, absolute_import
  2. import datetime
  3. import os
  4. import sys
  5. from os.path import join as pjoin
  6. if sys.version_info[0] >= 3:
  7. from io import StringIO
  8. else:
  9. from cStringIO import StringIO
  10. import numpy as np
  11. from numpy.testing import (assert_array_almost_equal,
  12. assert_array_equal, assert_equal, assert_)
  13. import pytest
  14. from pytest import raises as assert_raises
  15. from scipy.io.arff.arffread import loadarff
  16. from scipy.io.arff.arffread import read_header, parse_type, ParseArffError
  17. data_path = pjoin(os.path.dirname(__file__), 'data')
  18. test1 = pjoin(data_path, 'test1.arff')
  19. test2 = pjoin(data_path, 'test2.arff')
  20. test3 = pjoin(data_path, 'test3.arff')
  21. test4 = pjoin(data_path, 'test4.arff')
  22. test5 = pjoin(data_path, 'test5.arff')
  23. test6 = pjoin(data_path, 'test6.arff')
  24. test7 = pjoin(data_path, 'test7.arff')
  25. test8 = pjoin(data_path, 'test8.arff')
  26. expect4_data = [(0.1, 0.2, 0.3, 0.4, 'class1'),
  27. (-0.1, -0.2, -0.3, -0.4, 'class2'),
  28. (1, 2, 3, 4, 'class3')]
  29. expected_types = ['numeric', 'numeric', 'numeric', 'numeric', 'nominal']
  30. missing = pjoin(data_path, 'missing.arff')
  31. expect_missing_raw = np.array([[1, 5], [2, 4], [np.nan, np.nan]])
  32. expect_missing = np.empty(3, [('yop', float), ('yap', float)])
  33. expect_missing['yop'] = expect_missing_raw[:, 0]
  34. expect_missing['yap'] = expect_missing_raw[:, 1]
  35. class TestData(object):
  36. def test1(self):
  37. # Parsing trivial file with nothing.
  38. self._test(test4)
  39. def test2(self):
  40. # Parsing trivial file with some comments in the data section.
  41. self._test(test5)
  42. def test3(self):
  43. # Parsing trivial file with nominal attribute of 1 character.
  44. self._test(test6)
  45. def _test(self, test_file):
  46. data, meta = loadarff(test_file)
  47. for i in range(len(data)):
  48. for j in range(4):
  49. assert_array_almost_equal(expect4_data[i][j], data[i][j])
  50. assert_equal(meta.types(), expected_types)
  51. def test_filelike(self):
  52. # Test reading from file-like object (StringIO)
  53. f1 = open(test1)
  54. data1, meta1 = loadarff(f1)
  55. f1.close()
  56. f2 = open(test1)
  57. data2, meta2 = loadarff(StringIO(f2.read()))
  58. f2.close()
  59. assert_(data1 == data2)
  60. assert_(repr(meta1) == repr(meta2))
  61. @pytest.mark.skipif(sys.version_info < (3, 6),
  62. reason='Passing path-like objects to IO functions requires Python >= 3.6')
  63. def test_path(self):
  64. # Test reading from `pathlib.Path` object
  65. from pathlib import Path
  66. with open(test1) as f1:
  67. data1, meta1 = loadarff(f1)
  68. data2, meta2 = loadarff(Path(test1))
  69. assert_(data1 == data2)
  70. assert_(repr(meta1) == repr(meta2))
  71. class TestMissingData(object):
  72. def test_missing(self):
  73. data, meta = loadarff(missing)
  74. for i in ['yop', 'yap']:
  75. assert_array_almost_equal(data[i], expect_missing[i])
  76. class TestNoData(object):
  77. def test_nodata(self):
  78. # The file nodata.arff has no data in the @DATA section.
  79. # Reading it should result in an array with length 0.
  80. nodata_filename = os.path.join(data_path, 'nodata.arff')
  81. data, meta = loadarff(nodata_filename)
  82. expected_dtype = np.dtype([('sepallength', '<f8'),
  83. ('sepalwidth', '<f8'),
  84. ('petallength', '<f8'),
  85. ('petalwidth', '<f8'),
  86. ('class', 'S15')])
  87. assert_equal(data.dtype, expected_dtype)
  88. assert_equal(data.size, 0)
  89. class TestHeader(object):
  90. def test_type_parsing(self):
  91. # Test parsing type of attribute from their value.
  92. ofile = open(test2)
  93. rel, attrs = read_header(ofile)
  94. ofile.close()
  95. expected = ['numeric', 'numeric', 'numeric', 'numeric', 'numeric',
  96. 'numeric', 'string', 'string', 'nominal', 'nominal']
  97. for i in range(len(attrs)):
  98. assert_(parse_type(attrs[i][1]) == expected[i])
  99. def test_badtype_parsing(self):
  100. # Test parsing wrong type of attribute from their value.
  101. ofile = open(test3)
  102. rel, attrs = read_header(ofile)
  103. ofile.close()
  104. for name, value in attrs:
  105. assert_raises(ParseArffError, parse_type, value)
  106. def test_fullheader1(self):
  107. # Parsing trivial header with nothing.
  108. ofile = open(test1)
  109. rel, attrs = read_header(ofile)
  110. ofile.close()
  111. # Test relation
  112. assert_(rel == 'test1')
  113. # Test numerical attributes
  114. assert_(len(attrs) == 5)
  115. for i in range(4):
  116. assert_(attrs[i][0] == 'attr%d' % i)
  117. assert_(attrs[i][1] == 'REAL')
  118. # Test nominal attribute
  119. assert_(attrs[4][0] == 'class')
  120. assert_(attrs[4][1] == '{class0, class1, class2, class3}')
  121. def test_dateheader(self):
  122. ofile = open(test7)
  123. rel, attrs = read_header(ofile)
  124. ofile.close()
  125. assert_(rel == 'test7')
  126. assert_(len(attrs) == 5)
  127. assert_(attrs[0][0] == 'attr_year')
  128. assert_(attrs[0][1] == 'DATE yyyy')
  129. assert_(attrs[1][0] == 'attr_month')
  130. assert_(attrs[1][1] == 'DATE yyyy-MM')
  131. assert_(attrs[2][0] == 'attr_date')
  132. assert_(attrs[2][1] == 'DATE yyyy-MM-dd')
  133. assert_(attrs[3][0] == 'attr_datetime_local')
  134. assert_(attrs[3][1] == 'DATE "yyyy-MM-dd HH:mm"')
  135. assert_(attrs[4][0] == 'attr_datetime_missing')
  136. assert_(attrs[4][1] == 'DATE "yyyy-MM-dd HH:mm"')
  137. def test_dateheader_unsupported(self):
  138. ofile = open(test8)
  139. rel, attrs = read_header(ofile)
  140. ofile.close()
  141. assert_(rel == 'test8')
  142. assert_(len(attrs) == 2)
  143. assert_(attrs[0][0] == 'attr_datetime_utc')
  144. assert_(attrs[0][1] == 'DATE "yyyy-MM-dd HH:mm Z"')
  145. assert_(attrs[1][0] == 'attr_datetime_full')
  146. assert_(attrs[1][1] == 'DATE "yy-MM-dd HH:mm:ss z"')
  147. class TestDateAttribute(object):
  148. def setup_method(self):
  149. self.data, self.meta = loadarff(test7)
  150. def test_year_attribute(self):
  151. expected = np.array([
  152. '1999',
  153. '2004',
  154. '1817',
  155. '2100',
  156. '2013',
  157. '1631'
  158. ], dtype='datetime64[Y]')
  159. assert_array_equal(self.data["attr_year"], expected)
  160. def test_month_attribute(self):
  161. expected = np.array([
  162. '1999-01',
  163. '2004-12',
  164. '1817-04',
  165. '2100-09',
  166. '2013-11',
  167. '1631-10'
  168. ], dtype='datetime64[M]')
  169. assert_array_equal(self.data["attr_month"], expected)
  170. def test_date_attribute(self):
  171. expected = np.array([
  172. '1999-01-31',
  173. '2004-12-01',
  174. '1817-04-28',
  175. '2100-09-10',
  176. '2013-11-30',
  177. '1631-10-15'
  178. ], dtype='datetime64[D]')
  179. assert_array_equal(self.data["attr_date"], expected)
  180. def test_datetime_local_attribute(self):
  181. expected = np.array([
  182. datetime.datetime(year=1999, month=1, day=31, hour=0, minute=1),
  183. datetime.datetime(year=2004, month=12, day=1, hour=23, minute=59),
  184. datetime.datetime(year=1817, month=4, day=28, hour=13, minute=0),
  185. datetime.datetime(year=2100, month=9, day=10, hour=12, minute=0),
  186. datetime.datetime(year=2013, month=11, day=30, hour=4, minute=55),
  187. datetime.datetime(year=1631, month=10, day=15, hour=20, minute=4)
  188. ], dtype='datetime64[m]')
  189. assert_array_equal(self.data["attr_datetime_local"], expected)
  190. def test_datetime_missing(self):
  191. expected = np.array([
  192. 'nat',
  193. '2004-12-01T23:59',
  194. 'nat',
  195. 'nat',
  196. '2013-11-30T04:55',
  197. '1631-10-15T20:04'
  198. ], dtype='datetime64[m]')
  199. assert_array_equal(self.data["attr_datetime_missing"], expected)
  200. def test_datetime_timezone(self):
  201. assert_raises(ValueError, loadarff, test8)