arffread.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670
  1. # Last Change: Mon Aug 20 08:00 PM 2007 J
  2. from __future__ import division, print_function, absolute_import
  3. import re
  4. import itertools
  5. import datetime
  6. from functools import partial
  7. import numpy as np
  8. from scipy._lib.six import next
  9. """A module to read arff files."""
  10. __all__ = ['MetaData', 'loadarff', 'ArffError', 'ParseArffError']
  11. # An Arff file is basically two parts:
  12. # - header
  13. # - data
  14. #
  15. # A header has each of its components starting by @META where META is one of
  16. # the keyword (attribute of relation, for now).
  17. # TODO:
  18. # - both integer and reals are treated as numeric -> the integer info
  19. # is lost!
  20. # - Replace ValueError by ParseError or something
  21. # We know can handle the following:
  22. # - numeric and nominal attributes
  23. # - missing values for numeric attributes
  24. r_meta = re.compile(r'^\s*@')
  25. # Match a comment
  26. r_comment = re.compile(r'^%')
  27. # Match an empty line
  28. r_empty = re.compile(r'^\s+$')
  29. # Match a header line, that is a line which starts by @ + a word
  30. r_headerline = re.compile(r'^@\S*')
  31. r_datameta = re.compile(r'^@[Dd][Aa][Tt][Aa]')
  32. r_relation = re.compile(r'^@[Rr][Ee][Ll][Aa][Tt][Ii][Oo][Nn]\s*(\S*)')
  33. r_attribute = re.compile(r'^@[Aa][Tt][Tt][Rr][Ii][Bb][Uu][Tt][Ee]\s*(..*$)')
  34. # To get attributes name enclosed with ''
  35. r_comattrval = re.compile(r"'(..+)'\s+(..+$)")
  36. # To get normal attributes
  37. r_wcomattrval = re.compile(r"(\S+)\s+(..+$)")
  38. #-------------------------
  39. # Module defined exception
  40. #-------------------------
  41. class ArffError(IOError):
  42. pass
  43. class ParseArffError(ArffError):
  44. pass
  45. #------------------
  46. # Various utilities
  47. #------------------
  48. # An attribute is defined as @attribute name value
  49. def parse_type(attrtype):
  50. """Given an arff attribute value (meta data), returns its type.
  51. Expect the value to be a name."""
  52. uattribute = attrtype.lower().strip()
  53. if uattribute[0] == '{':
  54. return 'nominal'
  55. elif uattribute[:len('real')] == 'real':
  56. return 'numeric'
  57. elif uattribute[:len('integer')] == 'integer':
  58. return 'numeric'
  59. elif uattribute[:len('numeric')] == 'numeric':
  60. return 'numeric'
  61. elif uattribute[:len('string')] == 'string':
  62. return 'string'
  63. elif uattribute[:len('relational')] == 'relational':
  64. return 'relational'
  65. elif uattribute[:len('date')] == 'date':
  66. return 'date'
  67. else:
  68. raise ParseArffError("unknown attribute %s" % uattribute)
  69. def get_nominal(attribute):
  70. """If attribute is nominal, returns a list of the values"""
  71. return attribute.split(',')
  72. def read_data_list(ofile):
  73. """Read each line of the iterable and put it in a list."""
  74. data = [next(ofile)]
  75. if data[0].strip()[0] == '{':
  76. raise ValueError("This looks like a sparse ARFF: not supported yet")
  77. data.extend([i for i in ofile])
  78. return data
  79. def get_ndata(ofile):
  80. """Read the whole file to get number of data attributes."""
  81. data = [next(ofile)]
  82. loc = 1
  83. if data[0].strip()[0] == '{':
  84. raise ValueError("This looks like a sparse ARFF: not supported yet")
  85. for i in ofile:
  86. loc += 1
  87. return loc
  88. def maxnomlen(atrv):
  89. """Given a string containing a nominal type definition, returns the
  90. string len of the biggest component.
  91. A nominal type is defined as seomthing framed between brace ({}).
  92. Parameters
  93. ----------
  94. atrv : str
  95. Nominal type definition
  96. Returns
  97. -------
  98. slen : int
  99. length of longest component
  100. Examples
  101. --------
  102. maxnomlen("{floup, bouga, fl, ratata}") returns 6 (the size of
  103. ratata, the longest nominal value).
  104. >>> maxnomlen("{floup, bouga, fl, ratata}")
  105. 6
  106. """
  107. nomtp = get_nom_val(atrv)
  108. return max(len(i) for i in nomtp)
  109. def get_nom_val(atrv):
  110. """Given a string containing a nominal type, returns a tuple of the
  111. possible values.
  112. A nominal type is defined as something framed between braces ({}).
  113. Parameters
  114. ----------
  115. atrv : str
  116. Nominal type definition
  117. Returns
  118. -------
  119. poss_vals : tuple
  120. possible values
  121. Examples
  122. --------
  123. >>> get_nom_val("{floup, bouga, fl, ratata}")
  124. ('floup', 'bouga', 'fl', 'ratata')
  125. """
  126. r_nominal = re.compile('{(.+)}')
  127. m = r_nominal.match(atrv)
  128. if m:
  129. return tuple(i.strip() for i in m.group(1).split(','))
  130. else:
  131. raise ValueError("This does not look like a nominal string")
  132. def get_date_format(atrv):
  133. r_date = re.compile(r"[Dd][Aa][Tt][Ee]\s+[\"']?(.+?)[\"']?$")
  134. m = r_date.match(atrv)
  135. if m:
  136. pattern = m.group(1).strip()
  137. # convert time pattern from Java's SimpleDateFormat to C's format
  138. datetime_unit = None
  139. if "yyyy" in pattern:
  140. pattern = pattern.replace("yyyy", "%Y")
  141. datetime_unit = "Y"
  142. elif "yy":
  143. pattern = pattern.replace("yy", "%y")
  144. datetime_unit = "Y"
  145. if "MM" in pattern:
  146. pattern = pattern.replace("MM", "%m")
  147. datetime_unit = "M"
  148. if "dd" in pattern:
  149. pattern = pattern.replace("dd", "%d")
  150. datetime_unit = "D"
  151. if "HH" in pattern:
  152. pattern = pattern.replace("HH", "%H")
  153. datetime_unit = "h"
  154. if "mm" in pattern:
  155. pattern = pattern.replace("mm", "%M")
  156. datetime_unit = "m"
  157. if "ss" in pattern:
  158. pattern = pattern.replace("ss", "%S")
  159. datetime_unit = "s"
  160. if "z" in pattern or "Z" in pattern:
  161. raise ValueError("Date type attributes with time zone not "
  162. "supported, yet")
  163. if datetime_unit is None:
  164. raise ValueError("Invalid or unsupported date format")
  165. return pattern, datetime_unit
  166. else:
  167. raise ValueError("Invalid or no date format")
  168. def go_data(ofile):
  169. """Skip header.
  170. the first next() call of the returned iterator will be the @data line"""
  171. return itertools.dropwhile(lambda x: not r_datameta.match(x), ofile)
  172. #----------------
  173. # Parsing header
  174. #----------------
  175. def tokenize_attribute(iterable, attribute):
  176. """Parse a raw string in header (eg starts by @attribute).
  177. Given a raw string attribute, try to get the name and type of the
  178. attribute. Constraints:
  179. * The first line must start with @attribute (case insensitive, and
  180. space like characters before @attribute are allowed)
  181. * Works also if the attribute is spread on multilines.
  182. * Works if empty lines or comments are in between
  183. Parameters
  184. ----------
  185. attribute : str
  186. the attribute string.
  187. Returns
  188. -------
  189. name : str
  190. name of the attribute
  191. value : str
  192. value of the attribute
  193. next : str
  194. next line to be parsed
  195. Examples
  196. --------
  197. If attribute is a string defined in python as r"floupi real", will
  198. return floupi as name, and real as value.
  199. >>> iterable = iter([0] * 10) # dummy iterator
  200. >>> tokenize_attribute(iterable, r"@attribute floupi real")
  201. ('floupi', 'real', 0)
  202. If attribute is r"'floupi 2' real", will return 'floupi 2' as name,
  203. and real as value.
  204. >>> tokenize_attribute(iterable, r" @attribute 'floupi 2' real ")
  205. ('floupi 2', 'real', 0)
  206. """
  207. sattr = attribute.strip()
  208. mattr = r_attribute.match(sattr)
  209. if mattr:
  210. # atrv is everything after @attribute
  211. atrv = mattr.group(1)
  212. if r_comattrval.match(atrv):
  213. name, type = tokenize_single_comma(atrv)
  214. next_item = next(iterable)
  215. elif r_wcomattrval.match(atrv):
  216. name, type = tokenize_single_wcomma(atrv)
  217. next_item = next(iterable)
  218. else:
  219. # Not sure we should support this, as it does not seem supported by
  220. # weka.
  221. raise ValueError("multi line not supported yet")
  222. #name, type, next_item = tokenize_multilines(iterable, atrv)
  223. else:
  224. raise ValueError("First line unparsable: %s" % sattr)
  225. if type == 'relational':
  226. raise ValueError("relational attributes not supported yet")
  227. return name, type, next_item
  228. def tokenize_single_comma(val):
  229. # XXX we match twice the same string (here and at the caller level). It is
  230. # stupid, but it is easier for now...
  231. m = r_comattrval.match(val)
  232. if m:
  233. try:
  234. name = m.group(1).strip()
  235. type = m.group(2).strip()
  236. except IndexError:
  237. raise ValueError("Error while tokenizing attribute")
  238. else:
  239. raise ValueError("Error while tokenizing single %s" % val)
  240. return name, type
  241. def tokenize_single_wcomma(val):
  242. # XXX we match twice the same string (here and at the caller level). It is
  243. # stupid, but it is easier for now...
  244. m = r_wcomattrval.match(val)
  245. if m:
  246. try:
  247. name = m.group(1).strip()
  248. type = m.group(2).strip()
  249. except IndexError:
  250. raise ValueError("Error while tokenizing attribute")
  251. else:
  252. raise ValueError("Error while tokenizing single %s" % val)
  253. return name, type
  254. def read_header(ofile):
  255. """Read the header of the iterable ofile."""
  256. i = next(ofile)
  257. # Pass first comments
  258. while r_comment.match(i):
  259. i = next(ofile)
  260. # Header is everything up to DATA attribute ?
  261. relation = None
  262. attributes = []
  263. while not r_datameta.match(i):
  264. m = r_headerline.match(i)
  265. if m:
  266. isattr = r_attribute.match(i)
  267. if isattr:
  268. name, type, i = tokenize_attribute(ofile, i)
  269. attributes.append((name, type))
  270. else:
  271. isrel = r_relation.match(i)
  272. if isrel:
  273. relation = isrel.group(1)
  274. else:
  275. raise ValueError("Error parsing line %s" % i)
  276. i = next(ofile)
  277. else:
  278. i = next(ofile)
  279. return relation, attributes
  280. #--------------------
  281. # Parsing actual data
  282. #--------------------
  283. def safe_float(x):
  284. """given a string x, convert it to a float. If the stripped string is a ?,
  285. return a Nan (missing value).
  286. Parameters
  287. ----------
  288. x : str
  289. string to convert
  290. Returns
  291. -------
  292. f : float
  293. where float can be nan
  294. Examples
  295. --------
  296. >>> safe_float('1')
  297. 1.0
  298. >>> safe_float('1\\n')
  299. 1.0
  300. >>> safe_float('?\\n')
  301. nan
  302. """
  303. if '?' in x:
  304. return np.nan
  305. else:
  306. return float(x)
  307. def safe_nominal(value, pvalue):
  308. svalue = value.strip()
  309. if svalue in pvalue:
  310. return svalue
  311. elif svalue == '?':
  312. return svalue
  313. else:
  314. raise ValueError("%s value not in %s" % (str(svalue), str(pvalue)))
  315. def safe_date(value, date_format, datetime_unit):
  316. date_str = value.strip().strip("'").strip('"')
  317. if date_str == '?':
  318. return np.datetime64('NaT', datetime_unit)
  319. else:
  320. dt = datetime.datetime.strptime(date_str, date_format)
  321. return np.datetime64(dt).astype("datetime64[%s]" % datetime_unit)
  322. class MetaData(object):
  323. """Small container to keep useful information on a ARFF dataset.
  324. Knows about attributes names and types.
  325. Examples
  326. --------
  327. ::
  328. data, meta = loadarff('iris.arff')
  329. # This will print the attributes names of the iris.arff dataset
  330. for i in meta:
  331. print(i)
  332. # This works too
  333. meta.names()
  334. # Getting attribute type
  335. types = meta.types()
  336. Notes
  337. -----
  338. Also maintains the list of attributes in order, i.e. doing for i in
  339. meta, where meta is an instance of MetaData, will return the
  340. different attribute names in the order they were defined.
  341. """
  342. def __init__(self, rel, attr):
  343. self.name = rel
  344. # We need the dictionary to be ordered
  345. # XXX: may be better to implement an ordered dictionary
  346. self._attributes = {}
  347. self._attrnames = []
  348. for name, value in attr:
  349. tp = parse_type(value)
  350. self._attrnames.append(name)
  351. if tp == 'nominal':
  352. self._attributes[name] = (tp, get_nom_val(value))
  353. elif tp == 'date':
  354. self._attributes[name] = (tp, get_date_format(value)[0])
  355. else:
  356. self._attributes[name] = (tp, None)
  357. def __repr__(self):
  358. msg = ""
  359. msg += "Dataset: %s\n" % self.name
  360. for i in self._attrnames:
  361. msg += "\t%s's type is %s" % (i, self._attributes[i][0])
  362. if self._attributes[i][1]:
  363. msg += ", range is %s" % str(self._attributes[i][1])
  364. msg += '\n'
  365. return msg
  366. def __iter__(self):
  367. return iter(self._attrnames)
  368. def __getitem__(self, key):
  369. return self._attributes[key]
  370. def names(self):
  371. """Return the list of attribute names."""
  372. return self._attrnames
  373. def types(self):
  374. """Return the list of attribute types."""
  375. attr_types = [self._attributes[name][0] for name in self._attrnames]
  376. return attr_types
  377. def loadarff(f):
  378. """
  379. Read an arff file.
  380. The data is returned as a record array, which can be accessed much like
  381. a dictionary of numpy arrays. For example, if one of the attributes is
  382. called 'pressure', then its first 10 data points can be accessed from the
  383. ``data`` record array like so: ``data['pressure'][0:10]``
  384. Parameters
  385. ----------
  386. f : file-like or str
  387. File-like object to read from, or filename to open.
  388. Returns
  389. -------
  390. data : record array
  391. The data of the arff file, accessible by attribute names.
  392. meta : `MetaData`
  393. Contains information about the arff file such as name and
  394. type of attributes, the relation (name of the dataset), etc...
  395. Raises
  396. ------
  397. ParseArffError
  398. This is raised if the given file is not ARFF-formatted.
  399. NotImplementedError
  400. The ARFF file has an attribute which is not supported yet.
  401. Notes
  402. -----
  403. This function should be able to read most arff files. Not
  404. implemented functionality include:
  405. * date type attributes
  406. * string type attributes
  407. It can read files with numeric and nominal attributes. It cannot read
  408. files with sparse data ({} in the file). However, this function can
  409. read files with missing data (? in the file), representing the data
  410. points as NaNs.
  411. Examples
  412. --------
  413. >>> from scipy.io import arff
  414. >>> from io import StringIO
  415. >>> content = \"\"\"
  416. ... @relation foo
  417. ... @attribute width numeric
  418. ... @attribute height numeric
  419. ... @attribute color {red,green,blue,yellow,black}
  420. ... @data
  421. ... 5.0,3.25,blue
  422. ... 4.5,3.75,green
  423. ... 3.0,4.00,red
  424. ... \"\"\"
  425. >>> f = StringIO(content)
  426. >>> data, meta = arff.loadarff(f)
  427. >>> data
  428. array([(5.0, 3.25, 'blue'), (4.5, 3.75, 'green'), (3.0, 4.0, 'red')],
  429. dtype=[('width', '<f8'), ('height', '<f8'), ('color', '|S6')])
  430. >>> meta
  431. Dataset: foo
  432. \twidth's type is numeric
  433. \theight's type is numeric
  434. \tcolor's type is nominal, range is ('red', 'green', 'blue', 'yellow', 'black')
  435. """
  436. if hasattr(f, 'read'):
  437. ofile = f
  438. else:
  439. ofile = open(f, 'rt')
  440. try:
  441. return _loadarff(ofile)
  442. finally:
  443. if ofile is not f: # only close what we opened
  444. ofile.close()
  445. def _loadarff(ofile):
  446. # Parse the header file
  447. try:
  448. rel, attr = read_header(ofile)
  449. except ValueError as e:
  450. msg = "Error while parsing header, error was: " + str(e)
  451. raise ParseArffError(msg)
  452. # Check whether we have a string attribute (not supported yet)
  453. hasstr = False
  454. for name, value in attr:
  455. type = parse_type(value)
  456. if type == 'string':
  457. hasstr = True
  458. meta = MetaData(rel, attr)
  459. # XXX The following code is not great
  460. # Build the type descriptor descr and the list of convertors to convert
  461. # each attribute to the suitable type (which should match the one in
  462. # descr).
  463. # This can be used once we want to support integer as integer values and
  464. # not as numeric anymore (using masked arrays ?).
  465. acls2dtype = {'real': float, 'integer': float, 'numeric': float}
  466. acls2conv = {'real': safe_float,
  467. 'integer': safe_float,
  468. 'numeric': safe_float}
  469. descr = []
  470. convertors = []
  471. if not hasstr:
  472. for name, value in attr:
  473. type = parse_type(value)
  474. if type == 'date':
  475. date_format, datetime_unit = get_date_format(value)
  476. descr.append((name, "datetime64[%s]" % datetime_unit))
  477. convertors.append(partial(safe_date, date_format=date_format,
  478. datetime_unit=datetime_unit))
  479. elif type == 'nominal':
  480. n = maxnomlen(value)
  481. descr.append((name, 'S%d' % n))
  482. pvalue = get_nom_val(value)
  483. convertors.append(partial(safe_nominal, pvalue=pvalue))
  484. else:
  485. descr.append((name, acls2dtype[type]))
  486. convertors.append(safe_float)
  487. #dc.append(acls2conv[type])
  488. #sdescr.append((name, acls2sdtype[type]))
  489. else:
  490. # How to support string efficiently ? Ideally, we should know the max
  491. # size of the string before allocating the numpy array.
  492. raise NotImplementedError("String attributes not supported yet, sorry")
  493. ni = len(convertors)
  494. def generator(row_iter, delim=','):
  495. # TODO: this is where we are spending times (~80%). I think things
  496. # could be made more efficiently:
  497. # - We could for example "compile" the function, because some values
  498. # do not change here.
  499. # - The function to convert a line to dtyped values could also be
  500. # generated on the fly from a string and be executed instead of
  501. # looping.
  502. # - The regex are overkill: for comments, checking that a line starts
  503. # by % should be enough and faster, and for empty lines, same thing
  504. # --> this does not seem to change anything.
  505. # 'compiling' the range since it does not change
  506. # Note, I have already tried zipping the converters and
  507. # row elements and got slightly worse performance.
  508. elems = list(range(ni))
  509. for raw in row_iter:
  510. # We do not abstract skipping comments and empty lines for
  511. # performance reasons.
  512. if r_comment.match(raw) or r_empty.match(raw):
  513. continue
  514. row = raw.split(delim)
  515. yield tuple([convertors[i](row[i]) for i in elems])
  516. a = generator(ofile)
  517. # No error should happen here: it is a bug otherwise
  518. data = np.fromiter(a, descr)
  519. return data, meta
  520. #-----
  521. # Misc
  522. #-----
  523. def basic_stats(data):
  524. nbfac = data.size * 1. / (data.size - 1)
  525. return np.nanmin(data), np.nanmax(data), np.mean(data), np.std(data) * nbfac
  526. def print_attribute(name, tp, data):
  527. type = tp[0]
  528. if type == 'numeric' or type == 'real' or type == 'integer':
  529. min, max, mean, std = basic_stats(data)
  530. print("%s,%s,%f,%f,%f,%f" % (name, type, min, max, mean, std))
  531. else:
  532. msg = name + ",{"
  533. for i in range(len(tp[1])-1):
  534. msg += tp[1][i] + ","
  535. msg += tp[1][-1]
  536. msg += "}"
  537. print(msg)
  538. def test_weka(filename):
  539. data, meta = loadarff(filename)
  540. print(len(data.dtype))
  541. print(data.size)
  542. for i in meta:
  543. print_attribute(i, meta[i], data[i])
  544. # make sure nose does not find this as a test
  545. test_weka.__test__ = False
  546. if __name__ == '__main__':
  547. import sys
  548. filename = sys.argv[1]
  549. test_weka(filename)