123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670 |
- # Last Change: Mon Aug 20 08:00 PM 2007 J
- from __future__ import division, print_function, absolute_import
- import re
- import itertools
- import datetime
- from functools import partial
- import numpy as np
- from scipy._lib.six import next
- """A module to read arff files."""
- __all__ = ['MetaData', 'loadarff', 'ArffError', 'ParseArffError']
- # An Arff file is basically two parts:
- # - header
- # - data
- #
- # A header has each of its components starting by @META where META is one of
- # the keyword (attribute of relation, for now).
- # TODO:
- # - both integer and reals are treated as numeric -> the integer info
- # is lost!
- # - Replace ValueError by ParseError or something
- # We know can handle the following:
- # - numeric and nominal attributes
- # - missing values for numeric attributes
- r_meta = re.compile(r'^\s*@')
- # Match a comment
- r_comment = re.compile(r'^%')
- # Match an empty line
- r_empty = re.compile(r'^\s+$')
- # Match a header line, that is a line which starts by @ + a word
- r_headerline = re.compile(r'^@\S*')
- r_datameta = re.compile(r'^@[Dd][Aa][Tt][Aa]')
- r_relation = re.compile(r'^@[Rr][Ee][Ll][Aa][Tt][Ii][Oo][Nn]\s*(\S*)')
- r_attribute = re.compile(r'^@[Aa][Tt][Tt][Rr][Ii][Bb][Uu][Tt][Ee]\s*(..*$)')
- # To get attributes name enclosed with ''
- r_comattrval = re.compile(r"'(..+)'\s+(..+$)")
- # To get normal attributes
- r_wcomattrval = re.compile(r"(\S+)\s+(..+$)")
- #-------------------------
- # Module defined exception
- #-------------------------
- class ArffError(IOError):
- pass
- class ParseArffError(ArffError):
- pass
- #------------------
- # Various utilities
- #------------------
- # An attribute is defined as @attribute name value
- def parse_type(attrtype):
- """Given an arff attribute value (meta data), returns its type.
- Expect the value to be a name."""
- uattribute = attrtype.lower().strip()
- if uattribute[0] == '{':
- return 'nominal'
- elif uattribute[:len('real')] == 'real':
- return 'numeric'
- elif uattribute[:len('integer')] == 'integer':
- return 'numeric'
- elif uattribute[:len('numeric')] == 'numeric':
- return 'numeric'
- elif uattribute[:len('string')] == 'string':
- return 'string'
- elif uattribute[:len('relational')] == 'relational':
- return 'relational'
- elif uattribute[:len('date')] == 'date':
- return 'date'
- else:
- raise ParseArffError("unknown attribute %s" % uattribute)
- def get_nominal(attribute):
- """If attribute is nominal, returns a list of the values"""
- return attribute.split(',')
- def read_data_list(ofile):
- """Read each line of the iterable and put it in a list."""
- data = [next(ofile)]
- if data[0].strip()[0] == '{':
- raise ValueError("This looks like a sparse ARFF: not supported yet")
- data.extend([i for i in ofile])
- return data
- def get_ndata(ofile):
- """Read the whole file to get number of data attributes."""
- data = [next(ofile)]
- loc = 1
- if data[0].strip()[0] == '{':
- raise ValueError("This looks like a sparse ARFF: not supported yet")
- for i in ofile:
- loc += 1
- return loc
- def maxnomlen(atrv):
- """Given a string containing a nominal type definition, returns the
- string len of the biggest component.
- A nominal type is defined as seomthing framed between brace ({}).
- Parameters
- ----------
- atrv : str
- Nominal type definition
- Returns
- -------
- slen : int
- length of longest component
- Examples
- --------
- maxnomlen("{floup, bouga, fl, ratata}") returns 6 (the size of
- ratata, the longest nominal value).
- >>> maxnomlen("{floup, bouga, fl, ratata}")
- 6
- """
- nomtp = get_nom_val(atrv)
- return max(len(i) for i in nomtp)
- def get_nom_val(atrv):
- """Given a string containing a nominal type, returns a tuple of the
- possible values.
- A nominal type is defined as something framed between braces ({}).
- Parameters
- ----------
- atrv : str
- Nominal type definition
- Returns
- -------
- poss_vals : tuple
- possible values
- Examples
- --------
- >>> get_nom_val("{floup, bouga, fl, ratata}")
- ('floup', 'bouga', 'fl', 'ratata')
- """
- r_nominal = re.compile('{(.+)}')
- m = r_nominal.match(atrv)
- if m:
- return tuple(i.strip() for i in m.group(1).split(','))
- else:
- raise ValueError("This does not look like a nominal string")
- def get_date_format(atrv):
- r_date = re.compile(r"[Dd][Aa][Tt][Ee]\s+[\"']?(.+?)[\"']?$")
- m = r_date.match(atrv)
- if m:
- pattern = m.group(1).strip()
- # convert time pattern from Java's SimpleDateFormat to C's format
- datetime_unit = None
- if "yyyy" in pattern:
- pattern = pattern.replace("yyyy", "%Y")
- datetime_unit = "Y"
- elif "yy":
- pattern = pattern.replace("yy", "%y")
- datetime_unit = "Y"
- if "MM" in pattern:
- pattern = pattern.replace("MM", "%m")
- datetime_unit = "M"
- if "dd" in pattern:
- pattern = pattern.replace("dd", "%d")
- datetime_unit = "D"
- if "HH" in pattern:
- pattern = pattern.replace("HH", "%H")
- datetime_unit = "h"
- if "mm" in pattern:
- pattern = pattern.replace("mm", "%M")
- datetime_unit = "m"
- if "ss" in pattern:
- pattern = pattern.replace("ss", "%S")
- datetime_unit = "s"
- if "z" in pattern or "Z" in pattern:
- raise ValueError("Date type attributes with time zone not "
- "supported, yet")
- if datetime_unit is None:
- raise ValueError("Invalid or unsupported date format")
- return pattern, datetime_unit
- else:
- raise ValueError("Invalid or no date format")
- def go_data(ofile):
- """Skip header.
- the first next() call of the returned iterator will be the @data line"""
- return itertools.dropwhile(lambda x: not r_datameta.match(x), ofile)
- #----------------
- # Parsing header
- #----------------
- def tokenize_attribute(iterable, attribute):
- """Parse a raw string in header (eg starts by @attribute).
- Given a raw string attribute, try to get the name and type of the
- attribute. Constraints:
- * The first line must start with @attribute (case insensitive, and
- space like characters before @attribute are allowed)
- * Works also if the attribute is spread on multilines.
- * Works if empty lines or comments are in between
- Parameters
- ----------
- attribute : str
- the attribute string.
- Returns
- -------
- name : str
- name of the attribute
- value : str
- value of the attribute
- next : str
- next line to be parsed
- Examples
- --------
- If attribute is a string defined in python as r"floupi real", will
- return floupi as name, and real as value.
- >>> iterable = iter([0] * 10) # dummy iterator
- >>> tokenize_attribute(iterable, r"@attribute floupi real")
- ('floupi', 'real', 0)
- If attribute is r"'floupi 2' real", will return 'floupi 2' as name,
- and real as value.
- >>> tokenize_attribute(iterable, r" @attribute 'floupi 2' real ")
- ('floupi 2', 'real', 0)
- """
- sattr = attribute.strip()
- mattr = r_attribute.match(sattr)
- if mattr:
- # atrv is everything after @attribute
- atrv = mattr.group(1)
- if r_comattrval.match(atrv):
- name, type = tokenize_single_comma(atrv)
- next_item = next(iterable)
- elif r_wcomattrval.match(atrv):
- name, type = tokenize_single_wcomma(atrv)
- next_item = next(iterable)
- else:
- # Not sure we should support this, as it does not seem supported by
- # weka.
- raise ValueError("multi line not supported yet")
- #name, type, next_item = tokenize_multilines(iterable, atrv)
- else:
- raise ValueError("First line unparsable: %s" % sattr)
- if type == 'relational':
- raise ValueError("relational attributes not supported yet")
- return name, type, next_item
- def tokenize_single_comma(val):
- # XXX we match twice the same string (here and at the caller level). It is
- # stupid, but it is easier for now...
- m = r_comattrval.match(val)
- if m:
- try:
- name = m.group(1).strip()
- type = m.group(2).strip()
- except IndexError:
- raise ValueError("Error while tokenizing attribute")
- else:
- raise ValueError("Error while tokenizing single %s" % val)
- return name, type
- def tokenize_single_wcomma(val):
- # XXX we match twice the same string (here and at the caller level). It is
- # stupid, but it is easier for now...
- m = r_wcomattrval.match(val)
- if m:
- try:
- name = m.group(1).strip()
- type = m.group(2).strip()
- except IndexError:
- raise ValueError("Error while tokenizing attribute")
- else:
- raise ValueError("Error while tokenizing single %s" % val)
- return name, type
- def read_header(ofile):
- """Read the header of the iterable ofile."""
- i = next(ofile)
- # Pass first comments
- while r_comment.match(i):
- i = next(ofile)
- # Header is everything up to DATA attribute ?
- relation = None
- attributes = []
- while not r_datameta.match(i):
- m = r_headerline.match(i)
- if m:
- isattr = r_attribute.match(i)
- if isattr:
- name, type, i = tokenize_attribute(ofile, i)
- attributes.append((name, type))
- else:
- isrel = r_relation.match(i)
- if isrel:
- relation = isrel.group(1)
- else:
- raise ValueError("Error parsing line %s" % i)
- i = next(ofile)
- else:
- i = next(ofile)
- return relation, attributes
- #--------------------
- # Parsing actual data
- #--------------------
- def safe_float(x):
- """given a string x, convert it to a float. If the stripped string is a ?,
- return a Nan (missing value).
- Parameters
- ----------
- x : str
- string to convert
- Returns
- -------
- f : float
- where float can be nan
- Examples
- --------
- >>> safe_float('1')
- 1.0
- >>> safe_float('1\\n')
- 1.0
- >>> safe_float('?\\n')
- nan
- """
- if '?' in x:
- return np.nan
- else:
- return float(x)
- def safe_nominal(value, pvalue):
- svalue = value.strip()
- if svalue in pvalue:
- return svalue
- elif svalue == '?':
- return svalue
- else:
- raise ValueError("%s value not in %s" % (str(svalue), str(pvalue)))
- def safe_date(value, date_format, datetime_unit):
- date_str = value.strip().strip("'").strip('"')
- if date_str == '?':
- return np.datetime64('NaT', datetime_unit)
- else:
- dt = datetime.datetime.strptime(date_str, date_format)
- return np.datetime64(dt).astype("datetime64[%s]" % datetime_unit)
- class MetaData(object):
- """Small container to keep useful information on a ARFF dataset.
- Knows about attributes names and types.
- Examples
- --------
- ::
- data, meta = loadarff('iris.arff')
- # This will print the attributes names of the iris.arff dataset
- for i in meta:
- print(i)
- # This works too
- meta.names()
- # Getting attribute type
- types = meta.types()
- Notes
- -----
- Also maintains the list of attributes in order, i.e. doing for i in
- meta, where meta is an instance of MetaData, will return the
- different attribute names in the order they were defined.
- """
- def __init__(self, rel, attr):
- self.name = rel
- # We need the dictionary to be ordered
- # XXX: may be better to implement an ordered dictionary
- self._attributes = {}
- self._attrnames = []
- for name, value in attr:
- tp = parse_type(value)
- self._attrnames.append(name)
- if tp == 'nominal':
- self._attributes[name] = (tp, get_nom_val(value))
- elif tp == 'date':
- self._attributes[name] = (tp, get_date_format(value)[0])
- else:
- self._attributes[name] = (tp, None)
- def __repr__(self):
- msg = ""
- msg += "Dataset: %s\n" % self.name
- for i in self._attrnames:
- msg += "\t%s's type is %s" % (i, self._attributes[i][0])
- if self._attributes[i][1]:
- msg += ", range is %s" % str(self._attributes[i][1])
- msg += '\n'
- return msg
- def __iter__(self):
- return iter(self._attrnames)
- def __getitem__(self, key):
- return self._attributes[key]
- def names(self):
- """Return the list of attribute names."""
- return self._attrnames
- def types(self):
- """Return the list of attribute types."""
- attr_types = [self._attributes[name][0] for name in self._attrnames]
- return attr_types
- def loadarff(f):
- """
- Read an arff file.
- The data is returned as a record array, which can be accessed much like
- a dictionary of numpy arrays. For example, if one of the attributes is
- called 'pressure', then its first 10 data points can be accessed from the
- ``data`` record array like so: ``data['pressure'][0:10]``
- Parameters
- ----------
- f : file-like or str
- File-like object to read from, or filename to open.
- Returns
- -------
- data : record array
- The data of the arff file, accessible by attribute names.
- meta : `MetaData`
- Contains information about the arff file such as name and
- type of attributes, the relation (name of the dataset), etc...
- Raises
- ------
- ParseArffError
- This is raised if the given file is not ARFF-formatted.
- NotImplementedError
- The ARFF file has an attribute which is not supported yet.
- Notes
- -----
- This function should be able to read most arff files. Not
- implemented functionality include:
- * date type attributes
- * string type attributes
- It can read files with numeric and nominal attributes. It cannot read
- files with sparse data ({} in the file). However, this function can
- read files with missing data (? in the file), representing the data
- points as NaNs.
- Examples
- --------
- >>> from scipy.io import arff
- >>> from io import StringIO
- >>> content = \"\"\"
- ... @relation foo
- ... @attribute width numeric
- ... @attribute height numeric
- ... @attribute color {red,green,blue,yellow,black}
- ... @data
- ... 5.0,3.25,blue
- ... 4.5,3.75,green
- ... 3.0,4.00,red
- ... \"\"\"
- >>> f = StringIO(content)
- >>> data, meta = arff.loadarff(f)
- >>> data
- array([(5.0, 3.25, 'blue'), (4.5, 3.75, 'green'), (3.0, 4.0, 'red')],
- dtype=[('width', '<f8'), ('height', '<f8'), ('color', '|S6')])
- >>> meta
- Dataset: foo
- \twidth's type is numeric
- \theight's type is numeric
- \tcolor's type is nominal, range is ('red', 'green', 'blue', 'yellow', 'black')
- """
- if hasattr(f, 'read'):
- ofile = f
- else:
- ofile = open(f, 'rt')
- try:
- return _loadarff(ofile)
- finally:
- if ofile is not f: # only close what we opened
- ofile.close()
- def _loadarff(ofile):
- # Parse the header file
- try:
- rel, attr = read_header(ofile)
- except ValueError as e:
- msg = "Error while parsing header, error was: " + str(e)
- raise ParseArffError(msg)
- # Check whether we have a string attribute (not supported yet)
- hasstr = False
- for name, value in attr:
- type = parse_type(value)
- if type == 'string':
- hasstr = True
- meta = MetaData(rel, attr)
- # XXX The following code is not great
- # Build the type descriptor descr and the list of convertors to convert
- # each attribute to the suitable type (which should match the one in
- # descr).
- # This can be used once we want to support integer as integer values and
- # not as numeric anymore (using masked arrays ?).
- acls2dtype = {'real': float, 'integer': float, 'numeric': float}
- acls2conv = {'real': safe_float,
- 'integer': safe_float,
- 'numeric': safe_float}
- descr = []
- convertors = []
- if not hasstr:
- for name, value in attr:
- type = parse_type(value)
- if type == 'date':
- date_format, datetime_unit = get_date_format(value)
- descr.append((name, "datetime64[%s]" % datetime_unit))
- convertors.append(partial(safe_date, date_format=date_format,
- datetime_unit=datetime_unit))
- elif type == 'nominal':
- n = maxnomlen(value)
- descr.append((name, 'S%d' % n))
- pvalue = get_nom_val(value)
- convertors.append(partial(safe_nominal, pvalue=pvalue))
- else:
- descr.append((name, acls2dtype[type]))
- convertors.append(safe_float)
- #dc.append(acls2conv[type])
- #sdescr.append((name, acls2sdtype[type]))
- else:
- # How to support string efficiently ? Ideally, we should know the max
- # size of the string before allocating the numpy array.
- raise NotImplementedError("String attributes not supported yet, sorry")
- ni = len(convertors)
- def generator(row_iter, delim=','):
- # TODO: this is where we are spending times (~80%). I think things
- # could be made more efficiently:
- # - We could for example "compile" the function, because some values
- # do not change here.
- # - The function to convert a line to dtyped values could also be
- # generated on the fly from a string and be executed instead of
- # looping.
- # - The regex are overkill: for comments, checking that a line starts
- # by % should be enough and faster, and for empty lines, same thing
- # --> this does not seem to change anything.
- # 'compiling' the range since it does not change
- # Note, I have already tried zipping the converters and
- # row elements and got slightly worse performance.
- elems = list(range(ni))
- for raw in row_iter:
- # We do not abstract skipping comments and empty lines for
- # performance reasons.
- if r_comment.match(raw) or r_empty.match(raw):
- continue
- row = raw.split(delim)
- yield tuple([convertors[i](row[i]) for i in elems])
- a = generator(ofile)
- # No error should happen here: it is a bug otherwise
- data = np.fromiter(a, descr)
- return data, meta
- #-----
- # Misc
- #-----
- def basic_stats(data):
- nbfac = data.size * 1. / (data.size - 1)
- return np.nanmin(data), np.nanmax(data), np.mean(data), np.std(data) * nbfac
- def print_attribute(name, tp, data):
- type = tp[0]
- if type == 'numeric' or type == 'real' or type == 'integer':
- min, max, mean, std = basic_stats(data)
- print("%s,%s,%f,%f,%f,%f" % (name, type, min, max, mean, std))
- else:
- msg = name + ",{"
- for i in range(len(tp[1])-1):
- msg += tp[1][i] + ","
- msg += tp[1][-1]
- msg += "}"
- print(msg)
- def test_weka(filename):
- data, meta = loadarff(filename)
- print(len(data.dtype))
- print(data.size)
- for i in meta:
- print_attribute(i, meta[i], data[i])
- # make sure nose does not find this as a test
- test_weka.__test__ = False
- if __name__ == '__main__':
- import sys
- filename = sys.argv[1]
- test_weka(filename)
|