Source code for alex.components.nlg.tectotpl.tool.ml.dataset

#!/usr/bin/env python
# coding=utf-8

"""
Data set representation with ARFF input possibility.
"""

from __future__ import unicode_literals
import re
import numpy as np
import scipy.sparse as sp
import copy
from sklearn.datasets.base import Bunch
import math
from alex.components.nlg.tectotpl.core.util import file_stream

__author__ = "Ondřej Dušek"
__date__ = "2012"


[docs]class Attribute(object):
    """
    This represents an attribute of the data set.
    """

    def __init__(self, name, type_spec):
        """
        Initialize an attribute, given its ARFF specification.
        Sets the attribute type, list of labels and list of possible values.
        """
        self.name = name
        # numeric attributes
        if type_spec.lower() in ['numeric', 'real', 'integer']:
            self.type = 'numeric'
            self.labels = None
            self.values = None
        # string attributes
        elif type_spec.lower() == 'string':
            self.type = 'string'
            self.labels = []
            self.values = {}
        # nominal attributes
        elif type_spec.startswith('{'):
            # strip '{', '}', append comma to match last value
            type_spec = type_spec[1:-1] + ','
            self.type = 'nominal'
            self.values = {}
            self.labels = []
            for match in re.finditer(DataSet.DENSE_FIELD, type_spec):
                val = match.group(1)
                # quoted value
                if re.match(r'^[\'"].*[\'"]$', val):
                    val = val[1:-1]
                    val = re.sub(r'\\([\n\r\'"\\\t%])', '\1', val)
                # plain value
                else:
                    val = val.strip()
                self.values[val] = float(len(self.labels))
                self.labels.append(val)
        # other attribute types are not supported
        else:
            raise TypeError('Unsupported attribute type: ' + type_spec)

[docs]    def numeric_value(self, value):
        """
        Return a numeric representation of the given value.
        Raise a ValueError if the given value does not conform to the
        attribute type.
        """
        # parse number for numeric values
        if self.type == 'numeric':
            try:
                return float(value)
            except ValueError:
                raise ValueError('Invalid numeric value "' + value + '" ' +
                                 'of attribute ' + self.name)
        # return value numbers for nominal values
        elif self.type == 'nominal':
            if not value in self.values:
                raise ValueError('Invalid nominal value "' + value + '" ' +
                                 'of attribute ' + self.name)
            return self.values[value]
        # return values for string attributes, adding new ones is possible
        else:
            if not value in self.values:
                self.values[value] = float(len(self.labels))
                self.labels.append(value)
            return self.values[value]

[docs]    def soft_numeric_value(self, value, add_values):
        """
        Same as numeric_value(), but will not raise exceptions for unknown
        numeric/string values. Will either add the value to the list or
        return a NaN (depending on the add_values setting).
        """
        # None = NaN
        if value is None or (isinstance(value, float) and math.isnan(value)):
            return float('NaN')
        # return directly or convert for numeric values
        if self.type == 'numeric':
            if isinstance(value, float):
                return value
            try:
                return float(value)
            except ValueError:
                raise ValueError('Invalid numeric value "' + value + '" ' +
                                 'of attribute ' + self.name)
        # return value numbers for nominal/string values,
        # add unseen values to list if add_values == True.
        else:
            if not value in self.values:
                if add_values:
                    self.values[value] = float(len(self.labels))
                    self.labels.append(value)
                else:
                    return float('NaN')
            return self.values[value]

[docs]    def value(self, numeric_val):
        """
        Given a numeric (int/float) value, returns the corresponding string
        value for string or nominal attributes, or the identical value for
        numeric attributes.
        Returns None for missing nominal/string values, NaN for missing numeric
        values.
        """
        if self.type == 'numeric':
            return numeric_val
        if math.isnan(numeric_val):
            return None
        return self.labels[int(numeric_val)]

[docs]    def get_arff_type(self):
        """
        Return the ARFF type of the given attribute (numeric, string or
        list of values for nominal attributes).
        """
        if self.type == 'nominal':
            return "{'" + "','".join([re.sub('(' + DataSet.SPEC_CHARS + ')',
                                             r'\\\1', label)
                                      for label in self.labels]) + "'}"
        else:
            return self.type

[docs]    def values_set(self):
        """
        Return a set of all possible values for this attribute.
        """
        return set(self.labels)

    @property
    def num_values(self):
        """
        Return the number of distinct values found in this attribute.
        Returns -1 for numeric attributes where the number of values is
        not known.
        """
        if self.type == 'numeric':
            return -1
        else:
            return len(self.labels)

    def __repr__(self):
        """
        This is the same as __str__.
        """
        return self.__str__()

    def __str__(self):
        """
        String representation returns the attribute name and type.
        """
        return self.__class__.__name__ + ': ' + \
                self.name + ' (' + self.type + ')'


[docs]class DataSet(object):
    """
    ARFF relation data representation.
    """

    # Regex matching an ARFF sparse instance field
    SPARSE_FIELD = r'([0-9]+)\s+' + \
                   r'([^"\'\s][^,]*|' + \
                   r'\'[^\']*(\\\'[^\']*)*\'|' + \
                   r'"[^"]*(\\"[^"]*)*"),'
    # Regex matching an ARFF dense instance field
    DENSE_FIELD = r'([^"\'][^,]*|' + \
                  r'\'[^\']*(\\\'[^\']*)*(?<!\\)\'|' + \
                  r'"[^"]*(\\"[^"]*)*(?<!\\)"),'
    # ARFF special characters for regexps
    SPEC_CHARS = r'[\n\r\'"\\\t%]'

    def __init__(self):
        """
        Just initialize the internal data structures (as empty).
        """
        self.relation_name = ''
        self.data = []
        self.inst_weights = []
        self.attribs = []
        self.attribs_by_name = {}
        self.is_sparse = False

    @property
    def is_empty(self):
        """
        Return true if the data structures are empty.
        """
        return not self.relation_name and not self.data and not self.attribs

[docs]    def as_dict(self, mask_attrib=[], select_attrib=[]):
        """
        Return the data as a list of dictionaries, which is useful
        as an input to DictVectorizer.

        Attributes (numbers or indexes) listed in mask_attrib are not
        added to the dictionary. Missing values are also not added to the
        dictionary.
        If mask_attrib is not set but select_attrib is set, only attributes
        listed in select_attrib are added to the dictionary.
        """
        ret = []
        mask_set = self.__get_mask_set(select_attrib, mask_attrib)
        for inst in self.data:
            # find relevant data (different for sparse and dense)
            if self.is_sparse:
                num_vals = zip(inst.rows[0], inst.data[0])
            else:
                num_vals = enumerate(inst)
            # add the data to a dictionary which is appended to the list
            ret.append({self.attribs[attr_num].name:
                        self.attribs[attr_num].value(val)
                        for attr_num, val in num_vals
                        if attr_num not in mask_set and not math.isnan(val)})
        # return the list of all collected dictionaries
        return ret

[docs]    def as_bunch(self, target, mask_attrib=[], select_attrib=[]):
        """
        Return the data as a scikit-learn Bunch object. The target parameter
        specifies the class attribute.
        """
        mask_set = self.__get_mask_set(select_attrib, mask_attrib + [target])
        # prepare the data matrixes
        X = np.empty(shape=(len(self.attribs) - len(mask_set), 0))
        y = np.empty(shape=(1, 0))
        # identify the target attribute
        target = self.attrib_index(target)
        # divide and convert the data to X, y
        if self.data:
            # dense matrix
            if not self.is_sparse:
                y = np.array([inst[target] for inst in self.data])
                X = np.matrix([[val for idx, val in enumerate(inst)
                                if idx not in mask_set]
                               for inst in self.data])
            # sparse matrix
            else:
                y = np.array([inst[0, target] for inst in self.data])
                data_buf = []
                for inst in self.data:
                    filt_inst = sp.csr_matrix([val for idx, val
                                               in enumerate(inst.toarray()[0])
                                               if idx not in mask_set])
                    data_buf.append(filt_inst)
                X = sp.vstack(tuple(data_buf), 'csr')
        # return as Bunch
        return Bunch(data=X,
                     DESCR=self.relation_name,
                     target=y,
                     target_names=self.attribs[target].labels)

[docs]    def load_from_arff(self, filename, encoding='UTF-8'):
        """
        Load an ARFF file/stream, filling the data structures.
        """
        # initialize
        if not self.is_empty:
            raise IOError('Cannot store second data set into the same object.')
        status = 'header'  # we first assume to read the header
        line_num = 1  # line counter
        instances = []
        weights = []
        # open the file
        fh = file_stream(filename, encoding=encoding)
        # parse the file
        for line in fh:
            line = line.strip()
            # skip comments
            if line.startswith('%'):
                continue
            # relation name
            elif line.lower().startswith('@relation'):
                self.relation_name = line.split(None, 1)[1]
            # attribute definition
            elif line.lower().startswith('@attribute'):
                attr_name, attr_type = line.split(None, 2)[1:]
                self.attribs.append(Attribute(attr_name, attr_type))
            # data section start
            elif line.lower().startswith('@data'):
                status = 'data'
            # data lines
            elif status == 'data' and line != '':
                inst, weight = self.__parse_line(line, line_num)
                instances.append(inst)
                weights.append(weight)
            line_num += 1
        fh.close()
        # store the resulting matrix
        self.data = instances
        self.inst_weights = weights
        # remember attribute names
        self.attribs_by_name = {attr.name: idx
                                for idx, attr in enumerate(self.attribs)}

[docs]    def save_to_arff(self, filename, encoding='UTF-8'):
        """
        Save the data set to an ARFF file
        """
        # open the file
        fh = file_stream(filename, 'w', encoding)
        # print the relation name
        print >> fh, '@relation ' + (self.relation_name
                                     if self.relation_name is not None
                                     else '<noname>')
        # print the list of attributes
        for attrib in self.attribs:
            print >> fh, '@attribute ' + attrib.name + ' ' + \
                    attrib.get_arff_type()
        # print instances
        print >> fh, '@data'
        for inst, weight in zip(self.data, self.inst_weights):
            print >> fh, self.__get_arff_line(inst, weight)

[docs]    def load_from_matrix(self, attr_list, matrix):
        """
        Fill in values from a matrix.
        """
        # initialize
        if not self.is_empty:
            raise IOError('Cannot store second data set into the same object.')
        if len(attr_list) != matrix.shape[1]:
            raise ValueError('Number of attributes must' +
                             'correspond to matrix width.')
        # store attribute lists
        self.attribs = copy.deepcopy(attr_list)
        self.attribs_by_name = {attr.name: idx
                                for idx, attr in enumerate(self.attribs)}
        self.is_sparse = sp.issparse(matrix)
        # store data
        if self.is_sparse:
            self.data = [matrix[line, :].tolil()
                         for line in xrange(matrix.shape[0])]
        else:
            self.data = [matrix[line] for line in xrange(matrix.shape[0])]

[docs]    def load_from_vect(self, attrib, vect):
        """
        Fill in values from a vector of values and an attribute (allow adding
        values for nominal attributes).
        """
        # store attribute information
        attrib = copy.deepcopy(attrib)
        self.attribs = [attrib]
        self.attribs_by_name = {attrib.name: 0}
        self.is_sparse = False
        # store the data
        self.data = [[attrib.soft_numeric_value(val, True)] for val in vect]

[docs]    def load_from_dict(self, data, attrib_types={}):
        """
        Fill in values from a list of dictionaries (=instances).
        Attributes are assumed to be of string type unless specified
        otherwise in the attrib_types variable.
        Currently only capable of creating dense data sets.
        """
        if not self.is_empty:
            raise IOError('Cannot store second data set into the same object.')
        self.attribs = []
        self.attribs_by_name = {}
        buf = []
        # prepare 'instances' with stringy values, prepare attributes
        for dict_inst in data:
            inst = [None] * len(self.attribs)
            for attr_name, val in dict_inst.iteritems():
                try:
                    attr = self.get_attrib(attr_name)
                # attribute does not exist, create it
                except:
                    attr = Attribute(attr_name,
                                     attrib_types.get(attr_name, 'string'))
                    self.attribs_by_name[attr_name] = len(self.attribs)
                    self.attribs.append(attr)
                    inst.append(None)
                # add the stringy value to the instance
                idx = self.attrib_index(attr_name)
                inst[idx] = val
            buf.append(inst)
        # convert instances to numeric representation and add to my list
        for str_inst in buf:
            if len(str_inst) < len(self.attribs):
                str_inst += [None] * (len(self.attribs) - len(str_inst))
            inst = [self.get_attrib(idx).soft_numeric_value(val, True)
                    for idx, val in enumerate(str_inst)]
            self.data.append(inst)

[docs]    def attrib_index(self, attrib_name):
        """
        Given an attribute name, return its number. Given a number, return
        precisely that number. Return -1 on failure.
        """
        if isinstance(attrib_name, int):
            return attrib_name
        return self.attribs_by_name.get(attrib_name, -1)

[docs]    def get_attrib(self, attrib):
        """
        Given an attribute name or index, return the Attribute object.
        """
        if isinstance(attrib, basestring):
            attrib = self.attribs_by_name[attrib]
        return self.attribs[attrib]

[docs]    def get_headers(self):
        """
        Return a copy of the headers of this data set (just attributes list,
        relation name and sparse/dense setting)
        """
        ret = DataSet()
        ret.attribs = copy.deepcopy(self.attribs)
        ret.attribs_by_name = copy.deepcopy(self.attribs_by_name)
        ret.data = []
        ret.is_sparse = copy.deepcopy(self.is_sparse)
        ret.relation_name = copy.deepcopy(self.relation_name)
        return ret

[docs]    def attrib_as_vect(self, attrib, dtype=None):
        """
        Return the specified attribute (by index or name) as a list
        of values.
        If the data type parameter is left as default, the type of the returned
        values depends on the attribute type (strings for nominal or string
        attributes, floats for numeric ones). Set the data type parameter to
        int or float to override the data type.
        """
        # convert attribute name to index
        if isinstance(attrib, basestring):
            attrib = self.attrib_index(attrib)
        # default data type: according to the attribute type
        if dtype is None:
            dtype = lambda x: self.attribs[attrib].value(x)
        elif dtype == int:
            dtype = lambda x: int(x) if not math.isnan(x) else None
        # return the values
        if self.is_sparse:
            return [dtype(line[0, attrib]) for line in self.data]
        else:
            return [dtype(line[attrib]) for line in self.data]

[docs]    def rename_attrib(self, old_name, new_name):
        """
        Rename an attribute of this data set (find it by original name or
        by index).
        """
        attr = self.get_attrib(old_name)
        attr.name = new_name

[docs]    def separate_attrib(self, attribs):
        """
        Given a list of attributes, delete them from the data set
        and return them as a new separate data set.
        Accepts a list of names or indexes, or one name, or one index.
        """
        attribs, attribs_set = self.__get_attrib_list(attribs)
        # initialize the second data set
        separ = DataSet()
        separ.is_sparse = self.is_sparse
        separ.relation_name = self.relation_name + \
                 '-sep-' + ",".join([str(attrib) for attrib in attribs])
        separ.inst_weights = copy.deepcopy(self.inst_weights)
        # separate columns in sparse matrixes
        if self.is_sparse:
            # cache column shifting (i.e. number of deleted to the left)
            # and new indexes for the separated data
            shifts = {idx: len([a for a in attribs if a < idx])
                      for idx in xrange(len(self.attribs))}
            for sep_idx, old_idx in enumerate(attribs):
                shifts[old_idx] = old_idx - sep_idx
            # separate data in individual instances
            for inst in self.data:
                # find sparse indexes to split-out
                sep_inst = sp.lil_matrix((1, len(attribs)))
                sep_cols = [col in attribs_set for col in inst.rows[0]]
                # shift sparse column indexes
                lshift = np.array([shifts[col] for col in inst.rows[0]])
                inst.rows[0] -= lshift
                # split out the desired columns
                sep_inst.rows[0] = [col for col, sep
                                    in zip(inst.rows[0], sep_cols) if sep]
                inst.rows[0] = [col for col, sep
                                in zip(inst.rows[0], sep_cols) if not sep]
                sep_inst.data[0] = [val for val, sep
                                    in zip(inst.data[0], sep_cols) if sep]
                inst.data[0] = [val for val, sep
                                in zip(inst.data[0], sep_cols) if not sep]
                # update the original instance shape
                inst._shape = (1, len(self.attribs) - len(attribs))
                # add the separated data to the other data set
                separ.data.append(sep_inst)
        # separate columns in dense matrixes
        else:
            for idx, inst in enumerate(self.data):
                self.data[idx] = [val for col, val in enumerate(inst)
                                  if col not in attribs_set]
                sep_inst = [val for col, val in enumerate(inst)
                            if col in attribs_set]
                separ.data.append(sep_inst)
        # separate metadata
        separ.attribs = [attr for idx, attr in enumerate(self.attribs)
                         if idx in attribs_set]
        self.attribs = [attr for idx, attr in enumerate(self.attribs)
                        if not idx in attribs_set]
        separ.attribs_by_name = {attr.name: idx
                                 for idx, attr in enumerate(separ.attribs)}
        self.attribs_by_name = {attr.name: idx
                                for idx, attr in enumerate(self.attribs)}
        return separ

[docs]    def delete_attrib(self, attribs):
        """
        Given a list of attributes, delete them from the data set.
        Accepts a list of names or indexes, or one name, or one index.
        """
        attribs, attribs_set = self.__get_attrib_list(attribs)
        # delete columns in sparse matrixes
        if self.is_sparse:
            # cache column shifting (i.e. number of deleted to the left)
            lshifts = {idx: len([a for a in attribs if a < idx])
                       for idx in xrange(len(self.attribs))}
            for inst in self.data:
                # find sparse indexes to remove
                rem = [idx for idx, col in enumerate(inst.rows[0])
                       if col in attribs_set]
                # shift sparse column indexes
                lshift = np.array([lshifts[col] for col in inst.rows[0]])
                inst.rows[0] -= lshift
                # remove the desired columns and update the shape
                inst.rows[0] = np.delete(inst.rows[0], rem)
                inst.data[0] = np.delete(inst.data[0], rem)
                inst._shape = (1, len(self.attribs) - len(attribs))
        # delete columns in dense matrixes
        else:
            self.data = [np.delete(inst, attribs) for inst in self.data]
        # delete the attributes from metadata
        self.attribs = [attr for idx, attr in enumerate(self.attribs)
                        if not idx in attribs_set]
        self.attribs_by_name = {attr.name: idx
                                for idx, attr in enumerate(self.attribs)}

[docs]    def merge(self, other):
        """
        Merge two DataSet objects. The list of attributes will be concatenated.
        The two data sets must have the same number of instances and
        be either both sparse or both non-sparse.

        Instance weights are left unchanged (from this data set).
        """
        # check compatibility
        if self.is_sparse != other.is_sparse or \
                len(self) != len(other):
            raise ValueError('Data sets are not compatible!')
        # merge instances
        if self.is_sparse:
            for my_inst, other_inst in zip(self.data, other.data):
                my_inst.rows[0].extend([col + len(self.attribs)
                                        for col in other_inst.rows[0]])
                my_inst.data[0].extend(other_inst.data[0])
                my_inst._shape = (1, len(self.attribs) + len(other.attribs))
        else:
            for my_inst, other_inst in zip(self.data, other.data):
                my_inst.extend(other_inst)
        # merge meta data
        self.attribs.extend(other.attribs)
        self.attribs_by_name = {attr.name: idx
                                for idx, attr in enumerate(self.attribs)}
        self.relation_name += '_' + other.relation_name

[docs]    def append(self, other):
        """
        Append instances from one data set to another. Their attributes must
        be compatible (of the same types).
        """
        # sanity checks
        self.__check_headers(other)
        # append the instances
        # update possible values for string and nominal using loose_nominal
        for inst in other.data:
            self.data.append(other.__convert_to_headers(inst, self, True))
            self.inst_weights.extend(copy.deepcopy(other.inst_weights))

[docs]    def add_attrib(self, attrib, values=None):
        """
        Add a new attribute to the data set, with pre-filled values
        (or missing, if not set).
        """
        # create a vector of missing values, if none are given
        if values is None:
            values = [None] * len(self)
        # if values are given, check vector size
        elif len(values) != len(self):
            raise ValueError('The size of the attribute vector must match!')
        # create a temporary data set and merge
        temp = DataSet()
        temp.load_from_vect(attrib, values)
        self.merge(temp)

[docs]    def match_headers(self, other, add_values=False):
        """
        Force this data set to have equal headers as the other data set.
        This cares for different values of nominal/numeric attributes --
        (numeric values will be the same, values unknown in the other data
        set will be set to NaNs).
        In other cases, such as a different number or type of attributes,
        an exception is thrown.
        """
        # sanity checks
        self.__check_headers(other)
        # go through nominal and string attribute values
        for idx, inst in enumerate(self.data):
            self.data[idx] = self.__convert_to_headers(inst, other, add_values)
        # copy the headers from other
        self.attribs = [copy.deepcopy(attr) for attr in other.attribs]

[docs]    def value(self, instance, attr_idx):
        """
        Return the value of the given instance and attribute.
        """
        if isinstance(attr_idx, basestring):
            attr_idx = self.attrib_index(attr_idx)
        attr = self.attribs[attr_idx]
        if self.is_sparse:
            return attr.value(self.data[instance][0, attr_idx])
        return attr.value(self.data[instance][attr_idx])

[docs]    def instance(self, index, dtype='dict', do_copy=True):
        """
        Return the given instance as a dictionary (or a list, if specified).

        If do_copy is set to False, do not create a copy of the list for
        dense instances (other types must be copied anyway).
        """
        inst = self.data[index]
        if dtype == 'list':
            if self.is_sparse:
                return inst.toarray()[0].tolist()
            return copy.deepcopy(inst) if do_copy else inst
        elif dtype == 'dict':
            if self.is_sparse:
                return {self.attribs[attr].name: self.attribs[attr].value(val)
                        for attr, val in zip(inst.rows[0], inst.data[0])}
            return {self.attribs[attr].name: self.attribs[attr].value(val)
                    for attr, val in enumerate(inst)}
        raise ValueError('Unsupported data type')

[docs]    def subset(self, *args, **kwargs):
        """
        Return a data set representing a subset of this data set's values.

        Args can be a slice or [start, ] stop [, stride] to create a slice.
        No arguments result in a complete copy of the original.

        Kwargs may contain just one value -- if copy is set to false,
        the sliced values are removed from the original data set.
        """
        # obtain the real arguments
        if len(args) > 3:
            raise TypeError('Too many arguments')
        elif len(args) == 0:
            indexes = slice(len(self))
        elif len(args) == 1 and isinstance(args[0], slice):
            indexes = args[0]
        else:
            indexes = slice(*args)
        if kwargs.keys() not in [[], ['copy']]:
            raise TypeError('Unsupported keyword arguments')
        keep_copy = kwargs.get('copy', True)
        # copy metadata
        subset = self.__metadata_copy('_slice_' + str(indexes.start) +
                                      '-' + str(indexes.stop) +
                                      '-' + str(indexes.step))
        # copy/move instances
        if keep_copy:
            subset.data = [copy.deepcopy(self.data[idx])
                           for idx in xrange(*indexes.indices(len(self)))]
            subset.inst_weights = [self.inst_weights[idx] for idx
                                   in xrange(*indexes.indices(len(self)))]
        else:
            idxs = range(*indexes.indices(len(self)))
            subset.data = [self.data[idx] for idx in idxs]
            subset.inst_weights = [self.inst_weights[idx] for idx in idxs]
            idxs_set = set(idxs)
            self.data = [self.data[idx] for idx in xrange(len(self))
                         if not idx in idxs_set]
            self.inst_weights = [self.inst_weights[idx] for idx
                                 in xrange(len(self)) if not idx in idxs_set]
        return subset

[docs]    def filter(self, filter_func, keep_copy=True):
        """
        Filter the data set using a filtering function and return a
        filtered data set.

        The filtering function must take two arguments - current instance
        index and the instance itself in an attribute-value dictionary
        form - and return a boolean.

        If keep_copy is set to False, filtered instances will be removed from
        the original data set.
        """
        filtered = self.__metadata_copy('_filtered')
        filt_res = [filter_func(idx, self.instance(idx))
                    for idx in xrange(len(self))]
        true_idxs = [idx for idx, res in enumerate(filt_res) if res]
        if keep_copy:
            filtered.data = [copy.deepcopy(self.data[idx])
                             for idx in true_idxs]
            filtered.inst_weights = [self.inst_weights[idx]
                                     for idx in true_idxs]
        else:
            false_idxs = [idx for idx, res in enumerate(filt_res) if not res]
            data_true = [self.data[idx] for idx in true_idxs]
            weights_true = [self.inst_weights[idx] for idx in true_idxs]
            data_false = [self.data[idx] for idx in false_idxs]
            weights_false = [self.inst_weights[idx] for idx in false_idxs]
            self.data = data_false
            self.inst_weights = weights_false
            filtered.data = data_true
            filtered.inst_weights = weights_true
        return filtered

[docs]    def split(self, split_func, keep_copy=True):
        """
        Split the data set using a splitting function and return a dictionary
        where keys are different return values of the splitting function and
        values are data sets containing instances which yield the respective
        splitting function return values.

        The splitting function takes two arguments - the current instance index
        and the instance itself as an attribute-value dictionary. Its return
        value determines the split.

        If keep_copy is set to False, ALL instances will be removed from
        the original data set.
        """
        ret = {}
        for idx in xrange(len(self)):
            key = split_func(idx, self.instance(idx))
            if not key in ret:
                ret[key] = self.__metadata_copy('_split_' + key)
            if keep_copy:
                ret[key].data.append(self.data[idx])
            else:
                ret[key].data.append(copy.deepcopy(self.data[idx]))
        if not keep_copy:
            self.data = []
        return ret

    def __parse_line(self, line, line_num):
        """"
        Parse one ARFF data line (dense or sparse, return appropriate
        array).
        """
        # check weight, if needed
        weight = 1.0
        match_weight = re.search(r',\s*\{([0-9]+(\.[0-9]*)?|\.[0-9]+)\}$',
                                 line)
        if match_weight:
            weight = float(match_weight.group(1))
            line = re.sub(r',\s*\{[^\{\}]+\}$', '', line)
        # sparse instance
        if line.startswith('{'):
            self.is_sparse = True  # trigger sparseness
            line = line.strip('{}') + ','  # append comma to match last value
            values = np.zeros(len(self.attribs))
            # capture all fields
            for match in re.finditer(self.SPARSE_FIELD, line):
                # extract index and value
                idx, val = match.group(1, 2)
                idx = int(idx)
                # undefined value
                if val == '?':
                    values[idx] = float('NaN')
                # quoted value
                elif re.match(r'^[\'"].*[\'"]$', val):
                    val = val[1:-1]
                    val = re.sub(r'\\(' + self.SPEC_CHARS + ')', r'\1', val)
                    values[idx] = self.__get_numeric_value(idx, val, line_num)
                # plain value
                else:
                    val = val.strip()
                    values[idx] = self.__get_numeric_value(idx, val, line_num)
            # return in sparse format
            return sp.lil_matrix(values), weight
        # dense instance
        else:
            values = []
            line += ','  # append comma to match last value
            for match in re.finditer(self.DENSE_FIELD, line):
                val = match.group(1)
                # undefined value
                if val == '?':
                    values.append(float('NaN'))
                # quoted value
                elif re.match(r'^[\'"].*[\'"]$', val):
                    val = val[1:-1]
                    val = re.sub(r'\\(' + self.SPEC_CHARS + ')', r'\1', val)
                    values.append(self.__get_numeric_value(len(values),
                                                           val, line_num))
                # plain value
                else:
                    val = val.strip()
                    values.append(self.__get_numeric_value(len(values),
                                                           val, line_num))
            return values, weight

    def __get_attrib_list(self, attribs):
        """
        Convert the given list of names or indexes, or one name, or one index
        to a list and a set of indexes.
        """
        if isinstance(attribs, list):
            attribs = [self.attrib_index(a) if isinstance(a, basestring) else a
                       for a in attribs]
        elif isinstance(attribs, basestring):
            attribs = [self.attrib_index(attribs)]
        elif isinstance(attribs, int):
            attribs = [attribs]
        # cache set of attributes to be deleted
        attribs_set = set(attribs)
        return attribs, attribs_set

    def __check_headers(self, other):
        """
        Sanity check for appending / headers matching. Checks if the data sets
        have the same number of attributes and if the attributes are of the
        same type. Same values for numeric/string attributes are not required.
        """
        if len(self.attribs) != len(other.attribs):
            raise ValueError('Data sets have different numbers of attributes!')
        for my_attr, other_attr in zip(self.attribs, other.attribs):
            if my_attr.type != other_attr.type:
                raise ValueError('Attributes ' + my_attr + ' and ' +
                                 other_attr + ' must be of the same type!')

    def __convert_to_headers(self, inst, other, add_values):
        """
        Convert numeric values for an instance to match the string/nominal
        headers of the given data set. Returns a new instance (dense or
        sparse).
        """
        if other.is_sparse:
            # convert through dense as 0 may have a different meaning
            vals = [self.attribs[col].value(val)
                    for col, val in enumerate(inst.toarray()[0])]
            vals = [other.attribs[col].soft_numeric_value(val, add_values)
                    for col, val in enumerate(vals)]
            new_inst = sp.lil_matrix((1, len(other.attribs)))
            new_inst.rows[0] = [col for col, val in enumerate(vals)
                                if val != 0]
            new_inst.data[0] = [val for col, val in enumerate(vals)
                                if val != 0]
            return new_inst
        # dense data sets
        else:
            vals = [self.attribs[col].value(val)
                    for col, val in enumerate(inst)]
            return [other.attribs[col].soft_numeric_value(val, add_values)
                              for col, val in enumerate(vals)]

    def __get_numeric_value(self, attr_num, value, line_num):
        """
        Return the attribute value as a float,
        i.e. convert the string value to number for numeric attributes,
        look up the value number for nominal ones and keep track of possible
        values for string attributes.
        """
        if attr_num >= len(self.attribs):
            raise TypeError('Attribute number ' + str(attr_num) +
                            ' out of range on line ' + str(line_num))
        attr = self.attribs[attr_num]
        try:
            return attr.numeric_value(value)
        except ValueError as e:
            raise ValueError(e.message + ' on line ' + str(line_num))

    def __get_arff_line(self, inst, weight=1.0):
        """
        Return a sparse or a dense ARFF data line
        """
        if self.is_sparse:
            ret = "{" + ",".join([str(int(idx)) + ' ' +
                                  self.__get_arff_val(idx, attr)
                                  for idx, attr in zip(inst.rows[0],
                                                       inst.data[0])]) + '}'
        else:
            ret = ",".join([self.__get_arff_val(idx, attr)
                            for idx, attr in enumerate(inst)])
        if weight != 1.0:
            ret += ', {' + str(weight) + '}'
        return ret

    def __get_arff_val(self, attr_num, value):
        """
        Return an ARFF-output safe value.
        """
        # missing values
        if math.isnan(value):
            return '?'
        # numeric values
        if self.attribs[attr_num].type == 'numeric':
            return str(value)
        # stringy values
        else:
            value = self.attribs[attr_num].value(value)
            # decide if it needs to be quoted
            quote = False
            if value == '' or \
                    re.search('(' + self.SPEC_CHARS + '|[{}?, ])', value):
                quote = True
            # backslash for special chars
            value = re.sub('(' + self.SPEC_CHARS + ')', r'\\\1', value)
            # return the result (quoted or not)
            return value if not quote else "'" + value + "'"

    def __metadata_copy(self, add_to_name=''):
        """
        Returns a copy of this data set with no instances.
        Adds the specified string to the name if required.
        """
        my_copy = DataSet()
        my_copy.is_sparse = self.is_sparse
        my_copy.attribs = copy.deepcopy(self.attribs)
        my_copy.attribs_by_name = copy.deepcopy(self.attribs_by_name)
        my_copy.relation_name = self.relation_name + add_to_name
        my_copy.data = []
        return my_copy

    def __get_mask_set(self, select_attrib, mask_attrib):
        """
        Given a list of specifically selected or specifically masked
        attributes, this returns the set of attributes to avoid.
        """
        deselect_set = set()
        mask_set = set()
        if select_attrib:
            select_attrib, select_set = self.__get_attrib_list(select_attrib)
            deselect_set = set(range(len(self.attribs))) - select_set
        if mask_attrib:
            mask_attrib, mask_set = self.__get_attrib_list(mask_attrib)
        return mask_set | deselect_set

    def __len__(self):
        """
        Return the number of instances in this data set.
        """
        return len(self.data)

    def __getitem__(self, key):
        """
        This supports access to individual instances by index (will
        be returned as a dict), to individual attributes (returned as
        vector of values) or slicing and filtering (see subset() and
        filter()).
        """
        # tuple: return the value given by the coordinates
        if isinstance(key, tuple) and len(key) == 2 and \
                isinstance(key[0], int) and (isinstance(key[1], int) or
                                             isinstance(key[1], basestring)):
            return self.value(*key)
        # one number: return one element
        elif isinstance(key, int):
            return self.instance(key)
        # string: return attribute as vector
        elif isinstance(key, basestring):
            return self.attrib_as_vect(key)
        # slicing
        elif isinstance(key, slice):
            return self.subset(key)
        # filtering
        elif hasattr(key, '__call__'):
            return self.filter(key)
        raise ValueError('Unsupported index type!')

    def __repr__(self):
        """
        This is the same as __str__.
        """
        return self.__str__()

    def __str__(self):
        """
        String representation returns the relation name, number of
        attributes and instances.
        """
        ret = self.__class__.__name__ + ': '
        if self.is_empty:
            return ret + 'empty'
        ret += 'relation ' + (self.relation_name
                              if self.relation_name is not None
                              else '<noname>') + ': '
        ret += ('sparse' if self.is_sparse else 'dense') + ', ' + \
                str(len(self.attribs)) + ' attributes, ' + \
                str(len(self)) + ' instances.'
        return ret

    def __iter__(self):
        """
        Return an iterator over instances.
        """
        return DataSetIterator(self)


[docs]class DataSetIterator(object):
    """
    An iterator over the instances of a data set.
    """

    def __init__(self, dataset):
        """
        Initialize pointing at the beginning.
        """
        self.dataset = dataset
        self.pos = 0

    def __iter__(self):
        return self

[docs]    def next(self):
        """
        Move to the next instance.
        """
        try:
            res = self.dataset.instance(self.pos)
            self.pos += 1
            return res
        except IndexError:
            raise StopIteration