"""
Module for create datasets from distinct sources of data.
"""
from skimage import io
import os
import numpy as np
import pandas as pd
import cPickle as pickle
import random
import h5py
import logging
import datetime
import uuid
from ml.processing import Transforms
from ml.utils.config import get_settings
settings = get_settings("ml")
logging.basicConfig()
console = logging.StreamHandler()
console.setLevel(logging.WARNING)
log = logging.getLogger(__name__)
log.setLevel(logging.DEBUG)
log.addHandler(console)
def save_metadata(file_path, data):
with open(file_path, 'wb') as f:
pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)
def load_metadata(path):
try:
with open(path, 'rb') as f:
data = pickle.load(f)
return data
except IOError:
return {}
def calc_nshape(data, value):
if value is None or not (0 < value <= 1) or data is None:
value = 1
limit = int(round(data.shape[0] * value, 0))
return data[:limit]
class ReadWriteData(object):
def auto_dtype(self, data, ttype):
if ttype == "auto" and data is not None:
return data.dtype
elif ttype == "auto" and isinstance(data, type(None)):
return "float64"
else:
return np.dtype(ttype)
def _set_space_shape(self, f, name, shape, label=False):
dtype = self.auto_dtype(None, self.dtype) if label is False else self.auto_dtype(None, self.ltype)
f['data'].create_dataset(name, shape, dtype=dtype, chunks=True, **self.zip_params)
def _set_space_data(self, f, name, data, label=False):
dtype = self.auto_dtype(data, self.dtype) if label is False else self.auto_dtype(data, self.ltype)
f['data'].create_dataset(name, data.shape, dtype=dtype, data=data, chunks=True, **self.zip_params)
def _set_data(self, f, name, data):
key = '/data/' + name
f[key] = data
def _get_data(self, name):
if not hasattr(self, 'f'):
self.f = h5py.File(self.url(), 'r')
key = '/data/' + name
return self.f[key]
def _set_attr(self, name, value):
while True:
try:
with h5py.File(self.url(), 'r+') as f:
f.attrs[name] = value
break
except IOError:
self.f.close()
del self.f
def _get_attr(self, name):
try:
with h5py.File(self.url(), 'r') as f:
return f.attrs[name]
except KeyError:
return None
except IOError:
log.debug("Error found in file {}".format(self.url()))
return None
def chunks_writer(self, f, name, data, chunks=128, init=0):
from ml.utils.seq import grouper_chunk
end = init
for row in grouper_chunk(chunks, data):
seq = np.asarray(list(row))
end += seq.shape[0]
#print("init:{}, end:{}, shape:{}, chunks:{}".format(init, end, seq.shape, chunks))
f[name][init:end] = seq
init = end
return end
class DataLabel(ReadWriteData):
"""
Base class for dataset build. Get data from memory.
create the initial values for the dataset.
:type name: string
:param name: dataset's name
:type dataset_path: string
:param dataset_path: path where the datased is saved. This param is automaticly set by the settings.cfg file.
:type transforms: transform instance
:param transforms: list of transforms
:type apply_transforms: bool
:param apply_transforms: apply transformations to the data
:type dtype: string
:param dtype: the type of the data to save
:type description: string
:param description: an bref description of the dataset
:type author: string
:param author: Dataset Author's name
:type compression_level: int
:param compression_level: number in 0-9 range. If 0 is passed no compression is executed
:type rewrite: bool
:param rewrite: if true, you can clean the saved data and add a new dataset.
"""
def __init__(self, name=None,
dataset_path=None,
transforms=None,
apply_transforms=True,
dtype='float64',
ltype='|S1',
description='',
author='',
compression_level=0,
chunks=100,
rewrite=True):
self.name = name
self._applied_transforms = False
self.chunks = chunks
self.rewrite = rewrite
if dataset_path is None:
self.dataset_path = settings["dataset_path"]
else:
self.dataset_path = dataset_path
if transforms is None:
transforms = Transforms()
if not self._preload_attrs() or self.rewrite is True:
self.apply_transforms = apply_transforms
self.author = author
self.description = description
self.compression_level = compression_level
self.dtype = dtype
self.ltype = ltype
self.transforms = transforms
self.mode = "w"
else:
self.mode = "r"
@property
def data(self):
"""
eturn the data in the dataset
"""
return self._get_data('data')
@property
def labels(self):
"""
return the labels in the dataset
"""
return self._get_data('labels')
def url(self):
"""
return the path where is saved the dataset
"""
return os.path.join(self.dataset_path, self.name)
def num_features(self):
"""
return the number of features of the dataset
"""
return self.data.shape[1]
@property
def shape(self):
"return the shape of the dataset"
return self.data.shape
def labels_info(self):
"""
return a counter of labels
"""
from collections import Counter
counter = Counter(self.labels)
return counter
def only_labels(self, labels):
"""
:type labels: list
:param labels: list of labels
return a tuple of arrays with data and labels, the returned data only have the labels selected.
"""
try:
dl = self.desfragment()
s_labels = set(labels)
dataset, n_labels = zip(*filter(lambda x: x[1] in s_labels, zip(dl.data, dl.labels)))
dl.destroy()
except ValueError:
label = labels[0] if len(labels) > 0 else None
log.warning("label {} is not found in the labels set".format(label))
return np.asarray([]), np.asarray([])
return np.asarray(dataset), np.asarray(n_labels)
def desfragment(self):
"""
Concatenate the train, valid and test data in a data array.
Concatenate the train, valid, and test labels in another array.
return DataLabel
"""
return self.copy()
def type_t(self, ttype, data):
"""
:type ttype: string
:param ttype: name of the type to convert the data. If ttype is 'auto'
the data is returned without be converted.
:type data: array
:param data: data to be converted
convert the data to the especified ttype.
"""
if ttype == 'auto':
return data
ttype = np.dtype(ttype)
if data.dtype is not ttype and data.dtype != np.object:
return data.astype(ttype)
else:
return data
def dtype_t(self, data):
"""
:type data: narray
:param data: narray to cast
cast the data to the predefined dataset dtype
"""
return self.type_t(self.dtype, data)
def ltype_t(self, labels):
"""
:type labels: narray
:param labels: narray to cast
cast the labels to the predefined dataset ltype
"""
return self.type_t(self.ltype, labels)
def _open_attrs(self):
self.create_route()
f = h5py.File(self.url(), 'w')
f.attrs['path'] = self.url()
f.attrs['timestamp'] = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M UTC")
f.attrs['author'] = self.author
f.attrs['transforms'] = self.transforms.to_json()
f.attrs['description'] = self.description
f.attrs['applied_transforms'] = self.apply_transforms
f.attrs['dtype'] = self.dtype
f.attrs['ltype'] = self.ltype
f.attrs['compression_level'] = self.compression_level
if 0 < self.compression_level <= 9:
self.zip_params = {"compression": "gzip", "compression_opts": self.compression_level}
else:
self.zip_params = {}
f.create_group("data")
return f
def _preload_attrs(self):
try:
with h5py.File(self.url(), 'r') as f:
self.author = f.attrs['author']
self.transforms = Transforms.from_json(f.attrs['transforms'])
self.description = f.attrs['description']
self.apply_transforms = f.attrs['applied_transforms']
self.dtype = f.attrs['dtype']
self.ltype = f.attrs['ltype']
self.compression_level = f.attrs['compression_level']
if self.md5() is None:
return False
except KeyError:
return False
except IOError:
return False
else:
return True
def info(self, classes=False):
"""
:type classes: bool
:param classes: if true, print the detail of the labels
This function print the details of the dataset.
"""
from ml.utils.order import order_table_print
print(' ')
print('DATASET NAME: {}'.format(self.name))
print('Author: {}'.format(self.author))
print('Transforms: {}'.format(self.transforms.to_json()))
print('Applied transforms: {}'.format(self.apply_transforms))
print('MD5: {}'.format(self.md5()))
print('Description: {}'.format(self.description))
print(' ')
headers = ["Dataset", "Mean", "Std", "Shape", "dType", "Labels"]
table = []
table.append(["dataset", self.data[:].mean(), self.data[:].std(),
self.data.shape, self.data.dtype, self.labels.size])
def is_binary(self):
"""
return true if the labels only has two classes
"""
return len(self.labels_info()) == 2
def calc_md5(self):
"""
calculate the md5 from the data.
"""
import hashlib
dl = self.desfragment()
h = hashlib.md5(dl.data[:])
dl.destroy()
return h.hexdigest()
def md5(self):
"""
return the signature of the dataset in hex md5
"""
return self._get_attr("md5")
def distinct_data(self):
"""
return the radio of distincts elements in the training data.
i.e
[1,2,3,4,5] return 5/5
[2,2,2,2,2] return 1/5
"""
if not isinstance(self.data.dtype, object):
data = self.data[:].reshape(self.data.shape[0], -1)
else:
data = np.asarray([row.reshape(1, -1)[0] for row in self.data])
y = set((elem for row in data for elem in row))
return float(len(y)) / data.size
def sparcity(self):
"""
return a value between [0, 1] of the sparcity of the dataset.
0 no zeros exists, 1 all data is zero.
"""
if not isinstance(self.data.dtype, object):
data = self.data[:].reshape(self.data.shape[0], -1)
else:
data = np.asarray([row.reshape(1, -1)[0] for row in self.data])
zero_counter = 0
total = 0
for row in data:
for elem in row:
if elem == 0:
zero_counter += 1
total += 1
return float(zero_counter) / total
def build_dataset_from_dsb(self, dsb):
"""
Transform a dataset with train, test and validation dataset into a datalabel dataset
"""
if self.mode == "r":
return
f = self._open_attrs()
labels_shape = tuple(dsb.shape[0:1] + dsb.train_labels.shape[1:])
self._set_space_shape(f, "data", dsb.shape)
self._set_space_shape(f, "labels", labels_shape, label=True)
end = self.chunks_writer(f, "/data/data", dsb.train_data, chunks=self.chunks)
end = self.chunks_writer(f, "/data/data", dsb.test_data, chunks=self.chunks,
init=end)
self.chunks_writer(f, "/data/data", dsb.validation_data, chunks=self.chunks,
init=end)
end = self.chunks_writer(f, "/data/labels", dsb.train_labels, chunks=self.chunks)
end = self.chunks_writer(f, "/data/labels", dsb.test_labels, chunks=self.chunks,
init=end)
self.chunks_writer(f, "/data/labels", dsb.validation_labels, chunks=self.chunks,
init=end)
f.close()
def build_dataset(self, data, labels):
"""
build a datalabel dataset from data and labels
"""
f = self._open_attrs()
data = self.processing(data, initial=True)
self._set_space_data(f, 'data', self.dtype_t(data))
self._set_space_data(f, 'labels', self.ltype_t(labels), label=True)
f.close()
self._set_attr("md5", self.calc_md5())
def create_route(self):
"""
create directories if the dataset_path does not exist
"""
if self.dataset_path is not None:
if not os.path.exists(self.dataset_path):
os.makedirs(self.dataset_path)
def destroy(self):
"""
delete the correspondly hdf5 file
"""
from ml.utils.files import rm
self.close_reader()
rm(self.url())
log.debug("rm {}".format(self.url()))
def convert(self, name, dtype='float64', ltype='|S1', apply_transforms=False,
percentaje=1):
"""
:type dtype: string
:param dtype: cast the data to the defined type
dataset_path is not necesary to especify, this info is obtained from settings.cfg
"""
dl = DataLabel(name=name,
dataset_path=self.dataset_path,
transforms=self.transforms,
apply_transforms=apply_transforms,
dtype=dtype,
ltype=ltype,
description=self.description,
author=self.author,
compression_level=self.compression_level,
chunks=self.chunks,
rewrite=self.rewrite)
dl._applied_transforms = self.apply_transforms
dl.build_dataset(calc_nshape(self.data, percentaje), calc_nshape(self.labels, percentaje))
dl.close_reader()
return dl
def copy(self, percentaje=1):
"""
:type percentaje: float
:param percentaje: value between [0, 1], this value represent the size of the dataset to copy.
copy the dataset, a percentaje is permited for the size of the copy
"""
name = self.name + "_copy_" + str(percentaje)
dl = self.convert(name, dtype=self.dtype, ltype=self.ltype,
apply_transforms=self.apply_transforms,
percentaje=percentaje)
return dl
def processing(self, data, initial=True):
"""
:type data: array
:param data: data to transform
:type initial: bool
:param initial: if multirow transforms are added, then this parameter
indicates the initial data fit
execute the transformations to the data.
"""
data = self.processing_rows(data)
#if init is True:
# return self.processing_global(data, base_data=data)
#elif init is False and not self.transforms.empty('global'):
# base_data, _ = self.desfragment()
# return self.processing_global(data, base_data=base_data)
#else:
# return data
return data
def close_reader(self):
"""
close the hdf5 file. If is closed, no more data retrive will be perform.
"""
if hasattr(self, 'f'):
self.f.close()
del self.f
def processing_rows(self, data):
"""
:type data: array
:param data: data to be transformed
each row is transformed with the transformations defined.
"""
if not self.transforms.empty() and self.transforms_to_apply and data is not None:
log.debug("Apply transforms")
return np.asarray([self.transforms.apply(row) for row in data])
else:
log.debug("No transforms applied")
return data if isinstance(data, np.ndarray) else np.asarray(data)
@property
def transforms_to_apply(self):
return self.apply_transforms and self._applied_transforms is False
@classmethod
def to_DF(self, dataset, labels):
if len(dataset.shape) > 2:
dataset = dataset.reshape(dataset.shape[0], -1)
columns_name = map(lambda x: "c"+str(x), range(dataset.shape[-1])) + ["target"]
return pd.DataFrame(data=np.column_stack((dataset, labels)), columns=columns_name)
def to_df(self):
"""
convert the dataset to a dataframe
"""
dl = self.desfragment()
df = self.to_DF(dl.data[:], dl.labels[:])
dl.destroy()
return df
def add_transforms(self, name, transforms):
"""
:type name: string
:param name: result dataset's name
:type transforms: Transform
:param transforms: transforms to apply in the new dataset
"""
if self.apply_transforms is True:
dsb_c = self.copy()
dsb_c.apply_transforms = False
dsb_c.transforms = transforms
dsb = dsb_c.convert(name, dtype=self.dtype, ltype=self.ltype,
apply_transforms=True, percentaje=1)
dsb_c.destroy()
dsb.transforms = self.transforms + transforms
else:
dsb = self.copy()
dsb.transforms += transforms
return dsb
[docs]class DataSetBuilder(DataLabel):
"""
Base class for dataset build. Get data from memory.
create the initial values for the dataset.
:type name: string
:param name: dataset's name
:type dataset_path: string
:param dataset_path: path where the datased is saved. This param is automaticly set by the settings.cfg file.
:type apply_transforms: bool
:param apply_transforms: apply transformations to the data
:type processing_class: class
:param processing_class: class where are defined the functions for preprocessing data.
:type train_size: float
:param train_size: value between [0, 1] who determine the size of the train data
:type valid_size: float
:param valid_size: value between [0, 1] who determine the size of the validation data
:type validator: string
:param validator: name of the method for extract from the data, the train data, test data and valid data
:type dtype: string
:param dtype: the type of the data to save
:type description: string
:param description: an bref description of the dataset
:type author: string
:param author: Dataset Author's name
:type compression_level: int
:param compression_level: number in 0-9 range. If 0 is passed no compression is executed
:type rewrite: bool
:param rewrite: if true, you can clean the saved data and add a new dataset.
:type chunks: int
:param chunks: number of chunks to use when the dataset is copy or desfragmented.
"""
def __init__(self, name=None,
dataset_path=None,
apply_transforms=True,
transforms=None,
train_size=.7,
valid_size=.1,
validator='cross',
dtype='float64',
ltype='|S1',
description='',
author='',
compression_level=0,
chunks=100,
rewrite=False):
self.name = name
self._applied_transforms = False
self.chunks = chunks
self.rewrite = rewrite
if dataset_path is None:
self.dataset_path = settings["dataset_path"]
else:
self.dataset_path = dataset_path
if transforms is None:
transforms = Transforms()
if not self._preload_attrs() or self.rewrite is True:
self.dtype = dtype
self.ltype = ltype
self.transforms = transforms
self.valid_size = valid_size
self.train_size = train_size
self.test_size = round(1 - (train_size + valid_size), 2)
self.apply_transforms = apply_transforms
self.validator = validator
self.author = author
self.description = description
self.compression_level = compression_level
self.mode = "w"
else:
self.mode = "r"
@property
def train_data(self):
return self._get_data('train_data')
@property
def train_labels(self):
return self._get_data('train_labels')
@property
def test_data(self):
return self._get_data('test_data')
@property
def test_labels(self):
return self._get_data('test_labels')
@property
def validation_data(self):
return self._get_data('validation_data')
@property
def validation_labels(self):
return self._get_data('validation_labels')
@property
def data(self):
return self.train_data
@property
def labels(self):
return self.train_labels
@property
def shape(self):
"return the shape of the dataset"
rows = self.train_data.shape[0] + self.test_data.shape[0] +\
self.validation_data.shape[0]
if self.train_data.dtype != np.object:
return tuple([rows] + list(self.train_data.shape[1:]))
else:
return (rows,)
def _open_attrs(self):
f = super(DataSetBuilder, self)._open_attrs()
f.attrs["validator"] = self.validator
f.attrs["train_size"] = self.train_size
f.attrs["valid_size"] = self.valid_size
return f
def _preload_attrs(self):
try:
with h5py.File(self.url(), 'r') as f:
self.author = f.attrs['author']
self.transforms = Transforms.from_json(f.attrs['transforms'])
self.description = f.attrs['description']
self.apply_transforms = f.attrs['applied_transforms']
self.dtype = f.attrs['dtype']
self.ltype = f.attrs['ltype']
self.compression_level = f.attrs['compression_level']
self.validator = f.attrs["validator"]
self.train_size = f.attrs["train_size"]
self.valid_size = f.attrs["valid_size"]
if self.md5() is None:
return False
except KeyError:
return False
except IOError:
return False
else:
return True
[docs] def desfragment(self):
"""
Concatenate the train, valid and test data in a data array.
Concatenate the train, valid, and test labels in another array.
return data, labels
"""
id_ = uuid.uuid4().hex
dl = DataLabel(
name=self.name+id_,
dataset_path=self.dataset_path,
transforms=self.transforms,
apply_transforms=self.apply_transforms,
dtype=self.dtype,
ltype=self.ltype,
description=self.description,
author=self.author,
compression_level=self.compression_level)
dl.build_dataset_from_dsb(self)
return dl
[docs] def info(self, classes=False):
"""
:type classes: bool
:param classes: if true, print the detail of the labels
This function print the details of the dataset.
"""
from ml.utils.order import order_table_print
print(' ')
print('DATASET NAME: {}'.format(self.name))
print('Author: {}'.format(self.author))
print('Transforms: {}'.format(self.transforms.to_json()))
print('Applied transforms: {}'.format(self.apply_transforms))
print('MD5: {}'.format(self.md5()))
print('Description: {}'.format(self.description))
print(' ')
if self.train_data.dtype != np.object:
headers = ["Dataset", "Mean", "Std", "Shape", "dType", "Labels"]
table = []
table.append(["train set", self.train_data[:].mean(), self.train_data[:].std(),
self.train_data.shape, self.train_data.dtype, self.train_labels.size])
if self.validation_data is not None:
table.append(["valid set", self.validation_data[:].mean(), self.validation_data[:].std(),
self.validation_data.shape, self.validation_data.dtype, self.validation_labels.size])
table.append(["test set", self.test_data[:].mean(), self.test_data[:].std(),
self.test_data.shape, self.test_data.dtype, self.test_labels.size])
order_table_print(headers, table, "shape")
else:
headers = ["Dataset", "Shape", "dType", "Labels"]
table = []
table.append(["train set", self.train_data.shape, self.train_data.dtype,
self.train_labels.size])
if self.valid_data is not None:
table.append(["valid set", self.valid_data.shape, self.valid_data.dtype,
self.valid_labels.size])
table.append(["test set", self.test_data.shape, self.test_data.dtype,
self.test_labels.size])
order_table_print(headers, table, "shape")
if classes is True:
headers = ["class", "# items"]
order_table_print(headers, self.labels_info().items(), "# items")
def cross_validators(self, data, labels):
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
data, labels, train_size=round(self.train_size+self.valid_size, 2), random_state=0)
if isinstance(data, list):
size = len(data)
else:
size = data.shape[0]
valid_size_index = int(round(size * self.valid_size))
X_validation = X_train[:valid_size_index]
y_validation = y_train[:valid_size_index]
X_train = X_train[valid_size_index:]
y_train = y_train[valid_size_index:]
return X_train, X_validation, X_test, y_train, y_validation, y_test
def adversarial_validator(self, train_data, train_labels, test_data, test_labels):
self.train_data = train_data
self.test_data = test_data
self.train_labels = train_labels
self.test_labels = test_labels
self.valid_data = train_data[0:1]
self.valid_labels = train_labels[0:1]
# train class is labeled as 1
train_test_data_clf = self._clf()
train_test_data_clf.train()
class_train = np.argmax(train_test_data_clf.model.classes_)
predictions = [p[class_train]
for p in train_test_data_clf.predict(self.train_data, raw=True, transform=False)]
predictions = sorted(enumerate(predictions), key=lambda x: x[1], reverse=False)
#print([(pred, index) for index, pred in predictions if pred < .5])
#because all targets are ones (train data) is not necessary compare it
false_test = [index for index, pred in predictions if pred < .5] # is a false test
ok_train = [index for index, pred in predictions if pred >= .5]
valid_data = self.train_data[false_test]
valid_labels = self.train_labels[false_test]
valid_size = int(round(self.train_data.shape[0] * self.valid_size))
self.valid_data = valid_data[:int(round(valid_size))]
self.valid_labels = valid_labels[:int(round(valid_size))]
self.train_data = np.concatenate((
valid_data[int(round(valid_size)):],
self.train_data[ok_train]),
axis=0)
self.train_labels = np.concatenate((
valid_labels[int(round(valid_size)):],
self.train_labels[ok_train]),
axis=0)
self.save()
#def processing_global(self, data, base_data=None):
# if not self.transforms.empty('global') and self.apply_transforms and data is not None:
# from pydoc import locate
# fiter, params = self.transforms.get_transforms('global')[0]
# fiter = locate(fiter)
# if isinstance(params, dict):
# self.fit = fiter(**params)
# else:
# self.fit = fiter()
# print(base_data)
# self.fit.fit(base_data)
# return self.fit.transform(data)
#else:
# return self.fit.transform(data)
# else:
# return data
[docs] def build_dataset(self, data, labels, test_data=None, test_labels=None,
validation_data=None, validation_labels=None):
"""
:type data: ndarray
:param data: array of values to save in the dataset
:type labels: ndarray
:param labels: array of labels to save in the dataset
"""
if self.mode == "r":
return
f = self._open_attrs()
if self.validator == '' and test_data is not None and test_labels is not None \
and validation_data is not None and validation_labels is not None:
data_labels = [
data, validation_data, test_data,
labels, validation_labels, test_labels]
else:
if self.validator == 'cross':
data_labels = self.cross_validators(data, labels)
elif self.validator == 'adversarial':
data_labels = self.adversarial_validator(data, labels, test_data, test_labels)
train_data = self.processing(data_labels[0], initial=True)
validation_data = self.processing(data_labels[1])
test_data = self.processing(data_labels[2])
#print("-------", train_data.dtype)
self._set_space_data(f, 'train_data', self.dtype_t(train_data))
self._set_space_data(f, 'test_data', self.dtype_t(test_data))
self._set_space_data(f, 'validation_data', self.dtype_t(validation_data))
self._set_space_data(f, 'train_labels', self.ltype_t(data_labels[3]), label=True)
self._set_space_data(f, 'test_labels', self.ltype_t(data_labels[5]), label=True)
self._set_space_data(f, 'validation_labels', self.ltype_t(data_labels[4]), label=True)
f.close()
self._set_attr("md5", self.calc_md5())
def _clf(self):
from ml.clf.extended.w_sklearn import RandomForest
train_labels = np.ones(self.train_labels.shape[0], dtype=int)
test_labels = np.zeros(self.test_labels.shape[0], dtype=int)
data = np.concatenate((self.train_data, self.test_data), axis=0)
labels = np.concatenate((train_labels, test_labels), axis=0)
dataset = DataSetBuilder("test_train_separability", apply_transforms=False)
dataset.build_dataset(data, labels)
return RandomForest(dataset=dataset)
[docs] def score_train_test(self):
"""
return the score of separability between the train data and the test data.
"""
classif = self._clf()
classif.train()
measure = "auc"
return classif.load_meta().get("score", {measure, None}).get(measure, None)
def plot(self):
import matplotlib.pyplot as plt
last_transform = self.transforms.get_transforms("row")[-1]
data, labels = self.desfragment()
if last_transform[0] == "tsne":
if last_transform[1]["action"] == "concatenate":
dim = 2
features_tsne = data[:,-dim:]
else:
features_tsne = data
else:
features_tsne = ml.processing.Preprocessing(data, [("tsne",
{"perplexity": 50, "action": "replace"})])
classes = self.labels_info().keys()
colors = ['b', 'r', 'y', 'm', 'c']
classes_colors = dict(zip(classes, colors))
fig, ax = plt.subplots(1, 1, figsize=(17.5, 17.5))
r_indexes = {}
for index, target in enumerate(labels):
r_indexes.setdefault(target, [])
r_indexes[target].append(index)
for target, indexes in r_indexes.items():
features_index = features_tsne[indexes]
ax.scatter(
features_index[:,0],
features_index[:,1],
color=classes_colors[target],
marker='o',
alpha=.4,
label=target)
ax.set(xlabel='X',
ylabel='Y',
title=self.name)
ax.legend(loc=2)
plt.show()
[docs] def convert(self, name, dtype='float64', ltype='|S1', apply_transforms=False,
percentaje=1):
"""
:type name: string
:param name: converted dataset's name
:type dtype: string
:param dtype: cast the data to the defined type
:type ltype: string
:param ltype: cast the labels to the defined type
:type apply_transforms: bool
:param apply_transforms: apply the transforms to the data
:type percentaje: float
:param percentaje: values between 0 and 1, this value specify the percentaje of the data to apply transforms and cast function, then return a subset
"""
dsb = DataSetBuilder(name=name,
dataset_path=self.dataset_path,
transforms=self.transforms,
apply_transforms=apply_transforms,
train_size=self.train_size,
valid_size=self.valid_size,
validator=self.validator,
dtype=dtype,
ltype=ltype,
description=self.description,
author=self.author,
compression_level=self.compression_level,
rewrite=self.rewrite)
dsb._applied_transforms = self.apply_transforms
dsb.build_dataset(
calc_nshape(self.train_data, percentaje),
calc_nshape(self.train_labels, percentaje),
test_data=calc_nshape(self.test_data, percentaje),
test_labels=calc_nshape(self.test_labels, percentaje),
validation_data=calc_nshape(self.validation_data, percentaje),
validation_labels=calc_nshape(self.validation_labels, percentaje))
dsb.close_reader()
return dsb
[docs]class DataSetBuilderImage(DataSetBuilder):
"""
Class for images dataset build. Get the data from a directory where each directory's name is the label.
:type image_size: int
:param image_size: define the image size to save in the dataset
kwargs are the same that DataSetBuilder's options
:type data_folder_path: string
:param data_folder_path: path to the data what you want to add to the dataset, split the data in train, test and validation. If you want manualy split the data in train and test, check test_folder_path.
"""
def __init__(self, name=None, image_size=None, train_folder_path=None, **kwargs):
super(DataSetBuilderImage, self).__init__(name, **kwargs)
self.image_size = image_size
self.train_folder_path = train_folder_path
def images_from_directories(self, directories):
if isinstance(directories, str):
directories = [directories]
elif isinstance(directories, list):
pass
else:
raise Exception
images = []
for root_directory in directories:
for directory in os.listdir(root_directory):
files = os.path.join(root_directory, directory)
if os.path.isdir(files):
number_id = directory
for image_file in os.listdir(files):
images.append((number_id, os.path.join(files, image_file)))
return images
[docs] def images_to_dataset(self, folder_base):
"""
:type folder_base: string path
:param folder_base: path where live the images to convert
extract the images from folder_base, where folder_base has the structure folder_base/label/
"""
images = self.images_from_directories(folder_base)
labels = np.ndarray(shape=(len(images),), dtype='|S1')
data = []
for image_index, (number_id, image_file) in enumerate(images):
img = io.imread(image_file)
data.append(img)
labels[image_index] = number_id
return data, labels
@classmethod
def save_images(self, url, number_id, images, rewrite=False):
if not os.path.exists(url):
os.makedirs(url)
n_url = os.path.join(url, number_id)
if not os.path.exists(n_url):
os.makedirs(n_url)
initial = 0 if rewrite else len(os.listdir(n_url))
for i, image in enumerate(images, initial):
try:
image_path = "img-{}-{}.png".format(number_id, i)
io.imsave(os.path.join(n_url, image_path), image)
except IndexError:
print("Index error", n_url, number_id)
def clean_directory(self, path):
import shutil
shutil.rmtree(path)
[docs] def build_dataset(self):
"""
the data is extracted from the train_folder_path, and then saved.
"""
data, labels = self.images_to_dataset(self.train_folder_path)
super(DataSetBuilderImage, self).build_dataset(data, labels)
def labels_images(self, urls):
images_data = []
labels = []
if not isinstance(urls, list):
urls = [urls]
for url in urls:
for number_id, path in self.images_from_directories(url):
images_data.append(io.imread(path))
labels.append(number_id)
return images_data, labels
def copy(self):
dataset = super(DataSetBuilderImage, self).copy()
dataset.image_size = self.image_size
return dataset
def info(self):
super(DataSetBuilderImage, self).info()
print('Image Size {}x{}'.format(self.image_size, self.image_size))
[docs]class DataSetBuilderFile(DataSetBuilder):
"""
Class for csv dataset build. Get the data from a csv's file.
"""
def __init__(self, name=None, train_folder_path=None, **kwargs):
super(DataSetBuilderFile, self).__init__(name, **kwargs)
self.train_folder_path = train_folder_path
[docs] def from_csv(self, folder_path, label_column):
"""
:type folder_path: string
:param folder_path: path to the csv.
:type label_column: string
:param label_column: column's name where are the labels
"""
data, labels = self.csv2dataset(folder_path, label_column)
return data, labels
@classmethod
def csv2dataset(self, path, label_column):
df = pd.read_csv(path)
dataset = df.drop([label_column], axis=1).as_matrix()
labels = df[label_column].as_matrix()
return dataset, labels
[docs] def build_dataset(self, label_column=None):
"""
:type label_column: string
:param label_column: column's name where are the labels
"""
data, labels = self.from_csv(self.train_folder_path, label_column)
super(DataSetBuilderFile, self).build_dataset(data, labels)
@classmethod
def merge_data_labels(self, data_path, labels_path, column_id):
import pandas as pd
data_df = pd.read_csv(data_path)
labels_df = pd.read_csv(labels_path)
return pd.merge(data_df, labels_df, on=column_id)
[docs]class DataSetBuilderFold(object):
"""
Class for create datasets folds from datasets.
:type n_splits: int
:param n_plists: numbers of splits for apply to the dataset
"""
def __init__(self, n_splits=2):
self.name = uuid.uuid4().hex
self.splits = []
self.n_splits = n_splits
[docs] def create_folds(self, dl):
"""
:type dl: DataLabel
:param dl: datalabel to split
return an iterator of splited datalabel in n_splits DataSetBuilder datasets
"""
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=self.n_splits)
for i, (train, test) in enumerate(skf.split(dl.data, dl.labels)):
validation_index = int(train.shape[0] * .1)
validation = train[:validation_index]
train = train[validation_index:]
dsb = DataSetBuilder(name=self.name+"_"+str(i),
dataset_path=settings["dataset_folds_path"],
transforms=None,
apply_transforms=False,
dtype=dl.dtype,
ltype=dl.ltype,
description="",
author="",
compression_level=9,
rewrite=True)
data = dl.data[:]
labels = dl.labels[:]
dsb.build_dataset(data[train], labels[train], test_data=data[test],
test_labels=labels[test], validation_data=data[validation],
validation_labels=labels[validation])
dsb.close_reader()
yield dsb
[docs] def build_dataset(self, dataset=None):
"""
:type dataset: DataLabel
:param dataset: dataset to fold
construct the dataset fold from an DataSet class
"""
dl = dataset.desfragment()
for dsb in self.create_folds(dl):
self.splits.append(dsb.name)
dl.destroy()
[docs] def get_splits(self):
"""
return an iterator of datasets with the splits of original data
"""
for split in self.splits:
yield DataSetBuilder(name=split, dataset_path=settings["dataset_folds_path"])
def destroy(self):
for split in self.get_splits():
split.destroy()