Source code for qlearnkit.datasets.dataset_helper

from sklearn.model_selection import train_test_split
from sklearn.utils import Bunch
from sklearn.decomposition import PCA

from typing import Optional, Union, Tuple, List
import numpy as np


[docs]def features_labels_from_data(X: Union[np.ndarray, list], y: Union[np.ndarray, list], train_size: Optional[Union[float, int]] = None, test_size: Optional[Union[float, int]] = None, n_features: Optional[int] = None, *, use_pca: Optional[bool] = False, return_bunch: Optional[bool] = False): """ This script splits a dataset according to the required train size, test size and number of features Args: X: raw data from dataset y: labels from dataset test_size : float or int, default=None If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split. If int, represents the absolute number of test samples. If None, the value is set to the complement of the train size. If ``train_size`` is also None, it will be set to 0.25. train_size : float or int, default=None If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the train split. If int, represents the absolute number of train samples. If None, the value is automatically set to the complement of the test size. n_features: number of desired features use_pca: whether to use PCA for dimensionality reduction or not default False return_bunch: whether to return a :class:`sklearn.Bunch` (similar to a dictionary) or not Returns: Preprocessed dataset as available in sklearn """ # decomposing dataset according to the required train and test size # exceptions are already handled in train_test_split X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size=test_size, train_size=train_size, random_state=42) if n_features is not None: if use_pca: X_train, X_test = pca_reduce(X_train, X_test, n_components=n_features) else: X_train = X_train[:, 0:n_features] X_test = X_test[:, 0:n_features] if return_bunch: # a Bunch is similar to a # dictionary (actually inherits from it) return Bunch( training_data=X_train, test_data=X_test, training_labels=y_train, test_labels=y_test ) return ( X_train, X_test, y_train, y_test )
[docs]def pca_reduce(X_train: np.ndarray, X_test: np.ndarray, n_components: int = 2) -> Tuple[np.ndarray, np.ndarray]: pca = PCA(n_components=n_components) X_train = pca.fit_transform(X_train) X_test = pca.fit_transform(X_test) return ( X_train, X_test )
[docs]def label_to_class_name(predicted_labels, classes) -> List[str]: """ Helper converts labels (numeric) to class name (string) Args: predicted_labels (numpy.ndarray): Nx1 array classes (dict or list): a mapping form label (numeric) to class name (str) Example: Returns: list of predicted class names of each datum Example: classes = ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)' ] predicted_labels = [0, 2, 1, 2, 0] print(label_to_class_name(predicted_labels, classes)) """ if not isinstance(predicted_labels, np.ndarray): predicted_labels = np.asarray([predicted_labels]) predicted_class_names = [ classes[predicted_label] for predicted_label in predicted_labels ] return predicted_class_names