import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from bcselector.filter_methods.cost_based_filter_methods import difference_find_best_feature, fraction_find_best_feature
from bcselector.filter_methods.no_cost_based_filter_methods import no_cost_find_best_feature
from bcselector.information_theory.j_criterion_approximations import mim, mifs, mrmr, jmi, cife
__all__ = [
'_VariableSelector',
'DiffVariableSelector',
'FractionVariableSelector'
]
class _VariableSelector():
"""
Partent class to provide all basic and common functions and attibutes for other selectors.
This class does not implement proper `fit` method.
"""
def __init__(self):
self.data = None
self.target_variable = None
self.costs = None
self.normalized_costs = None
self.budget = None
self.criterion_values = []
self.filter_values = []
self.colnames = None
self.stop_budget = False
self.X_train = None
self.X_test = None
self.y_train = None
self.y_test = None
self.variables_selected_order = []
self.cost_variables_selected_order = []
self.j_criterion_func = None
self.total_scores = None
self.total_costs = None
self.no_cost_total_scores = None
self.no_cost_total_costs = None
self.model = None
self.scoring_function = None
self.beta = None
self.number_of_features = None
self.fig = None
self.ax = None
def fit(self, data, target_variable, costs, j_criterion_func='cife', seed=42, budget=None, test_size=0.2, **kwargs):
self.variables_selected_order = []
self.cost_variables_selected_order = []
if 'beta' in kwargs.keys():
self.beta = kwargs['beta']
# data & costs
assert isinstance(data, np.ndarray) or isinstance(data, pd.DataFrame), "Argument `data` must be numpy.ndarray or pandas.DataFrame"
if isinstance(data, np.ndarray):
assert isinstance(costs, list), "When using `data` as np.array, provide `costs` as list of floats or integers"
else:
assert isinstance(costs, (list, dict)), "When using `data` as pd.DataFrame, provide `costs` as list of floats or integers or dict {'col_1':cost_1,...}"
if isinstance(data, pd.DataFrame):
self.data = data.values
self.colnames = data.columns
if isinstance(costs, dict):
self.costs = [costs[x] for x in data.columns]
else:
self.costs = costs
else:
self.data = data
self.colnames = ['var_' + str(i) for i in np.arange(1, self.data.shape[1]+1)]
self.costs = costs
# normalized costs
if (min(self.costs) >= 0) and (max(self.costs) <= 1):
self.normalized_costs = self.costs
else:
# I add 0.0001 to avoid 0 cost feature
self.normalized_costs = list((np.array(self.costs) - min(self.costs) + 0.0001)/(max(self.costs)-min(self.costs)+0.0001))
assert len(self.data.shape) == 2, "For `data` argument use numpy array of shape (n,p) or pandas DataFrame"
assert data.shape[1] == len(costs), "Length od cost must equal number of columns in `data`"
# target_variable
assert isinstance(target_variable, np.ndarray) or isinstance(target_variable, pd.core.series.Series), "Use np.array or pd.Series for argument `target_variable`"
if isinstance(target_variable, pd.core.series.Series):
self.target_variable = target_variable.values
else:
self.target_variable = target_variable
assert self.data.shape[0] == len(self.target_variable), "Number of rows in 'data' must equal target_variable length"
# j_criterion_func
j_criterion_dict = {'mim': mim, 'mifs': mifs, 'mrmr': mrmr, 'jmi': jmi, 'cife': cife}
assert j_criterion_func in ['mim', 'mifs', 'mrmr', 'jmi', 'cife'], "Argument `j_criterion_func` must be one of ['mim','mifs','mrmr','jmi','cife']"
self.j_criterion_func = j_criterion_dict[j_criterion_func]
if budget is not None:
assert isinstance(budget, (int, float)), "Argument `budget` must be float or int."
assert budget >= 0, "Budget must be greater or equal 0."
self.budget = budget
# Train test split
self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.data, self.target_variable, test_size=test_size, random_state=seed)
def get_cost_results(self):
"""Getter to obtain cost-sensitive results.
Returns
-------
variables_selected_order: list
Indexes of features selected.
cost_variables_selected_order: list
Costs of features selected. In the same order as `variables_selected_order`
"""
assert len(self.variables_selected_order) > 0, "Run fit method first."
return self.variables_selected_order, self.cost_variables_selected_order
def get_no_cost_results(self):
"""Getter to obtain NO-cost-sensitive results.
Returns
-------
variables_selected_order: list
Indexes of features selected.
cost_variables_selected_order: list
Costs of features selected. In the same order as `variables_selected_order`
"""
assert len(self.no_cost_variables_selected_order) > 0, "Run fit_no_cost or plot_scores method first."
return self.no_cost_variables_selected_order, self.no_cost_cost_variables_selected_order
def score(self, model, scoring_function):
"""Method scores selected features step by step by `scoring_function`. In each step one more feature is added.
Of course user can do that on his own, but using `score` function we are sure that feature selection is performed on the same train set and it is much easier to use,
than writing a loop on our own.
Parameters
----------
model: sklearn.base.ClassifierMixin
Any classifier from sklearn API.
scoring_function: function
Classification metric function from sklearn. Must be one of ['roc_auc_score']. For more scoring functions open an GH issue.
Returns
-------
total_scores: list
List of `scoring_function` scores for each step. One step is one feature in algorighm ranking order.
total_costs: list
List of accumulated costs for each step. One step is one feature in algorighm ranking order.
Examples
--------
>>> from bcselector.variable_selection import FractionVariableSelector
>>> from sklearn.metrics import roc_auc_score
>>> from sklearn.linear_model import LogisticRegression
>>> fvs = FractionVariableSelector()
>>> fvs.fit(X, y, costs, lamb=1, j_criterion_func='mim')
>>> model = LogisticRegression()
>>> fvs.score(roc_auc_score, model)
"""
assert isinstance(model, sklearn.base.ClassifierMixin), "Model must be sklearn.base.ClassifierMixin."
self.total_scores = []
self.total_costs = []
self.model = model
self.scoring_function = scoring_function
assert len(self.variables_selected_order) > 0, "Run fit method first."
current_cost = 0
for i, var_id in enumerate(tqdm(self.variables_selected_order, desc='Scoring')):
cur_vars = self.variables_selected_order[0:i+1]
self.model = self.model.fit(X=self.X_train[:, cur_vars], y=self.y_train)
y_hat = self.model.predict_proba(self.X_test[:, cur_vars])[:, 1]
score = self.scoring_function(self.y_test, y_hat)
current_cost += self.costs[var_id]
self.total_scores.append(score)
self.total_costs.append(current_cost)
return self.total_scores, self.total_costs
def fit_no_cost(self):
"""Ranks all features in dataset with the same method as previously with `fit` method but costs are not considered at all.
Returns
-------
no_cost_total_scores: list
List of `scoring_function` scores for each step. One step is one feature in algorighm ranking order.
no_cost_total_costs: list
List of accumulated costs for each step. One step is one feature in algorighm ranking order.
Examples
--------
>>> from bcselector.variable_selection import FractionVariableSelector
>>> fvs = FractionVariableSelector()
>>> fvs.fit(X, y, costs, lamb=1, j_criterion_func='mim')
>>> fvs.fit_no_cost()
"""
assert self.j_criterion_func, "Must run `fit` method first."
self._fit_no_cost(stop_budget=self.stop_budget)
return self.no_cost_variables_selected_order, self.no_cost_cost_variables_selected_order
def plot_scores(self, budget=None, compare_no_cost_method=False, savefig=False, annotate=False, annotate_box=False,
figsize=(12, 8),
bbox_pos=(0.72, 0.60),
plot_title=None, x_axis_title=None, y_axis_title=None, **kwargs):
"""Plots scores of each iteration of the algorithm.
Parameters
----------
budget: int or float
Budget to be ploted on the figure as vertical line.
compare_no_cost_method: bool = False
Plot no-cost curve on the plot.
savefig: bool
Save figure with scores, savefig arguments passed with kwargs.
annotate: bool
Annotate plot with feature indexes on the plot.
annotate_box: bool
Plot box with features data: id, name and cost.
figsize: tuple
Figsize.
bbox_pos: tuple
Position of box with features data.
plot_title: str
x_axis_title: str
y_axis_title: str
**kwargs: list
Arguments passed to np.savefig()
"""
assert self.total_scores, "Run `score` method first."
self.fig, self.ax = plt.subplots(figsize=figsize)
if budget is not None:
assert isinstance(budget, (int, float)), "Argument `budget` must be float or int."
self.ax.axvline(x=budget, linewidth=3, label=f'budget={budget:.2f}')
elif self.budget is not None:
self.ax.axvline(x=self.budget, linewidth=3, label=f'budget={self.budget:.2f}')
else:
pass
move_horizontal = max(self.total_costs)/100
move_vertical = max(self.total_scores)/100
if compare_no_cost_method is True:
self._fit_no_cost(stop_budget=self.stop_budget)
self._score_no_cost()
self.ax.plot(self.no_cost_total_costs, self.no_cost_total_scores, linestyle='--', marker='o', color='r', label='no regard to cost')
self.ax.plot(self.total_costs, self.total_scores, linestyle='--', marker='o', color='b', label='with regard to costs')
self.ax.legend(prop={"size": 16}, loc='lower right')
if annotate:
move_horizontal = max(self.no_cost_total_costs + self.total_costs)/100
move_vertical = max(self.no_cost_total_scores + self.total_scores)/100
costs_normalized_to_alpha = list(
(np.array(self.no_cost_cost_variables_selected_order) - min(self.costs) + 0.7) /
(max(self.costs) - min(self.costs)+0.7)
)
for i, txt in enumerate(self.no_cost_variables_selected_order):
self.ax.annotate(
txt,
(self.no_cost_total_costs[i], self.no_cost_total_scores[i]),
bbox=dict(boxstyle="round", alpha=costs_normalized_to_alpha[i], color='red'),
xytext=(self.no_cost_total_costs[i]+move_horizontal, self.no_cost_total_scores[i]+move_vertical*0.5),
size=10,
color='white')
else:
self.ax.plot(self.total_costs, self.total_scores, linestyle='--', marker='o', color='b')
if annotate:
costs_normalized_to_alpha = self.normalized_costs = list((
np.array(self.cost_variables_selected_order) - min(self.costs) + 0.7) /
(max(self.costs) - min(self.costs)+0.7))
for i, txt in enumerate(self.variables_selected_order):
self.ax.annotate(
txt,
(self.total_costs[i], self.total_scores[i]),
bbox=dict(boxstyle="round", alpha=costs_normalized_to_alpha[i], color='blue'),
xytext=(self.total_costs[i]+move_horizontal, self.total_scores[i]-move_vertical),
size=10,
color='white')
self.ax.tick_params(axis='both', which='major', labelsize=16)
if plot_title is None:
self.ax.set_title('Model ' + self.scoring_function.__name__ + ' vs cost', fontsize=18)
else:
self.ax.set_title(plot_title, fontsize=18)
if x_axis_title is None:
self.ax.set_xlabel('Cost', fontsize=16)
else:
self.ax.set_xlabel(x_axis_title, fontsize=16)
if y_axis_title is None:
self.ax.set_ylabel(self.scoring_function.__name__, fontsize=16)
else:
self.ax.set_ylabel(y_axis_title, fontsize=16)
# BBox with feature names
if annotate_box:
variables_idx = set(self.variables_selected_order).union(set(self.no_cost_variables_selected_order))
variables_names = [self.colnames[i] for i in variables_idx]
variables_costs = [self.costs[i] for i in variables_idx]
textstr = '\n'.join([str(idx) + ': ' + name + f' C={cost:.2f}' for idx, name, cost in zip(variables_idx, variables_names, variables_costs)])
props = dict(boxstyle='round', facecolor='gray', alpha=0.1)
self.ax.text(bbox_pos[0], bbox_pos[1], textstr, transform=self.ax.transAxes, fontsize=14, verticalalignment='top', bbox=props, size=12, color='gray')
if savefig:
assert kwargs.get('fig_name'), "Must specify `fig_name` as key word argument"
name = kwargs.pop('fig_name')
plt.savefig(name, **kwargs)
plt.tight_layout()
plt.show()
def _fit_no_cost(self, stop_budget=False, **kwargs):
S = set()
U = set([i for i in range(self.data.shape[1])])
self.no_cost_variables_selected_order = []
self.no_cost_cost_variables_selected_order = []
for _ in tqdm(range(self.number_of_features), desc='Selecting No-cost Features'):
k, _, cost = no_cost_find_best_feature(
j_criterion_func=self.j_criterion_func,
data=self.data,
target_variable=self.target_variable,
prev_variables_index=list(S),
possible_variables_index=list(U),
costs=self.costs,
beta=self.beta)
S.add(k)
if stop_budget is True and (sum(self.no_cost_cost_variables_selected_order) + cost) >= (self.budget or np.inf):
break
self.no_cost_variables_selected_order.append(k)
self.no_cost_cost_variables_selected_order.append(cost)
U = U.difference(set([k]))
if len(S) == self.number_of_features:
break
def _score_no_cost(self):
current_cost = 0
self.no_cost_total_scores = []
self.no_cost_total_costs = []
for i, var_id in enumerate(self.no_cost_variables_selected_order):
cur_vars = self.no_cost_variables_selected_order[0:i+1]
self.model = self.model.fit(X=self.X_train[:, cur_vars], y=self.y_train)
y_hat = self.model.predict_proba(self.X_test[:, cur_vars])[:, 1]
score = roc_auc_score(self.y_test, y_hat)
current_cost += self.costs[var_id]
self.no_cost_total_scores.append(score)
self.no_cost_total_costs.append(current_cost)
[docs]class DiffVariableSelector(_VariableSelector):
"""
Ranks all features in dataset with difference cost filter method.
"""
[docs] def fit(self, data, target_variable, costs, lamb, j_criterion_func='cife', number_of_features=None, budget=None, stop_budget=False, **kwargs):
"""Ranks all features in dataset with difference cost filter method.
Parameters
----------
data: np.ndarray or pd.
Matrix or data frame of data that we want to rank features.
target_variable: np.ndarray or pd.core.series.Series
Vector or series of target variable. Number of rows in `data` must equal target_variable length
costs: list or dict
Costs of features. Must be the same size as columns in `data`.
When using `data` as np.array, provide `costs` as list of floats or integers.
When using `data` as pd.DataFrame, provide `costs` as list of floats or integers or dict {'col_1':cost_1,...}.
lamb: int or float
Cost scaling parameter. Higher lambda is, higher is the impact of the cost on selection.
j_criterion_func: str
Method of approximation of the conditional mutual information
Must be one of ['mim','mifs','mrmr','jmi','cife'].
All methods can be seen by running:
>>> from bcselector.information_theory.j_criterion_approximations.__all__
number_of_features: int
Optional argument, constraint to selected number of features.
budget: int or float
Optional argument, constraint to selected total cost of features.
stop_budget: bool
Optional argument, TODO - must delete this argument
**kwargs
Arguments passed to `difference_find_best_feature()` function and then to `j_criterion_func`.
Attributes
----------
Examples
--------
>>> from bcselector.variable_selection import DiffVariableSelector
>>> dvs = DiffVariableSelector()
>>> dvs.fit(X, y, costs, lamb=1, j_criterion_func='mim')
"""
# lamb
assert isinstance(lamb, int) or isinstance(lamb, float), "Argument `lamb` must be integer or float"
self.lamb = lamb
self.stop_budget = stop_budget
super().fit(data=data, target_variable=target_variable, costs=costs, j_criterion_func=j_criterion_func, budget=budget, **kwargs)
if number_of_features is None:
self.number_of_features = self.data.shape[1]
else:
self.number_of_features = number_of_features
if self.budget is None and stop_budget:
warnings.warn("Unused argument `stop_budget`. Works only with `budget` argument.")
S = set()
U = set([i for i in range(self.data.shape[1])])
self.variables_selected_order = []
self.cost_variables_selected_order = []
for _ in range(self.number_of_features):
k, filter_value, criterion_value, cost = difference_find_best_feature(
j_criterion_func=self.j_criterion_func,
data=self.X_train,
target_variable=self.y_train,
prev_variables_index=list(S),
possible_variables_index=list(U),
costs=self.costs,
normalized_costs=self.normalized_costs,
lamb=self.lamb,
**kwargs)
S.add(k)
if stop_budget is True and (sum(self.cost_variables_selected_order) + cost) >= (self.budget or np.inf):
break
self.variables_selected_order.append(k)
self.cost_variables_selected_order.append(cost)
self.criterion_values.append(criterion_value)
self.filter_values.append(filter_value)
U = U.difference(set([k]))
if len(S) == self.number_of_features:
break
[docs]class FractionVariableSelector(_VariableSelector):
"""Ranks all features in dataset with difference cost filter method.
Attributes
----------
"""
[docs] def fit(self, data, target_variable, costs, r, j_criterion_func='cife', number_of_features=None, budget=None, stop_budget=False, **kwargs):
"""Ranks all features in dataset with fraction cost filter method.
Parameters
----------
data: np.ndarray or pd.
Matrix or data frame of data that we want to rank features.
target_variable: np.ndarray or pd.core.series.Series
Vector or series of target variable. Number of rows in `data` must equal target_variable length
costs: list or dict
Costs of features. Must be the same size as columns in `data`.
When using `data` as np.array, provide `costs` as list of floats or integers.
When using `data` as pd.DataFrame, provide `costs` as list of floats or integers or dict {'col_1':cost_1,...}.
r: int or float
Cost scaling parameter. Higher `r` is, higher is the impact of the cost on selection.
j_criterion_func: str
Method of approximation of the conditional mutual information
Must be one of ['mim','mifs','mrmr','jmi','cife'].
All methods can be seen by running:
>>> from bcselector.information_theory.j_criterion_approximations.__all__
number_of_features: int
Optional argument, constraint to selected number of features.
budget: int or float
Optional argument, constraint to selected total cost of features.
stop_budget: bool
Optional argument, TODO - must delete this argument
**kwargs
Arguments passed to `fraction_find_best_feature()` function and then to `j_criterion_func`.
Examples
--------
>>> from bcselector.variable_selection import FractionVariableSelector
>>> fvs = FractionVariableSelector()
>>> fvs.fit(X, y, costs, lamb=1, j_criterion_func='mim')
"""
# r
assert isinstance(r, int) or isinstance(r, float), "Argument `r` must be integer or float"
self.r = r
self.stop_budget = stop_budget
super().fit(data=data, target_variable=target_variable, costs=costs, j_criterion_func=j_criterion_func, budget=budget, **kwargs)
if number_of_features is None:
self.number_of_features = self.data.shape[1]
else:
self.number_of_features = number_of_features
if self.budget is None and stop_budget:
warnings.warn("Unused argument `stop_budget`. Works only with `budget` argument.")
S = set()
U = set([i for i in range(self.data.shape[1])])
self.variables_selected_order = []
self.cost_variables_selected_order = []
for _ in tqdm(range(self.number_of_features), desc=f'Selecting Features for r = {self.r:0.3f}'):
k, filter_value, criterion_value, cost = fraction_find_best_feature(
j_criterion_func=self.j_criterion_func,
data=self.data,
target_variable=self.target_variable,
prev_variables_index=list(S),
possible_variables_index=list(U),
costs=self.costs,
normalized_costs=self.normalized_costs,
r=self.r,
**kwargs)
S.add(k)
if stop_budget is True and (sum(self.cost_variables_selected_order) + cost) >= (self.budget or np.inf):
break
self.variables_selected_order.append(k)
self.cost_variables_selected_order.append(cost)
self.criterion_values.append(criterion_value)
self.filter_values.append(filter_value)
U = U.difference(set([k]))
if len(S) == self.number_of_features:
break
class NoCostVariableSelector(_VariableSelector):
def fit(self, data, target_variable, costs, j_criterion_func='cife', **kwargs):
super().fit(data, target_variable, costs, j_criterion_func, **kwargs)
S = set()
U = set([i for i in range(self.data.shape[1])])
self.variables_selected_order = []
self.cost_variables_selected_order = []
for _ in tqdm(range(len(U)), desc='Scoring No-cost Features'):
k, _, cost = no_cost_find_best_feature(
j_criterion_func=self.j_criterion_func,
data=self.data,
target_variable=self.target_variable,
prev_variables_index=list(S),
possible_variables_index=list(U),
costs=self.costs)
S.add(k)
self.variables_selected_order.append(k)
self.cost_variables_selected_order.append(cost)
U = U.difference(set([k]))
if len(S) == self.number_of_features:
break
def plot_scores(self, model):
super().plot_scores(model)
plt.show()