#-*- coding: utf-8 -*-
# Author: Stephen Szwiec
# Date: 2023-02-19
# Description: Multi-Processing Feature Selection Module
#
#Copyright (C) 2023 Stephen Szwiec
#
#This file is part of qsarify.
#
#This program is free software: you can redistribute it and/or modify
#it under the terms of the GNU General Public License as published by
#the Free Software Foundation, either version 3 of the License, or
#(at your option) any later version.
#
#This program is distributed in the hope that it will be useful,
#but WITHOUT ANY WARRANTY; without even the implied warranty of
#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
#GNU General Public License for more details.
#
#You should have received a copy of the GNU General Public License
#along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
Multi-Processing Feature Selection Module
This module contains the functions for performing feature selection using
the clustering module's output as a guide for feature selection, and implements
a genetic algorithm for feature selection using reflection.
"""
import datetime
import random
import numpy as np
from sklearn import linear_model as lm
from sklearn.svm import SVC
import itertools
import multiprocessing as mp
"""
Reflector class for the evolve function; allows for the use of a pool of workers.
"""
[docs]
class Evolution:
"""
Initializes the evolution class with the learning algorithm to be used
"""
def __init__(self, evolve):
self.e_mlr = lm.LinearRegression()
self.evolve = evolve
"""
Function call for the evolution function
"""
def __call__(self, i, cluster_info, cluster, X_data, y_data):
return self.evolve(i, cluster_info, cluster, X_data, y_data, self.e_mlr)
"""
Evolution of descriptors for learning algorithm, implemented as a function map
Parameters
----------
i: list, descriptor set
cluster_info: dict, descriptor cluster information
cluster: list, descriptor cluster
X_data: DataFrame, descriptor data
y_data: DataFrame, target data
"""
[docs]
def evolve(i, cluster_info, cluster, X_data, y_data, e_mlr):
# Get the descriptors in the model
i = i[1]
# Get the groups of descriptors in model
group_n = [cluster_info[x]-1 for x in i]
# randomly select one descriptor to remove
sw_index = random.randrange(0, len(i))
# randomly select new group from cluster to swap with
sw_group = random.randrange(0, max(list(cluster_info.values())))
while sw_group in group_n:
# make sure the new group is not in the current group
sw_group = random.randrange(0, len(cluster))
# list comprehension which generates a new list of descriptors by
# swapping the indexed descriptor with a new one randomly chosen from the new cluster group
b_set = [random.choice(cluster[sw_group]) if x == sw_index else i[x] for x in range(0, len(i))]
b_set.sort()
x = X_data[b_set].values
y = y_data.values.ravel()
score = e_mlr.fit(x, y).score(x, y)
return [score, b_set]
[docs]
def selection(X_data, y_data, cluster_info, model="regression", learning=500000, bank=200, component=4, interval=1000, cores=(mp.cpu_count()*2)-1):
"""
Forward feature selection using cophenetically correlated data on mutliple cores
Parameters
----------
X_data : pandas DataFrame , shape = (n_samples, n_features)
y_data : pandas DataFrame , shape = (n_samples,)
cluster_info : dictionary returned by clustering.featureCluster.set_cluster()
model : default="regression", otherwise "classification"
learning : default=500000, number of overall models to be trained
bank : default=200, number of models to be trained in each iteration
component : default=4, number of features to be selected
interval : optional, default=1000, print current scoring and selected features
every interval
cores: optional, default=(mp.cpu_count()*2)-1, number of processes to be used
for multiprocessing; default is twice the number of cores minus 1, which
is assuming you have SMT, HT, or something similar) If you have a large
number of cores, you may want to set this to a lower number to avoid
memory issues.
Returns
-------
list, result of selected best feature set
"""
now = datetime.datetime.now()
print("Start time: ", now.strftime('%H:%M:%S'))
if model == "regression":
print('\x1b[1;42m','Regression','\x1b[0m')
y_mlr = lm.LinearRegression()
e_mlr = lm.LinearRegression()
else:
print('\x1b[1;42m','Classification','\x1b[0m')
y_mlr = SVC(kernel='rbf', C=1, gamma=0.1, random_state=0)
e_mlr = SVC(kernel='rbf', C=1, gamma=0.1, random_state=0)
# a list of numbered clusters
nc = list(cluster_info.values())
num_clusters = list(range(max(nc)))
# extract information from dictionary by inversion
inv_cluster_info = dict()
for k, v in cluster_info.items():
inv_cluster_info.setdefault(v, list()).append(k)
# an ordered list of features in each cluster
cluster = list(dict(sorted(inv_cluster_info.items())).values())
# fill the interation bank with random models
# models contain (1 - component) number of features
# ensure the models are not duplicated and non redundant
index_sort_bank = set()
model_bank = [ ini_desc for _ in range(bank) for ini_desc in [sorted([random.choice(cluster[random.choice(num_clusters)]) for _ in range(random.randint(1,component))])] if ini_desc not in tuple(index_sort_bank) and not index_sort_bank.add(tuple(ini_desc))]
# score each set of features, saving each score and the corresponding feature set
scoring_bank = list(map(lambda x: [y_mlr.fit(np.array(X_data.loc[:,x]), y_data.values.ravel()).score(np.array(X_data.loc[:,x]), y_data), list(X_data.loc[:,x].columns.values)], model_bank))
# create a reflection of the evolution function
evolver = Evolution(Evolution.evolve)
with mp.Pool(processes = cores) as pool:
# perform main learning loop
for n in range(learning):
# initialize best score to the worst possible score
best_score = -float("inf")
# Evolve the bank of models and allow those surpassing the best score to replace the worst models up to the bank size
results = pool.starmap(evolver, [(i, cluster_info, cluster, X_data, y_data) for i in scoring_bank])
rank_filter = [x for x in results if (best_score := max(best_score, x[0])) == x[0]]
scoring_bank = sorted(itertools.chain(scoring_bank, rank_filter), reverse = True)[:bank]
if n % interval == 0 and n != 0:
tt = datetime.datetime.now()
print(n, '=>', tt.strftime('%H:%M:%S'), scoring_bank[0])
# print output and return best model found during training
print("Best score: ", scoring_bank[0])
clulog = [cluster_info[y] for _, y in scoring_bank[0][1]]
print("Model's cluster info", clulog)
fi = datetime.datetime.now()
fiTime = fi.strftime('%H:%M:%S')
print("Finish Time : ", fiTime)
return scoring_bank[0][1]