Source code for SISSOkit.cross_validation

import os
import shutil
import random
import math
import string
import sys
import json
import re

import numpy as np
import pandas as pd





[docs]def kfold(current_path,target_path,property_name,num_fold): r""" Generates K-fold cross validation files for SISSO. You should at least contains two files necessary for SISSO: ``SISSO.in`` and ``train.dat`` before you use this function. All the arguments in ``SISSO.in`` will remain the same in CV files except ``nsample`` will change to the correct nsample in ``train.dat``. This function will generate a directory contains totally ``num_fold`` CV directories ,and ``cross_validation_info.dat`` containing the information about CV type (i.e. K-fold) and shuffle list, In each CV directory, there will be a new ``validation.dat`` contains the data left out from ``train.dat``, and ``shuffle.dat`` contains sample index and number of ``train.dat`` and ``validation.dat``, noting that the index is the index in the original ``train.dat``, i.e. the index in the whole data set. You can directly run SISSO on this CV files if original ``SISSO.in`` is correctly set. Arguments: current_path (string): path to SISSO file on which you want to do cross validation. It should at least contains two files necessary for SISSO: ``SISSO.in`` and ``train.dat``. target_path (string): path to newly generated cross validation directory. property_name (string): the name of the property you want to predict. num_fold (int): K of K-fold cross validation. """ with open(os.path.join(current_path,'SISSO.in'),'r') as f: input_file=f.read() task_number=int(re.findall(r'ntask\s*=\s*(\d+)',input_file)[0]) samples_number=re.findall(r'nsample\s*=\s*([\d,]+)',input_file)[0] samples_number=re.split(r'[, ]+',samples_number) samples_number=list(map(int,samples_number)) data_total=pd.read_csv(os.path.join(current_path,'train.dat'),sep=r'\s+') i=1 data_list=[] for sample_num in samples_number: data_list.append(list(range(i,i+sample_num))) i+=sample_num for task in range(0,task_number): random.shuffle(data_list[task]) batch_size=list(map(lambda x: int(math.ceil(x/num_fold)), samples_number)) try: if os.path.exists(os.path.join(target_path,'%s_cv'%property_name)): print('Directory already exists.\nDo you want to remove the directory?') a=input('y|n\n') if a=='y': shutil.rmtree(os.path.join(target_path,'%s_cv'%property_name)) if a=='n': print('Please input a new target path!') return None except FileNotFoundError: if os.path.exists(target_path)==False: os.mkdir(target_path) finally: os.mkdir(os.path.join(target_path,'%s_cv'%property_name)) target_path=os.path.join(target_path,'%s_cv'%property_name) data_total.to_csv(os.path.join(target_path,'train.dat'),index=False,sep=' ') with open(os.path.join(target_path,'cross_validation_info.dat'),'w') as f: json.dump({'cross_validation_type':'%d-fold'%num_fold,'shuffle_data_list':data_list},f) for i in range(0,num_fold): try: shutil.copytree(current_path,os.path.join(target_path,property_name+'_cv%d'%i)) except FileExistsError: shutil.rmtree(os.path.join(target_path,property_name+'_cv%d'%i)) shutil.copytree(current_path,os.path.join(target_path,property_name+'_cv%d'%i)) val_list=[] train_list=[] for task in range(0,task_number): train_list_t=[] if batch_size[task]*i<samples_number[task]: for j in range(0,num_fold): if batch_size[task]*j<samples_number[task]: if i==j: val_list.append(data_list[task][batch_size[task]*j:min(batch_size[task]*(j+1),samples_number[task])]) else: train_list_t.append(data_list[task][batch_size[task]*j:min(batch_size[task]*(j+1),samples_number[task])]) else: break else: val_list.append([]) train_list_t.append(data_list[task]) train_list.append(np.hstack(train_list_t).tolist()) train_len=list(map(len,train_list)) val_len=list(map(len,val_list)) with open(os.path.join(target_path,property_name+'_cv%d'%i,'shuffle.dat'),'w') as f: json.dump({'training_list':train_list,'training_samples_number':train_len,'validation_list':val_list,'validation_samples_number':val_len},f) data_train=data_total.iloc[np.hstack(train_list)-1] data_val=data_total.iloc[np.hstack(val_list)-1] data_train.to_csv(os.path.join(target_path,property_name+'_cv%d'%i,'train.dat'),index=False,sep=' ') data_val.to_csv(os.path.join(target_path,property_name+'_cv%d'%i,'validation.dat'),index=False,sep=' ') with open(os.path.join(target_path,property_name+'_cv%d'%i,'SISSO.in'),'r') as f: lines=f.readlines() for j in range(len(lines)): if lines[j].startswith('nsample'): lines[j]='nsample=%s'%(str(train_len).strip('[]'))+'\t! number of samples for each task (seperate the numbers by comma for ntask >1)\n' with open(os.path.join(target_path,property_name+'_cv%d'%i,'SISSO.in'),'w') as f: f.writelines(lines)
[docs]def leave_out(current_path,target_path,property_name,num_iter,frac=0,num_out=0): r""" Generates leave-N-out cross validation files for SISSO. You should at least contains two files necessary for SISSO: ``SISSO.in`` and ``train.dat`` before you use this function. All the arguments in ``SISSO.in`` will remain the same in CV files except ``nsample`` will change to the correct nsample in ``train.dat``. This function will generate a directory contains totally ``num_iter`` CV directories ,and ``cross_validation_info.dat`` containing the information about CV type (i.e. leave-out), shuffle list and iteration times, In each CV directory, there will be a new ``validation.dat`` contains the data left out from ``train.dat``, and ``shuffle.dat`` contains sample index and number of ``train.dat`` and ``validation.dat``, noting that the index is the index in the original ``train.dat``, i.e. the index in the whole data set. You can directly run SISSO on this CV files if original ``SISSO.in`` is correctly set. Arguments: current_path (string): path to SISSO file on which you want to do cross validation. It should at least contains two files necessary for SISSO: ``SISSO.in`` and ``train.dat``. target_path (string): path to newly generated cross validation directory. property_name (string): the name of the property you want to predict. num_iter (int): the number of cross validation files. frac (float): the percentage of leave out samples. You should only pass either ``frac`` or ``num_out`` to this function. num_out (int): the number of leave out samples. You should only pass either ``frac`` or ``num_out`` to this function. """ if num_out and frac: print("Please input one of num_out and frac!") return None with open(os.path.join(current_path,'SISSO.in'),'r') as f: input_file=f.read() task_number=int(re.findall(r'ntask\s*=\s*(\d+)',input_file)[0]) samples_number=re.findall(r'nsample\s*=\s*([\d,]+)',input_file)[0] samples_number=re.split(r'[, ]+',samples_number) samples_number=list(map(int,samples_number)) data_total=pd.read_csv(os.path.join(current_path,'train.dat'),sep=r'\s+') i=1 data_list=[] for sample_num in samples_number: data_list.append(list(range(i,i+sample_num))) i+=sample_num try: if os.path.exists(os.path.join(target_path,'%s_cv'%property_name)): print('Directory already exists.\nDo you want to remove the directory?') a=input('y|n\n') if a=='y': shutil.rmtree(os.path.join(target_path,'%s_cv'%property_name)) if a=='n': print('Please input a new target path!') return None except FileNotFoundError: if os.path.exists(target_path)==False: os.mkdir(target_path) finally: os.mkdir(os.path.join(target_path,'%s_cv'%property_name)) target_path=os.path.join(target_path,'%s_cv'%property_name) data_total.to_csv(os.path.join(target_path,'train.dat'),index=False,sep=' ') with open(os.path.join(target_path,'cross_validation_info.dat'),'w') as f: if num_out: json.dump({'cross_validation_type':'leave-%d-out'%num_out,'iteration_times':num_iter},f) else: json.dump({'cross_validation_type':'leave-%d%%-out'%int(frac*100),'iteration_times':num_iter},f) num_out=[] total_samples_number=0 for task in range(task_number): total_samples_number+=samples_number[task] if num_out: frac=num_out/total_samples_number if frac: for task in range(0,task_number): num_out.append(round(samples_number[task]*frac)) for i in range(0,num_iter): try: shutil.copytree(current_path,os.path.join(target_path,property_name+'_cv%d'%i)) except FileExistsError: shutil.rmtree(os.path.join(target_path,property_name+'_cv%d'%i)) shutil.copytree(current_path,os.path.join(target_path,property_name+'_cv%d'%i)) val_list=[] for task in range(0,task_number): val_list.append(random.sample(data_list[task],num_out[task])) train_list=[[x for x in data_list[task] if x not in val_list[task]] for task in range(0,task_number) ] train_len=list(map(len,train_list)) val_len=list(map(len,val_list)) data_train=data_total.iloc[np.hstack(train_list)-1] data_val=data_total.iloc[np.hstack(val_list)-1] data_train.to_csv(os.path.join(target_path,property_name+'_cv%d'%i,'train.dat'),index=False,sep=' ') data_val.to_csv(os.path.join(target_path,property_name+'_cv%d'%i,'validation.dat'),index=False,sep=' ') with open(os.path.join(target_path,property_name+'_cv%d'%i,'shuffle.dat'),'w') as f: json.dump({'training_list':train_list,'training_samples_number':train_len,'validation_list':val_list,'validation_samples_number':val_len},f) with open(os.path.join(target_path,property_name+'_cv%d'%i,'SISSO.in'),'r') as f: lines=f.readlines() for j in range(len(lines)): if lines[j].startswith('nsample'): lines[j]='nsample=%s'%(str(train_len).strip('[]'))+'\t! number of samples for each task (seperate the numbers by comma for ntask >1)\n' with open(os.path.join(target_path,property_name+'_cv%d'%i,'SISSO.in'),'w') as f: f.writelines(lines)