Calibration
HydroCNHS equips with a genetic algorithm powered by Distributed Evolutionary Algorithms in Python (DEAP), which can be used for calibration in parallel. The unique feature about HydroCNHS is that both the parameters of the hydrological model and user-defined ABM can be calibrated simutaneously as long as those parameters are defined the model file (.yaml). Furthermore, users are allowed to assign initial guesses to the algorithm. Note that this GA module are not limited to HydroCNHS model.
Genetic algorithm calibration code outline
To use this genetic algorithm (GA) module, three things are required.
The evaluation function
The evaluation function has to follow a certain protocal. Please see the example code below.
GA configuration dictionary
GA configuration dictionary template can be obtained by cali.get_config_template().
Calibration inputs dictionary
The calibration inputs dictionary template can be obtained by cali.get_inputs_template().
Note
Currently, GA module only support calibrating real number parameters.
import HydroCNHS
import HydroCNHS.calibration as cali
# Assisting functions
cali.helper() # Get instructions.
cali.get_config_template()
cali.get_inputs_template()
# Must have individual and info arguments for evaluation function.
def evaluation(individual, info):
# individual: A 1D array of generated parameter set.
# info: The info tuple contains five items such as the calibration
# working directory (cali_wd), current generation number
# (current_generation), the index of the current individual of the
# generation (ith_individual), formtter for the Converter, and a random
# number generator (random_generator).
# Note the random_generator should be used to generate random number to
# ensure the reproducibility. (Do not use np.random directly.)
(cali_wd, current_generation, ith_individual,
formatter, random_generator) = info
# Run model.
# Calculate fitness.
# E.g.,
fitness = sum(individual)
return (fitness,) # Has to be a tuple format.
# GA configuration
config = {'min_or_max': 'max',
'pop_size': 200,
'num_ellite': 1,
'prob_cross': 0.5,
'prob_mut': 0.1,
'stochastic': False,
'max_gen': 100,
'sampling_method': 'LHC',
'drop_record': False,
'paral_cores': -1,
'paral_verbose': 1,
'auto_save': True,
'print_level': 1,
'plot': True}
# GA inputs (Can be generated by the Converter shown in the next section.)
cali_inputs = {"par_name": ["a","b","c"],
"par_bound": [[1,2],[1,2],[1,2]],
"wd": "working directory"}
seed = 5
# Create a random number generator for GA.
rn_gen = HydroCNHS.create_rn_gen(seed) # Optional
# Initialize GA object.
ga = cali.GA_DEAP(evaluation, rn_gen)
# Set up GA object (or load the previously auto-saved pickle file)
ga.set(cali_inputs, config, name="Cali_example")
# Run
ga.run()
# Run the evaluation with the given individual (e.g., the solution).
ga.run_individual(ga.solution)
# Show the result summary.
summary = ga.summary
print(summary)
Converter
Convertor helps user to convert a list of parameter dataframes (can contain nan values) into an 1D array (parameters for calibration, automatically exclude nan values) that can be used for GA calibration. And the formatter created by Convertor can be used to convert 1D array back to a list of original dataframes. The inputs dictionary for GA can also be generated by Convertor. Besides, we provide option for defining fixed parameters, which those parameters will not enter the calibration process (exclude from the 1D array). Note that the dataframe index is parameter names.
import numpy as np
import pandas as pd
import HydroCNHS
import HydroCNHS.calibration as cali
### Prepare testing data.
par_df1 = pd.DataFrame({"Subbasin1": [1000,1000,3], "Subbasin2": [4,5,6]},
index=["a", "b", "c"])
par_df2 = pd.DataFrame({"Agent1": [9,8,7], "Agent2": [6,5,None]},
index=["Par1", "Par2", "Par3"])
bound_df1 = pd.DataFrame({"Subbasin1": [[0,1000],[0,1000],[0,10]], "Subbasin2": [[0,10],[0,10],[0,10]]},
index=["a", "b", "c"])
bound_df2 = pd.DataFrame({"Agent1": [[0,10],[0,10],[0,10]], "Agent2": [[0,10],[0,10],None]},
index=["Par1", "Par2", "Par3"])
df_list = [par_df1, par_df2]
par_bound_df_list = [bound_df1, bound_df2]
### Create a object called Converter.
converter = cali.Convertor()
### Generate GA inputs with fixed a & b parameters for Subbasin1.
fixed_par_list = [[(["a","b"], ["Subbasin1"])],[]]
cali_inputs = converter.gen_cali_inputs(
"working directory", df_list, par_bound_df_list, fixed_par_list)
### Get formatter
formatter = converter.formatter
### Show cali_inputs
print(cali_inputs)
r"""
print(cali_inputs)
{'wd': 'working directory',
'par_name': ['a|Subbasin2', 'b|Subbasin2', 'c|Subbasin1', 'c|Subbasin2',
'Par1|Agent1', 'Par1|Agent2', 'Par2|Agent1', 'Par2|Agent2',
'Par3|Agent1'],
'par_bound': [[0, 10], [0, 10], [0, 10], [0, 10], [0, 10], [0, 10], [0, 10],
[0, 10], [0, 10]]}
"""
### to 1D array
converter.to_1D_array(df_list, formatter)
r"""
# Out[31]: array([4., 5., 3., 6., 9., 6., 8., 5., 7.])
# Note the order of the array corresponds to "par_name" in the cali_inputs.
"""
### to df_list
var_array = np.array([5]*9)
converter.to_df_list(var_array, formatter)
r"""
Out[46]:
[ Subbasin1 Subbasin2
a 1000.0 5.0
b 1000.0 5.0
c 5.0 5.0,
Agent1 Agent2
Par1 5.0 5.0
Par2 5.0 5.0
Par3 5.0 NaN]
"""
TRB calibration example
import os
import pandas as pd
from copy import deepcopy
import pickle
import HydroCNHS
import HydroCNHS.calibration as cali
##### Path and Load Model Test
# Get this file directory.
prj_path, this_filename = os.path.split(__file__)
model_path = os.path.join(prj_path, "Template_for_calibration", "TRB_dm_gwlf.yaml")
bound_path = os.path.join(prj_path, "ParBound")
wd = prj_path
# Update model paths.
model_dict = HydroCNHS.load_model(model_path)
model_dict["Path"]["WD"] = wd
model_dict["Path"]["Modules"] = os.path.join(prj_path, "ABM_modules")
##### Gen cali information
# Convert parameter sections in the model file (i.e., model_dict) to a list of
# dataframes (df_list).
df_list, df_name = HydroCNHS.write_model_to_df(model_dict, key_option=["Pars"])
# Load the parameter bounds. The order of the list is corresponding to df_list.
par_bound_df_list = [
pd.read_csv(os.path.join(bound_path, "gwlf_par_bound.csv"), index_col=[0]),
pd.read_csv(os.path.join(bound_path, "routing_par_bound.csv"), index_col=[0]),
pd.read_csv(os.path.join(bound_path, "abm_par_bound_dm.csv"), index_col=[0])]
# Initialize Convertor.
converter = cali.Convertor()
cali_inputs = converter.gen_cali_inputs(wd, df_list, par_bound_df_list)
formatter = converter.formatter
# Load inputs from pickle file.
with open(os.path.join(prj_path, "Inputs", "TRB_inputs.pickle"), "rb") as file:
(temp, prec, pet, obv_D, obv_M, obv_Y) = pickle.load(file)
#%%
# =============================================================================
# Calibration
# =============================================================================
def cal_batch_indicator(period, target, df_obv, df_sim):
"""Compute mean indicator over targets"""
df_obv = df_obv[period[0]:period[1]]
df_sim = df_sim[period[0]:period[1]]
Indicator = HydroCNHS.Indicator()
df = pd.DataFrame()
for item in target:
df_i = Indicator.cal_indicator_df(df_obv[item], df_sim[item],
index_name=item)
df = pd.concat([df, df_i], axis=0)
df_mean = pd.DataFrame(df.mean(axis=0), columns=["Mean"]).T
df = pd.concat([df, df_mean], axis=0)
return df
def evaluation(individual, info):
cali_wd, current_generation, ith_individual, formatter, _ = info
name = "{}-{}".format(current_generation, ith_individual)
##### individual -> model
# Convert 1D array to a list of dataframes.
df_list = cali.Convertor.to_df_list(individual, formatter)
# Feed dataframes in df_list to model dictionary.
model = deepcopy(model_dict)
for i, df in enumerate(df_list):
s = df_name[i].split("_")[0]
model = HydroCNHS.load_df_to_model_dict(model, df, s, "Pars")
##### Run simuluation
model = HydroCNHS.Model(model, name)
Q = model.run(temp, prec, pet)
##### Get simulation data
# Streamflow of routing outlets.
cali_target = ["DLLO", "WSLO"]
cali_period = ("1981-1-1", "2005-12-31")
vali_period = ("2006-1-1", "2013-12-31")
sim_Q_D = pd.DataFrame(Q, index=model.pd_date_index)[cali_target]
# Agents' outputs stored in the data_collector.
sim_Q_D["DivAgt"] = model.data_collector.DivAgt["Diversion"]
sim_Q_D["ResAgt"] = model.data_collector.ResAgt["Release"]
cali_target += ["DivAgt", "ResAgt"]
# Resample the daily simulation output to monthly and annually outputs.
sim_Q_M = sim_Q_D[cali_target].resample("MS").mean()
sim_Q_Y = sim_Q_D[cali_target].resample("YS").mean()
df_cali_Q_D = cal_batch_indicator(cali_period, cali_target, obv_D, sim_Q_D)
df_cali_Q_M = cal_batch_indicator(cali_period, cali_target, obv_M, sim_Q_M)
df_cali_Q_Y = cal_batch_indicator(cali_period, cali_target, obv_Y, sim_Q_Y)
df_vali_Q_D = cal_batch_indicator(vali_period, cali_target, obv_D, sim_Q_D)
df_vali_Q_M = cal_batch_indicator(vali_period, cali_target, obv_M, sim_Q_M)
df_vali_Q_Y = cal_batch_indicator(vali_period, cali_target, obv_Y, sim_Q_Y)
##### Save output.txt
# Only exercute when ga.run_individual(ga.solution)
if current_generation == "best":
with open(os.path.join(cali_wd, "cali_indiv_" + name + ".txt"), 'w') as f:
f.write("Annual cali/vali result\n")
f.write(df_cali_Q_Y.round(3).to_csv(sep='\t').replace("\n", ""))
f.write("\n")
f.write(df_vali_Q_Y.round(3).to_csv(sep='\t').replace("\n", ""))
f.write("\n\nMonthly cali/vali result\n")
f.write(df_cali_Q_M.round(3).to_csv(sep='\t').replace("\n", ""))
f.write("\n")
f.write(df_vali_Q_M.round(3).to_csv(sep='\t').replace("\n", ""))
f.write("\n\nDaily cali/vali result\n")
f.write(df_cali_Q_D.round(3).to_csv(sep='\t').replace("\n", ""))
f.write("\n")
f.write(df_vali_Q_D.round(3).to_csv(sep='\t').replace("\n", ""))
f.write("\n=========================================================\n")
f.write("Sol:\n" )
df = pd.DataFrame(individual, index=cali_inputs["par_name"]).round(4)
f.write(df.to_string(header=False, index=True))
fitness = df_cali_Q_M.loc["Mean", "KGE"]
return (fitness,)
# cali.helper()
# cali.get_config_template()
# cali.get_inputs_template()
config = {'min_or_max': 'max',
'pop_size': 200,
'num_ellite': 1,
'prob_cross': 0.5,
'prob_mut': 0.1,
'stochastic': False,
'max_gen': 100,
'sampling_method': 'LHC',
'drop_record': False,
'paral_cores': -1,
'paral_verbose': 1,
'auto_save': True,
'print_level': 1,
'plot': True}
# Calibrate with 3 seeds.
seeds = [5,10,13]
for seed in seeds:
rn_gen = HydroCNHS.create_rn_gen(seed)
ga = cali.GA_DEAP(evaluation, rn_gen)
ga.set(cali_inputs, config, formatter, name="Cali_gwlf_abm_KGE_{}".format(seed))
ga.run()
ga.run_individual(ga.solution) # Output performance (.txt) of solution.
##### Output the calibrated model.
individual = ga.solution
df_list = cali.Convertor.to_df_list(individual, formatter)
model_best = deepcopy(model_dict)
for i, df in enumerate(df_list):
s = df_name[i].split("_")[0]
model = HydroCNHS.load_df_to_model_dict(model_best, df, s, "Pars")
HydroCNHS.write_model(model_best, os.path.join(ga.cali_wd, "Best_gwlf_abm_KGE.yaml"))
summary = ga.summary