import joblib
import time
import warnings
import math
import numpy as np
import pandas as pd
import metropolis_hastings as mh
from datetime import datetime
from joblib import Parallel, delayed, cpu_count
from metropolis_hastings import random_walk
from helper_func import schedule_to_pandas
from activity import Schedule
from settings import DESIRED_TIMES, DEFAULT_MODES, DEFAULT_MH_PARAMS, DEFAULT_OPERATORS, DEFAULT_VARIABLES, DEFAULT_ACTIVITIES, DEFAULT_P_OPERATORS
from typing import List, Dict, Tuple, Optional, Union
warnings.filterwarnings('once')
[docs]class ChoiceSetGenerator():
"""
This class is used to generate choice sets of Schedule objects for given individuals.
Attributes:
-------------------
- schedules: List of Schedule objects
- param_file: location of parameters for the target distribution
- n_alt: number of alternatives in the choice set
- mh_params: dictionary containing parameters for the random walk
- activities: list of activities
- operators: list of operators
- p_operators: probabilities of operators
- modes: list of modes
- locations: list of locations
- variables: list of variables for target distribution
- outfile: location of file to save result
Methods:
------------------
- generate_set: generates choice set for a given individual.
- run: Run metropolis_hastings algorithm for full dataset.
- run: Run metropolis_hastings algorithm for full dataset, using parallel processing.
- compute_sample_correction: Returns the corrective term for the utility function
- train_test_sets: Creates train and test Dataframes to use in Biogeme.
"""
def __init__(self, schedules:List, params_file:str,n_alt:int = 10, mh_params:List = DEFAULT_MH_PARAMS,
activities:Optional[List] = DEFAULT_ACTIVITIES, operators:Optional[List] = DEFAULT_OPERATORS, proba_operators:Optional[List] = None,
modes: Optional[List] = DEFAULT_MODES, locations:Optional[List] = None, variables: List = DEFAULT_VARIABLES, outfile:str ='choice_set.joblib', seed:int = 42, **kwargs):
self.schedules = schedules
self.params_file = params_file
self.n_alt = n_alt
self.mh_params = mh_params
self.activities = activities
self.operators = operators
self.p_operators = proba_operators
self.modes = modes
self.outfile = outfile
self.locations = locations
self.variables = variables
self.choice_sets = []
self.accepted_operators = []
self.acceptance_probas = []
self.sample_corrections = []
[docs] def generate_set(self, schedule: Schedule)-> Tuple[List, List, List]:
"""
Generates choice set for a given individual.
Parameters
----------
- schedule : Schedule object
Returns
----------
Choice sets, accepted operators and probabilities.
"""
choice_set = [schedule]
list_op = []
probas = []
n_skip = self.mh_params["n_skip"]
n_burn = self.mh_params["n_burn"]
if not self.locations:
all_locations = schedule.all_locations
else:
all_locations = self.locations
steps = mh.random_walk(init_sched = schedule, operators=self.operators, p_operators = self.p_operators,
list_act=self.activities, list_loc = all_locations, list_modes = self.modes, param_file = self.params_file, **self.mh_params)
n = 0
for state, op, pb in steps:
if (n>n_burn) and (n%n_skip == 0):
optype = op.optype
if optype == "MetaOperator":
optype = op.meta_type
list_op.append(optype)
choice_set.append(state)
probas.append(pb)
n += 1
return choice_set, list_op, probas
[docs] def run(self) -> None:
"""
Run metropolis_hastings algorithm for full dataset and saves choice sets, accepted operators and acceptance probabilities to file.
"""
start = datetime.now()
for i, schedule in enumerate(self.schedules):
print(f"Starting generation for individual {i}.\n")
choice_set, list_op, probas = self.generate_set(schedule)
self.choice_sets.append(choice_set)
self.accepted_operators.append(list_op)
self.acceptance_probas.append(probas)
end = datetime.now()
print(f"Total runtime: {end-start}")
joblib.dump([self.choice_sets,self.accepted_operators, self.acceptance_probas], self.outfile)
[docs] def run_parallel(self, n_cpus:Optional[int] = None, verbose:int = 5)->None:
"""
Run metropolis_hastings algorithm for full dataset using parallel processing. Saves choice sets, accepted operators and acceptance probabilities to file.
Parameters
----------
n_cpus: number of CPUs to use for the parallel process.
verbose: gives frequency of progress ouptuts
"""
if not n_cpus:
n_cpus = cpu_count()
delayed_output = [delayed(self.generate_set)(s) for s in self.schedules]
results = Parallel(n_jobs=n_cpus, verbose=verbose)(delayed_output)
joblib.dump(results, self.outfile)
self.choice_sets, self.accepted_operators,self.acceptance_probas = results
[docs] def compute_sample_correction(self, original_probas: List, unique_probas:List, k: int = 1)->List:
"""
Returns the corrective term for the utility function, to estimate the model on the sampled choice set passed as input.
See Ben-Akiva & Lerman (1985) "Discrete choice analysis", p.266
Parameters
---------------
- original_probas: list of probabilities for alternatives in the choice set (including duplicates)
- unique_pobas: list of probabilities for unique alternatives in the choice set (excludeing duplicates)
- k: proportionality constant
"""
orig_len = [len(x) for x in original_probas]
new_len = [len(x) for x in unique_probas]
#Computing sampling proba - see Ben-Akiva and Lerman (1985) p. 266
diff_len = [i-j+1 for i, j in zip(orig_len, new_len)]
q = [np.log(k) + sum(unique_probas[i]) + diff_len[i]*np.log(sum(np.exp(unique_probas[i]))) for i in range(len(unique_probas))]
sample_correction = [i - j for i, j in zip(q, unique_probas)]
return sample_correction
[docs] def train_test_sets(self, k:int=1, train_ratio:float= 0.7) -> Tuple[pd.DataFrame, pd.DataFrame, List]:
"""
Creates train and test Dataframes to use in Biogeme.
Parameters:
--------------
- k: proportionality constant for sample correction
- train_ratio: train test split (default: 70% of observations will be used for the train set)
Returns:
---------------
- Train dataset in wide and long format, Test dataset
"""
draws_proba = [[1] for x in self.choice_sets]
for i, proba in enumerate(draws_proba):
proba.extend(self.acceptance_probas[i])
draws_proba[i] = proba
#Check unique alternatives (to compute sample probability)
unique_id_draws =[[idx for idx, item in enumerate(choice_set) if item not in choice_set[:idx]] for choice_set in self.choice_sets]
unique_draws = []
unique_draws_proba = [[1] for x in unique_id_draws]
n = 0
for ids, proba in zip(unique_id_draws, unique_draws_proba):
list_probas = [self.acceptance_probas[n][i-1] for i in ids[1:]]
proba.extend(list_probas)
unique_draws_proba[n] = proba
unique_draws.append([self.choice_sets[n][i-1] for i in ids[1:]])
n += 1
self.sample_corrections = self.compute_sample_correction(draws_proba, unique_draws_proba,k)
#Check how many choice sets have enough unique alternatives for the estimation
valid_ids = [i for i, elem in enumerate(unique_draws) if len(elem) >= self.n_alt]
other_ids = [i for i, _ in enumerate(unique_draws) if i not in valid_ids]
n_train = math.ceil(train_ratio*len(self.choice_sets))
if len(valid_ids) >= n_train:
train_ids = np.random.choice(range(len(valid_ids)), n_train, replace = False)
test_ids = other_ids.extend([i for i in valid_ids if i not in train_ids])
else:
train_ids = valid_ids + other_ids
test_ids = None
warnings.warn("The train/test ratio could not be satisfied with the requested choice set size. Consider a lower number of alternatives or increasing the number of draws for the random walk. ")
formatted_train = []
formatted_test = []
for t in train_ids:
cs = [unique_draws[t][0]] #add chosen alternative
cs.extend(list(np.random.choice(unique_draws[t][1:], self.n_alt-1)))
formatted_train.append([schedule_to_pandas(sched) for sched in cs])
if test_ids:
for t in test_ids:
cs = unique_draws[t]
formatted_test.append([schedule_to_pandas(sched) for sched in cs])
train_probs = [prob for i, prob in enumerate(self.sample_corrections) if i in train_ids]
#Add an ID to each alternative in the choice set, and creating a single list with all the alternatives for each individual
for j, list_cs in enumerate(formatted_train):
for i, cs in enumerate(list_cs):
cs['alt_id'] = i
cs['prob_corr'] = train_probs[j][i]
lng_cs = [pd.concat(formatted) for formatted in formatted_train]
#Add choice alternative and individual IDs
for i, cs in enumerate(lng_cs):
cs.reset_index(drop = True, inplace = True)
cs['obs_id'] = i
cs['choice'] = cs['alt_id'].apply(lambda x: 1 if i == 0 else 0)
#Create choice sets in long format
db_long = pd.concat(lng_cs).reset_index(drop = True)
df_activities=db_long.groupby(['obs_id', 'alt_id', 'act_label']).agg(start_time=('start_time','min'),duration=('duration','sum'),choice=('choice','mean'), prob_corr = ('prob_corr', 'mean')).reset_index()
df_activities['participation'] = 1
df_long = df_activities[['obs_id','alt_id','choice', 'prob_corr']].drop_duplicates()
for at in self.activities:
df_long = df_long.merge(df_activities[df_activities.act_label==at].drop(['choice','act_label', 'prob_corr'],axis=1).rename(columns={v : f'{at}:{v}' for v in self.variables}), how='left', on=['obs_id','alt_id'])
df_long.fillna(0,inplace=True)
for at in self.activities:
if at not in ['home', 'dawn', 'dusk']:
desired_st = DESIRED_TIMES[at]['desired_start_time']
desired_dur = DESIRED_TIMES[at]['desired_duration']
else:
desired_st = 0
desired_dur = 0
st_diff = (df_long[f'{at}:start_time'] - desired_st) * df_long[f'{at}:participation']
df_long[f'{at}:early'] = ((st_diff>=-12) & (st_diff<=0))*(-st_diff) + ((st_diff>=12) & (st_diff<=24))*(24-st_diff)
df_long[f'{at}:late'] = ((st_diff>=0)&(st_diff<12))*(st_diff) + ((st_diff>=-24) & (st_diff<-12))*(24+st_diff)
d_diff = (df_long[f'{at}:duration'] - desired_dur)* df_long[f'{at}:participation']
df_long[f'{at}:short'] = (d_diff<=0)*(-d_diff) + 0 #Add zero to make sure that every value is strictly positive
df_long[f'{at}:long'] = (d_diff>=0)*d_diff + 0 #Add zero to make sure that every value is strictly positive
#Add zero to make sure that every value is strictly positive
df_long = df_long.apply(lambda x: (x + 0) if (x.dtypes == 'float64') else x, axis=0)
#Convert from long to wide
df_wide = pd.DataFrame(df_long['obs_id'].drop_duplicates())
df_wide['choice'] = 0
for i in list(df_long.alt_id.unique()):
tmp = df_long[df_long.alt_id==i].rename(columns={c:f'{c}_{i}' for c in df_long.columns if c not in ['obs_id', 'alt_id', 'choice']})
df_wide = df_wide.merge(tmp.drop(['alt_id', 'choice'],axis=1), how='left', on=['obs_id'])
df_wide.choice += i*df_wide.choice[~df_wide.choice.isnull()]
df_wide.dropna(axis = 0, inplace = True)
#Delete temporary dataframes from namespace
del db_long, tmp, df_activities
return df_wide, df_long, formatted_test