Source code for bayesim.model

#import sys
#sys.path.append('../')
from bayesim.pmf import Pmf
import bayesim.params as pm
from bayesim.utils import calc_deltas, get_closest_val
import pandas as pd
import bayesim.hdf5io as dd
from copy import deepcopy
import numpy as np
import matplotlib.pyplot as plt
import random
import os
from joblib import Parallel, delayed, cpu_count
import time
import platform

[docs]class Model(object):
    """
    The main workhorse class of bayesim. Stores the modeled and observed data as well as a Pmf object which maintains the current probability distribution and grid subdivisions.

    Attributes:
    [update this]
    """

    def __init__(self,**argv):
        """
        Initialize with a uniform PMF over the fitting parameters.

        Args:
            obs_data_path (`str`): path to HDF5 file containing measured data
            load_state (`bool`): flag for whether to load state from a file - if True, other inputs (apart from state_file) are ignored
            state_file (`str`): path to file saved by save_state() fcn
            verbose (`bool`): flag for verbosity, defaults to False
            output_var (`str`): name of experimental output measurements
            params (:obj:`param_list`): Param_list object containing parameters to be fit and associated metadata
            ec_x_var (`str`): EC to plot on x-axis and consider in trimming data
            ec_list (:obj:`list` of :obj:`str`): names of experimental conditions
            ec_tols (`dict`): dict of form {ec_name_1:tolerance_1, ec_name_2:tolerance_2, ...}, will supersede ec_list
            ec_units (`dict`): dict of form {ec_name_1:units_1, ec_name_2:units_2, ...}, optional
            model_data_path (`str`): path to HDF5 file containing modeled data
            model_data_func (callable): handle to function for computing model data
        """
        verbose = argv.get('verbose', False)
        state_file = argv.get('state_file', 'bayesim_state.h5')
        load_state = argv.get('load_state', False)

        if load_state:
            if verbose:
                print('Loading bayesim state from %s...\n'%state_file)
            state = dd.load(state_file)

            # variables
            self.ec_pts = state['ec_pts']
            self.output_var = state['output_var']
            self.params = pm.Param_list(param_dict=state['params'])

            # probabilities
            self.probs = Pmf(prob_dict=state['probs'])

            # model
            self.model_data = state['model_data']
            self.model_data_grps = state['model_data_grps']
            self.model_data_ecgrps = state['model_data_ecgrps']
            self.needs_new_model_data = state['needs_new_model_data']
            self.obs_data = state['obs_data']
            self.is_run = state['is_run']

        else:
            if verbose:
                print('Constructing bayesim model object...\n')
            # initialize empty parameter list
            self.params = pm.Param_list()

            # if a param_list object has been provided, attach it
            if 'params' in argv.keys():
                self.attach_params(argv['params'])
            else:
                self.probs = Pmf()

            # check if output list is populated / attach output variable
            output_list = self.params.output
            if len(output_list)>0: # outputs are populated
                if len(output_list)>1 or ('output_var' in argv.keys() and not output_list[0].name==argv['output_var']):
                    raise ValueError('It seems you are trying to add more than one output variable. Sorry, the current version of bayesim supports only one type of output - this will be addressed in future versions!')
                else:
                    self.output_var = output_list[0].name # for now
            elif 'output_var' in argv.keys():
                self.params.add_output(name=argv['output_var'])
                self.output_var = argv['output_var'] # for now...eventually need to be general to multiple potential outputs
            else:
                raise ValueError("You need to define your output variable!")

            # attach EC's if provided explicitly, or infer from obs_data
            if len(set(['ec_list','ec_tols','ec_units']).intersection(set(argv.keys())))>0:
                self.attach_ecs(**argv)

            # set x-axis EC variable
            if 'ec_x_var' in argv.keys():
                self.params.set_ec_x(argv['ec_x_var'])

            # attach observed data if provided
            if 'obs_data_path' in argv.keys():
                self.attach_observations(**argv)
            else:
                self.ec_pts = pd.DataFrame()
                self.obs_data = pd.DataFrame

            # attach modeled data if provided
            if 'model_data_path' in argv.keys():
                self.attach_model(mode='file', **argv)
            elif 'model_data_func' in argv.keys():
                self.attach_model(mode='function', **argv)
            else:
                self.model_data = pd.DataFrame()
                self.model_data_grps = []
                self.model_data_ecgrps = []
                self.needs_new_model_data = True

            # run flag
            self.is_run = False

[docs]    def attach_params(self, params):
        """Attach a param_list object."""
        if not self.params.is_empty():
            print('Overwriting preexiting parameter list with this new one.\n')
        self.params = params
        self.probs = Pmf(params=params.fit_params)

[docs]    def attach_ecs(self, **argv):
        """
        Define parameters for experimental conditions.

        Args:
            ec_list (:obj:`list` of :obj:`str`): names of experimental conditions
            ec_tols (`dict`): dict of form {ec_name_1:tolerance_1, ec_name_2:tolerance_2, ...}, will supersede ec_list
            ec_units (`dict`): dict of form {ec_name_1:units_1, ec_name_2:units_2, ...}, optional
        """
        ec_names = []
        tol_dict = {}
        unit_dict = {}
        if 'ec_tols' in argv.keys():
            tol_dict = argv['ec_tols']
            ec_names = tol_dict.keys()
        if 'ec_units' in argv.keys():
            unit_dict = argv['ec_units']
            if ec_names==[]:
                ec_names = unit_dict.keys()
        if 'ec_list' in argv.keys():
            if ec_names==[]:
                ec_names = argv['ec_list']

        for name in ec_names:
            args = {'name':name}
            if name in tol_dict.keys():
                args['tolerance'] = tol_dict[name]
            if name in unit_dict.keys():
                args['units'] = unit_dict[name]
            self.params.add_ec(**args)

[docs]    def attach_fit_params(self,params):
        """
        Attach list of parameters to fit.

        Args:
            param_list: list of Fit_param objects
        """
        for param in params:
            self.params.add_fit_param(param=param)

[docs]    def attach_observations(self, **argv):
        """
        Attach measured dataset.

        Args:
            obs_data_path (`str`): path to HDF5 file containing observed data
            keep_all (`bool`): whether to keep all the data in the file (longer simulation times) or to clip out data points that are close to each other (defaults to False)
            ec_x_var (`str`): required if keep_all is False, the experimental condition over which to measure differences (e.g. V for JV(Ti) curves in PV). It will also be used in plotting later.
            max_ec_x_step (`float`): used if keep_all is False, largest step to take in the ec_x_var before keeping a point even if curve if "flat" (defaults to 0.05 * range of ec_x_var)
            thresh_dif_frac (`float`): used if keep_all is False, threshold (as a percentage of the range of values, defaults to 0.01)
            fixed_unc (`float`): required if running in function mode or if file doesn't have an 'uncertainty' column, value to use as uncertainty in measurement
            output_column (`str`): optional, header of column containing output data (required if different from self.output_var)
            verbose (`bool`): flag for verbosity, defaults to False
        """
        output_col = argv.get('output_column', self.output_var)
        keep_all = argv.get('keep_all', False)
        thresh_dif_frac = argv.get('thresh_dif_frac', 0.01)
        verbose = argv.get('verbose', False)

        if 'ec_x_var' in argv.keys():
            self.params.set_ec_x(argv['ec_x_var'])
        elif not keep_all and self.params.ec_x_name==None:
            raise NameError('You must specify ec_x_var if you want to throw out data points that are too close together.')

        if verbose:
            print('Attaching measured data...\n')

        self.obs_data = dd.load(argv['obs_data_path'])
        # get EC names if necessary
        cols = list(self.obs_data.columns)
        if output_col not in cols:
            raise NameError('Your output variable name, %s, is not the name of a column in your input data!' %(output_col))
            return
        elif 'fixed_unc' not in argv.keys() and 'uncertainty' not in cols:
            raise NameError('You need to either provide a value for fixed_unc or your measurement data needs to have an uncertainty column!')
            return
        else:
            cols.remove(output_col)
            if len(self.params.ecs)==0 or ('ec_x_var' in argv.keys() and len(self.params.ecs)==1):
                print('Determining experimental conditions from observed data...\n')
                for c in cols:
                    if not c=='uncertainty':
                        # 1% of the smallest difference between values
                        tol=0.01*min(abs(np.diff(list(set(self.obs_data[c])))))
                        if c==argv['ec_x_var']:
                            self.params.set_tolerance(c, tol)
                        else:
                            self.params.add_ec(name=c, tolerance=tol)
                print('Identified experimental conditions as %s. (If this is wrong, rerun and explicitly specify them with attach_ec (make sure they match data file columns) or remove extra columns from data file.)\n' %(str(self.params.param_names('ec'))))

            # these next bits used to be under an else...
            if 'uncertainty' not in cols:
                self.obs_data['uncertainty'] = argv['fixed_unc']*np.ones(len(self.obs_data))
                cols.extend('uncertainty')
                #print(self.obs_data.head())
                self.params.set_tolerance(self.output_var, 0.01*argv['fixed_unc'])
            else: # set tolerance to 1% of minimum uncertainty
                self.params.set_tolerance(self.output_var, 0.01*min(self.obs_data['uncertainty']))

            if set(cols) == set(self.ec_names()+['uncertainty']):
                pass # all good
            elif set(self.ec_names()+['uncertainty']) <= set(cols):
                print('Ignoring extra columns in data file: %s\n'%(str(list(set(cols)-set(self.ec_names())))))
            elif set(cols) <= set(self.ec_names()+['uncertainty']):
                print('These experimental conditions were missing from your data file: %s\nProceeding assuming that %s is the full set of experimental conditions...\n'%(str(list(set(ec)-set(cols))), str(cols)))
                for c in [c for c in cols if not c=='uncertainty']:
                    self.params.add_ec(name=c)
            # ...else ended here

        # pick out rows to keep - the way to do this thresholding should probably be tweaked
        if not keep_all:
            if verbose:
                print('Choosing which measured data to keep...')
            other_ecs = [ec for ec in self.ec_names() if not ec==self.params.ec_x_name]
            obs_data_grps = self.obs_data.groupby(by=other_ecs)
            for grp in obs_data_grps.groups.keys():
                subset = deepcopy(self.obs_data.loc[obs_data_grps.groups[grp]]).sort_values(self.params.ec_x_name)
                if 'max_ec_x_step' in argv.keys():
                    max_step = argv['max_ec_x_step']
                else:
                    max_step = 0.1 * max(subset[self.params.ec_x_name]-min(subset[self.params.ec_x_name]))
                    print('Using %.2f as the maximum step size in %s when choosing observation points to keep at %s=%s.\n'%(max_step, self.params.ec_x_name, other_ecs, grp))
                thresh = thresh_dif_frac * (max(subset[self.output_var])-min(subset[self.output_var]))
                i = 0
                while i < len(subset)-1:
                    this_pt = subset.iloc[i]
                    next_pt = subset.iloc[i+1]
                    if next_pt[self.params.ec_x_name]-this_pt[self.params.ec_x_name] >= max_step:
                        i = i+1
                    elif next_pt[self.output_var]-this_pt[self.output_var] < thresh:
                        subset.drop(next_pt.name,inplace=True)
                        self.obs_data.drop(next_pt.name,inplace=True)
                    else:
                        i = i+1

        # round EC values
        rd_dct = {c.name:c.tol_digits for c in self.params.ecs}
        self.obs_data = self.obs_data.round(rd_dct)

        # sort observed data
        self.obs_data.sort_values(by=self.ec_names(), inplace=True)

        # populate list of EC points
        if len(self.params.ecs)==1: # it's finicky in this case
            self.ec_pts = pd.DataFrame()
            self.ec_pts[self.ec_names()[0]] = self.obs_data.groupby(self.ec_names()).groups.keys()
        else:
            self.ec_pts = pd.DataFrame.from_records(data=[list(k) for k in self.obs_data.groupby(self.ec_names()).groups.keys()], columns=self.ec_names())

        self.ec_pts = self.ec_pts.sort_values(self.ec_names()).reset_index(drop=True)

[docs]    def check_data_columns(self,**argv):
        """
        Make sure the columns in imported data make sense.

        Args:
            model_data (`DataFrame`): dataset to check
            output_column (`str`): optional, header of column containing output data (required if different from self.output_var)
        """
        model_data = argv['model_data']
        output_col = argv['output_column']

        cols = list(model_data.columns)
        # first, check that the output is there
        if output_col not in cols:
            raise NameError('Your output variable name, %s, is not the name of a column in your model data!' %(output_col))
            return
        else:
            cols.remove(output_col)
            # next, that all the experimental conditions are there
            for c in self.ec_names():
                if c not in cols:
                    raise NameError('Experimental condition %s is not the name of a column in your model data!' %(c))
                    return
                else:
                    cols.remove(c)
            # if param_names has been populated, check if they match
            if not self.fit_param_names() == []:
                if set(self.fit_param_names())==set(cols):
                    pass # all good
                elif set(self.fit_param_names()) <= set(cols):
                    print('Ignoring extra columns in model data file: %s\n'%(str(list(set(cols)-set(self.fit_param_names())))))
                elif set(cols) <= set(self.fit_param_names()):
                    print('These experimental conditions were missing from your model data file: %s\nProceeding assuming that %s is the full set of experimental conditions...\n'%(str(list(set(self.fit_param_names())-set(cols))), str(cols)))
            else:
                print("Determining fitting parameters from modeled data...\n")
                for c in cols:
                    if not c=='uncertainty':
                        vals = list(set(model_data[c]))
                        self.params.add_fit_param(name=c, vals=vals)
                print("Found fitting parameters: %s"%self.fit_param_names())

[docs]    def check_ecs(self,**argv):
        """
        Check that all experimental conditions are present at each parameter point in modeled data.

        Args:
            gb (:obj:`groupby`): Pandas groupby object of model data grouped by parameter points
            verbose (`bool`): flag for verbosity, defaults to False
        """
        verbose = argv.get('verbose', False)
        if verbose:
            print('Checking that modeled data contains all experimental conditions at every combination of fit parameters...\n')

        grps = argv['gb']
        # then check at each model point that they match
        for name,group in grps:
            #print(self.ec_pts,group[self.ec_names()].sort_values(self.ec_names()).reset_index(drop=True))
            # specifically, we check that EC pts is a SUBSET of model EC's at each parameter space point
            ec_inds = self.ec_pts.index
            if not all(self.ec_pts==group[self.ec_names()].sort_values(self.ec_names()).reset_index(drop=True).loc[ec_inds]):
                # FIX MEEEEE
                #raise ValueError('The experimental conditions do not match to %d digits between the observed and modeled data at the modeled parameter space point %s!'%(self.ec_tol_digits,name))
                print('there is a problem I need to fix the error message for!')
                return

[docs]    def calc_indices(self):
        """
        Compute starting and ending indices in self.model_data for each point in self.probs.
        """
        start_indices = np.zeros(len(self.probs.points),dtype=int)
        end_indices = np.zeros(len(self.probs.points),dtype=int)
        #print(list(self.model_data_grps.groups.keys())[:5])
        for pt in self.probs.points.iterrows():
            #print(pt[1])
            #print(tuple(pt[1][self.fit_param_names()].tolist()))
            subset_inds = self.model_data_grps.groups[tuple(pt[1][self.fit_param_names()].tolist())]
            if len(subset_inds)==0:
                print('Something went wrong calculating sim indices! Could not find any points in model data for params %s.'%pt)
            start_ind = int(min(subset_inds))
            end_ind = int(max(subset_inds))
            start_indices[pt[0]] = start_ind
            end_indices[pt[0]] = end_ind
        self.probs.points['start_ind'] = start_indices
        self.probs.points['end_ind'] = end_indices

[docs]    def attach_model(self,**argv):
        """
        Attach the model for the data, either by feeding in a file of precomputed data or a function that does the computing.

        Args:
            mode (`str`): 'file' or 'function'
            model_data_func (callable): if mode='function', provide function here
            model_data_path (`str`): if mode=='file', provide path to file
            output_column (`str`): optional, header of column containing output data (required if different from self.output_var)
            calc_model_unc (`bool`): whether to calculate model uncertainties as well, defaults to False
            verbose (`bool`): flag for verbosity, defaults to False
        """
        mode = argv['mode']
        output_col = argv.get('output_column',self.output_var)
        verbose = argv.get('verbose', False)
        calc_model_unc = argv.get('calc_model_unc', False)

        if verbose:
            print('Attaching simulated data...')

        if mode == 'file':
            # import and sort data on parameter values
            self.model_data = dd.load(argv['model_data_path'])

            # Check that columns match EC's and parameter names
            # (and determine parameter names if need be)
            self.check_data_columns(model_data=self.model_data, output_column=output_col)

            # now sort data by param names and EC's
            self.model_data = self.model_data.sort_values(self.fit_param_names()+self.ec_names()).reset_index(drop=True)

            # next get list of parameter space points
            param_points_grps = self.model_data.groupby(self.fit_param_names())
            param_points = pd.DataFrame.from_records(data=[list(k) for k in param_points_grps.groups.keys()],columns=self.fit_param_names()).sort_values(self.fit_param_names()).reset_index(drop=True)

            # if PMF has been populated, check that points match
            # this check is broken ("Can only compare identically-labeled...")
            # FIX ME
            """
            if not self.probs.is_empty:
                #print(len(param_points),len(self.probs.points[self.fit_param_names()]))
                if not all(param_points==self.probs.points[self.fit_param_names()]):
                    print('Your previously populated PMF does not have the same set of parameter space points as your model data. Proceeding using the points from the model data.')
                    self.probs = Pmf()
            """
            ## check that all EC's are present at all model points
            self.check_ecs(gb=param_points_grps, verbose=verbose)
            # turned this off temporarily...
            # FIX ME ("can only compare identically-labelled...")
            # (e.g. in 3.4.3 of nb8)
            
            # Generate self.probs if necessary
            if self.probs.is_empty:
                if verbose:
                    print('Initializing probability distribution...\n')
                # check that points are on a grid (the quick but slightly less certain way)
                param_lengths = [len(set(param_points[name])) for name in self.fit_param_names()]
                if not np.product(param_lengths)==len(param_points):
                    raise ValueError('Your modeled parameter space does not appear to be on a grid; the current version of bayesim can only handle initially gridded spaces (unless using a previously saved subdivided state).')
                    return
                else:
                    self.probs = Pmf(params=self.params.fit_params)

        elif mode=='function':
            # is there a way to save this (that could be saved to HDF5 too) so that subdivide can automatically call it?
            model_func = argv['model_data_func']

            # check that probs.points is populated...


            # iterate over parameter space and measured conditions to compute output at every point
            param_vecs = {p:[] for p in self.fit_param_names()}
            ec_vecs = {c:[] for c in self.ec_names()}
            model_vals = []

            for pt in self.probs.points.iterrows():
                param_vals = {p:pt[1][p] for p in self.fit_param_names()}
                for d in self.obs_data.iterrows():
                    ec_vals = {c:d[1][c] for c in self.ec_names()}
                    # add param and EC vals to the columns
                    for p in self.fit_param_names():
                        param_vecs[p].append(param_vals[p])
                    for c in self.ec_names():
                        ec_vecs[c].append(d[1][c])
                    # compute/look up the model data
                    # need to make sure that model_func takes in params and EC in appropriate format
                    model_vals.append(model_func(ec_vals,param_vals))

            # merge dictionaries together to put into a model data df
            vecs = deepcopy(param_vecs)
            vecs.update(ec_vecs)
            vecs.update({self.output_var:model_vals})

            self.model_data = pd.DataFrame.from_dict(vecs)

        # round EC's and generate groups
        rd_dct = {c.name:c.tol_digits for c in self.params.ecs}
        self.model_data = self.model_data.round(rd_dct)

        # round fit params - actually just force them to be members of the vals lists
        if verbose:
            print("Rounding model data...")
        for p in self.params.fit_params:
            #self.model_data[p.name] = [p.get_closest_val(val) for val in self.model_data[p.name]]
            self.model_data[p.name] = Parallel(n_jobs=cpu_count())(delayed(get_closest_val)(val, p.vals) for val in self.model_data[p.name])

        #print('At line 467...')
        #print(self.model_data.head())

        # drop any modeled data at EC's not measured
        # this was making mistakes previously so is commented out for now
        """
        if verbose:
            print("Removing modeled EC's that weren't measured...")
        self.model_data_ecgrps = self.model_data.groupby(self.ec_names())
        model_ecs = pd.DataFrame.from_records(data=list(self.model_data_ecgrps.groups.keys()), columns=self.ec_names()).sort_values(by=self.ec_names()).reset_index(drop=True)
        merged_ecs = model_ecs.merge(self.ec_pts, on=self.ec_names(), how='left', indicator=True)
        extra_pts = merged_ecs[merged_ecs._merge=='left_only']
        print('there are '+str(len(extra_pts))+' extra points...here are a few:')
        print(extra_pts.head())
        extra_data = merged_ecs[merged_ecs._merge=='right_only']
        if len(extra_data)>0:
            print("There seem to be data EC's that weren't modeled...")
            print(extra_data)
        print(extra_pts.index)
        self.model_data.drop(labels=extra_pts.index, axis=0, inplace=True)
        """
        #print('at line 485...')
        #print(self.model_data.head())
        
        # reset index to avoid duplication
        self.model_data.reset_index(inplace=True,drop=True)

        # (re-)generate groups
        if verbose:
            print("Grouping model data...")
        self.model_data_ecgrps = self.model_data.groupby(self.ec_names())
        self.model_data_grps = self.model_data.groupby(by=self.fit_param_names())

        # update flag and indices
        self.needs_new_model_data = False
        if verbose:
            print("Indexing model data...")
        self.calc_indices()

        if calc_model_unc:
            self.calc_model_unc(**argv)

[docs]    def comparison_plot(self,**argv):
        """
        Plot observed data vs. highest-probability modeled data.

        Args:
            ec_vals (`dict`): optional, dict of EC values at which to plot. If not provided, they will be chosen randomly. This can also be a list of dicts for multiple points.
            num_ecs (`int`): number of EC values to plot, defaults to 1 (ignored if ecs is provided)
            num_param_pts (`int`): number of the most probable parameter space points to plot (defaults to 1)
            ec_x_var (`str`): one of self.ec_names, will overwrite if this was provided before in attach_observations, required if it wasn't. If ec was provided, this will supercede that
            return_avg_err (bool): whether or not to return average (absolute) error for highest probability model over all EC's plotted (defaults to False)
            fpath (`str`): optional, path to save image to if desired
        """
        if 'ec_x_var' in argv.keys():
            self.params.set_ec_x(argv['ec_x_var'])

        if self.params.ec_x_name==None:
            print('You have not provided an x-variable from your experimental conditions against which to plot. Choosing the first one in the list, %s.\n'%(self.ec_names()[0]))
            self.params.set_ec_x(self.ec_names()[0])

        return_avg_err = argv.get('return_avg_err', False)
        all_errs = []

        other_ecs = [c for c in self.params.ecs if not c.name==self.params.ec_x_name]
        #print(self.params.ec_x_name, [c.name for c in other_ecs])

        # read in options and do some sanity checks
        num_ecs = argv.get('num_ecs',1)

        if len(other_ecs)==0: #only one experimental condition...
            one_ec = True
            ec_vals = [0] #placeholder, basically - this is klunky
        else: # more than one
            one_ec = False
            if 'ec_vals' in argv.keys():
                ec_vals = argv['ec_vals']
                if not (isinstance(ec_vals,list) or isinstance(ec_vals,np.ndarray)):
                    ec_vals = [ec_vals]
            else:
                ec_vals = []
                ec_pts = random.sample(list(self.model_data.groupby([c.name for c in other_ecs]).groups.keys()), num_ecs)

                for pt in ec_pts:
                    if len(other_ecs)==1: #pt will just be a float rather than a tuple
                        ec_vals.append({other_ecs[0].name:pt})
                    else:
                        ec_vals.append({other_ecs[i].name:pt[i] for i in range(len(pt))})

        num_param_pts = argv.get('num_param_pts',1)

        # need to fix this check
        #if self.params.ec_x_name in ecs.keys():
            #print('You provided a fixed value for your x variable, ignoring that and plotting the full range.')
            #del ecs[self.params.ec_x_name]

        # get parameter points for which to plot data
        param_pts = self.probs.most_probable(num_param_pts)

        # set up subplots
        fig, axs = plt.subplots(len(ec_vals), 2, figsize=(13,4*len(ec_vals)), squeeze=False)

        # get color cycle
        prop_cycle = plt.rcParams['axes.prop_cycle']
        colors = prop_cycle.by_key()['color']

        # at each set of conditions...
        for i in range(len(ec_vals)):
            obs_data = self.obs_data
            plot_title = 'Comparison  '
            if not one_ec:
                plot_title = plot_title[:-1] + 'at '
                ecs_here = ec_vals[i]
                # get obs data for correct EC values
                #print(ecs_here, obs_data.head())
                for c in other_ecs:
                    obs_data =  obs_data[abs(obs_data[c.name]-ecs_here[c.name])<=10.**(-1.*c.tol_digits)]
                    plot_title = plot_title + '%s=%s, '%(c.name,c.get_val_str(ecs_here[c.name]))
            obs_data = obs_data.sort_values(by=[self.params.ec_x_name])
            # plot obs data
            axs[i,0].plot(obs_data[self.params.ec_x_name], obs_data[self.output_var], color=colors[0])

            legend_list = ['observed']
            c_ind = 1
            # plot modeled data and errors
            for pt in param_pts.iterrows():
                color = colors[c_ind]
                model_data = deepcopy(self.model_data.loc[self.model_data_grps.groups[tuple([pt[1][n] for n in self.fit_param_names()])]])
                if not one_ec:
                    for c in other_ecs:
                        model_data =  model_data[abs(model_data[c.name]-ecs_here[c.name])<=10.**(-1.*c.tol_digits)]
                model_data.sort_values(by=[self.params.ec_x_name], inplace=True)
                model_data.reset_index(drop=True, inplace=True)
                # get the modeled values only at the observed points
                x_vals = list(obs_data[self.params.ec_x_name])
                model_data_drop = [i for i in range(len(model_data)) if model_data.loc[i][self.params.ec_x_name] not in x_vals]
                model_data.drop(labels=model_data_drop, axis=0, inplace=True)
                errors = np.subtract(model_data[self.output_var], obs_data[self.output_var])
                if c_ind==1: #take errors from highest probability
                    all_errs.extend([abs(e) for e in errors])

                axs[i,0].plot(model_data[self.params.ec_x_name], model_data[self.output_var], color=color)
                axs[i,1].plot(model_data[self.params.ec_x_name], errors, color=color)
                leg_label = 'modeled: '
                for p in self.params.fit_params:
                    leg_label = leg_label + '%s=%s, '%(p.name, p.get_val_str(pt[1][p.name]))
                leg_label = leg_label[:-2]
                legend_list.append(leg_label)
                c_ind = c_ind+1

            # set ylims to match observed data and label axes
            obs_max = max(obs_data[self.output_var])
            obs_min = min(obs_data[self.output_var])
            obs_width = obs_max-obs_min
            axs[i,0].set_ylim([obs_min-0.05*obs_width, obs_max+0.05*obs_width])
            xvar = self.params.get_ec_x()
            axs[i,0].set_xlabel('%s [%s]' %(xvar.name, xvar.units))
            axs[i,1].set_xlabel('%s [%s]' %(xvar.name, xvar.units))
            yvar = self.params.find_param(self.output_var)
            axs[i,0].set_ylabel('%s [%s]' %(yvar.name, yvar.units))
            axs[i,1].set_ylabel('%s [%s]' %('$\Delta$'+yvar.name, yvar.units))
            axs[i,0].legend(legend_list, fontsize=14)
            plot_title = plot_title[:-2]
            axs[i,0].set_title(plot_title, fontsize=20)
            if one_ec:
                err_title = 'Errors'
            else:
                err_title = plot_title + ': errors'
            axs[i,1].set_title(err_title, fontsize=20)

            for j in [0,1]:
                for item in ([axs[i][j].xaxis.label, axs[i][j].yaxis.label] + axs[i][j].get_xticklabels() + axs[i][j].get_yticklabels()):
                    item.set_fontsize(18)

        plt.tight_layout()
        if 'fpath' in argv.keys():
            plt.savefig(argv['fpath'])

        if return_avg_err:
            return np.mean(all_errs)

[docs]    def run(self, **argv):
        """
        Do Bayes!
        Will stop iterating through observations if/when >= th_pm of probability mass is concentrated in <= th_pv of boxes and decide it's time to subdivide. (completely arbitrary thresholding for now)

        Args:
            save_step (`int`): interval (number of data points) at which to save intermediate PMF's (defaults to 10, 0 to save only final, <0 to save none)
            th_pm (`float`): threshold quantity of probability mass to be concentrated in th_pv fraction of parameter space to trigger the run to stop (defaults to 0.9)
            th_pv (`float`): threshold fraction of parameter space volume for th_pm fraction of probability to be concentrated into to trigger the run to stop (defaults to 0.05)
            min_num_pts (`int`): minimum number of observation points to use - if threshold is reached before this number of points has been used, it will start over and the final PMF will be the average of the number of runs needed to use sufficient points (defaults to 0.7 * the number of experimental measurements)
            prob_relax (`float`): number from 0 to 1.0, fraction of PMF from previous step to mix into prior for this step (defaults to 0) - higher values will likely converge faster but possibly have larger errors, especially if min_num_pts is small
            verbose (`bool`): flag for verbosity, defaults to False
        """
        # read in options
        save_step = argv.get('save_step', 10)
        th_pm = argv.get('th_pm', 0.9)
        th_pv = argv.get('th_pv', 0.05)
        min_num_pts = argv.get('min_num_pts', int(0.7*len(self.obs_data)))
        verbose = argv.get('verbose', False)
        rel = argv.get('prob_relax', 0.0)
        if rel < 0 or rel > 1.0:
            print("Relaxation parameter must be between 0 and 1.0 - defaulting to 0.")
            rel = 0
        if verbose:
            print('Running inference!\n')

        if min_num_pts > len(self.obs_data):
            print('Cannot use more observation points than there are in the data. Setting min_num_pts to len(self.obs_data)=%d\n'%len(self.obs_data))
            min_num_pts = len(self.obs_data)

        # do some sanity checks
        if self.needs_new_model_data:
            raise ValueError('Oops, you need to attach model data before running!')
            return

        if self.is_run:
            print('Running again at the same subdivision level. Previous results may be overridden...\n')

        # set up folders for intermediate files
        if save_step>0:
            folder = os.getcwd()
            pmf_folder = folder+'/PMFs/'
            obs_list_folder = folder+'/obs_lists/'
            for fp in [pmf_folder,obs_list_folder]:
                if not os.path.isdir(fp):
                    os.mkdir(fp)

        # rather than uniformizing, averaging the previous probs with a quasi-uniform
        old_probs = deepcopy(self.probs)
        uni_probs = deepcopy(self.probs)
        uni_probs.uniformize()
        start_probs = deepcopy(self.probs)
        start_probs.points['prob'] = rel*old_probs.points['prob'] + (1-rel)*uni_probs.points['prob']
        start_probs.normalize()

        # randomize observation order
        self.obs_data = self.obs_data.sample(frac=1)
        num_pts_used = 0
        num_runs = 0
        probs_lists = []
        delta_count_list = []
        nan_count_list = []
        skip_total = 0
        while num_pts_used < min_num_pts:
            prev_used_pts = num_pts_used
            num_runs = num_runs + 1
            obs_indices = []
            self.probs = deepcopy(start_probs)
            done=False
            
            while not done:
                # check if we've gone through all the points
                if num_pts_used == len(self.obs_data):
                    print('Used all the observed data! Last PMF to go into average may have been further from threshold condition.\n')
                    done = True
                else:
                    # get observed and modeled data
                    obs = self.obs_data.iloc[num_pts_used]
                    #print(obs)
                    obs_indices.append(num_pts_used)
                    ec = obs[self.ec_names()]
                    if len(self.ec_names())>1:
                        ecpt = tuple([ec[n] for n in self.ec_names()])
                    else:
                        ecpt = float(ec)
                    #print(ecpt)
                    #print(self.model_data_ecgrps.groups.keys())
                    model_here = deepcopy(self.model_data.loc[self.model_data_ecgrps.groups[ecpt]])
                    #print(len(model_here))

                    # compute likelihood and do a Bayesian update
                    #print(obs, model_here)
                    #print()
                    lkl, delta_count, nan_count, skip_count = self.probs.likelihood(meas=obs, model_at_ec=model_here, output_col=self.output_var, verbose=verbose)
                    self.probs.multiply(lkl)
                    num_pts_used = num_pts_used + 1
                    delta_count_list.append(delta_count)
                    nan_count_list.append(nan_count)
                    skip_total = skip_total + skip_count

                    # save intermediate PMF if necessary
                    if save_step >0 and (num_pts_used-prev_used_pts) % save_step == 0:
                        dd.save(pmf_folder+'sub%d_run%d_PMF_%d.h5'%(self.probs.num_sub,num_runs,num_pts_used-prev_used_pts),self.probs.points)

                    # check if threshold probability concentration has been reached
                    if np.sum(self.probs.most_probable(int(th_pv*len(self.probs.points)))['prob'])>th_pm:
                        done = True
                        print('done, points so far: '+str(num_pts_used))
                        
                if done:
                    probs_lists.append(np.array(self.probs.points['prob']))
                    if save_step >= 0:
                        dd.save(pmf_folder+'sub%d_run%d_PMF_final.h5'%(self.probs.num_sub,num_runs),self.probs.points)
                    at_threshold=True
                    dd.save(obs_list_folder+'sub%d_run%d_obs_list.h5'%(self.probs.num_sub,num_runs),self.obs_data.iloc[obs_indices])

    
        probs = np.mean(probs_lists,axis=0)
        self.probs.points['prob'] = probs
        print('Did a total of %d runs to use a total of %d observations.\n'%(num_runs,num_pts_used))

        print('\nAn average of %d / %d probability points had larger model uncertainty than experimental uncertainty during this run.\n'%(int(round(np.mean(delta_count_list))),len(self.probs.points)))

        print('\nAn average of %.2f / %d probability points were affected by missing/NaN simulation data.\n' %(np.mean(nan_count_list), len(self.probs.points)))
        print('\n%d points were skipped.\n' %skip_total)

        if save_step >=0:
            dd.save(pmf_folder+'sub%d_PMF_final.h5'%(self.probs.num_sub),self.probs.points)
        self.is_run = True

[docs]    def subdivide(self, **argv):
        """
        Subdivide the probability distribution and save the list of new sims to run to a file.

        Args:
            threshold_prob (`float`): minimum probability of box to (keep and) subdivide (default value is the uniform distribution probability)
            new_sim_list_fpath (`str`): filename for file containing list of new simulations to be run (optional)
        """
        threshold_prob = argv.get('threshold_prob',1.0/len(self.probs.points))
        filename = argv.get('new_sim_list_fpath', 'new_sim_points_%d.h5'%(self.probs.num_sub))
        new_boxes = self.probs.subdivide(threshold_prob)
        #dropped_inds = list(dropped_boxes.index)

        #self.fit_params = [p for p in self.probs.params]
        self.attach_fit_params(self.probs.params)

        # remove old model data
        self.model_data = pd.DataFrame()

        # update flags
        self.needs_new_model_data = True
        self.is_run = False
        #dd.save(filename,new_boxes)
        self.list_model_pts_to_run(fpath=filename)
        print('New model points to simulate are saved in the file %s.'%filename)

[docs]    def list_model_pts_to_run(self, fpath, **argv):
        """
        Generate full list of model points that need to be run (not just parameter points but also all experimental conditions). Saves to HDF5 at fpath.

        Note that this could be very slow if used on the initial grid (i.e. for potentially millions of points) - it's better for after a subdivide call.

        Args:
            fpath (`str`): path to save the list to (HDF5)
            verbose (`bool`): flag for verbosity, defaults to False
        """
        verbose = argv.get('verbose', False)
        if verbose:
            print('Listing the sets of simulation parameters that need to be run...\n')

        # get just the columns with the parameters
        param_pts = self.probs.points[self.fit_param_names()]

        # Now at every point in that list, make a row for every EC point
        param_inds = range(len(param_pts))
        ec_inds = range(len(self.ec_pts))
        columns = self.fit_param_names() + self.ec_names()
        pts = []
        for ppt in param_pts.iterrows():
            for ecpt in self.ec_pts.iterrows():
                pts.append(list(ppt[1])+list(ecpt[1]))
        sim_pts = pd.DataFrame(data=pts,columns=columns)
        dd.save(fpath,sim_pts)

[docs]    def calc_model_unc(self, **argv):
        """
        Calculates largest difference in modeled output along any parameter direction for each experimental condition, to be used for uncertainty in calculating likelihoods. Currently only works if data is on a grid.

        (also assumes it's sorted by param names and then EC's)

        Args:
            verbose (`bool`): flag for verbosity, defaults to False
            model_unc_factor (`float`): multiplier on deltas to give uncertainty, defaults to 0.5 - smaller probably means faster convergence, but also higher chance to miss "hot spots"
            take_average (`bool`): flag for whether to use average model uncertainty at each measurement condition or the parameter-resolved version. Defaults to True if any parameters are logarithmically spaced, and False otherwise.
            min_unc_frac (`float`): minimum uncertainty as a fraction of the output variable value, defaults to 0.01
            min_unc_val (`float`): minimum uncertainty as an absolute number, defaults to 0.0

        Note:
            If both `min_unc_frac` and `min_unc_val` are specified, the uncertainty will be set to the larger of the two in each case

        """

        factor = argv.get('model_unc_factor', 0.5)
        verbose = argv.get('verbose', False)
        if verbose:
            print('Calculating model uncertainty...')
        min_unc_frac = argv.get('min_unc_frac', 0.01)
        min_unc_val = argv.get('min_unc_val', 0.0)

        # check if any parameters are log-spaced
        has_log = False
        if any([p.spacing=='log' for p in self.params.fit_params]):
            has_log = True
        take_average = argv.get('take_average', has_log)

        param_lengths = [p.length for p in self.params.fit_params]

        start_time = time.time()

        deltas = np.zeros(len(self.model_data))

        if platform.system()=='Windows':
            # need to do it in serial
            deltas_list = []
            for grp in self.model_data_ecgrps.groups:
                deltas_list.append(calc_deltas(grp, self.model_data_ecgrps.groups[grp], param_lengths, self.model_data, self.fit_param_names(), self.probs, self.output_var, take_average))
        else:
            # parallelize!
            deltas_list = Parallel(n_jobs=cpu_count())(delayed(calc_deltas)(grp, self.model_data_ecgrps.groups[grp], param_lengths, self.model_data, self.fit_param_names(), self.probs, self.output_var, take_average) for grp in self.model_data_ecgrps.groups)


        for entry in deltas_list:
            inds = self.model_data_ecgrps.groups[entry[0]]
            deltas[inds] = entry[1]

        # check for NaNs
        if np.any(np.isnan(deltas)):
            # there is a problem
            raise ValueError("Some uncertainties came out to NaN! Help!")
            
        # multiply by the factor
        self.model_data['uncertainty'] = factor * deltas

        # adjust upward for any minimum values
        unc = []
        for row in self.model_data.iterrows():
            min_unc = max(min_unc_val, min_unc_frac*row[1][self.output_var])
            unc.append(max(min_unc, row[1]['uncertainty']))
        self.model_data['uncertainty'] = unc

        if verbose:
            print('Calculating model uncertainty took %.2f seconds.\n'%(time.time()-start_time))

[docs]    def save_state(self,filename='bayesim_state.h5'): #rewrite this!
        """
        Save the entire state of this model object to an HDF5 file so that work can be resumed later.
        """

        # construct a dict with all the state variables
        state = {}

        # parameters
        state['params'] = self.params.as_dict()
        state['ec_pts'] = self.ec_pts
        state['output_var'] = self.output_var

        # PMF
        state['probs'] = self.probs.as_dict()

        # model/data
        state['model_data'] = self.model_data
        state['model_data_grps'] = self.model_data_grps
        state['model_data_ecgrps'] = self.model_data_ecgrps
        state['needs_new_model_data'] = self.needs_new_model_data
        state['obs_data'] = self.obs_data
        state['is_run'] = self.is_run

        # save the file
        dd.save(filename,state)

[docs]    def visualize_grid(self,**argv):
        """
        Visualize the current state of the grid.

        Args:
            same as pmf.visualize()
        """
        self.probs.visualize(just_grid=True,**argv)

[docs]    def visualize_probs(self,**argv):
        """
        Visualize the PMF with a corner plot.

        Args:
            same as pmf.visualize()
        """
        self.probs.visualize(**argv)

# maybe I'll finish this eventually
    """
    def visualize_model_unc(self,**argv):

        #Visualize model uncertainty.

        #Args:
        #    ec_vals (`dict`): optional, dict of EC values at which to plot. If not provided, they will be chosen randomly. This can also be a list of dicts for multiple points.
        #    num_ecs (`int`): number of EC values to plot, defaults to 1 (ignored if ecs is provided)

        # read in options and do some sanity checks
        num_ecs = argv.get('num_ecs',1)
        if 'ec_vals' in argv.keys():
            ec_vals = argv['ec_vals']
            if not (isinstance(ec_vals,list) or isinstance(ec_vals,np.ndarray)):
                ec_vals = [ec_vals]
        else:
            ec_vals = []
            ec_pts_df = self.ec_pts.sample(num_ecs)
            ec_pts=[]
            for pt in ec_pts_df.iterrows():
                ec_vals.append({name:pt[1][name] for name in self.ec_names()})
                ec_pts.append(tuple(pt[1][self.ec_names()]))

        # set up subplots
        fig, axs = plt.subplots(len(ec_vals), figsize=(13,4*len(ec_vals)), squeeze=False)

        for i in range(len(ec_vals)):
            ecs_here = ec_vals[i]
            plot_title = ''
            for c in self.params.ecs:
                plot_title = plot_title + '%s=%s, '%(c.name,c.get_val_str(ecs_here[c.name]))
            subset = self.model_data.loc[self.model_data_ecgrps[tuple([ecs_here[n] for n in self.ec_names()])]]
            dense_grid = probs.populate_dense_grid(df=subset, col_to_pull='uncerainty', make_ind_lists=False)
            mat = dense_grid['mat']
            axs[i].imshow(mat)
            axs[i].set_title(plot_title)
    """

[docs]    def top_probs(self, num):
        """Return a DataFrame with the 'num' most probable points and some of the less interesting columns hidden."""
        df = self.probs.most_probable(num)
        cols = ['prob'] + [c for cs in [[p.name, p.name+'_min', p.name+'_max'] for p in self.params.fit_params] for c in cs]
        return df[cols]

[docs]    def set_param_info(self, param_name, **argv):
        """
        Set additional info for parameter param_name (any type).

        Args:
            param_name (str): name of parameter to modify
            units (str): units of parameter
            min_width (float): minimum width of parameter (only for fitting params)
            display_name (str): name to use on plots (can include TeX)
            tolerance (float): tolerance for this parameter
        """
        assert param_name in [p.name for p in self.params.all_params()], "I can't set info for a parameter (%s) that doesn't exist!"%param_name

        if 'units' in argv.keys():
            for i in range(len(self.params.fit_params)):
                if self.params.fit_params[i].name==param_name:
                    self.params.fit_params[i].units = argv['units']
            for i in range(len(self.params.ecs)):
                if self.params.ecs[i].name==param_name:
                    self.params.ecs[i].units = argv['units']
            for i in range(len(self.params.output)):
                if self.params.output[i].name==param_name:
                    self.params.output[i].units = argv['units']

        if 'min_width' in argv.keys():
            for i in range(len(self.params.fit_params)):
                if self.params.fit_params[i].name==param_name:
                    self.params.fit_params[i].min_width = argv['min_width']

        if 'display_name' in argv.keys():
            for i in range(len(self.params.fit_params)):
                if self.params.fit_params[i].name==param_name:
                    self.params.fit_params[i].display_name = argv['display_name']
            for i in range(len(self.params.ecs)):
                if self.params.ecs[i].name==param_name:
                    self.params.ecs[i].display_name = argv['display_name']
            for i in range(len(self.params.output)):
                if self.params.output[i].name==param_name:
                    self.params.output[i].display_name = argv['display_name']

        # pass these through to PMF since it doesn't always seem to happen automatically...
        if param_name in self.probs.param_names():
            for i in range(len(self.probs.params)):
                if self.probs.params[i].name==param_name:
                    if 'units' in argv.keys():
                        self.probs.params[i].units = argv['units']
                    if 'min_width' in argv.keys():
                        self.probs.params[i].min_width = argv['min_width']
                    if 'display_name' in argv.keys():
                        self.probs.params[i].display_name = argv['display_name']


[docs]    def ec_names(self):
        """Return list of experimental condition names."""
        return self.params.param_names('ec')

[docs]    def fit_param_names(self):
        """Return list of fitting parameter names."""
        return self.params.param_names('fit')