Source code for bonsai_ipcc.sample

import logging

import numpy as np
import pandas as pd

from .uncertainties import monte_carlo, sample_dirichlet

logger = logging.getLogger(__name__)


[docs] def create_sample(dfs, constraints=None, size=1000): """Add sample to parameter tables. For parameter tables in which values are related and constraint, Dirichlet distribution is used for samples. For other parameter tables, sample is determined independently using the best fit distribution. Argument -------- dfs : dict key is name of paramter, value is df constraints: dict key is name of parameter, value is dict with group_py: ["year", "region", "product"] size : int sample size Returns ------- dict of dfs revised parameter tables with sample array """ modified_dfs = {} for k in dfs.keys(): df = dfs[k].copy() # Use Dirichlet for constraint parameters (values sum up across a level) if constraints: if k in constraints.keys(): levels_core = constraints[f"{k}"]["group_by"] levels_to_drop = [ level for level in df.index.names if level not in levels_core ] level_to_sum = list(df.index.names) levels_not_to_sum = levels_core + ["property"] for item in levels_not_to_sum: if item in level_to_sum: level_to_sum.remove(item) if len(level_to_sum) != 1: raise ValueError(f"more then one level to sum in '{k}'") level_to_sum = level_to_sum[0] tmp = np.empty((0, size)) for coords in df.droplevel(levels_to_drop).index.unique(): if "property" in df.index.names: level_values_to_sum = ( df.loc[tuple(coords)] .index.get_level_values(level_to_sum) .unique() ) default = [] max95 = [] abs_max = [] for l in level_values_to_sum: _df = df.loc[tuple(coords) + (f"{l}",)] if not _df.empty: default.append( df.loc[ tuple(coords) + ( f"{l}", "def", ) ].value ) max95.append( df.loc[ tuple(coords) + ( f"{l}", "max", ) ].value ) abs_max.append( df.loc[ tuple(coords) + ( f"{l}", "abs_max", ) ].value ) sample = sample_dirichlet(default, max95, abs_max, size=size) tmp = np.vstack((tmp, sample)) for s, coords in enumerate(df.droplevel("property").index.unique()): u = df.loc[tuple(coords) + ("max",)].unit df["value"] = df["value"].astype(object) new_index = df.index.append( pd.MultiIndex.from_tuples( [tuple(coords) + ("sample",)], names=df.index.names ) ) df = df.reindex(new_index) df.at[tuple(coords) + ("sample",), "value"] = tmp[s] df.at[tuple(coords) + ("sample",), "unit"] = u modified_dfs[f"{k}"] = df logger.info( f"Dirichlet distribution used for parameter '{k}' and coords '{coords}'" ) # For other parameters chose best fit else: for coords in df.droplevel("property").index.unique(): if "property" in df.index.names: d = df.loc[tuple(coords) + ("def",)].value min_val = df.loc[tuple(coords) + ("min",)].value max_val = df.loc[tuple(coords) + ("max",)].value abs_min = df.loc[tuple(coords) + ("abs_min",)].value abs_max = df.loc[tuple(coords) + ("abs_max",)].value logger.info( f"Uncertainty distribution for parameter '{k}' and coords '{coords}' is:" ) tmp = monte_carlo( default=d, min95=min_val, max95=max_val, abs_min=abs_min, abs_max=abs_max, size=size, distribution="check", ) u = df.loc[tuple(coords) + ("max",)].unit df["value"] = df["value"].astype(object) new_index = df.index.append( pd.MultiIndex.from_tuples( [tuple(coords) + ("sample",)], names=df.index.names ) ) df = df.reindex(new_index) df.at[tuple(coords) + ("sample",), "value"] = tmp df.at[tuple(coords) + ("sample",), "unit"] = u modified_dfs[f"{k}"] = df else: raise ValueError return modified_dfs