diff --git a/validphys2/src/validphys/config.py b/validphys2/src/validphys/config.py index 113e46af4b..ae7390e02a 100644 --- a/validphys2/src/validphys/config.py +++ b/validphys2/src/validphys/config.py @@ -46,7 +46,8 @@ LoadFailedError, PDFNotFound, ) -from validphys.paramfits.config import ParamfitsConfig + +pass from validphys.plotoptions.core import get_info import validphys.scalevariations @@ -1739,5 +1740,5 @@ def produce_total_phi_data(self, fitthcovmat): return validphys.results.dataset_inputs_phi_data -class Config(report.Config, CoreConfig, ParamfitsConfig): +class Config(report.Config, CoreConfig): """The effective configuration parser class.""" diff --git a/validphys2/src/validphys/paramfits/__init__.py b/validphys2/src/validphys/paramfits/__init__.py deleted file mode 100644 index 35afd7cfdc..0000000000 --- a/validphys2/src/validphys/paramfits/__init__.py +++ /dev/null @@ -1,11 +0,0 @@ -""" -paramfits - - -Functionality to determine parameters from a scan over -PDF fits. αs is so far the only example. - -This package contains high level plots and tables, with the corresponding -data analysis and aggregation routines. The low level functionality is provided -by chi2grids.py. -""" \ No newline at end of file diff --git a/validphys2/src/validphys/paramfits/config.py b/validphys2/src/validphys/paramfits/config.py deleted file mode 100644 index 11158b84c9..0000000000 --- a/validphys2/src/validphys/paramfits/config.py +++ /dev/null @@ -1,444 +0,0 @@ -""" -Configuration class for the paramfits module -""" -import re -from collections.abc import Mapping, Sequence - -from reportengine.configparser import Config, ConfigError, element_of - -from validphys import tableloader, utils -from validphys.loader import LoaderError - -class ParamfitsConfig(Config): - def _get_table(self, loader_func, fname, config_rel_path): - #TODO: This is here because I am extremely unconvinced it is the - #right interface. There should be something more specific at the - #reportengine level. There is some undeniable ugliness in referencing - #self.loader, which does not exist, but it is still better than - #creating a base class in an isolated file to avoid the circular import. - try: - res = self.loader.check_vp_output_file(fname.strip(), - extra_paths=['.', config_rel_path]) - except LoaderError as e: - raise ConfigError(e) from e - - try: - df = loader_func(res) - except Exception as e: - raise ConfigError(e) from e - return df - - - #TODO: Get rid of this - def produce_fits_pdf_config(self, fits): - """DO NOT USE. For internal use only,""" - return [self.produce_fitpdf(fit)['pdf'] for fit in fits] - - #TODO: Try to remove the loop from here - def produce_fits_name(self, fits): - """NOTE: EXPERIMENTAL. - Return a list with the ids of the fits""" - return [fit.name for fit in fits] - - #TODO: Try to remove the loop from here - def produce_fits_as(self, fits_pdf_config): - """NOTE: EXPERIMENTAL. Return the as value of the fits, reading - it from the installed pdf""" - return [pdf.alphas_mz for pdf in fits_pdf_config] - - #TODO: Try to remove the loop from here. - def produce_fits_as_from_fitdeclarations(self, fitdeclarations): - """NOTE: EXPERIMENTAL. A hack to obtain fits_as from the - fitdeclarations, without having to - download and inspect the actual fits.""" - alpha_pattern = r'NNPDF\d\d(?:_[a-z]+)*_as_(\d\d\d\d).*' - res = [] - for fit in fitdeclarations: - m = re.match(alpha_pattern, fit) - if not m: - raise ConfigError(f"Couldn't match fit name {fit} to the " - f"pattern {alpha_pattern!r}") - res.append(float(m.group(1))/1000) - return {'fits_as' : res} - - def produce_fits_name_from_fitdeclarations(self, fitdeclarations): - """Inject the names from the ``fitdeclarations`` as the fit_names - property""" - #Cast nslist away - return {'fits_name': list(fitdeclarations)} - - def parse_blacklist_datasets(self, datasets:list): - return datasets - - def produce_combine_dataspecs_pseudoreplicas_as( - self, dataspecs, how='min', blacklist_datasets=()): - if not isinstance(dataspecs, Sequence): - raise ConfigError("dataspecs should be a sequence of mappings, not " - f"{type(dataspecs).__name__}") - if how != 'min': - raise ConfigError("Only min is implemented at the moment") - - dfs = [] - fitnames = [] - for spec in dataspecs: - if not isinstance(spec, Mapping): - raise ConfigError("dataspecs should be a sequence of mappings, " - f" but {spec} is {type(spec).__name__}") - with self.set_context(ns=self._curr_ns.new_child(spec)): - _, df = self.parse_from_(None, 'fits_computed_pseudoreplicas_chi2', write=False) - _, asval = self.parse_from_(None, 'fits_as', write=False) - _, namelist = self.parse_from_(None, 'fits_name', write=False) - if not dfs: - firstas = asval - elif asval != firstas: - raise ConfigError("Expecting all as values to be the same") - dfs.append(df) - fitnames.append(namelist) - finalnames = [utils.common_prefix(*ns) + '__combined' for ns in zip(*fitnames)] - res = tableloader.combine_pseudoreplica_tables(dfs, finalnames, - blacklist_datasets=blacklist_datasets) - - return {'fits_computed_pseudoreplicas_chi2': res} - - - #TODO: autogenerate functions like this - def parse_experiments_covmat_output(self, fname:str, config_rel_path): - """NOTE: THIS INTERFACE IS EXPERIMENTAL AND MIGHT CHANGE IN THE FUTURE. - Process the output CSV table of the experiments_covmat action - and return an equivalent dataframe""" - df = self._get_table(tableloader.load_experiments_covmat, fname, config_rel_path) - return {'experiments_covmat': df} - - - #TODO: Move these to their own module when that's supported by reportengine - def produce_fits_matched_pseudoreplicas_chi2_output(self, - pseudoreplicafile:str, - fits_name): - """DEPRECATED. DO NOT USE.""" - import pandas as pd - import numpy as np - try: - df = pd.read_csv(pseudoreplicafile, sep='\t', - index_col=[0,1],header=[0,1]) - except Exception as e: - raise ConfigError(f"Failed to load the table: {e}") from e - - - #Require that the fits are matched so we filer out some that are not - #interesting or broken. - try: - df = df[fits_name] - except Exception as e: - raise ConfigError("Mismatch between fits provided and fits " - f"in the table {pseudoreplicafile}:\n{e}") from e - ndataindexer = df.columns.get_locs([slice(None), 'ndata']) - lentest = lambda x: len(np.unique(x.dropna()))<=1 - samelens = df.iloc[:,ndataindexer].apply(lentest, axis=1).all() - if not samelens: - raise ConfigError("Incorrect data: Expected all experiments to have the same length.") - chindexer = df.columns.get_locs([slice(None), 'central_chi2']) - df = df.iloc[:,chindexer] - df = df.swaplevel(0,1) - #Have it the way the existing functions like - newcols = df.columns.set_levels([df.columns.levels[0], ['chi2']]) - df.columns = newcols - return df - - def parse_fits_computed_pseudoreplicas_chi2_output(self, fname:str, - config_rel_path): - """Return a namespace (mapping) with the output of - ``fits_computed_pseudoreplicas_chi2_table`` as read from the specified - filename. Use a {@with@} block to pass it to the providers. - The fit names must be provided explicitly.""" - return self._get_table(tableloader.load_fits_computed_pseudoreplicas_chi2, - fname, config_rel_path) - - - def produce_use_fits_computed_pseudoreplicas_chi2_output( - self, fits_computed_pseudoreplicas_chi2_output, fits_name): - """Select the columns of the input file matching the fits.""" - df = fits_computed_pseudoreplicas_chi2_output - try: - df = df[fits_name] - except Exception as e: - raise ConfigError(f"Could not select the fit names from the table: {e}") from e - - return {'fits_computed_pseudoreplicas_chi2': df} - - - def produce_use_fits_computed_psedorreplicas_chi2_output( - self, fits_computed_psedorreplicas_chi2_output, fits_name): - """Select the columns of the input file matching the fits. - - Note: this is a copy of ``produce_use_fits_computed_pseudoreplicas_chi2_output``. - It is here so that `fits_computed_pseudoreplicas_chi2` gets assigned whether - `fits_computed_pseudoreplicas_chi2_output` or `fits_computed_psedorreplicas_chi2_output` - is specified in the runcard. This is to ensure that old runcards still work. - """ - df = fits_computed_psedorreplicas_chi2_output - try: - df = df[fits_name] - except Exception as e: - raise ConfigError(f"Could not select the fit names from the table: {e}") from e - - return {'fits_computed_pseudoreplicas_chi2': df} - - - @element_of('extra_sums') - def parse_extra_sum(self, s:dict): - keys = {'dataset_item', 'components'} - if s.keys() != keys: - d1 = s.keys() - keys - d2 = keys - s.keys - if d1: - raise ConfigError(f'Unable to parse extra_sum: unrecognized keys: {d1}') - if d2: - raise ConfigError(f'Unable to parse extra_sum. The following keys are required: {d2}') - raise RuntimeError() - return s - - def produce_fits_matched_pseudoreplicas_chi2_by_experiment_and_dataset(self, - fits_computed_pseudoreplicas_chi2, prepend_total:bool=True, - extra_sums=None): - """Take the table returned by - ``fits_matched_pseudoreplicas_chi2_output`` and break it down - by experiment. If `preprend_total` is True, the sum over experiments - will be included. - - This provides a namespace list with `suptitle`, `ndata` and - `fits_replica_data_correlated`. - - """ - def get_ndata(df): - val = df.index.get_level_values(2).unique() - if len(val) != 1: - raise ConfigError(f"Found different number " - f"of points in {df.name}") - return val[0] - - df = fits_computed_pseudoreplicas_chi2 - - if prepend_total: - s = df.loc[(slice(None), 'Total'),:].groupby(level=3).sum(min_count=1) - ndata = df.loc[(slice(None), 'Total'),:].groupby(level=0).apply(get_ndata).sum(min_count=1) - total = [ - {'experiment_label': 'Total', - 'by_dataset': [{ - 'fits_replica_data_correlated': s, - 'suptitle': 'Total', - 'ndata': ndata - }]}] - else: - total = [] - - expres = [] - for exp, expdf in df.groupby(level=0): - d = {'experiment_label': exp} - by_dataset = d['by_dataset'] = [] - for ds, dsdf in expdf.groupby(level=1): - ndata = dsdf.groupby(level=0).apply(get_ndata).sum() - dsdf.index = dsdf.index.droplevel([0,1,2]) - - if ds == 'Total': - if exp != 'Total': - ds = f'{exp} Total' - by_dataset.insert(0, {'fits_replica_data_correlated': dsdf, - 'suptitle':ds, 'ndata':ndata}) - else: - by_dataset.append({'fits_replica_data_correlated': dsdf, - 'suptitle':ds, 'ndata':ndata}) - - expres.append(d) - - - if extra_sums: - dss = {d['suptitle'] for l in [*total, *expres] for d in l['by_dataset']} - for es in extra_sums: - label = es['dataset_item'] - components = es['components'] - diff = set(components) - dss - if diff: - bad_item = next(iter(diff)) - raise ConfigError(f"Unrecognized elements in extra_sum: {diff}", bad_item, dss) - - sliced = tableloader.get_extrasum_slice(df, components) - s = sliced.groupby(level=3).sum(min_count=1) - ndata = sliced.groupby(level=[0,1]).apply(get_ndata).sum() - - - - total.append( - {'experiment_label': label, - 'by_dataset': [{ - 'fits_replica_data_correlated': s, - 'suptitle': label, - 'ndata': ndata - }]}) - - - return [*total, *expres] - - def _breakup_by_dataset_item(self, l, dataset_items): - if dataset_items is None: - return [{**expdict, **dsdict} - for expdict in l for dsdict in expdict['by_dataset']] - - positions = {ds: pos for ds,pos in zip(dataset_items, range(len(dataset_items)))} - #NOTE: If you want duplicates for some reason, you'll need to rewrite - #this algorithm. - if len(positions) != len(dataset_items): - raise ConfigError("'dataset_items' cannot have duplicates") - - res = {} - - for expdict in l: - for dsdict in expdict['by_dataset']: - dsname = dsdict['suptitle'] - if dsname in positions: - res[positions[dsname]] = {**expdict, **dsdict} - del positions[dsname] - if positions: - raise ConfigError(f"Unrecognized dataset_items: {list(positions)}") - return [res[index] for index in range(len(dataset_items))] - - - def produce_fits_matched_pseudoreplicas_chi2_by_dataset_item( - self, - fits_matched_pseudoreplicas_chi2_by_experiment_and_dataset, - dataset_items:(list,type(None)) = None): - """Reorder, filter and flatten the result of - fits_matched_pseudoreplicas_chi2_by_experiment_and_dataset with the - dataset_items list. If it's not provided, this is equivalent to: - fits_matched_pseudoreplicas_chi2_by_experiment_and_dataset::by_dataset - Otherwise, the dictionaries will be returned in the order they appear - in dataset_items, if they appear. - """ - l = fits_matched_pseudoreplicas_chi2_by_experiment_and_dataset - return self._breakup_by_dataset_item(l, dataset_items) - - def produce_matched_pseudoreplicas_for_total( - self, - fits_matched_pseudoreplicas_chi2_by_experiment_and_dataset): - """Like ``fits_matched_pseudoreplicas_chi2_by_dataset_item``, but - restriction the ``dataset_item`` selection to "Total" exclusively.""" - res = self.produce_fits_matched_pseudoreplicas_chi2_by_dataset_item( - fits_matched_pseudoreplicas_chi2_by_experiment_and_dataset, - ['Total']) - return res - - def produce_fits_replica_data_correlated_for_total( - self, matched_pseudoreplicas_for_total): - """Extract `fits_replica_data_correlated` from - `matched_pseudoreplicas_for_total`. This is a hack that cannot be - done efficiently with collect because of - https://github.com/NNPDF/reportengine/issues/8.""" - return [matched_pseudoreplicas_for_total[0]['fits_replica_data_correlated']] - - - def parse_fits_chi2_paramfits_output(self, fname:str, config_rel_path): - """Load the output of ``fits_chi2_table`` adapted to suit the - ``paramfits`` module. The fit names must be provided explicitly.""" - return self._get_table(tableloader.load_adapted_fits_chi2_table, - fname, config_rel_path) - - def produce_use_fits_chi2_paramfits_output(self, fits_chi2_paramfits_output, - fits_name): - ndatatable, chi2table = fits_chi2_paramfits_output - try: - chi2table = chi2table[fits_name] - except Exception as e: - raise ConfigError(f"Could not select the fit names from the table: {e}") from e - return {'adapted_fits_chi2_table': chi2table, 'ndatatable':ndatatable} - - def produce_fits_central_chi2_by_experiment_and_dataset(self, - adapted_fits_chi2_table, ndatatable, prepend_total=True,extra_sums=None): - """Take the table returned by - ``fits_matched_pseudoreplicas_chi2_output`` and break it down - by experiment. If `preprend_total` is True, the sum over experiments - will be included. - - This provides a namespace list with `suptilte` and - `fits_replica_data_correlated`.""" - - df = adapted_fits_chi2_table - - if prepend_total: - s = df.sort_index().loc[(slice(None), 'Total'), :].sum(min_count=1) - total = [ - {'experiment_label': 'Total', - 'by_dataset': [{ - 'fits_total_chi2': s, - 'suptitle': 'Total', - 'ndata': ndatatable.loc[(slice(None), 'Total')].sum(), - }]}] - else: - total = [] - expres = [] - for exp, expdf in df.groupby(level='experiment'): - d = {'experiment_label': exp} - by_dataset = d['by_dataset'] = [] - for ds, dsdf in expdf.groupby(level=1): - dsdf.index = dsdf.index.droplevel([0]) - ndata = ndatatable[(exp,ds)] - if ds == 'Total': - ds = f'{exp} Total' - by_dataset.insert(0, {'fits_total_chi2': dsdf, - 'suptitle':ds, 'ndata':ndata}) - else: - by_dataset.append({'fits_total_chi2': dsdf, - 'suptitle':ds, 'ndata':ndata}) - - expres.append(d) - - if extra_sums: - dss = {d['suptitle'] for l in [*total, *expres] for d in l['by_dataset']} - for es in extra_sums: - label = es['dataset_item'] - components = es['components'] - diff = set(components) - dss - if diff: - bad_item = next(iter(diff)) - raise ConfigError(f"Unrecognised element in extra sum: {diff}", bad_item, dss) - - sliced = tableloader.get_extrasum_slice(df, components) - s = sliced.sum() - ndata = tableloader.get_extrasum_slice(ndatatable, components).sum() - total.append( - {'experiment_label': label, - 'by_dataset': [{ - 'fits_total_chi2': s, - 'suptitle': label, - 'ndata': ndata, - }]}) - - return [*total, *expres] - - def produce_fits_central_chi2_by_dataset_item( - self, - fits_central_chi2_by_experiment_and_dataset, - dataset_items:(list,type(None)) = None): - """Reorder, filter and flatten the result of - fits_central_chi2_by_experiment_and_dataset with the - dataset_items list. If it's not provided, this is equivalent to: - fits_central_chi2_by_experiment_and_dataset::by_dataset - Otherwise, the dictionaries will be returned in the order they appear - in dataset_items, if they appear. - """ - l = fits_central_chi2_by_experiment_and_dataset - return self._breakup_by_dataset_item(l, dataset_items) - - def produce_fits_central_chi2_for_total( - self, - fits_central_chi2_by_experiment_and_dataset): - res = self.produce_fits_central_chi2_by_dataset_item( - fits_central_chi2_by_experiment_and_dataset, ['Total']) - return res - - # Define aliases for functions with spelling mistakes in their names which have now been corrected - # Do this so that old runcards still work - produce_combine_dataspecs_pseudorreplicas_as = produce_combine_dataspecs_pseudoreplicas_as - produce_fits_matched_pseudorreplicas_chi2_output = produce_fits_matched_pseudoreplicas_chi2_output - parse_fits_computed_psedorreplicas_chi2_output = parse_fits_computed_pseudoreplicas_chi2_output - produce_fits_matched_pseudorreplicas_chi2_by_experiment_and_dataset = produce_fits_matched_pseudoreplicas_chi2_by_experiment_and_dataset - produce_fits_matched_pseudorreplicas_chi2_by_dataset_item = produce_fits_matched_pseudoreplicas_chi2_by_dataset_item - produce_matched_pseudorreplcias_for_total = produce_matched_pseudoreplicas_for_total diff --git a/validphys2/src/validphys/paramfits/dataops.py b/validphys2/src/validphys/paramfits/dataops.py deleted file mode 100644 index 81c0992fc4..0000000000 --- a/validphys2/src/validphys/paramfits/dataops.py +++ /dev/null @@ -1,917 +0,0 @@ -""" -dataops.py - -This module implements the core functionality of the paramfits package, -currently focused on the αs determination. -It computes various statistics and formats the data in a form suitable to -be consumed by plotting functions. -""" -import logging -import warnings -import functools -from collections import defaultdict - -import numpy as np -import pandas as pd -import scipy.stats as stats - -from reportengine import collect -from reportengine.floatformatting import format_error_value_columns, ValueErrorTuple, format_number -from reportengine.checks import make_argcheck, CheckError, check_positive, check_not_empty -from reportengine.table import table - -from validphys.checks import ( - check_fits_different, - check_dataspecs_fits_different, - check_speclabels_different, -) - -log = logging.getLogger(__name__) - -class StandardSampleWrapper: - """A class that holds a flat array of data, and has 'location' and - 'scale' properties, which are the mean and the standard deviation. - The purpose of the class is to explicitly disallow calling np.mean - and np.std on the result while preserving functional backward - compatibility with the runcards.""" - def __init__(self, data): - self.data = data - - @property - def location(self): - return np.mean(self.data) - - @property - def scale(self): - return np.std(self.data) - -class RobustSampleWrapper: - """Similar to ``StandardSampleWrapper``, but location and scale - are implemented as the median and the 68% interval respectively.""" - def __init__(self, data): - self.data = data - - @property - def location(self): - return np.median(self.data) - - @property - def scale(self): - return (np.diff(np.percentile(self.data, [15.87, 84.13]))).item()/2 - - -def get_parabola(asvals, chi2vals): - """Return the three coefficients of a parabola χ²(αs) given a set of - asvals and a set of χ² values. - Only the finite χ² values are taken into account.""" - chi2vals = np.ravel(chi2vals) - filt = np.isfinite(chi2vals) - return np.polyfit(np.asarray(asvals)[filt], chi2vals[filt], 2) - -#TODO: Export the total here. Not having it is causing huge pain elsewhere. -@table -@check_fits_different -def fits_matched_pseudoreplicas_chi2_table(fits, fits_computed_pseudoreplicas_chi2): - """Collect the chi^2 of the pseudoreplicas in the fits a single table, - groped by nnfit_id. - The columns come in two levels, fit name and (total chi², n). - The indexes also come in two levels: nnfit_id and experiment name.""" - return pd.concat(fits_computed_pseudoreplicas_chi2, axis=1, keys=map(str,fits)) - -@table -@check_dataspecs_fits_different -def dataspecs_matched_pseudoreplicas_chi2_table( - dataspecs_fit, dataspecs_computed_pseudoreplicas_chi2): - """Like ``fits_matched_pseudoreplicas_chi2_table`` but for arbitrary dataspecs""" - return fits_matched_pseudoreplicas_chi2_table(dataspecs_fit, dataspecs_computed_pseudoreplicas_chi2) - -@make_argcheck -def _check_badcurves(badcurves): - options = ['discard', 'minimum', 'allminimum'] - if badcurves not in options: - raise CheckError(f"badcurves must be one of {options}", - badcurves, options) - - - - -def _discard_sparse_curves(fits_replica_data_correlated, - max_ndiscarded): - """Return a table like `fits_replica_data_correlated` where the replicas - with too many discarded points have been filtered out.""" - - df = fits_replica_data_correlated - - def ap(x): - x.columns = x.columns.droplevel(0) - return (x['chi2']) - table = df.groupby(axis=1, level=0).apply(ap) - filt = table.isnull().sum(axis=1, min_count=1) < max_ndiscarded - - table = table[filt] - return table, filt - -@make_argcheck -def _check_discarded_string(max_ndiscarded): - arg = max_ndiscarded - if isinstance(arg,str): - if arg != 'auto': - raise CheckError("Expecting string to be 'auto'") - -@_check_discarded_string -def discarded_mask( - fits_replica_data_correlated_for_total, - fits_as, - max_ndiscarded:(int,str)='auto', - autodiscard_confidence_level:float=0.99, - trim_ndistant:int=0): - - """Return a table like `fits_replica_data_correlated` where the replicas - with too many discarded points have been filtered out. - - autodiscard_confidence_level is the student-T confidence level. Is normalised to 1 - and only is used if max_ndiscarded is set to 'auto' - - The automated discarding is done by estimating the uncertainty on the uncertainty by bootstrapping. - - The function returns a mask to be applied in fits_replica_data_with_discarded_replicas""" - - df = fits_replica_data_correlated_for_total[0] - - with warnings.catch_warnings(): - warnings.simplefilter('ignore') - estimate = parabolic_as_determination(fits_as,df) - best_as = estimate.location - dist_best_as = -np.abs(best_as - fits_as) - to_remove = np.argpartition(dist_best_as, trim_ndistant)[:trim_ndistant] - as_mask = np.ones(df.shape[1], dtype=bool) - as_mask[to_remove] = False - - - - if isinstance(max_ndiscarded,int): - return _discard_sparse_curves(df,max_ndiscarded)[1], as_mask - - else: - best_error = np.inf - ndiscarded = range(len(fits_as),0,-1) - for i in range(len(ndiscarded),0,-1): - - tablefilt_total, auto_filt = _discard_sparse_curves(df,ndiscarded[i-1]) - least_points = tablefilt_total.notnull().sum(axis=1, min_count=1).min() - - #Number of points that pass the cuts - size = np.sum(auto_filt) - - #We can only fit a parabola with 3 points. - #Use a fouth to have in principle some error estimate. - if least_points > 3: - #Apply as mask before fitting - tablefilt_total = tablefilt_total.copy() - tablefilt_total.iloc[:,~as_mask] = np.NAN - parabolas = parabolic_as_determination(fits_as,tablefilt_total).data - bootstrap_est = np.random.choice(parabolas,(100000,size)).std(axis=1).std() - else: - bootstrap_est = np.inf - - stdT = stats.t.ppf((1-(1-autodiscard_confidence_level)/2), size-1) - current_err = bootstrap_est*stdT - - if current_err < best_error: - best_error = current_err - best_filt = auto_filt - - return best_filt, as_mask - -def fits_replica_data_with_discarded_replicas( - discarded_mask, - fits_replica_data_correlated): - """Applies mask from discarded_mask to dataframes""" - curve_mask, as_mask = discarded_mask - - discarded_replicas = fits_replica_data_correlated[curve_mask].copy() - #Set these to Nan instead to masking them away in order to not break - #all the apis that match this with fits_as. - discarded_replicas.iloc[:,~as_mask] = np.NAN - return discarded_replicas - - -def _parabolic_as_minimum_and_coefficient(fits_as, - fits_replica_data_with_discarded_replicas, - badcurves='discard'): - """This implements the fitting in ``parabolic_as_determination``. - Returns the minimum and the set of locations.""" - alphas = fits_as - - table = fits_replica_data_with_discarded_replicas.values - - minimums = [] - quadratic = [] - asarr = np.asarray(alphas) - for row in table: - filt = np.isfinite(row) - if not filt.any(): - continue - a,b,c = np.polyfit(asarr[filt], row[filt], 2) - quadratic.append(a) - if badcurves == 'allminimum': - minimums.append(asarr[filt][np.argmin(row[filt])]) - elif a>0: - minimums.append(-b/2/a) - elif badcurves == 'discard': - pass - elif badcurves == 'minimum': - minimums.append(asarr[filt][np.argmin(row[filt])]) - else: - raise RuntimeError("Unknown bad curves.") - quadratic = np.asarray(quadratic) - minimums = np.asarray(minimums) - return minimums, quadratic - - -def _parabolic_as_determination(fits_as, - fits_replica_data_with_discarded_replicas,badcurves='discard'): - - return _parabolic_as_minimum_and_coefficient( fits_as, - fits_replica_data_with_discarded_replicas, badcurves)[0] - -def quadratic_as_determination(fits_as, - fits_replica_data_with_discarded_replicas,badcurves='discard'): - - return _parabolic_as_minimum_and_coefficient(fits_as, - fits_replica_data_with_discarded_replicas, badcurves)[1] - - - -@make_argcheck -def _check_as_transform(as_transform): - values = (None, 'log', 'exp', 'logshift') - if not as_transform in values: - raise CheckError(f"The allowed valued values for " - f"as_transform are {values}", str(as_transform), - values[1:]) - -@make_argcheck -def _check_parabolic_as_statistics(parabolic_as_statistics): - values = 'standard', 'robust' - if parabolic_as_statistics not in values: - raise CheckError('The allowed values for' - f'`parabolic_as_statistics` are {values}', - parabolic_as_statistics, values) - - -@_check_badcurves -@_check_as_transform -@_check_parabolic_as_statistics -def parabolic_as_determination( - fits_as, - fits_replica_data_with_discarded_replicas, - badcurves='discard', - as_transform:(str, type(None))=None, - parabolic_as_statistics:str='standard'): - """Return the minima for alpha_s corresponding to the fitted curves. - ``badcuves`` specifies what to do with concave replicas and can be one of - 'discard', 'allminimum' - (which takes the minimum points - for *all* the replicas without fitting a parabola) or - 'minimum' (which takes the minimum value for the concave replicas). - - If ``parabolic_as_statistics`` is ``"standard"``, means and standard - deviations will be used to compute statstics. Otherwise, if it is - ``"robust"``, medians and 68% intervals will be used. - - as_transform can be None, 'log', 'logshift' (``log(1+αs)``) or - 'exp' and is applied to the as_values and then reversed for the - minima. - """ - if as_transform == 'log': - fits_as = np.log(fits_as) - elif as_transform == 'logshift': - fits_as = np.log(fits_as + 1) - elif as_transform == 'exp': - fits_as = np.exp(fits_as) - minimums = _parabolic_as_determination( - fits_as, - fits_replica_data_with_discarded_replicas, badcurves) - # Invert the transform - if as_transform == 'log': - minimums = np.exp(minimums) - elif as_transform == 'exp': - minimums = np.log(minimums) - elif as_transform == 'logshift': - minimums = np.exp(minimums - 1) - if parabolic_as_statistics == 'standard': - res = StandardSampleWrapper(minimums) - elif parabolic_as_statistics == 'robust': - res = RobustSampleWrapper(minimums) - else: - raise RuntimeError("Unknown `parabolic_as_statistics`.") - return res - -def as_central_parabola( - fits_as, - fits_total_chi2): - """Return the coefficients corresponding to the parabolic fit to the - minimum of the pseudoreplicas""" - return get_parabola(fits_as, fits_total_chi2) - -as_datasets_central_parabolas = collect( - 'as_central_parabola', ['fits_central_chi2_by_dataset_item']) - -central_by_dataset_suptitle = collect('suptitle', ['fits_central_chi2_by_dataset_item']) -dataspecs_central_by_dataset_suptitle = collect('central_by_dataset_suptitle', ['dataspecs']) - -central_by_dataset_ndata = collect( - 'ndata', - ['fits_central_chi2_by_dataset_item',] -) -dataspecs_central_by_dataset_ndata = collect('central_by_dataset_ndata', ['dataspecs']) - -by_dataset_suptitle = collect( - 'suptitle', - ['fits_matched_pseudoreplicas_chi2_by_dataset_item',] -) - -dataspecs_dataset_suptitle = collect('by_dataset_suptitle', ['dataspecs']) - - -by_dataset_ndata = collect( - 'ndata', - ['fits_matched_pseudoreplicas_chi2_by_dataset_item',] -) - -dataspecs_dataset_ndata = collect('by_dataset_ndata', ['dataspecs']) - - -@table -def derivative_dispersion_table( - as_datasets_central_parabolas, - fits_as, - central_by_dataset_suptitle, - as_determination_from_central_chi2_for_total): - best_as = np.ravel(as_determination_from_central_chi2_for_total)[0] - d = {} - for label, p in zip(central_by_dataset_suptitle, as_datasets_central_parabolas): - d[label] = np.polyval(np.polyder(p), best_as) - - s = pd.Series(d) - s['SUM'] = np.sum(s) - s['SUM QUADRATURE'] = np.sqrt(np.sum(s**2)) - res = pd.DataFrame(s, columns=['Derivative']) - - return res - -dataspecs_as_central_parabolas = collect('as_datasets_central_parabolas', ['dataspecs']) - -def dataspecs_as_central_parabolas_map( - dataspecs_speclabel, - dataspecs_as_central_parabolas, - dataspecs_central_by_dataset_suptitle, - dataspecs_central_by_dataset_ndata): - """Return a dict-like datastucture with the central chi² of the form: - - d[dataset_name][dataspec] = parabola_coefficients/ndata - - for all dataset items and dataspecs. - """ - res = defaultdict(dict) - for label, parabolas, dsnames, ndatas in zip( - dataspecs_speclabel, - dataspecs_as_central_parabolas, - dataspecs_central_by_dataset_suptitle, - dataspecs_central_by_dataset_ndata): - for parabola, dsname, ndata in zip(parabolas, dsnames, ndatas): - res[dsname][label] = parabola/np.asarray(ndata) - return res - -def _aic(residuals, n, k): - return 2*k + n*np.log(residuals) + 2*k*(k+1)/(n-k-1) - -@table -def compare_aic(fits_as, fits_replica_data_with_discarded_replicas, suptitle): - """Compare the Akaike information criterion (AIC) for a - parabolic and a cubic fit. Note that - this does **not** yield the actual AIC score, but only the piece - necessary to compare least squared fit (i.e. assuming - iid gaussian noise for all points). This is: - - 2*k + n*log(sum(residuals squared)) - - The mean and standard deviation are taken across curves. - Note that this always uses the *discard* criterion: - That is, it ignores the curves that have no minimum.""" - alphas = fits_as - asarr = np.asarray(alphas) - - aic2s = [] - aic3s = [] - - table = fits_replica_data_with_discarded_replicas.values - for row in table: - filt = np.isfinite(row) - asfilt = asarr[filt] - rowfilt = row[filt] - n = len(rowfilt) - - p2, res2, *stuff = np.polyfit(asfilt, rowfilt, 2, full=True) - if p2[0] <= 0: - pass - #log.warning(f"Concave parabola computing AIC in {suptitle}") - else: - aic2 = _aic(res2, n, k=4) - aic2s.append(aic2) - - p3, res3, *stuff = np.polyfit(asfilt, rowfilt, 3, full=True) - - extrema = np.roots(np.polyder(p3)) - #Cast away the zero complex part - candidates = np.real(extrema[np.isreal(extrema)]) - if not len(candidates): - pass - #log.warning(f"Bad cubic minimum computing AIC in {suptitle}") - else: - aic3 = _aic(res3, n, k=5) - aic3s.append(aic3) - v2, e2 = np.mean(aic2s), np.std(aic2s) - v3, e3 = np.mean(aic3s), np.std(aic3s) - - qp = "Quadratic polynomial" - cp = "Cubic polynomial" - - df = pd.DataFrame({'mean': {qp: v2, cp: v3}, 'error': {qp: e2, cp:e3}, - 'n minima':{qp: len(aic2s), cp: len(aic3s)}}, - columns=['mean', 'error', 'n minima']) - format_error_value_columns(df, 'mean', 'error', inplace=True) - return df - - -def as_determination_from_central_chi2(fits_as, fits_total_chi2): - """Return the alpha_s from the minimum chi² and the Delta_chi²=1 error - from a quadratic fit to the total chi².""" - alphas = fits_as - chi2s = np.ravel(fits_total_chi2) - a,b,c = np.polyfit(alphas, chi2s, 2) - if a<=0: - log.error("Found non convex parabola when computing the quadratic fit.") - return np.nan, np.nan - return ValueErrorTuple(-b/(2*a), 1/(np.sqrt(a))) - -def parabolic_as_determination_with_tag(parabolic_as_determination, suptitle): - """Convenience function to collect the arguments together. It is an identity""" - return parabolic_as_determination, suptitle - -def quadratic_as_determination_with_tag(quadratic_as_determination, suptitle): - """Convenience function to collect the arguments together. It is an identity""" - return quadratic_as_determination, suptitle - -def as_determination_from_central_chi2_with_tag( - as_determination_from_central_chi2, suptitle): - """Convenience function to collect the arguments together. It is an identity""" - - return as_determination_from_central_chi2, suptitle - - -as_datasets_pseudoreplicas_chi2 = collect( - parabolic_as_determination_with_tag, - ['fits_matched_pseudoreplicas_chi2_by_dataset_item',] -) - -as_datasets_central_chi2 = collect( - as_determination_from_central_chi2_with_tag, - ['fits_central_chi2_by_dataset_item'] -) - -parabolic_as_determination_for_total = collect(parabolic_as_determination, - ['matched_pseudoreplicas_for_total']) - -as_determination_from_central_chi2_for_total = collect( - as_determination_from_central_chi2, ['fits_central_chi2_for_total']) - - -quadratic_datasets_pseudoreplicas_chi2 = collect( - quadratic_as_determination_with_tag, - ['fits_matched_pseudoreplicas_chi2_by_dataset_item',] -) - -dataspecs_parabolic_as_determination_for_total = collect( - 'parabolic_as_determination_for_total', ['dataspecs']) - -@check_positive('nresamplings') -def bootstrapping_stats_error(parabolic_as_determination, nresamplings:int=100000, suptitle=""): - """Compute the bootstrapping uncertainty of the distribution of - determinations of as, by resampling the list of points with replacement - from the original sampling distribution `nresamplings` times - and then computing the standard deviation of the means.""" - distribution = parabolic_as_determination.data - shape = (nresamplings, len(distribution)) - if not len(distribution): - log.error("Cannot compute stats error. Empty data.") - return np.nan - return np.random.choice(distribution, shape).mean(axis=1).std() - -@check_positive('nresamplings') -def bootstrapping_stats_error_on_the_error(parabolic_as_determination, nresamplings:int=100000, suptitle=""): - """Compute the bootstrapping uncertainty of standard deviation on the parabolic determination.""" - distribution = parabolic_as_determination.data - shape = (nresamplings, len(distribution)) - if not len(distribution): - log.error("Cannot compute stats error. Empty data.") - return np.nan - return np.random.choice(distribution, shape).std(axis=1).std() - - -@check_positive('nresamplings') -def half_sample_stats_error(parabolic_as_determination, nresamplings:int=100000): - """Like the bootstrapping error, but using only half og the data""" - sample = parabolic_as_determination.data - distribution = sample[:len(sample)//2] - if not len(distribution): - log.error("Cannot compute half stats. Too few data") - return np.nan - shape = (nresamplings, len(distribution)) - return np.random.choice(distribution, shape).mean(axis=1).std() - - - - -as_datasets_bootstrapping_stats_error = collect(bootstrapping_stats_error, - ['fits_matched_pseudoreplicas_chi2_by_dataset_item',] -) - -as_datasets_bootstrapping_stats_error_on_the_error = collect( - bootstrapping_stats_error_on_the_error, - ['fits_matched_pseudoreplicas_chi2_by_dataset_item',] -) - -as_datasets_half_sample_stats_error = collect(half_sample_stats_error, - ['fits_matched_pseudoreplicas_chi2_by_dataset_item',] -) - - -#Don't write complicated column names everywhere -ps_mean = "pseudoreplica mean" -ps_error = "pseudoreplica error" -ps_stat_error = "pseudoreplica stat" -ps_half_stat_error = "pseudoreplica halfstat" -stats_ratio = r"$\frac{halfstat}{stat}/\sqrt 2$" -n = 'n' - -stats_halfone = "cv selecting one half of the replicas" -err_halfone = "err selecting one half of the replicas" - -stats_halfother = "cv selecting other half of the replicas" -err_halfonother = "err selecting other half of the replicas" - -stats_err_err = "Stat error on the error" - -ps_cols = (ps_mean, ps_error ,n, ps_stat_error, ps_half_stat_error, - stats_halfone, err_halfone, stats_halfother, err_halfonother, - stats_err_err) - - -cv_mean = "central mean" -cv_error = "central error" - -def pseudoreplicas_stats_error( - as_datasets_pseudoreplicas_chi2, - as_datasets_bootstrapping_stats_error, - as_datasets_bootstrapping_stats_error_on_the_error, - as_datasets_half_sample_stats_error): - """Return a dictionary (easily convertible to a DataFrame) with the mean, - error and the measures of statistical error for each dataset.""" - d = defaultdict(dict) - - for (distribution, tag), statserr, staterrerr, halfstaterr in zip( - as_datasets_pseudoreplicas_chi2, - as_datasets_bootstrapping_stats_error, - as_datasets_bootstrapping_stats_error_on_the_error, - as_datasets_half_sample_stats_error): - d[ps_mean][tag] = distribution.location - d[n][tag] = len(distribution.data) - d[ps_error][tag] = distribution.scale - d[ps_stat_error][tag] = statserr - d[ps_half_stat_error][tag] = halfstaterr - d[stats_ratio][tag] = halfstaterr/statserr/np.sqrt(2) - d[stats_err_err][tag] = staterrerr - - ldh = len(distribution.data)//2 - onehalf = distribution.data[:ldh] - otherhalf = distribution.data[ldh:] - d[stats_halfone][tag] = np.mean(onehalf) - d[err_halfone][tag] = np.std(onehalf) - d[stats_halfother][tag] = np.mean(otherhalf) - d[err_halfonother][tag] = np.std(otherhalf) - - return dict(d) - -dataspecs_pseudoreplica_stats_error = collect(pseudoreplicas_stats_error, ['dataspecs']) - -#TODO: This is deprecated FAPP -@make_argcheck -def check_dataset_items(dataset_items, dataspecs_dataset_suptitle): - """Check that the dataset_items are legit.""" - if dataset_items is None: - return - try: - s = set(dataset_items) - except Exception as e: - raise CheckError(f'dataset_items must be a list of strings: {e}') from e - - flat = [item for l in dataspecs_dataset_suptitle for item in l] - d = s - set(flat) - if d: - raise CheckError(f"The following dataset_items are unrecognized: {d}") - - - -def compare_determinations_table_impl( - pseudoreplicas_stats_error, - as_datasets_central_chi2): - """Produce a table by experiment comparing the alpha_S determination - from pseudoreplicas and from central values.""" - - - #Use this to get the right sorting - d = defaultdict(dict) - tags = [] - for (cv, error), tag in as_datasets_central_chi2: - d[cv_mean][tag] = cv - d[cv_error][tag] = error - - tags.append(tag) - - d.update(pseudoreplicas_stats_error) - - df = pd.DataFrame(d, columns=[*ps_cols, cv_mean, cv_error]) - df = df.loc[tags] - return df - -@table -@check_speclabels_different -def dataspecs_stats_error_table( - dataspecs_pseudoreplica_stats_error, - dataspecs_dataset_suptitle, - dataspecs_speclabel, - dataset_items:(type(None), list) = None, - ): - """Return a table with the stats errors of the pseudoreplica determination - of each dataspec""" - dfs = [] - for d in dataspecs_pseudoreplica_stats_error: - df = pd.DataFrame(d, columns=ps_cols) - format_error_value_columns(df, ps_mean, ps_error) - format_error_value_columns(df, stats_halfone, err_halfone) - format_error_value_columns(df, stats_halfother, err_halfonother) - - dfs.append(df) - table = pd.concat(dfs, axis=1, keys=dataspecs_speclabel) - if dataset_items is not None: - table = table.loc[dataset_items] - return table - -@table -def dataspecs_stats_error_table_transposed(dataspecs_stats_error_table): - """Transposed version of dataspecs_stats_error_table for display - purposes.""" - return dataspecs_stats_error_table.T - -@table -def compare_determinations_table(compare_determinations_table_impl): - """Return ``compare_determinations_table_impl`` formatted nicely""" - df = compare_determinations_table_impl - format_error_value_columns(df, ps_mean, - ps_error, inplace=True) - format_error_value_columns(df, cv_mean, - cv_error, inplace=True) - stats_cols = {ps_stat_error, ps_half_stat_error, stats_ratio} - #Don't fail if/when we remove a table from here - stats_cols &= set(df.columns) - stats_cols = list(stats_cols) - - digits2 = functools.partial(format_number, digits=2) - df[stats_cols] = df[stats_cols].applymap(digits2) - return df - -dataspecs_as_datasets_pseudoreplicas_chi2 = collect('as_datasets_pseudoreplicas_chi2', ['dataspecs']) - -quad_as_datasets_pseudoreplicas_chi2 = collect('quadratic_datasets_pseudoreplicas_chi2',['dataspecs']) - - -#TODO: Deprecate fixup dataset_items earlier -@check_speclabels_different -@check_dataset_items -@table -def dataspecs_ndata_table( - dataspecs_dataset_suptitle, - dataspecs_dataset_ndata, - dataspecs_speclabel, - dataset_items:(list, type(None))=None): - """Return a table with the same index as - dataspecs_as_value_error_table_impl with the number of points - per dataset.""" - d = {} - for dslabel, datanames, ndatas in zip(dataspecs_speclabel, - dataspecs_dataset_suptitle, - dataspecs_dataset_ndata): - d[dslabel] = dict(zip(datanames, ndatas)) - df = pd.DataFrame(d) - if dataset_items is not None: - df = df.loc[dataset_items] - return df - -@check_speclabels_different -@check_dataset_items -def dataspecs_quad_table_impl( - quad_as_datasets_pseudoreplicas_chi2, dataspecs_speclabel, - dataspecs_dataset_suptitle, - dataset_items:(list, type(None)) = None, - display_n:bool = False, - ): - """Return a table with the mean and error of the quadratic coefficient of the parabolic - determinations across dataspecs. If display_n is True, a column showing the number of points - will be added to the table""" - tables = [] - taglist = {} - if display_n: - cols = ['mean', 'error', 'n'] - else: - cols = ['mean', 'error'] - for dets in quad_as_datasets_pseudoreplicas_chi2: - d = defaultdict(dict) - - for distribution, tag in dets: - d['mean'][tag] = np.mean(distribution) - d['error'][tag] = np.std(distribution) - - if display_n: - d['n'][tag] = len(distribution) - taglist[tag] = None - - tables.append(pd.DataFrame(d, columns=cols)) - - df = pd.concat(tables, axis=1, keys=dataspecs_speclabel) - if dataset_items is None: - ordered_keys = list(taglist) - - else: - ordered_keys = dataset_items - - df = df.loc[ordered_keys] - return df - - - -@check_speclabels_different -@check_dataset_items -def dataspecs_as_value_error_table_impl( - dataspecs_as_datasets_pseudoreplicas_chi2, dataspecs_speclabel, - dataspecs_dataset_suptitle, - dataset_items:(list, type(None)) = None, - display_n:bool = False, - ): - """Return a table with the mean and error of the as determinations across - dataspecs. If display_n is True, a column showing the number of points - will be added to the table""" - tables = [] - #Use the fact that in py3.6 a dict with None values is like an ordered set - #TODO: A better way to build the dataframe? - taglist = {} - if display_n: - cols = ['mean', 'error', 'n'] - else: - cols = ['mean', 'error'] - for dets in dataspecs_as_datasets_pseudoreplicas_chi2: - d = defaultdict(dict) - - for distribution, tag in dets: - d['mean'][tag] = distribution.location - d['error'][tag] = distribution.scale - - - if display_n: - d['n'][tag] = len(distribution.data) - taglist[tag] = None - - tables.append(pd.DataFrame(d, columns=cols)) - - - df = pd.concat(tables, axis=1, keys=dataspecs_speclabel) - if dataset_items is None: - ordered_keys = list(taglist) - else: - ordered_keys = dataset_items - - df = df.loc[ordered_keys] - - - - return df - -@table -def dataspecs_as_value_error_table(dataspecs_as_value_error_table_impl): - """Return ``dataspecs_value_error_table_impl`` formatted nicely""" - def f(x): - return format_error_value_columns(x, x.columns[0], x.columns[1]) - return dataspecs_as_value_error_table_impl.groupby(level=0, axis=1).apply(f) - -@table -def dataspecs_as_value_error_table_transposed(dataspecs_as_value_error_table): - """Transposed version of ``dataspecs_as_value_error_table``. - Useful for printing""" - return dataspecs_as_value_error_table.T - -@table -def dataspecs_quad_value_error_table(dataspecs_quad_table_impl): - """Return ``dataspecs_value_error_table_impl`` formatted nicely""" - def f(x): - return format_error_value_columns(x, x.columns[0], x.columns[1]) - return dataspecs_quad_table_impl.groupby(level=0, axis=1).apply(f) - -dataspecs_fits_as = collect('fits_as', ['dataspecs']) - -by_dataset_as_chi2 = collect( - fits_replica_data_with_discarded_replicas, - ['fits_matched_pseudoreplicas_chi2_by_dataset_item',]) - -dataspecs_fits_replica_data_with_discarded_replicas = collect( - 'by_dataset_as_chi2', ['dataspecs']) - -@check_not_empty('dataspecs_dataset_suptitle') -def dataspecs_chi2_by_dataset_dict(dataspecs_dataset_suptitle, - dataspecs_fits_replica_data_with_discarded_replicas, - dataspecs_fits_as, - ): - """Return a table-like dict with the - suptitle: [] - - where each table is ``fits_replica_data_with_discarded_replicas`` resolved - for the given dataset in each of the dataspecs. - """ - allkeys = set(dataspecs_dataset_suptitle[0]) - for newkeys in dataspecs_dataset_suptitle[1:]: - newkeys = set(newkeys) - symdiff = newkeys ^ allkeys - if symdiff: - log.warning(f"Some datasets are not " - f"common across all dataspecs {symdiff}") - allkeys |= symdiff - - res = defaultdict(list) - for keys, values, asvals in zip(dataspecs_dataset_suptitle, - dataspecs_fits_replica_data_with_discarded_replicas, - dataspecs_fits_as): - for k, v in zip(keys, values): - v.columns = asvals - res[k].append(v) - for k in allkeys-set(keys): - res[k].append(None) - return res - - -as_dataset_pseudodata = collect( - fits_replica_data_with_discarded_replicas, - ['fits_matched_pseudoreplicas_chi2_by_dataset_item',] -) - -@table -def as_parabolic_coefficient_table( - fits_as, - by_dataset_suptitle, - as_dataset_pseudodata): - """Return a table of the parabolic fit of each dataset item, for each - correlated replica. The index is the correlated_replica index and there - are four columns for each dataset: 'a', 'b' and 'c' corresponding to the - parabolic coefficients and 'min', which is ``-b/2/a`` if 'a' is positive, - and NaN otherwise.""" - alphas = np.asarray(fits_as) - tb_polys = [] - #Easier to repeat the polyfit code here than to change the format of the - #various outputs. - for tb in as_dataset_pseudodata: - polys = [] - for row in np.asarray(tb): - filt = np.isfinite(row) - if not filt.any(): - polys.append(np.array([np.nan]*3)) - continue - poly = np.polyfit(alphas[filt], row[filt], 2) - polys.append(poly) - frame = pd.DataFrame(polys, index=tb.index, columns=['a','b', 'c']) - frame['min'] = -frame['b']/2/frame['a'] - frame.loc[frame['a']<0,'min'] = np.nan - tb_polys.append(frame) - final_table = pd.concat(tb_polys, axis=1, keys=by_dataset_suptitle) - return final_table - -# Define aliases for functions with spelling mistakes in their names which have now been corrected -# Do this so that old runcards still work -fits_matched_pseudorreplicas_chi2_table = fits_matched_pseudoreplicas_chi2_table -dataspecs_matched_pseudorreplicas_chi2_table = dataspecs_matched_pseudoreplicas_chi2_table -as_datasets_pseudorreplicas_chi2 = as_datasets_pseudoreplicas_chi2 -quadratic_datasets_pseudorreplicas_chi2 = quadratic_datasets_pseudoreplicas_chi2 -pseudorreplicas_stats_error = pseudoreplicas_stats_error -datasepecs_pseudorreplica_stats_error = dataspecs_pseudoreplica_stats_error -dataspecs_as_datasets_pseudorreplicas_chi2 = dataspecs_as_datasets_pseudoreplicas_chi2 -quad_as_datasets_pseudorreplicas_chi2 = quad_as_datasets_pseudoreplicas_chi2 -datasepecs_quad_table_impl = dataspecs_quad_table_impl -datasepecs_as_value_error_table_impl = dataspecs_as_value_error_table_impl diff --git a/validphys2/src/validphys/paramfits/plots.py b/validphys2/src/validphys/paramfits/plots.py deleted file mode 100644 index 74c2de3da6..0000000000 --- a/validphys2/src/validphys/paramfits/plots.py +++ /dev/null @@ -1,895 +0,0 @@ -""" -plots.py - -Plots for the paramfits package. -""" -import logging -import numbers - -import numpy as np -import pandas as pd -from scipy import stats - -from reportengine.floatformatting import format_number -from reportengine.checks import make_argcheck, CheckError, check_positive -from reportengine.figure import figure, figuregen - -from validphys import plotutils -from validphys.plotutils import barplot, kde_plot, marker_iter_plot, plot_horizontal_errorbars -from validphys.paramfits.dataops import check_dataset_items, get_parabola - -log = logging.getLogger(__name__) - -@figure -def plot_fits_as_profile(fits_as, fits_total_chi2, suptitle=None): - """Plot the total central chi² as a function of the value of α_s. - Note that this plots as a function of the key "AlphaS_MZ" in the LHAPDF - file, which is annoyingly *not* α_s(MZ) for Nf<5.""" - fig, ax = plotutils.subplots() - alphas = fits_as - #Could be a transposed data frame - fits_total_chi2 = np.ravel(fits_total_chi2) - ax.plot(alphas, fits_total_chi2) - ax.set_xlabel(r'$\alpha_S$') - ax.set_ylabel(r'$\chi²$') - if suptitle: - fig.suptitle(suptitle) - return fig - -@figure -def plot_as_central_parabola( - fits_as, - as_central_parabola, - suptitle, ndata, - parabolic_as_determination_for_total, - markmin:bool=False): - """Plot a parabola with the central chi² per number of points, marking - the chi² at the total best fit.""" - fig, ax = plotutils.subplots() - asarr = np.linspace(min(fits_as), max(fits_as), 100) - as_central_parabola = np.asarray(as_central_parabola) - ndata = int(ndata) - ax.plot(asarr, np.polyval(as_central_parabola, asarr)/ndata) - - best_as = parabolic_as_determination_for_total[0].location - chi2_at_best = np.polyval(as_central_parabola, best_as)/ndata - ax.scatter(best_as, chi2_at_best) - ax.annotate(format_number(chi2_at_best, 3), (best_as, chi2_at_best)) - if markmin: - ax.axvline(-as_central_parabola[1]/2/as_central_parabola[0]) - ax.set_ylabel(r'$\chi^2/N_{data}$') - ax.set_xlabel(r'$\alpha_S$') - ax.set_title(f"{suptitle}") - return fig - -@figure -def plot_as_cummulative_central_chi2(fits_as, - as_datasets_central_parabolas, - central_by_dataset_suptitle): - """Plot the cumulative total chi² for each of the datasets""" - fig, ax = plotutils.subplots() - nx = 100 - asarr = np.linspace(min(fits_as), max(fits_as), nx) - last = np.zeros(nx) - for (p, label) in zip(as_datasets_central_parabolas, central_by_dataset_suptitle): - val = last + np.polyval(p, asarr) - ax.fill_between(asarr, last, val, label=label) - last = val - ax.legend() - ax.set_ylabel(r'$\chi^2$') - ax.set_xlabel(r'$\alpha_S$') - ax.set_ylim(0) - ax.set_xlim(asarr[[0,-1]]) - - return fig - -@figure -def plot_as_cummulative_central_chi2_diff(fits_as, - as_datasets_central_parabolas, - central_by_dataset_suptitle, - parabolic_as_determination_for_total, - ymax:(numbers.Real, type(None))=None): - """Plot the cumulative difference between the χ² at the best global - αs fit and the χ² at αs. If the difference is negative, it is set to zero. - """ - fig, ax = plotutils.subplots() - nx = 100 - best_as = parabolic_as_determination_for_total[0].location - asarr = np.linspace(min(fits_as), max(fits_as), nx) - last = np.zeros(nx) - for (p, label) in zip(as_datasets_central_parabolas, central_by_dataset_suptitle): - delta = np.polyval(p, asarr) - np.polyval(p, best_as) - delta[delta<0] = 0 - val = last + delta - ax.fill_between(asarr, last, val, label=label) - last = val - ax.legend() - ax.set_ylabel(r'$\chi^2 - \chi^2_{{\rm best}\ \alpha_S}$') - ax.set_xlabel(r'$\alpha_S$') - ax.set_ylim(0) - if ymax is not None: - ax.set_ylim(ymax=ymax) - ax.set_xlim(asarr[[0,-1]]) - - return fig - -@figure -def plot_as_cummulative_central_chi2_diff_underflow( - fits_as, - as_datasets_central_parabolas, - central_by_dataset_suptitle, - parabolic_as_determination_for_total, - ymax:(numbers.Real, type(None))=None): - """Plot the cumulative difference between the χ² at the best global - αs fit and the χ² at αs. If the difference is negative, it is set to zero. - """ - fig, ax = plotutils.subplots() - nx = 100 - best_as = parabolic_as_determination_for_total[0].location - asarr = np.linspace(min(fits_as), max(fits_as), nx) - - #ordering = [np.polyval(p, best_as + 0.001) - np.polyval(p, best_as) for p in as_datasets_central_parabolas] - #ordered_indexes = sorted(range(len(ordering)), key=ordering.__getitem__) - #as_datasets_central_parabolas = [as_datasets_central_parabolas[i] for i in ordered_indexes] - #central_by_dataset_suptitle = [central_by_dataset_suptitle[i] for i in ordered_indexes] - - - - last = np.zeros(nx) - for (p, label) in zip(as_datasets_central_parabolas, central_by_dataset_suptitle): - delta = np.polyval(p, asarr) - np.polyval(p, best_as) - delta[delta<0] = 0 - val = last + delta - ax.fill_between(asarr, last, val, label=label) - last = val - last = np.zeros(nx) - for (p, label) in zip(as_datasets_central_parabolas, central_by_dataset_suptitle): - delta = np.polyval(p, asarr) - np.polyval(p, best_as) - delta[delta>0] = 0 - val = last + delta - ax.fill_between(asarr, last, val) - last = val - ax.legend() - ax.set_ylabel(r'$\chi^2 - \chi^2_{{\rm best}\ \alpha_S}$') - ax.set_xlabel(r'$\alpha_S$') - ax.set_xlim(asarr[[0,-1]]) - if ymax is not None: - ax.set_ylim(ymax=ymax) - ymin = np.min(last) - ax.set_ylim(ymin=ymin) - - return fig - -@figure -def plot_as_cummulative_central_chi2_diff_negative( - fits_as, - as_datasets_central_parabolas, - central_by_dataset_suptitle, - parabolic_as_determination_for_total): - """Plot the cumulative difference between the χ² at the best global - αs fit and the χ² at αs. If the difference is negative, it is set to zero. - """ - """Plot the cumulative difference between the χ² at the best global - αs fit and the χ² at αs. If the difference is negative, it is set to zero. - """ - fig, ax = plotutils.subplots() - nx = 100 - best_as = parabolic_as_determination_for_total[0].location - asarr = np.linspace(min(fits_as), max(fits_as), nx) - last = np.zeros(nx) - for (p, label) in zip(as_datasets_central_parabolas, central_by_dataset_suptitle): - delta = np.polyval(p, asarr) - np.polyval(p, best_as) - delta[delta>0] = 0 - val = last + delta - ax.fill_between(asarr, last, val, label=label) - last = val - #Seems that fill_between doesn't compute for the legend location - ax.legend(loc='lower left') - ax.set_ylabel(r'$\chi^2 - \chi^2_{{\rm best}\ \alpha_S}$') - ax.set_xlabel(r'$\alpha_S$') - ymin = np.min(last) - ax.set_ylim(ymin=ymin) - ax.set_xlim(asarr[[0,-1]]) - - return fig -@figuregen -def plot_dataspecs_central_parabolas( - dataspecs_as_central_parabolas_map, - dataspecs_fits_as): - """Plot the parabolas resulting from the chi² of the mean PDF to the data, - as a function of alpha_S. Yield one plot per ``dataset_item`` comparing - several dataspecs.""" - #Note that cannot cast as a matrix if shapes are different - limits = min(map(min, dataspecs_fits_as)), max(map(max, dataspecs_fits_as)) - alphasaxis = np.linspace(*limits, 100) - for dsname, dataspecs_parabolas in dataspecs_as_central_parabolas_map.items(): - fig, ax = plotutils.subplots() - for label, parabola in dataspecs_parabolas.items(): - ax.plot(alphasaxis, np.polyval(parabola, alphasaxis), label=label) - - ax.set_ylabel(r'$\chi^2/N_{dat}$') - ax.set_xlabel(r'$\alpha_S$') - #ax.set_ylim(0) - ax.set_title(dsname) - ax.set_xlim(alphasaxis[[0,-1]]) - - ax.legend() - - yield fig - -@figure -def plot_as_datasets_pseudoreplicas_chi2(as_datasets_pseudoreplicas_chi2): - """Plot the error bars of the αs determination from pseudoreplicas - by dataset item. Note that this only has meaning of preferred - value for "Total", and the rest of the values are the minima of - the partial χ².""" - data, names = zip(*as_datasets_pseudoreplicas_chi2) - cv, err = zip(*[(np.mean(dt), np.std(dt)) for dt in data]) - fig, ax = plot_horizontal_errorbars([cv], [err], names) - ax.set_xlabel(r"$\alpha_S$") - ax.set_title(r"$\alpha_S$ from pseudoreplicas") - return fig - -@figure -def plot_as_exepriments_central_chi2(as_datasets_central_chi2): - """Plot the error bars of the αs determination from central χ² - by dataset item. Note that this only has meaning of preferred - value for "Total", and the rest of the values are the minima of - the partial χ².""" - data, names = zip(*as_datasets_central_chi2) - cv, err = zip(*data) - fig, ax = plot_horizontal_errorbars([cv], [err], names) - ax.set_xlabel(r"$\alpha_S$") - ax.set_title(r"$\alpha_S$ from central chi²") - return fig - - -@figure -def plot_as_datasets_compare(as_datasets_pseudoreplicas_chi2, - as_datasets_central_chi2, - marktotal:bool=True): - """Plot the result of ``plot_as_datasets_pseudoreplicas_chi2`` and - ``plot_as_exepriments_central_chi2`` together.""" - datapseudo, namespseudo = zip(*as_datasets_pseudoreplicas_chi2) - cvpseudo, errpseudo = zip(*[(np.mean(dt), np.std(dt)) for dt in datapseudo]) - - - datacentral, namescentral = zip(*as_datasets_central_chi2) - cvcentral, errcentral = zip(*datacentral) - - if namespseudo != namescentral: - raise RuntimeError("Names do not coincide") - - fig, ax = plot_horizontal_errorbars( - [cvcentral, cvpseudo], [errcentral, errpseudo], namescentral, - [r'Central $\chi^2$', r'Pseudoreplica $\chi^2$'] - ) - if marktotal: - try: - pos = namespseudo.index('Total') - except ValueError: - log.error("Asked to mark total, but it was not provided.") - else: - ax.axvline(cvcentral[pos], color='C0', linewidth=0.5, linestyle='--') - ax.axvline(cvpseudo[pos], color='C1', linewidth=0.5, linestyle='--') - - ax.set_xlabel(r"$\alpha_S$") - ax.set_title(r"$\alpha_S$ determination") - ax.legend() - return fig - -@figure -def plot_dataspecs_as_value_error(dataspecs_as_value_error_table_impl, - dataspecs_fits_as, - marktotal:bool=True, fix_limits:bool=True): - """ - Plot the result for each dataspec of the pseudoreplica alpha_s - determination based on the partial chi² for each ``dataset_item``. - - If ``marktotal`` is True, a vertical line will appear marking the position - of the best fit. - - If ``fix_limits`` is True, the limits of the plot will span all the fitted - values. Otherwise an heuristic will be used. - - """ - - df = dataspecs_as_value_error_table_impl - datalabels = df.columns.levels[0] - catlabels = list(df.index) - cvs = df.loc[:, (slice(None), 'mean')].T.values - errors = df.loc[:, (slice(None), 'error')].T.values - - if fix_limits: - minlim = min(min(x for x in dataspecs_fits_as)) - maxlim = max(max(x for x in dataspecs_fits_as)) - lims = minlim, maxlim - else: - lims = None - - - fig, ax = plot_horizontal_errorbars( - cvs, errors, catlabels, - datalabels, - xlim = lims - - ) - - if marktotal: - try: - pos = catlabels.index('Total') - except ValueError: - log.error("Asked to mark total, but it was not provided.") - else: - for i,cv in enumerate(cvs): - ax.axvline(cv[pos], color=f'C{i%10}', linewidth=0.5, linestyle='--') - - ax.set_xlabel(r"$\alpha_S$") - ax.set_title(r"$\alpha_S$ determination") - ax.legend() - return fig - -@figure -def plot_dataspecs_as_value_error_comparing_with_central( - dataspecs_as_value_error_table_impl, - as_datasets_central_chi2, - dataspecs_fits_as, - speclabel, - marktotal:bool=True, fix_limits:bool=True): - """ - This is an aberration we need to do for the paper plots. It compares the - central (old) and new partial chi². - """ - - df = dataspecs_as_value_error_table_impl - catlabels = list(df.index) - replica_cvs = df.loc[:, (speclabel, 'mean')].T.values - replica_errors = df.loc[:, (speclabel, 'error')].T.values - - datacentral, namescentral = zip(*as_datasets_central_chi2) - cvcentral, errcentral = zip(*datacentral) - - if fix_limits: - minlim = min(min(x for x in dataspecs_fits_as)) - maxlim = max(max(x for x in dataspecs_fits_as)) - lims = minlim, maxlim - else: - lims = None - - - cvs = np.c_[cvcentral, replica_cvs].T - errors = np.c_[errcentral, replica_errors].T - - - - fig, ax = plot_horizontal_errorbars( - cvs, errors, catlabels, - [r'$\Delta \chi^2=1$', 'Replicas'], - xlim = lims - - ) - - if marktotal: - try: - pos = catlabels.index('Total') - except ValueError: - log.error("Asked to mark total, but it was not provided.") - else: - for i,cv in enumerate(cvs): - ax.axvline(cv[pos], color=f'C{i%10}', linewidth=0.5, linestyle='--') - - ax.set_xlabel(r"$\alpha_S$") - ax.set_title(r"$\alpha_S$ determination") - ax.legend() - return fig - -@make_argcheck -def _check_first_is_total(fits_central_chi2_by_experiment_and_dataset): - l = fits_central_chi2_by_experiment_and_dataset - if not l or l[0]['experiment_label'] != 'Total': - raise CheckError("Expecting that the first experiment is the total. You may need to set prepend_total=True") - -@figure -@_check_first_is_total -def plot_as_value_error_central(as_datasets_central_chi2, - marktotal:bool=True): - """Plot the result of ``plot_as_datasets_pseudoreplicas_chi2`` and - ``plot_as_exepriments_central_chi2`` together.""" - - datacentral, namescentral = zip(*as_datasets_central_chi2) - cvcentral, errcentral = zip(*datacentral) - - fig, ax = plot_horizontal_errorbars( - [cvcentral], [errcentral], namescentral, - [r'Central'] - ) - ax.axvline(cvcentral[0], color=f'C{0}', linewidth=0.5, linestyle='--') - - ax.set_xlabel(r"$\alpha_S$") - ax.set_xlim(0.110,0.125) - ax.set_title(r"$\alpha_S$ determination") - ax.legend() - return fig - -# Pull plots -def _pulls_func(cv,alphas_global,error,error_global): - """Small definition to compute pulls""" - return error_global*((cv-alphas_global)/(error**2)) - - -@figure -@_check_first_is_total -def plot_pulls_central(as_datasets_central_chi2,hide_total:bool=True): - """ Plots the pulls per experiment for the central results """ - - data, names = zip(*as_datasets_central_chi2) - cv, err = zip(*data) - pulls = list() - if hide_total: - for i in range(1,len(cv)): - pulls.append(_pulls_func(cv[i],cv[0],err[i],err[0])) - names = [x for x in names if x!='Total'] - else: - for i in range(0,len(cv)): - pulls.append(_pulls_func(cv[i],cv[0],err[i],err[0])) - - fig, ax = barplot(pulls, names, " ", orientation="horizontal") - ax.legend() - - return fig - -@figure -@_check_first_is_total -def plot_pull_gaussian_fit_central(as_datasets_central_chi2, - dataspecs_fits_as,dataspecs_speclabel,hide_total:bool=True): - - """Bins the pulls and overlays - the normalised gaussian fit and KDE to the histogram of pulls""" - - data, names = zip(*as_datasets_central_chi2) - cv, err = zip(*data) - pulls = list() - - if hide_total: - for i in range(1,len(cv)): - pulls.append(_pulls_func(cv[i],cv[0],err[i],err[0])) - names = [x for x in names if x!='Total'] - else: - for i in range(0,len(cv)): - pulls.append(_pulls_func(cv[i],cv[0],err[i],err[0])) - - mean_pulls = np.mean(pulls) - std_dev = np.std(pulls) - x = np.linspace(min(pulls),max(pulls), 100) - kde_pulls = stats.gaussian_kde(pulls, bw_method='silverman') - fig, ax = plotutils.subplots() - - #ax.set_title(f"Histogram of pulls for {label} dataset") - ax.set_xlabel(r"Pull") - ax.plot(x, kde_pulls(x), label="Kernel Density Estimation of pulls") - ax.hist(pulls,normed=True,bins=4) - ax.grid(False) - normdist = stats.norm(mean_pulls, std_dev) - ax.plot(x, normdist.pdf(x),label="Normalised gaussian fit") - ax.legend() - - return fig - -@figure -def plot_pull_plots_global_min(dataspecs_as_value_error_table_impl, - dataspecs_fits_as,dataspecs_speclabel,hide_total:bool=True): - - """Plots the pulls of individual experiments as a barplot.""" - - df = dataspecs_as_value_error_table_impl - tots_error = df.loc['Total', (slice(None), 'error')].values - tots_mean = df.loc['Total', (slice(None), 'mean')].values - - if hide_total: - df = df.loc[df.index != 'Total'] - - catlabels = list(df.index) - cvs = df.loc[:, (slice(None), 'mean')].values - errors = df.loc[:, (slice(None), 'error')].values - - pulls = _pulls_func(cvs,tots_mean,errors,tots_error).T - - pulls = np.asarray(pulls,dtype=float) - #Compute sums - #sp = np.atleast_2d(np.nansum(pulls, axis=1)).T - #pulls =np.concatenate([pulls, sp], axis=1) - - #fig, ax = barplot(pulls, [*catlabels, 'sum'], dataspecs_speclabel, orientation="horizontal") - fig, ax = barplot(pulls, catlabels, dataspecs_speclabel, orientation="horizontal") - - - #ax.set_title(r"Pulls per experiment") - ax.legend() - return fig - -@make_argcheck -def _check_two_speclabels(dataspecs_speclabel): - if len(dataspecs_speclabel) != 2: - raise CheckError("Need 2 data specs") - - -@figure -@_check_two_speclabels -def alphas_shift( - dataspecs_as_value_error_table_impl, - dataspecs_quad_table_impl, - dataspecs_ndata_table, - dataspecs_dataset_ndata, - dataspecs_fits_as, - dataspecs_speclabel, - hide_total:bool=True, - ndata_weight:bool=False): - - """Plots NNLO - NLO alphas values for each experiment - i.e. - the shift in the best fit alphas for each process (as it currently - stands...) wrt the global best fit alphas at NLO or NNLO. - Also contains some computations for estimating MHOU, using either - the number of data points per experiment/process (ndata) - or the quadratic coefficient of the parabolic fit (quad_weights)""" - - df1 = dataspecs_ndata_table - df = dataspecs_as_value_error_table_impl - df2 = dataspecs_quad_table_impl - - - tots_mean = df.loc['Total', (slice(None), 'mean')].values - - if hide_total: - df = df.loc[df.index != 'Total'] - df1 = df1.loc[df1.index != 'Total'] - df2 = df2.loc[df2.index != 'Total'] - - - cvs = df.loc[:, (slice(None), 'mean')].T.values - quad_weights = df2.loc[:, (slice(None), 'mean')].T.values - - catlabels = list(df.index) - - alphas_shift = [] - nnlo_alphas_global_shift = [] - nlo_alphas_global_shift = [] - nnlo2_alphas_global_shift = [] - - for i in range(0,len(cvs[0])): - alphas_shift.append(cvs[1][i]-cvs[0][i]) - nnlo2_alphas_global_shift.append((cvs[1][i]-tots_mean[1])**2) - nlo_alphas_global_shift.append((cvs[0][i]-tots_mean[0])**2) - nnlo_alphas_global_shift.append(cvs[1][i]-tots_mean[1]) - - weights_nlo = [] - weights_nnlo = [] - weights_nnlo_sq = [] - weights_nlo_sq = [] - - if ndata_weight: - - ndataptsnlo = df1.iloc[:,0] - ndataptsnnlo = df1.iloc[:,1] - - for i in range(0,len(ndataptsnlo)): - weights_nlo.append(ndataptsnlo[i]) - weights_nnlo.append(ndataptsnnlo[i]) - - else: - for i in range(0,len(quad_weights.T)): - weights_nlo.append(quad_weights[0][i]) - weights_nnlo.append(quad_weights[1][i]) - weights_nnlo_sq.append((quad_weights[1][i])**2) - weights_nlo_sq.append((quad_weights[0][i])**2) - - - term1, term2 = dataspecs_speclabel - - - fig, ax = barplot(alphas_shift, catlabels, " ", orientation = "horizontal") - ax.set_title(f"{term2} - {term1} shifts") - ax.legend() - return fig - -@figuregen -def plot_pull_gaussian_fit_pseudo(dataspecs_as_value_error_table_impl, - dataspecs_fits_as,dataspecs_speclabel,hide_total:bool=True): - - """Bins the pulls computed in pull_plots_global_min and overlays - the normalised gaussian fit and KDE to the histogram of pulls""" - - df = dataspecs_as_value_error_table_impl - tots_error = df.loc['Total', (slice(None), 'error')].T.values - tots_mean = df.loc['Total', (slice(None), 'mean')].T.values - - if hide_total: - df = df.loc[df.index != 'Total'] - - cvs = df.loc[:, (slice(None), 'mean')].T.values - errors = df.loc[:, (slice(None), 'error')].T.values - - for label, i in zip(dataspecs_speclabel, range(len(cvs))): - pulls = _pulls_func(cvs[i],tots_mean[i],errors[i],tots_error[i]) - - mean_pulls = np.mean(pulls) - std_dev = np.std(pulls) - x = np.linspace(min(pulls),max(pulls), 100) - - kde_pulls = stats.gaussian_kde(pulls, bw_method='silverman') - fig, ax = plotutils.subplots() - - #ax.set_title(f"Histogram of pulls for {label} dataset") - ax.set_xlabel(r"Pull") - ax.plot(x, kde_pulls(x), label="Kernel Density Estimation of pulls") - ax.hist(pulls,normed=True,bins=4) - ax.grid(False) - normdist = stats.norm(mean_pulls, std_dev) - ax.plot(x, normdist.pdf(x) ,label="Normalised gaussian fit") - ax.legend() - - yield fig - -@figure -def plot_fitted_replicas_as_profiles_matched(fits_as, - fits_replica_data_with_discarded_replicas, - parabolic_as_determination, suptitle=None): - """Plot chi²(as) keeping the replica nnfit index matched. - - The ``max_ndiscarded`` parameter defines th number of points - discarded by postfit from which we discard the curve. - """ - alphas = fits_as - - - minimums = parabolic_as_determination - - table = fits_replica_data_with_discarded_replicas.values - - fig, ax = plotutils.subplots() - - from matplotlib.collections import LineCollection - - lc = LineCollection([list(x for x in zip(alphas, t) if np.isfinite(x[1])) for t in table]) - lc.set_array(minimums.data) - lc.set_clim(*np.percentile(minimums.data, (5,95))) - ax.add_collection(lc) - ax.set_xlim(min(alphas), max(alphas)) - ax.set_ylim(np.nanmin(table), np.nanmax(table)) - fig.colorbar(lc, label=r"Preferred $\alpha_S$") - ax.set_title(rf"$\alpha_S$ = ${minimums.location:.4f} \pm {minimums.scale:.4f}$ N={len(minimums.data)}") - if suptitle: - fig.suptitle(suptitle) - - #ax.plot(alphas, np.array(table).T, color='#ddddcc') - ax.set_xlabel(r'$\alpha_S$') - ax.set_ylabel(r'$\chi^2$') - return fig - -@figuregen -@check_dataset_items -def plot_dataspecs_pseudoreplica_means( - dataspecs_chi2_by_dataset_dict, - dataspecs_speclabel, - dataset_items:(list, type(None))=None, - ): - """ Plot the mean chi² from data to pseudoreplica, over replicas in a fit - and comparing dataspecs. - """ - if dataset_items is None: - dataset_items = list(dataspecs_chi2_by_dataset_dict) - - - for it in dataset_items: - fig, ax = plotutils.subplots() - for label, tb in zip(dataspecs_speclabel, dataspecs_chi2_by_dataset_dict[it]): - if tb is None: - #Advance color cycle so all dataspecs get the same color - ax._get_lines.get_next_color() - continue - m = tb.mean(axis=0) - ax.plot(np.asarray(m.index),np.asarray(m), label=label) - ax.set_title(it) - ax.set_xlabel(r'$\alpha_S$') - ax.set_ylabel(r'$\left<\chi^2\right>$') - ax.legend() - - yield fig - - - -#TODO: This should take parabolic_as_determination with the as_transforms -#and so on, rather that refitting here. -@figuregen -@check_dataset_items -@check_positive('examples_per_item') -def plot_dataspecs_parabola_examples( - dataspecs_chi2_by_dataset_dict, - dataspecs_speclabel, - dataset_items:(list, type(None))=None, - examples_per_item:int = 2, - random_seed:int = 0, - ): - """Sample ``examples_per_item`` replica_indexes for each of the - ``dataset_items``. Yield a plot with the parabolic fit, as resolved for - each of the dataspecs. The random state is local to the function and - controlled by ``random_seed``.""" - random_state = np.random.RandomState(random_seed) - if dataset_items is None: - dataset_items = list(dataspecs_chi2_by_dataset_dict) - - - for it in dataset_items: - table = pd.concat(dataspecs_chi2_by_dataset_dict[it], - join='inner', axis=1, keys=dataspecs_speclabel) - if examples_per_item > len(table): - log.warning("Length of table is less than examples_per_item") - sampled = table - else: - sampled = table.sample(examples_per_item, random_state=random_state) - - - for index, row in sampled.iterrows(): - fig, ax = plotutils.subplots() - im = marker_iter_plot() - ax.set_title(f"Parabola example for {it} (nnfit_index={index})") - for i, (label, vals) in enumerate(row.groupby(level=0)): - asvals = vals.index.get_level_values(1) - color = f'C{i}' - y = vals.values - ax.plot(asvals, y, **next(im), label=label, - color=color, linestyle='none', lw=0.5) - a,b,c = parabola = get_parabola(asvals, y) - ax.plot(asvals, np.polyval(parabola,asvals), color=color, - linestyle='--', lw=0.5) - m = -b/2/a - if asvals[0] < m < asvals[-1]: - ax.axvline(m , color=color, lw=0.4) - ax.set_xlabel(r'$\alpha_S$') - ax.set_ylabel(r'$\chi^2$') - ax.legend( ) - - yield fig - -@figure -def plot_as_distribution(parabolic_as_determination, suptitle): - """Histograms of the values of alphas produced, with the datapoints in - an array as sticks on an axis""" - - distribution = parabolic_as_determination.data - fig, ax = plotutils.subplots() - - kde_plot(distribution) - ax.legend() - ax.set_title(f"{suptitle}") - ax.set_xlabel(r"$\alpha_S$") - return fig - -@figure -def plot_total_as_distribution_dataspecs( - dataspecs_parabolic_as_determination_for_total, - dataspecs_speclabel, - ): - """Compare the total alpha_s distributions across dataspecs. - See ``plot_as_distribution``.""" - fig, ax = plotutils.subplots() - for dist, label in zip( - dataspecs_parabolic_as_determination_for_total, - dataspecs_speclabel): - #Remember that *_for_total is a len 1 list, so take the first element. - kde_plot(dist[0].error_members(), ax=ax, label=label) - ax.set_xlabel(r"$\alpha_S$") - ax.legend() - return fig - -@figure -def plot_poly_as_fit(fits_as, - fits_replica_data_correlated, max_ndiscarded:int=4, polorder:int=2, - suptitle=None): - """Plot a polynomial fit of chi²(as) of `degree polorder`, keeping the - replica index matched. - - The ``max_ndiscarded`` parameter defines th number of points - discarded by postfit from which we discard the curve. - """ - - alphas = fits_as - df = fits_replica_data_correlated - def ap(x): - x.columns = x.columns.droplevel(0) - return (x['chi2']) - table = df.groupby(axis=1, level=0).apply(ap) - filt = table.isnull().sum(axis=1) < max_ndiscarded - table = table[filt] - table = table.values - fig, ax = plotutils.subplots() - - minimums = [] - asarr = np.asarray(alphas) - for i, row in enumerate(table): - filt = np.isfinite(row) - fit = np.polyfit(asarr[filt], row[filt], polorder) - extrema = np.roots(np.polyder(fit)) - - #Cast away the zero complex part - candidates = np.real(extrema[np.isreal(extrema)]) - #candidates = candidates[(candidates > alphas[0]) & (candidates