Implement lognormal log likelihood function (#88)

twallema · Sep 6, 2024 · 92f24a1 · 92f24a1
1 parent fb1350c
commit 92f24a1
Show file tree

Hide file tree

Showing 5 changed files with 65 additions and 24 deletions.
diff --git a/docs/enzyme_kinetics.md b/docs/enzyme_kinetics.md
@@ -130,7 +130,7 @@ here {math}`y_{i,j}` is the glucose laurate ester concentration of the {math}`j`
 We'll load the datasets using a `for` statement and immediately extract two inputs to our log posterior probability function: 1) The glucose laurate ester observations as `data`, 2) The glucose laurate ester measurement error, computed as 4% relative to the glucose laurate ester observations and, 3) the initial concentrations used in the experiment in `initial_states`. We will also construct the list of model states to match our datasets to (`states`), which is a list containing eight instances of the Ester state names `'Es'`. 
 
 ```python
-from pySODM.optimization.objective_functions import ll_gaussian
+from pySODM.optimization.objective_functions import ll_normal
 
 # Extract and sort the names
 names = os.listdir(os.path.join(os.path.dirname(__file__),'data/intrinsic_kinetics/'))
@@ -144,7 +144,7 @@ initial_states=[]
 for name in names:
     df = pd.read_csv(os.path.join(os.path.dirname(__file__),'data/intrinsic_kinetics/'+name), index_col=0)
     data.append(df['Es'][1:]) # Cut out zero's!
-    log_likelihood_fnc.append(ll_gaussian)
+    log_likelihood_fnc.append(ll_normal)
     log_likelihood_fnc_args.append(0.04*df['Es'][1:]) # 4% Relative noise
     states.append('Es')
     initial_states.append(

diff --git a/docs/optimization.md b/docs/optimization.md
@@ -35,12 +35,23 @@
 
 ### Log likelihood
 
-***function* ll_gaussian(ymodel, ydata, sigma)**
+***function* ll_normal(ymodel, ydata, sigma)**
 
 >    **Parameters:**
->   * **ymodel** (list/np.ndarray) - Mean values of the Gaussian distribution (i.e. "mu" values), as predicted by the model
+>   * **ymodel** (list/np.ndarray) - Mean values of the normal distribution (i.e. "mu" values), as predicted by the model
 >   * **ydata** (list/np.ndarray) - Data time series values to be matched with the model predictions.
->   * **sigma** (float/list of floats/np.ndarray) - Standard deviation(s) of the Gaussian distribution around the data 'ydata'. Two options are possible: 1) One error per model dimension, applied uniformly to all datapoints corresponding to that dimension; OR 2) One error for every datapoint, corresponding to a weighted least-squares regression.
+>   * **sigma** (float/list of floats/np.ndarray) - Standard deviation(s) of the normal distribution around the data 'ydata'. Two options are possible: 1) One error per model dimension, applied uniformly to all datapoints corresponding to that dimension; OR 2) One error for every datapoint, corresponding to a weighted least-squares regression.
+
+>   **Returns:**
+>   * **ll** (float) - Loglikelihood associated with the comparison of the data points and the model prediction.  
+
+
+***function* ll_lognormal(ymodel, ydata, sigma)**
+
+>    **Parameters:**
+>   * **ymodel** (list/np.ndarray) - Mean values of the lognormal distribution (i.e. "mu" values), as predicted by the model
+>   * **ydata** (list/np.ndarray) - Data time series values to be matched with the model predictions.
+>   * **sigma** (float/list of floats/np.ndarray) - Standard deviation(s) of the lognormal distribution around the data 'ydata'. Two options are possible: 1) One error per model dimension, applied uniformly to all datapoints corresponding to that dimension; OR 2) One error for every datapoint, corresponding to a weighted least-squares regression.
 
 >   **Returns:**
 >   * **ll** (float) - Loglikelihood associated with the comparison of the data points and the model prediction.  
@@ -55,6 +66,7 @@
 >   **Returns:**
 >   * **ll** (float) - Loglikelihood associated with the comparison of the data points and the model prediction.
 
+
 ***function* ll_negative_binomial(ymodel, ydata, alpha)**
 
 >   **Parameters:**
@@ -202,7 +214,7 @@
 
 ## mcmc.py
 
-***function* run_EnsembleSampler(pos, max_n, identifier, objective_function, objective_function_args=None, objective_function_kwargs=None, moves=[(emcee.moves.DEMove(), 0.50),(emcee.moves.DESnookerMove(),0.25),(emcee.moves.KDEMove(bw_method='scott'), 0.25)], fig_path=None, samples_path=None, print_n=10, backend=None, processes=1, progress=True, settings_dict={})**
+***function* run_EnsembleSampler(pos, max_n, identifier, objective_function, objective_function_args=None, objective_function_kwargs=None, moves=[(emcee.moves.DEMove(), 0.25),(emcee.moves.DESnookerMove(),0.25),(emcee.moves.KDEMove(bw_method='scott'), 0.50)], fig_path=None, samples_path=None, print_n=10, backend=None, processes=1, progress=True, settings_dict={})**
 
 > Wrapper function to setup an `emcee.EnsembleSampler` and handle all backend-related tasks.
 
@@ -220,7 +232,7 @@
 >    * **settings_dict** (dict) - optional - Dictionary containing calibration settings or other usefull settings for long-term storage. Saved in `.json` format. Appended to the samples dictionary generated by `emcee_sampler_to_dictionary()`. 
 
 >    **Hyperparameters:**
->    * **moves** (list) - optional - Algorithm used for updating the coordinates of walkers in an ensemble sampler. By default, pySODM uses a differential evolution move 50% of the time, a differential evolution snooker move 25% of the time, and a kernel-density estimate move 25% of the time. Consult the [emcee documentation](https://emcee.readthedocs.io/en/stable/user/moves/) for an overview of all moves.
+>    * **moves** (list) - optional - Algorithm used for updating the coordinates of walkers in an ensemble sampler. By default, pySODM uses a differential evolution move 25% of the time, a differential evolution snooker move 25% of the time, and a kernel-density estimate move 50% of the time. Consult the [emcee documentation](https://emcee.readthedocs.io/en/stable/user/moves/) for an overview of all moves.
 >    * **backend** (`emcee.backends.HDFBackend`) - optional - Backend of a previous sampling experiment. If a backend is provided, the sampler is restarted from the last iteration of the previous run. Consult the [emcee documentation](https://emcee.readthedocs.io/en/stable/user/backends/).
 >    * **progress** (bool) - optional - Enables the progress bar.
 

diff --git a/src/pySODM/optimization/objective_functions.py b/src/pySODM/optimization/objective_functions.py
@@ -12,33 +12,64 @@
 ## Log-likelihood functions ##
 ##############################
 
-def ll_gaussian(ymodel, ydata, sigma):
+def ll_lognormal(ymodel, ydata, sigma):
     """
-    Loglikelihood of a Gaussian distribution, can be used homoskedastically (one sigma for the entire timeseries) or heteroskedastically (one sigma per datapoint in the timeseries).
+    Loglikelihood of a lognormal distribution, can be used homoskedastically (one sigma for the entire timeseries) or heteroskedastically (one sigma per datapoint in the timeseries).
 
     Parameters
     ----------
     ymodel: np.ndarray
-        mean values of the Gaussian distribution (i.e. "mu" values), as predicted by the model
+        mean values of the lognormal distribution (i.e. "mu" values), as predicted by the model
     ydata: np.ndarray
         time series to be matched with the model predictions
     sigma: float/list of floats/np.ndarray
-        Standard deviation of the Gaussian distribution around the mean value 'ymodel'
+        Standard deviation of the lognormal distribution around the mean value 'ymodel'
 
     Returns
     -------
     ll: float
         Loglikelihood belonging to the comparison of the data points and the model prediction for its particular parameter values
     """
 
-    # Expand first dimensions on 'alpha' to match the axes
+    # expand first dimensions on 'sigma' to match the axes
+    sigma = np.array(sigma)
+    if not sigma.shape == ymodel.shape:
+        sigma = sigma[np.newaxis, ...]  
+    # check for zeros
+    if len(sigma[sigma<=0]) != 0:
+        raise ValueError(
+            'the standard deviation used in `ll_lognormal` contains values smaller than or equal to zero'
+        )
+
+    return - 1/2 * np.sum((np.log(ydata+1)-np.log(ymodel+1))**2/sigma**2 + np.log(2*np.pi*sigma**2) + np.log(ydata+1))
+
+def ll_normal(ymodel, ydata, sigma):
+    """
+    Loglikelihood of a normal distribution, can be used homoskedastically (one sigma for the entire timeseries) or heteroskedastically (one sigma per datapoint in the timeseries).
+
+    Parameters
+    ----------
+    ymodel: np.ndarray
+        mean values of the normal distribution (i.e. "mu" values), as predicted by the model
+    ydata: np.ndarray
+        time series to be matched with the model predictions
+    sigma: float/list of floats/np.ndarray
+        Standard deviation of the normal distribution around the mean value 'ymodel'
+
+    Returns
+    -------
+    ll: float
+        Loglikelihood belonging to the comparison of the data points and the model prediction for its particular parameter values
+    """
+
+    # expand first dimensions on 'sigma' to match the axes
     sigma = np.array(sigma)
     if not sigma.shape == ymodel.shape:
         sigma = sigma[np.newaxis, ...]  
-    # Check for zeros (TODO: move to a higher layer)
+    # check for zeros
     if len(sigma[sigma<=0]) != 0:
         raise ValueError(
-            'the standard deviation used in `ll_gaussian` contains values smaller than or equal to zero'
+            'the standard deviation used in `ll_normal` contains values smaller than or equal to zero'
         )
     return - 1/2 * np.sum((ydata - ymodel) ** 2 / sigma**2 + np.log(2*np.pi*sigma**2))
 
@@ -963,7 +994,7 @@ def validate_log_likelihood_funtion(log_likelihood_function):
     Parameters
     ----------
     log_likelihood_function: callable function
-        The log likelihood function. F.i. Gaussian, Poisson, etc.
+        The log likelihood function. F.i. Normal, Poisson, etc.
 
     Returns
     -------
@@ -972,7 +1003,7 @@ def validate_log_likelihood_funtion(log_likelihood_function):
     """
 
     # Check that log_likelihood_fnc always has ymodel as the first argument and ydata as the second argument
-    # Find out how many additional arguments are needed for the log_likelihood_fnc (f.i. sigma for gaussian model, alpha for negative binomial)
+    # Find out how many additional arguments are needed for the log_likelihood_fnc (f.i. sigma for normal model, alpha for negative binomial)
     n_log_likelihood_extra_args=[]
     for idx,fnc in enumerate(log_likelihood_function):
         sig = inspect.signature(fnc)
@@ -1013,7 +1044,7 @@ def validate_log_likelihood_function_extra_args(data, n_log_likelihood_extra_arg
                     )
         else:
             if not additional_axes_data[idx]:
-                # ll_poisson, ll_gaussian, ll_negative_binomial take int/float, but ll_gaussian can also take an error for every datapoint (= weighted least-squares)
+                # ll_poisson, ll_normal, ll_negative_binomial take int/float, but ll_normal can also take an error for every datapoint (= weighted least-squares)
                 # Thus, its additional argument must be a np.array of the same dimensions as the data
                 if not isinstance(log_likelihood_fnc_args[idx], (int,float,np.ndarray,pd.Series)):
                     raise ValueError(

diff --git a/src/tests/test_calibration.py b/src/tests/test_calibration.py
@@ -1,12 +1,10 @@
 import pytest
-import datetime
 import pandas as pd
 import numpy as np
-import matplotlib.pyplot as plt
 from pySODM.models.base import ODE
 from pySODM.optimization import pso, nelder_mead
 from pySODM.optimization.utils import add_negative_binomial_noise
-from pySODM.optimization.objective_functions import log_posterior_probability, ll_poisson, ll_negative_binomial
+from pySODM.optimization.objective_functions import log_posterior_probability, ll_normal, ll_lognormal, ll_poisson, ll_negative_binomial
 from pySODM.optimization.mcmc import perturbate_theta, run_EnsembleSampler, emcee_sampler_to_dictionary
 
 # Only tested for ODE but the model output is identical so this shouldn't matter
@@ -528,8 +526,8 @@ def test_correct_approach_with_two_dimensions():
     # Variables that don't really change
     states = ["I",]
     weights = np.array([1,])
-    log_likelihood_fnc = [ll_negative_binomial,]
-    log_likelihood_fnc_args = [alpha*np.ones([2,3]),]
+    log_likelihood_fnc = [ll_normal,]
+    log_likelihood_fnc_args = [np.ones([2,3]),]
     # Calibated parameters and bounds
     pars = ['beta',]
     labels = ['$\\beta$',]

diff --git a/tutorials/enzyme_kinetics/calibrate_intrinsic_kinetics.py b/tutorials/enzyme_kinetics/calibrate_intrinsic_kinetics.py
@@ -22,7 +22,7 @@
 from pySODM.optimization import pso, nelder_mead
 from pySODM.optimization.utils import add_gaussian_noise, assign_theta
 from pySODM.optimization.mcmc import perturbate_theta, run_EnsembleSampler, emcee_sampler_to_dictionary
-from pySODM.optimization.objective_functions import log_posterior_probability, log_prior_uniform, ll_gaussian
+from pySODM.optimization.objective_functions import log_posterior_probability, log_prior_uniform, ll_normal
 # pySODM dependecies
 import corner
 
@@ -75,7 +75,7 @@
 for name in names:
     df = pd.read_csv(os.path.join(os.path.dirname(__file__),'data/intrinsic_kinetics/'+name), index_col=0)
     data.append(df['Es'][1:]) # Cut out zero's!
-    log_likelihood_fnc.append(ll_gaussian)
+    log_likelihood_fnc.append(ll_normal)
     log_likelihood_fnc_args.append(0.04*df['Es'][1:]) # 4% Relative noise
     y_err.append(df['sigma'][1:])
     states.append('Es')