From 7c051b14cce286be50d9339ba1f0465811d718a6 Mon Sep 17 00:00:00 2001 From: Liam Gray Date: Fri, 12 Apr 2024 12:26:23 -0700 Subject: [PATCH] feat(tools): add least squares baseline fit and mask taper functions. arPLS - 1D penalized least squares fit. Better than weighted median. taper_mask: apply a hanning taper along the second axis of a 2D mask --- draco/util/tools.py | 170 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 169 insertions(+), 1 deletion(-) diff --git a/draco/util/tools.py b/draco/util/tools.py index 688edd1b..d1f708f2 100644 --- a/draco/util/tools.py +++ b/draco/util/tools.py @@ -3,11 +3,16 @@ Miscellaneous tasks should be placed in :py:mod:`draco.core.misc`. """ +import warnings + import numpy as np # Keep this here for compatibility -from caput.tools import invert_no_zero # noqa: F401 +from caput.tools import invert_no_zero from numpy.lib.recfunctions import structured_to_unstructured +from scipy import linalg as la +from scipy.signal import oaconvolve +from scipy.sparse import dia_array from ._fast_tools import _calc_redundancy @@ -534,3 +539,166 @@ def window_generalised(x, window="nuttall"): w = (a[:, np.newaxis] * np.cos(t)).sum(axis=0) return np.where((x >= 0) & (x <= 1), w, 0) + + +def arPLS_1d(y, mask=None, lam=1e2, end_frac=1e-2, max_iter=1000): + r"""Use arPLS to estimate a signal baseline. + + 1D implementation of symmetrically reweighted penalized least squares. + Solves for a signal baseline in the presence of high power outliers by + heavily weighting values below the signal and minimizing the weights + of values above the signal. + + Notes + ----- + arPLS solves the following linear system given signal :math: `\mathtt{y}` + .. math:: (W + \lambdaD_{d}^{T}D_{d})z = Wy + where the weighting function is given by + .. math:: w_{i} = (1 + \exp(2\sigma^{-1}(r_{i} - (2\sigma - \mu))))^{-1} + where :math:`\mathtt{r_{i}}` is the difference :math:`\mathtt{y_{i} - z_{i}}` + and :math:`\mathtt{\mu}` and :math:`\mathtt{\sigma}` are the mean and standard + deviation of the negative values in :math:`\mathbf{r}`. + The solver runs until `max_iter` iterations, or until the fractional mean + change in weights is less than `end_frac`. + + Reference: + https://www.sciencedirect.com/science/article/pii/S1090780706002266 + + + Properties + ---------- + y : np.ndarray + 1D signal array + mask : np.ndarray, optional + 1D boolean array of same length as `y`. Default is None. + lam : float, optional + Scaling parameter used to control importance of smoothness vs. + fit. High value prioritizes smoothness of the baseline estimate. + Default is 1e2. + end_frac : float, optional + Convergeance ratio `norm(delta_w) / norm(w)`. + max_iter : int, optional + Maximum number of iterations to run, even if the convergance + criteria is not met. + + Returns + ------- + z : np.ndarray + Baseline estimate of the same shape as `y` + """ + y = np.squeeze(y) + if y.ndim != 1: + raise ValueError(f"Expected 1D data array - got array with shape {y.shape}") + + N = y.shape[0] + + if mask is None: + mask = np.zeros(N, dtype=bool) + elif np.all(mask): + warnings.warn("Entire dataset is masked.") + + return np.zeros_like(y) + + mask = np.squeeze(mask) + + if mask.ndim != 1: + raise ValueError(f"Expected 1D mask array - got array with shape {mask.shape}") + + # Construct second-order difference matrix + D = np.array([[1, -2, 1]]).T.repeat(N - 1, axis=1) + D = dia_array((D, [-2, -1, 0]), shape=(N, N - 2)) + Hp = lam * D @ D.T + + # Create the banded smoothness matrix and weights matrix + H = np.ones((3, N), dtype=np.float64) + W = np.zeros_like(H) + + # Fill the lower banded matrix + for i in range(H.shape[0]): + H[i, : N - i] = Hp.diagonal(i) + + # Initialize weights to one + W[0] = 1.0 + + # Get the maximum exponential to avoid runtime warnings + maxpwr = np.log(np.finfo(y.dtype).max) + + for _ in range(max_iter): + # Ignore masked values + W[:, mask] = 0.0 + # Extract the actual weights. W is a 3xN matrix to match + # the banded shape of H, all off-diagonal elements are zero + w = W[0] + + z = la.solveh_banded(H + W, w * y, lower=True) + + # Get the difference between the signal and the baseline estimate, + # and compute the mean and std where unmasked data is less than zero + d = y - z + dn = d[(d < 0) & ~mask] + m = np.mean(dn) + s = np.std(dn) + + # Adjust weights based on the criteria discussed in the paper + pwr = 2 * (d - ((2 * s) - m)) * invert_no_zero(s) + pwr = np.clip(pwr, -maxpwr, maxpwr) + wt = invert_no_zero(1 + np.exp(pwr)) + + # Check for convergeance + if la.norm(w - wt) / la.norm(w) < end_frac: + break + + # Update the weights + W[0] = wt + + return z + + +def taper_mask(mask, nwidth, outer=False): + """Taper a 2d mask along the last axis. + + Parameters + ---------- + mask : np.ndarray + Mask to taper + nwidth : int + Number of samples on either side of the mask to taper + outer : bool, optional + If True, expand the mask outwards (wider). Otherwise, + expand the mask inwards (narrower). Default is False + + Returns + ------- + tapered_mask : np.ndarray[np.float64] + Mask convolved with taper window + """ + mask = np.atleast_2d(mask) + + width = 2 * nwidth - 1 + + taper = np.hanning(width)[np.newaxis] + taper /= np.sum(taper) + + tapered_mask = np.zeros( + (mask.shape[0], mask.shape[-1] + 2 * width), dtype=np.float64 + ) + tapered_mask[:, width:-width] = mask.astype(np.float64) + # Extend the edges + tapered_mask[:, :width] = tapered_mask[:, width][:, np.newaxis] + tapered_mask[:, -width:] = tapered_mask[:, -width - 1][:, np.newaxis] + + if outer: + tapered_mask = 1.0 - tapered_mask + + # First convolution. This creates a copy of the original boolean + # mask with the edges extended by `nwidth`. The second convolution + # applies the taper to this extended boolean mask. + tapered_mask = np.isclose( + oaconvolve(tapered_mask, taper, axes=-1, mode="same"), 1.0 + ).astype(np.float64) + tapered_mask = oaconvolve(tapered_mask, taper, axes=-1, mode="same") + + if outer: + tapered_mask = 1.0 - tapered_mask + + return tapered_mask[:, width:-width]