Merge pull request #27 from pnnl/fix-warnings

Fix warnings stemming from changes to pandas
pnnl · Dec 17, 2024 · 17391b8 · 17391b8
2 parents ed8b280 + 017d866
commit 17391b8
Show file tree

Hide file tree

Showing 27 changed files with 1,472 additions and 1,212 deletions.
diff --git a/deimos/alignment.py b/deimos/alignment.py
@@ -9,9 +9,14 @@
 import deimos
 
 
-def match(a, b, dims=['mz', 'drift_time', 'retention_time'],
-          tol=[5E-6, 0.015, 0.3], relative=[True, True, False]):
-    '''
+def match(
+    a,
+    b,
+    dims=["mz", "drift_time", "retention_time"],
+    tol=[5e-6, 0.015, 0.3],
+    relative=[True, True, False],
+):
+    """
     Identify features in `b` within tolerance of those in `a` . Matches are
     bidirectionally one-to-one by highest intensity.
 
@@ -34,7 +39,7 @@ def match(a, b, dims=['mz', 'drift_time', 'retention_time'],
         Features matched within tolerances. E.g., `a[i..n]`and `b[i..n]` each
         represent matched features.
 
-    '''
+    """
 
     if a is None or b is None:
         return None, None
@@ -75,7 +80,7 @@ def match(a, b, dims=['mz', 'drift_time', 'retention_time'],
     # Compute normalized 3d distance
     v1 = a[dims].values / tol
     v2 = b[dims].values / tol
-    dist3d = scipy.spatial.distance.cdist(v1, v2, 'cityblock')
+    dist3d = scipy.spatial.distance.cdist(v1, v2, "cityblock")
     dist3d = np.multiply(dist3d, idx)
 
     # Normalize to 0-1
@@ -84,8 +89,7 @@ def match(a, b, dims=['mz', 'drift_time', 'retention_time'],
         dist3d = dist3d / dist3d.max()
 
     # Intensities
-    intensity = np.repeat(a['intensity'].values.reshape(-1, 1),
-                          b.shape[0], axis=1)
+    intensity = np.repeat(a["intensity"].values.reshape(-1, 1), b.shape[0], axis=1)
     intensity = np.multiply(intensity, idx)
 
     # Max over dims
@@ -113,9 +117,14 @@ def match(a, b, dims=['mz', 'drift_time', 'retention_time'],
     return a, b
 
 
-def tolerance(a, b, dims=['mz', 'drift_time', 'retention_time'],
-              tol=[5E-6, 0.025, 0.3], relative=[True, True, False]):
-    '''
+def tolerance(
+    a,
+    b,
+    dims=["mz", "drift_time", "retention_time"],
+    tol=[5e-6, 0.025, 0.3],
+    relative=[True, True, False],
+):
+    """
     Identify features in `b` within tolerance of those in `a`. Matches are
     potentially many-to-one.
 
@@ -138,7 +147,7 @@ def tolerance(a, b, dims=['mz', 'drift_time', 'retention_time'],
         Features matched within tolerances. E.g., `a[i..n]` and `b[i..n]` each
         represent matched features.
 
-    '''
+    """
 
     if a is None or b is None:
         return None, None
@@ -189,8 +198,8 @@ def tolerance(a, b, dims=['mz', 'drift_time', 'retention_time'],
     return a, b
 
 
-def fit_spline(a, b, align='retention_time', **kwargs):
-    '''
+def fit_spline(a, b, align="retention_time", **kwargs):
+    """
     Fit a support vector regressor to matched features.
 
     Parameters
@@ -210,7 +219,7 @@ def fit_spline(a, b, align='retention_time', **kwargs):
     :obj:`~scipy.interpolate.interp1d`
         Interpolated fit of the SVR result.
 
-    '''
+    """
 
     # Uniqueify
     x = a[align].values
@@ -219,16 +228,16 @@ def fit_spline(a, b, align='retention_time', **kwargs):
     arr = np.unique(arr, axis=0)
 
     # Check kwargs
-    if 'kernel' in kwargs:
-        kernel = kwargs.get('kernel')
+    if "kernel" in kwargs:
+        kernel = kwargs.get("kernel")
     else:
-        kernel = 'linear'
+        kernel = "linear"
 
     # Construct interpolation axis
     newx = np.linspace(arr[:, 0].min(), arr[:, 0].max(), 1000)
 
     # Linear kernel
-    if kernel == 'linear':
+    if kernel == "linear":
         reg = scipy.stats.linregress(x, y)
         newy = reg.slope * newx + reg.intercept
 
@@ -241,15 +250,18 @@ def fit_spline(a, b, align='retention_time', **kwargs):
         # Predict
         newy = svr.predict(newx.reshape(-1, 1))
 
-    return scipy.interpolate.interp1d(newx, newy,
-                                      kind='linear', fill_value='extrapolate')
+    return scipy.interpolate.interp1d(
+        newx, newy, kind="linear", fill_value="extrapolate"
+    )
 
 
-def agglomerative_clustering(features,
-                             dims=['mz', 'drift_time', 'retention_time'],
-                             tol=[20E-6, 0.03, 0.3],
-                             relative=[True, True, False]):
-    '''
+def agglomerative_clustering(
+    features,
+    dims=["mz", "drift_time", "retention_time"],
+    tol=[20e-6, 0.03, 0.3],
+    relative=[True, True, False],
+):
+    """
     Cluster features within provided linkage tolerances. Recursively merges
     the pair of clusters that minimally increases a given linkage distance.
     See :class:`sklearn.cluster.AgglomerativeClustering`.
@@ -271,7 +283,7 @@ def agglomerative_clustering(features,
     features : :obj:`~pandas.DataFrame`
         Features concatenated over samples with cluster labels.
 
-    '''
+    """
 
     if features is None:
         return None
@@ -288,10 +300,10 @@ def agglomerative_clustering(features,
     features = features.copy()
 
     # Connectivity
-    if 'sample_idx' not in features.columns:
+    if "sample_idx" not in features.columns:
         cmat = None
     else:
-        vals = features['sample_idx'].values.reshape(-1, 1)
+        vals = features["sample_idx"].values.reshape(-1, 1)
         cmat = cdist(vals, vals, metric=lambda x, y: x != y).astype(bool)
 
     # Compute inter-feature distances
@@ -310,8 +322,7 @@ def agglomerative_clustering(features,
             basis = np.where(basis == 0, fix, basis)
 
             # Divide
-            dist = np.divide(dist, basis, out=np.zeros_like(
-                basis), where=basis != 0)
+            dist = np.divide(dist, basis, out=np.zeros_like(basis), where=basis != 0)
 
         # Check tol
         distances.append(dist / tol[i])
@@ -324,25 +335,29 @@ def agglomerative_clustering(features,
 
     # Perform clustering
     try:
-        clustering = AgglomerativeClustering(n_clusters=None,
-                                             linkage='complete',
-                                             metric='precomputed',
-                                             distance_threshold=1,
-                                             connectivity=cmat).fit(distances)
-        features['cluster'] = clustering.labels_
+        clustering = AgglomerativeClustering(
+            n_clusters=None,
+            linkage="complete",
+            metric="precomputed",
+            distance_threshold=1,
+            connectivity=cmat,
+        ).fit(distances)
+        features["cluster"] = clustering.labels_
 
     # All data points are singleton clusters
     except:
-        features['cluster'] = np.arange(len(features.index))
+        features["cluster"] = np.arange(len(features.index))
 
     return features
 
 
-def merge_features(features,
-                   dims=['mz', 'drift_time', 'retention_time'],
-                   tol=[20E-6, 0.03, 0.3],
-                   relative=[True, True, False]):
-    '''
+def merge_features(
+    features,
+    dims=["mz", "drift_time", "retention_time"],
+    tol=[20e-6, 0.03, 0.3],
+    relative=[True, True, False],
+):
+    """
     Merge features within provided tolerances.
 
     Parameters
@@ -362,7 +377,7 @@ def merge_features(features,
     features : :obj:`~pandas.DataFrame`
         Features concatenated over samples with cluster labels.
 
-    '''
+    """
 
     if features is None:
         return None
@@ -377,40 +392,44 @@ def merge_features(features,
 
     # Copy input
     features = features.copy()
-    
+
     # Sort
-    features = features.sort_values(by='intensity', ascending=False).reset_index(drop=True)
+    features = features.sort_values(by="intensity", ascending=False).reset_index(
+        drop=True
+    )
 
     # Compute inter-feature distances
     distances = None
     for i in range(len(dims)):
         # Construct k-d tree for drift time
         values = features[dims[i]].values
         tree = KDTree(values.reshape(-1, 1))
-        
+
         max_tol = tol[i]
         if relative[i] is True:
             # Maximum absolute tolerance
             max_tol = tol[i] * values.max()
-            
+
         # Compute sparse distance matrix
-        sdm = tree.sparse_distance_matrix(tree, max_tol, output_type='coo_matrix')
+        sdm = tree.sparse_distance_matrix(tree, max_tol, output_type="coo_matrix")
 
         # Only consider forward case, exclude diagonal
         sdm = sparse.triu(sdm, k=1)
 
         # Filter relative distances
         if relative[i] is True:
             # Compute relative distances
-            rel_dists = sdm.data / values[sdm.row] # or col?
-            
+            rel_dists = sdm.data / values[sdm.row]  # or col?
+
             # Indices of relative distances less than tolerance
             idx = rel_dists <= tol[i]
-            
+
             # Reconstruct sparse distance matrix
-            sdm = sparse.coo_matrix((rel_dists[idx], (sdm.row[idx], sdm.col[idx])),
-                                    shape=(len(values), len(values)))
-
+            sdm = sparse.coo_matrix(
+                (rel_dists[idx], (sdm.row[idx], sdm.col[idx])),
+                shape=(len(values), len(values)),
+            )
+
         # Cast as binary matrix
         sdm.data = np.ones_like(sdm.data)
 
@@ -419,7 +438,7 @@ def merge_features(features,
             distances = sdm
         else:
             distances = distances.multiply(sdm)
-    
+
     # Extract indices of within-tolerance points
     distances = distances.tocoo()
     pairs = np.stack((distances.row, distances.col), axis=1)
@@ -428,13 +447,13 @@ def merge_features(features,
     to_drop = []
     while len(pairs) > 0:
         # Find root parents and their children
-        root_parents = np.setdiff1d(np.unique(pairs[:, 0]), np.unique(pairs[:, 1])) 
-        id_root_parents = np.isin(pairs[:, 0], root_parents) 
-        children_of_roots = np.unique(pairs[id_root_parents, 1]) 
-        to_drop = np.append(to_drop, children_of_roots) 
+        root_parents = np.setdiff1d(np.unique(pairs[:, 0]), np.unique(pairs[:, 1]))
+        id_root_parents = np.isin(pairs[:, 0], root_parents)
+        children_of_roots = np.unique(pairs[id_root_parents, 1])
+        to_drop = np.append(to_drop, children_of_roots)
 
-        # Set up pairs array for next iteration 
+        # Set up pairs array for next iteration
         pairs = pairs[~np.isin(pairs[:, 1], to_drop), :]
-        pairs = pairs[~np.isin(pairs[:, 0], to_drop), :] 
+        pairs = pairs[~np.isin(pairs[:, 0], to_drop), :]
 
     return features.reset_index(drop=True).drop(index=np.array(to_drop))
diff --git a/deimos/calibration.py b/deimos/calibration.py
@@ -245,7 +245,7 @@ def tunemix_mz(
     features,
     mz=[112.985587, 301.998139, 601.978977, 1033.988109, 1333.968947, 1633.949786],
     mz_tol=200e-6,
-    method='apex'
+    method="apex"
 ):
     """
     Provided tune mix data with known calibration ions (i.e. known m/z),