Skip to content

Commit

Permalink
Merge pull request #27 from pnnl/fix-warnings
Browse files Browse the repository at this point in the history
Fix warnings stemming from changes to pandas
  • Loading branch information
smcolby authored Dec 17, 2024
2 parents ed8b280 + 017d866 commit 17391b8
Show file tree
Hide file tree
Showing 27 changed files with 1,472 additions and 1,212 deletions.
141 changes: 80 additions & 61 deletions deimos/alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,14 @@
import deimos


def match(a, b, dims=['mz', 'drift_time', 'retention_time'],
tol=[5E-6, 0.015, 0.3], relative=[True, True, False]):
'''
def match(
a,
b,
dims=["mz", "drift_time", "retention_time"],
tol=[5e-6, 0.015, 0.3],
relative=[True, True, False],
):
"""
Identify features in `b` within tolerance of those in `a` . Matches are
bidirectionally one-to-one by highest intensity.
Expand All @@ -34,7 +39,7 @@ def match(a, b, dims=['mz', 'drift_time', 'retention_time'],
Features matched within tolerances. E.g., `a[i..n]`and `b[i..n]` each
represent matched features.
'''
"""

if a is None or b is None:
return None, None
Expand Down Expand Up @@ -75,7 +80,7 @@ def match(a, b, dims=['mz', 'drift_time', 'retention_time'],
# Compute normalized 3d distance
v1 = a[dims].values / tol
v2 = b[dims].values / tol
dist3d = scipy.spatial.distance.cdist(v1, v2, 'cityblock')
dist3d = scipy.spatial.distance.cdist(v1, v2, "cityblock")
dist3d = np.multiply(dist3d, idx)

# Normalize to 0-1
Expand All @@ -84,8 +89,7 @@ def match(a, b, dims=['mz', 'drift_time', 'retention_time'],
dist3d = dist3d / dist3d.max()

# Intensities
intensity = np.repeat(a['intensity'].values.reshape(-1, 1),
b.shape[0], axis=1)
intensity = np.repeat(a["intensity"].values.reshape(-1, 1), b.shape[0], axis=1)
intensity = np.multiply(intensity, idx)

# Max over dims
Expand Down Expand Up @@ -113,9 +117,14 @@ def match(a, b, dims=['mz', 'drift_time', 'retention_time'],
return a, b


def tolerance(a, b, dims=['mz', 'drift_time', 'retention_time'],
tol=[5E-6, 0.025, 0.3], relative=[True, True, False]):
'''
def tolerance(
a,
b,
dims=["mz", "drift_time", "retention_time"],
tol=[5e-6, 0.025, 0.3],
relative=[True, True, False],
):
"""
Identify features in `b` within tolerance of those in `a`. Matches are
potentially many-to-one.
Expand All @@ -138,7 +147,7 @@ def tolerance(a, b, dims=['mz', 'drift_time', 'retention_time'],
Features matched within tolerances. E.g., `a[i..n]` and `b[i..n]` each
represent matched features.
'''
"""

if a is None or b is None:
return None, None
Expand Down Expand Up @@ -189,8 +198,8 @@ def tolerance(a, b, dims=['mz', 'drift_time', 'retention_time'],
return a, b


def fit_spline(a, b, align='retention_time', **kwargs):
'''
def fit_spline(a, b, align="retention_time", **kwargs):
"""
Fit a support vector regressor to matched features.
Parameters
Expand All @@ -210,7 +219,7 @@ def fit_spline(a, b, align='retention_time', **kwargs):
:obj:`~scipy.interpolate.interp1d`
Interpolated fit of the SVR result.
'''
"""

# Uniqueify
x = a[align].values
Expand All @@ -219,16 +228,16 @@ def fit_spline(a, b, align='retention_time', **kwargs):
arr = np.unique(arr, axis=0)

# Check kwargs
if 'kernel' in kwargs:
kernel = kwargs.get('kernel')
if "kernel" in kwargs:
kernel = kwargs.get("kernel")
else:
kernel = 'linear'
kernel = "linear"

# Construct interpolation axis
newx = np.linspace(arr[:, 0].min(), arr[:, 0].max(), 1000)

# Linear kernel
if kernel == 'linear':
if kernel == "linear":
reg = scipy.stats.linregress(x, y)
newy = reg.slope * newx + reg.intercept

Expand All @@ -241,15 +250,18 @@ def fit_spline(a, b, align='retention_time', **kwargs):
# Predict
newy = svr.predict(newx.reshape(-1, 1))

return scipy.interpolate.interp1d(newx, newy,
kind='linear', fill_value='extrapolate')
return scipy.interpolate.interp1d(
newx, newy, kind="linear", fill_value="extrapolate"
)


def agglomerative_clustering(features,
dims=['mz', 'drift_time', 'retention_time'],
tol=[20E-6, 0.03, 0.3],
relative=[True, True, False]):
'''
def agglomerative_clustering(
features,
dims=["mz", "drift_time", "retention_time"],
tol=[20e-6, 0.03, 0.3],
relative=[True, True, False],
):
"""
Cluster features within provided linkage tolerances. Recursively merges
the pair of clusters that minimally increases a given linkage distance.
See :class:`sklearn.cluster.AgglomerativeClustering`.
Expand All @@ -271,7 +283,7 @@ def agglomerative_clustering(features,
features : :obj:`~pandas.DataFrame`
Features concatenated over samples with cluster labels.
'''
"""

if features is None:
return None
Expand All @@ -288,10 +300,10 @@ def agglomerative_clustering(features,
features = features.copy()

# Connectivity
if 'sample_idx' not in features.columns:
if "sample_idx" not in features.columns:
cmat = None
else:
vals = features['sample_idx'].values.reshape(-1, 1)
vals = features["sample_idx"].values.reshape(-1, 1)
cmat = cdist(vals, vals, metric=lambda x, y: x != y).astype(bool)

# Compute inter-feature distances
Expand All @@ -310,8 +322,7 @@ def agglomerative_clustering(features,
basis = np.where(basis == 0, fix, basis)

# Divide
dist = np.divide(dist, basis, out=np.zeros_like(
basis), where=basis != 0)
dist = np.divide(dist, basis, out=np.zeros_like(basis), where=basis != 0)

# Check tol
distances.append(dist / tol[i])
Expand All @@ -324,25 +335,29 @@ def agglomerative_clustering(features,

# Perform clustering
try:
clustering = AgglomerativeClustering(n_clusters=None,
linkage='complete',
metric='precomputed',
distance_threshold=1,
connectivity=cmat).fit(distances)
features['cluster'] = clustering.labels_
clustering = AgglomerativeClustering(
n_clusters=None,
linkage="complete",
metric="precomputed",
distance_threshold=1,
connectivity=cmat,
).fit(distances)
features["cluster"] = clustering.labels_

# All data points are singleton clusters
except:
features['cluster'] = np.arange(len(features.index))
features["cluster"] = np.arange(len(features.index))

return features


def merge_features(features,
dims=['mz', 'drift_time', 'retention_time'],
tol=[20E-6, 0.03, 0.3],
relative=[True, True, False]):
'''
def merge_features(
features,
dims=["mz", "drift_time", "retention_time"],
tol=[20e-6, 0.03, 0.3],
relative=[True, True, False],
):
"""
Merge features within provided tolerances.
Parameters
Expand All @@ -362,7 +377,7 @@ def merge_features(features,
features : :obj:`~pandas.DataFrame`
Features concatenated over samples with cluster labels.
'''
"""

if features is None:
return None
Expand All @@ -377,40 +392,44 @@ def merge_features(features,

# Copy input
features = features.copy()

# Sort
features = features.sort_values(by='intensity', ascending=False).reset_index(drop=True)
features = features.sort_values(by="intensity", ascending=False).reset_index(
drop=True
)

# Compute inter-feature distances
distances = None
for i in range(len(dims)):
# Construct k-d tree for drift time
values = features[dims[i]].values
tree = KDTree(values.reshape(-1, 1))

max_tol = tol[i]
if relative[i] is True:
# Maximum absolute tolerance
max_tol = tol[i] * values.max()

# Compute sparse distance matrix
sdm = tree.sparse_distance_matrix(tree, max_tol, output_type='coo_matrix')
sdm = tree.sparse_distance_matrix(tree, max_tol, output_type="coo_matrix")

# Only consider forward case, exclude diagonal
sdm = sparse.triu(sdm, k=1)

# Filter relative distances
if relative[i] is True:
# Compute relative distances
rel_dists = sdm.data / values[sdm.row] # or col?
rel_dists = sdm.data / values[sdm.row] # or col?

# Indices of relative distances less than tolerance
idx = rel_dists <= tol[i]

# Reconstruct sparse distance matrix
sdm = sparse.coo_matrix((rel_dists[idx], (sdm.row[idx], sdm.col[idx])),
shape=(len(values), len(values)))

sdm = sparse.coo_matrix(
(rel_dists[idx], (sdm.row[idx], sdm.col[idx])),
shape=(len(values), len(values)),
)

# Cast as binary matrix
sdm.data = np.ones_like(sdm.data)

Expand All @@ -419,7 +438,7 @@ def merge_features(features,
distances = sdm
else:
distances = distances.multiply(sdm)

# Extract indices of within-tolerance points
distances = distances.tocoo()
pairs = np.stack((distances.row, distances.col), axis=1)
Expand All @@ -428,13 +447,13 @@ def merge_features(features,
to_drop = []
while len(pairs) > 0:
# Find root parents and their children
root_parents = np.setdiff1d(np.unique(pairs[:, 0]), np.unique(pairs[:, 1]))
id_root_parents = np.isin(pairs[:, 0], root_parents)
children_of_roots = np.unique(pairs[id_root_parents, 1])
to_drop = np.append(to_drop, children_of_roots)
root_parents = np.setdiff1d(np.unique(pairs[:, 0]), np.unique(pairs[:, 1]))
id_root_parents = np.isin(pairs[:, 0], root_parents)
children_of_roots = np.unique(pairs[id_root_parents, 1])
to_drop = np.append(to_drop, children_of_roots)

# Set up pairs array for next iteration
# Set up pairs array for next iteration
pairs = pairs[~np.isin(pairs[:, 1], to_drop), :]
pairs = pairs[~np.isin(pairs[:, 0], to_drop), :]
pairs = pairs[~np.isin(pairs[:, 0], to_drop), :]

return features.reset_index(drop=True).drop(index=np.array(to_drop))
2 changes: 1 addition & 1 deletion deimos/calibration.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,7 @@ def tunemix_mz(
features,
mz=[112.985587, 301.998139, 601.978977, 1033.988109, 1333.968947, 1633.949786],
mz_tol=200e-6,
method='apex'
method="apex"
):
"""
Provided tune mix data with known calibration ions (i.e. known m/z),
Expand Down
Loading

0 comments on commit 17391b8

Please sign in to comment.