Skip to content

Commit

Permalink
Merge pull request #8 from IBM/v0.0.12
Browse files Browse the repository at this point in the history
V0.0.12
  • Loading branch information
RaulFD-creator authored Jul 17, 2024
2 parents 56f5d1a + fbea365 commit 1a830ed
Show file tree
Hide file tree
Showing 3 changed files with 154 additions and 125 deletions.
33 changes: 20 additions & 13 deletions hestia/dataset_generator.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import gzip
import json
from multiprocessing import cpu_count
from typing import Callable, Union

import numpy as np
import pandas as pd
from sklearn.metrics import auc
from tqdm import tqdm
Expand Down Expand Up @@ -46,8 +48,8 @@ def __init__(
as well, defaults to None
:type df_target: pd.DataFrame, optional
:param data_type: Biochemical data_type to which the data belongs.
Options: `protein`, `DNA`, `RNA`, or `small_molecule`; defaults to
'protein'
Options: `protein`, `protein_structure`, `DNA`, `RNA`, or
`small_molecule`; defaults to 'protein'
:type data_type: str, optional
:param similarity_metric: Similarity function to use.
Options:
Expand Down Expand Up @@ -281,8 +283,8 @@ def calculate_partitions(
th_parts = partition_algorithm(
self.data,
label_name=label_name, test_size=test_size,
valid_size=valid_size, threshold=th / 100,
sim_df=self.sim_df
threshold=th / 100,
sim_df=self.sim_df, verbose=2
)
train_th_parts = random_partition(
self.data.iloc[th_parts[0]].reset_index(drop=True),
Expand Down Expand Up @@ -345,7 +347,7 @@ def generate_datasets(self, dataset_type: str, threshold: float) -> dict:
return ds

@staticmethod
def calculate_aboid(results: dict, metric: str) -> float:
def calculate_augood(results: dict, metric: str) -> float:
"""Calculate Area between the similarity-performance
curve (out-of-distribution) and the in-distribution performance.
Expand All @@ -364,11 +366,14 @@ def calculate_aboid(results: dict, metric: str) -> float:
if key == 'random':
continue
x.append(float(key))
y.append(float(results['random'][metric]) - float(value[metric]))
return auc(x, y)
y.append(float(value[metric]))
idxs = np.argsort(x)
x, y = np.array(x), np.array(y)
min_x, max_x = np.min(x), np.max(x)
return auc(x[idxs], y[idxs]) / (max_x - min_x)

@staticmethod
def plot_aboid(results: dict, metric: str):
def plot_good(results: dict, metric: str):
"""Plot the Area between the similarity-performance
curve (out-of-distribution) and the in-distribution performance.
Expand All @@ -387,10 +392,12 @@ def plot_aboid(results: dict, metric: str):
continue
x.append(float(key))
y.append(float(value[metric]))
plt.scatter(x, y)
plt.plot(x, [results['random'][metric] for _ in range(len(x))], 'r')
idxs = np.argsort(x)
x, y = np.array(x), np.array(y)
plt.plot(x[idxs], y[idxs])
# plt.plot(x[idxs], [results['random'][metric] for _ in range(len(x))], 'r')
plt.ylabel(f'Performance: {metric}')
plt.xlabel(f'Threshold similarity')
plt.legend(['SP', 'Random'])
plt.ylim(0, 1.1)
plt.show()
# plt.legend(['SP', 'Random'])
# plt.ylim(0, 1.1)
# plt.show()
244 changes: 133 additions & 111 deletions hestia/similarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import shutil
import subprocess
import time
from typing import List, Union
from typing import Callable, List, Union

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -52,7 +52,7 @@ def calculate_similarity(
df_query: pd.DataFrame,
df_target: pd.DataFrame = None,
data_type: str = 'protein',
similarity_metric: str = 'mmseqs+prefilter',
similarity_metric: Union[str, Callable] = 'mmseqs+prefilter',
field_name: str = 'sequence',
threshold: float = 0.3,
threads: int = cpu_count(),
Expand All @@ -64,7 +64,8 @@ def calculate_similarity(
radius: int = 2,
denominator: str = 'shortest',
representation: str = '3di+aa',
config: dict = None
config: dict = None,
**kwargs
) -> pd.DataFrame:
"""Calculate similarity between entities in
`df_query` and `df_target`. Entities can be
Expand All @@ -80,7 +81,7 @@ def calculate_similarity(
:param data_type: Biochemical data_type to which the data belongs.
Options: `protein`, `DNA`, `RNA`, or `small_molecule`; defaults to
'protein'
:type data_type: str, optional
:type data_type: Union[str, Callable], optional
:param similarity_metric: Similarity function to use.
Options:
- `protein`: `mmseqs` (local alignment),
Expand All @@ -93,6 +94,11 @@ def calculate_similarity(
scaffolds: either identical or not) or
`fingerprint` (Tanimoto distance between ECFP (extended connectivity
fingerprints))
- It can also be a custom made function. It has to fulfill three requirements
1) be symmetrical, 2) be normalised in the interval [0, 1], 3) f(x_i, x_i) = 1.
It should support all values within the SimilarityArguments object. If
it requires additional inputs they can be added to this wrapper function as
key=value options at the end.
Defaults to `mmseqs+prefilter`.
:type similarity_metric: str, optional
:param field_name: Name of the field with the entity information
Expand Down Expand Up @@ -172,114 +178,130 @@ def calculate_similarity(
mssg += f'not implemented for data_type: {data_type}'
mssg2 = f'data_type: {data_type} not supported'

if data_type == 'protein':
if 'mmseqs' in similarity_metric:
sim_df = _mmseqs2_alignment(
df_query=df_query,
df_target=df_target,
field_name=field_name,
threshold=threshold,
threads=threads,
prefilter='prefilter' in similarity_metric,
denominator=denominator,
is_nucleotide=False,
verbose=verbose,
save_alignment=save_alignment,
filename=filename
)
elif similarity_metric == 'needle':
sim_df = _needle_alignment(
df_query=df_query,
df_target=df_target,
field_name=field_name,
threshold=threshold,
threads=threads,
is_nucleotide=False,
verbose=verbose,
config=config,
save_alignment=save_alignment,
filename=filename
)
elif similarity_metric == 'foldseek':
sim_df = _foldseek_alignment(
df_query=df_query,
df_target=df_target,
field_name=field_name,
threshold=threshold,
prefilter=False,
denominator=denominator,
representation=representation,
threads=threads,
verbose=verbose,
save_alignment=save_alignment,
filename=filename
)
else:
mssg = f'Alignment method: {similarity_metric} '
mssg += f'not implemented for data_type: {data_type}'
raise NotImplementedError(mssg)
elif data_type.upper() == 'DNA' or data_type.upper() == 'RNA':
if 'mmseqs' in similarity_metric:
sim_df = _mmseqs2_alignment(
df_query=df_query,
df_target=df_target,
field_name=field_name,
threshold=threshold,
threads=threads,
prefilter='prefilter' in similarity_metric,
denominator=denominator,
is_nucleotide=True,
verbose=verbose,
save_alignment=save_alignment,
filename=filename
)
elif similarity_metric == 'needle':
sim_df = _needle_alignment(
df_query=df_query,
df_target=df_target,
field_name=field_name,
threshold=threshold,
threads=threads,
is_nucleotide=True,
verbose=verbose,
config=config,
save_alignment=save_alignment,
filename=filename
)
else:
mssg = f'Alignment method: {similarity_metric} '
mssg += f'not implemented for data_type: {data_type}'
raise NotImplementedError(mssg)
elif data_type == 'small_molecule' or data_type.lower() == 'smiles':
if similarity_metric == 'scaffold':
sim_df = _scaffold_alignment(
df_query=df_query,
df_target=df_target,
field_name=field_name,
threads=threads,
verbose=verbose,
save_alignment=save_alignment,
filename=filename
)
elif similarity_metric == 'fingerprint':
sim_df = _fingerprint_alignment(
df_query=df_query,
df_target=df_target,
threshold=threshold,
field_name=field_name,
distance=distance,
threads=threads,
verbose=verbose,
bits=bits,
radius=radius,
save_alignment=save_alignment,
filename=filename
)
else:
mssg = f'Alignment method: {similarity_metric} '
mssg += f'not implemented for data_type: {data_type}'
if isinstance(similarity_metric, Callable):
sim_df = similarity_metric(
df_query=df_query,
df_target=df_target,
field_name=field_name,
threshold=threshold,
threads=threads,
prefilter=False,
denominator=denominator,
is_nucleotide=False,
verbose=verbose,
save_alignment=save_alignment,
filename=filename,
**kwargs
)
else:
raise NotImplementedError(mssg2)
if data_type == 'protein':
if 'mmseqs' in similarity_metric:
sim_df = _mmseqs2_alignment(
df_query=df_query,
df_target=df_target,
field_name=field_name,
threshold=threshold,
threads=threads,
prefilter='prefilter' in similarity_metric,
denominator=denominator,
is_nucleotide=False,
verbose=verbose,
save_alignment=save_alignment,
filename=filename
)
elif similarity_metric == 'needle':
sim_df = _needle_alignment(
df_query=df_query,
df_target=df_target,
field_name=field_name,
threshold=threshold,
threads=threads,
is_nucleotide=False,
verbose=verbose,
config=config,
save_alignment=save_alignment,
filename=filename
)
elif similarity_metric == 'foldseek':
sim_df = _foldseek_alignment(
df_query=df_query,
df_target=df_target,
field_name=field_name,
threshold=threshold,
prefilter=False,
denominator=denominator,
representation=representation,
threads=threads,
verbose=verbose,
save_alignment=save_alignment,
filename=filename
)
else:
mssg = f'Alignment method: {similarity_metric} '
mssg += f'not implemented for data_type: {data_type}'
raise NotImplementedError(mssg)
elif data_type.upper() == 'DNA' or data_type.upper() == 'RNA':
if 'mmseqs' in similarity_metric:
sim_df = _mmseqs2_alignment(
df_query=df_query,
df_target=df_target,
field_name=field_name,
threshold=threshold,
threads=threads,
prefilter='prefilter' in similarity_metric,
denominator=denominator,
is_nucleotide=True,
verbose=verbose,
save_alignment=save_alignment,
filename=filename
)
elif similarity_metric == 'needle':
sim_df = _needle_alignment(
df_query=df_query,
df_target=df_target,
field_name=field_name,
threshold=threshold,
threads=threads,
is_nucleotide=True,
verbose=verbose,
config=config,
save_alignment=save_alignment,
filename=filename
)
else:
mssg = f'Alignment method: {similarity_metric} '
mssg += f'not implemented for data_type: {data_type}'
raise NotImplementedError(mssg)
elif data_type == 'small_molecule' or data_type.lower() == 'smiles':
if similarity_metric == 'scaffold':
sim_df = _scaffold_alignment(
df_query=df_query,
df_target=df_target,
field_name=field_name,
threads=threads,
verbose=verbose,
save_alignment=save_alignment,
filename=filename
)
elif similarity_metric == 'fingerprint':
sim_df = _fingerprint_alignment(
df_query=df_query,
df_target=df_target,
threshold=threshold,
field_name=field_name,
distance=distance,
threads=threads,
verbose=verbose,
bits=bits,
radius=radius,
save_alignment=save_alignment,
filename=filename
)
else:
mssg = f'Alignment method: {similarity_metric} '
mssg += f'not implemented for data_type: {data_type}'
else:
raise NotImplementedError(mssg2)
return sim_df


Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,6 @@
test_suite='tests',
tests_require=test_requirements,
url='https://github.com/IBM/Hestia-OOD',
version='0.0.11',
version='0.0.12',
zip_safe=False,
)

0 comments on commit 1a830ed

Please sign in to comment.