Merge pull request #8 from IBM/v0.0.12

V0.0.12
IBM · Jul 17, 2024 · 1a830ed · 1a830ed
2 parents 56f5d1a + fbea365
commit 1a830ed
Show file tree

Hide file tree

Showing 3 changed files with 154 additions and 125 deletions.
diff --git a/hestia/dataset_generator.py b/hestia/dataset_generator.py
@@ -1,7 +1,9 @@
 import gzip
 import json
 from multiprocessing import cpu_count
+from typing import Callable, Union
 
+import numpy as np
 import pandas as pd
 from sklearn.metrics import auc
 from tqdm import tqdm
@@ -46,8 +48,8 @@ def __init__(
         as well, defaults to None
         :type df_target: pd.DataFrame, optional
         :param data_type: Biochemical data_type to which the data belongs.
-        Options: `protein`, `DNA`, `RNA`, or `small_molecule`; defaults to
-        'protein'
+        Options: `protein`, `protein_structure`, `DNA`, `RNA`, or
+        `small_molecule`; defaults to 'protein'
         :type data_type: str, optional
         :param similarity_metric: Similarity function to use.
         Options:
@@ -281,8 +283,8 @@ def calculate_partitions(
             th_parts = partition_algorithm(
                 self.data,
                 label_name=label_name, test_size=test_size,
-                valid_size=valid_size, threshold=th / 100,
-                sim_df=self.sim_df
+                threshold=th / 100,
+                sim_df=self.sim_df, verbose=2
             )
             train_th_parts = random_partition(
                 self.data.iloc[th_parts[0]].reset_index(drop=True),
@@ -345,7 +347,7 @@ def generate_datasets(self, dataset_type: str, threshold: float) -> dict:
             return ds
 
     @staticmethod
-    def calculate_aboid(results: dict, metric: str) -> float:
+    def calculate_augood(results: dict, metric: str) -> float:
         """Calculate Area between the similarity-performance
         curve (out-of-distribution) and the in-distribution performance.
 
@@ -364,11 +366,14 @@ def calculate_aboid(results: dict, metric: str) -> float:
             if key == 'random':
                 continue
             x.append(float(key))
-            y.append(float(results['random'][metric]) - float(value[metric]))
-        return auc(x, y)
+            y.append(float(value[metric]))
+        idxs = np.argsort(x)
+        x, y = np.array(x), np.array(y)
+        min_x, max_x = np.min(x), np.max(x)
+        return auc(x[idxs], y[idxs]) / (max_x - min_x)
 
     @staticmethod
-    def plot_aboid(results: dict, metric: str):
+    def plot_good(results: dict, metric: str):
         """Plot the Area between the similarity-performance
         curve (out-of-distribution) and the in-distribution performance.
 
@@ -387,10 +392,12 @@ def plot_aboid(results: dict, metric: str):
                 continue
             x.append(float(key))
             y.append(float(value[metric]))
-        plt.scatter(x, y)
-        plt.plot(x, [results['random'][metric] for _ in range(len(x))], 'r')
+        idxs = np.argsort(x)
+        x, y = np.array(x), np.array(y)
+        plt.plot(x[idxs], y[idxs])
+        # plt.plot(x[idxs], [results['random'][metric] for _ in range(len(x))], 'r')
         plt.ylabel(f'Performance: {metric}')
         plt.xlabel(f'Threshold similarity')
-        plt.legend(['SP', 'Random'])
-        plt.ylim(0, 1.1)
-        plt.show()
+        # plt.legend(['SP', 'Random'])
+        # plt.ylim(0, 1.1)
+        # plt.show()
diff --git a/hestia/similarity.py b/hestia/similarity.py
@@ -3,7 +3,7 @@
 import shutil
 import subprocess
 import time
-from typing import List, Union
+from typing import Callable, List, Union
 
 import numpy as np
 import pandas as pd
@@ -52,7 +52,7 @@ def calculate_similarity(
     df_query: pd.DataFrame,
     df_target: pd.DataFrame = None,
     data_type: str = 'protein',
-    similarity_metric: str = 'mmseqs+prefilter',
+    similarity_metric: Union[str, Callable] = 'mmseqs+prefilter',
     field_name: str = 'sequence',
     threshold: float = 0.3,
     threads: int = cpu_count(),
@@ -64,7 +64,8 @@ def calculate_similarity(
     radius: int = 2,
     denominator: str = 'shortest',
     representation: str = '3di+aa',
-    config: dict = None
+    config: dict = None,
+    **kwargs
 ) -> pd.DataFrame:
     """Calculate similarity between entities in
     `df_query` and `df_target`. Entities can be
@@ -80,7 +81,7 @@ def calculate_similarity(
     :param data_type: Biochemical data_type to which the data belongs.
     Options: `protein`, `DNA`, `RNA`, or `small_molecule`; defaults to
     'protein'
-    :type data_type: str, optional
+    :type data_type: Union[str, Callable], optional
     :param similarity_metric: Similarity function to use.
     Options:
         - `protein`: `mmseqs` (local alignment),
@@ -93,6 +94,11 @@ def calculate_similarity(
            scaffolds: either identical or not) or
           `fingerprint` (Tanimoto distance between ECFP (extended connectivity
            fingerprints))
+        - It can also be a custom made function. It has to fulfill three requirements
+          1) be symmetrical, 2) be normalised in the interval [0, 1], 3) f(x_i, x_i) = 1.
+          It should support all values within the SimilarityArguments object. If
+          it requires additional inputs they can be added to this wrapper function as
+          key=value options at the end.
     Defaults to `mmseqs+prefilter`.
     :type similarity_metric: str, optional
     :param field_name: Name of the field with the entity information
@@ -172,114 +178,130 @@ def calculate_similarity(
     mssg += f'not implemented for data_type: {data_type}'
     mssg2 = f'data_type: {data_type} not supported'
 
-    if data_type == 'protein':
-        if 'mmseqs' in similarity_metric:
-            sim_df = _mmseqs2_alignment(
-                df_query=df_query,
-                df_target=df_target,
-                field_name=field_name,
-                threshold=threshold,
-                threads=threads,
-                prefilter='prefilter' in similarity_metric,
-                denominator=denominator,
-                is_nucleotide=False,
-                verbose=verbose,
-                save_alignment=save_alignment,
-                filename=filename
-            )
-        elif similarity_metric == 'needle':
-            sim_df = _needle_alignment(
-                df_query=df_query,
-                df_target=df_target,
-                field_name=field_name,
-                threshold=threshold,
-                threads=threads,
-                is_nucleotide=False,
-                verbose=verbose,
-                config=config,
-                save_alignment=save_alignment,
-                filename=filename
-            )
-        elif similarity_metric == 'foldseek':
-            sim_df = _foldseek_alignment(
-                df_query=df_query,
-                df_target=df_target,
-                field_name=field_name,
-                threshold=threshold,
-                prefilter=False,
-                denominator=denominator,
-                representation=representation,
-                threads=threads,
-                verbose=verbose,
-                save_alignment=save_alignment,
-                filename=filename
-            )
-        else:
-            mssg = f'Alignment method: {similarity_metric} '
-            mssg += f'not implemented for data_type: {data_type}'
-            raise NotImplementedError(mssg)
-    elif data_type.upper() == 'DNA' or data_type.upper() == 'RNA':
-        if 'mmseqs' in similarity_metric:
-            sim_df = _mmseqs2_alignment(
-                df_query=df_query,
-                df_target=df_target,
-                field_name=field_name,
-                threshold=threshold,
-                threads=threads,
-                prefilter='prefilter' in similarity_metric,
-                denominator=denominator,
-                is_nucleotide=True,
-                verbose=verbose,
-                save_alignment=save_alignment,
-                filename=filename
-            )
-        elif similarity_metric == 'needle':
-            sim_df = _needle_alignment(
-                df_query=df_query,
-                df_target=df_target,
-                field_name=field_name,
-                threshold=threshold,
-                threads=threads,
-                is_nucleotide=True,
-                verbose=verbose,
-                config=config,
-                save_alignment=save_alignment,
-                filename=filename
-            )
-        else:
-            mssg = f'Alignment method: {similarity_metric} '
-            mssg += f'not implemented for data_type: {data_type}'
-            raise NotImplementedError(mssg)
-    elif data_type == 'small_molecule' or data_type.lower() == 'smiles':
-        if similarity_metric == 'scaffold':
-            sim_df = _scaffold_alignment(
-                df_query=df_query,
-                df_target=df_target,
-                field_name=field_name,
-                threads=threads,
-                verbose=verbose,
-                save_alignment=save_alignment,
-                filename=filename
-            )
-        elif similarity_metric == 'fingerprint':
-            sim_df = _fingerprint_alignment(
-                df_query=df_query,
-                df_target=df_target,
-                threshold=threshold,
-                field_name=field_name,
-                distance=distance,
-                threads=threads,
-                verbose=verbose,
-                bits=bits,
-                radius=radius,
-                save_alignment=save_alignment,
-                filename=filename
-            )
-        else:
-            mssg = f'Alignment method: {similarity_metric} '
-            mssg += f'not implemented for data_type: {data_type}'
+    if isinstance(similarity_metric, Callable):
+        sim_df = similarity_metric(
+            df_query=df_query,
+            df_target=df_target,
+            field_name=field_name,
+            threshold=threshold,
+            threads=threads,
+            prefilter=False,
+            denominator=denominator,
+            is_nucleotide=False,
+            verbose=verbose,
+            save_alignment=save_alignment,
+            filename=filename,
+            **kwargs
+        )
     else:
-        raise NotImplementedError(mssg2)
+        if data_type == 'protein':
+            if 'mmseqs' in similarity_metric:
+                sim_df = _mmseqs2_alignment(
+                    df_query=df_query,
+                    df_target=df_target,
+                    field_name=field_name,
+                    threshold=threshold,
+                    threads=threads,
+                    prefilter='prefilter' in similarity_metric,
+                    denominator=denominator,
+                    is_nucleotide=False,
+                    verbose=verbose,
+                    save_alignment=save_alignment,
+                    filename=filename
+                )
+            elif similarity_metric == 'needle':
+                sim_df = _needle_alignment(
+                    df_query=df_query,
+                    df_target=df_target,
+                    field_name=field_name,
+                    threshold=threshold,
+                    threads=threads,
+                    is_nucleotide=False,
+                    verbose=verbose,
+                    config=config,
+                    save_alignment=save_alignment,
+                    filename=filename
+                )
+            elif similarity_metric == 'foldseek':
+                sim_df = _foldseek_alignment(
+                    df_query=df_query,
+                    df_target=df_target,
+                    field_name=field_name,
+                    threshold=threshold,
+                    prefilter=False,
+                    denominator=denominator,
+                    representation=representation,
+                    threads=threads,
+                    verbose=verbose,
+                    save_alignment=save_alignment,
+                    filename=filename
+                )
+            else:
+                mssg = f'Alignment method: {similarity_metric} '
+                mssg += f'not implemented for data_type: {data_type}'
+                raise NotImplementedError(mssg)
+        elif data_type.upper() == 'DNA' or data_type.upper() == 'RNA':
+            if 'mmseqs' in similarity_metric:
+                sim_df = _mmseqs2_alignment(
+                    df_query=df_query,
+                    df_target=df_target,
+                    field_name=field_name,
+                    threshold=threshold,
+                    threads=threads,
+                    prefilter='prefilter' in similarity_metric,
+                    denominator=denominator,
+                    is_nucleotide=True,
+                    verbose=verbose,
+                    save_alignment=save_alignment,
+                    filename=filename
+                )
+            elif similarity_metric == 'needle':
+                sim_df = _needle_alignment(
+                    df_query=df_query,
+                    df_target=df_target,
+                    field_name=field_name,
+                    threshold=threshold,
+                    threads=threads,
+                    is_nucleotide=True,
+                    verbose=verbose,
+                    config=config,
+                    save_alignment=save_alignment,
+                    filename=filename
+                )
+            else:
+                mssg = f'Alignment method: {similarity_metric} '
+                mssg += f'not implemented for data_type: {data_type}'
+                raise NotImplementedError(mssg)
+        elif data_type == 'small_molecule' or data_type.lower() == 'smiles':
+            if similarity_metric == 'scaffold':
+                sim_df = _scaffold_alignment(
+                    df_query=df_query,
+                    df_target=df_target,
+                    field_name=field_name,
+                    threads=threads,
+                    verbose=verbose,
+                    save_alignment=save_alignment,
+                    filename=filename
+                )
+            elif similarity_metric == 'fingerprint':
+                sim_df = _fingerprint_alignment(
+                    df_query=df_query,
+                    df_target=df_target,
+                    threshold=threshold,
+                    field_name=field_name,
+                    distance=distance,
+                    threads=threads,
+                    verbose=verbose,
+                    bits=bits,
+                    radius=radius,
+                    save_alignment=save_alignment,
+                    filename=filename
+                )
+            else:
+                mssg = f'Alignment method: {similarity_metric} '
+                mssg += f'not implemented for data_type: {data_type}'
+        else:
+            raise NotImplementedError(mssg2)
     return sim_df
 
 

diff --git a/setup.py b/setup.py
@@ -48,6 +48,6 @@
     test_suite='tests',
     tests_require=test_requirements,
     url='https://github.com/IBM/Hestia-OOD',
-    version='0.0.11',
+    version='0.0.12',
     zip_safe=False,
 )