Merge branch 'enhancement/interaction_fingerprints_alternate' into 'dev'

Alternative implementation of joining on arbitrary columns See merge request cdd/QSPRpred!178
CDDLeiden · Mar 21, 2024 · 4d37019 · 4d37019
2 parents 7f63e85 + 6f84dd5
commit 4d37019
Show file tree

Hide file tree

Showing 4 changed files with 167 additions and 29 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,9 +1,10 @@
 # Change Log
 
-From v3.0.2 to v3.0.3
+From v3.0.1 to v3.0.2
 
 ## Fixes
 
+- Fixed a bug in `QSPRDataset` where property transformations were not applied.
 - Fixed a bug where an attached standardizer would be refit when calling
   `QSPRModel.predictMols` with `use_applicability_domain=True`.
 - Fixed random seed not set in `FoldsFromDataSplit.iterFolds` for `ClusterSplit`.
@@ -14,6 +15,8 @@ None.
 
 ## New Features
 
+- The `DataFrameDescriptorSet` class was extended to allow more flexibility when joining
+  custom descriptor sets.
 - Added the `prepMols` method to `DescriptorSet` to allow separated customization of
   molecule preparation before descriptor calculation.
 

diff --git a/qsprpred/data/descriptors/sets.py b/qsprpred/data/descriptors/sets.py
@@ -177,15 +177,66 @@ def getDescriptors(
 class DataFrameDescriptorSet(DescriptorSet):
     """`DescriptorSet` that uses a `pandas.DataFrame` of precalculated descriptors."""
 
-    def __init__(self, df: pd.DataFrame):
+    @staticmethod
+    def setIndex(df: pd.DataFrame, cols: list[str]):
+        """Create a multi-index from several columns of the data set.
+
+        Args:
+            df (pd.DataFrame): DataFrame to set index for.
+            cols (list[str]): List of columns to use as the new multi-index.
+        """
+        df_index_tuples = df[cols].values
+        df_index_tuples = tuple(map(tuple, df_index_tuples))
+        df_index = pd.MultiIndex.from_tuples(df_index_tuples, names=cols)
+        df.index = df_index
+        return df
+
+    def __init__(
+        self,
+        df: pd.DataFrame,
+        joining_cols: list[str] | None = None,
+        suffix="",
+        source_is_multi_index=False,
+    ):
         """Initialize the descriptor set with a dataframe of descriptors.
 
         Args:
-            df: dataframe of descriptors
+            df:
+                dataframe of descriptors
+            joining_cols:
+                list of columns to use as joining index,
+                properties of the same name must exist in the data set
+                this descriptor is added to
+            suffix:
+                suffix to add to the descriptor name
+            source_is_multi_index:
+                assume that a multi-index is already present in the supplied dataframe.
+                If `True`, the `joining_cols` argument must
+                also be specified to indicate which properties should
+                be used to create the multi-index in the destination.
         """
         super().__init__()
+        if source_is_multi_index and not joining_cols:
+            raise ValueError(
+                "When 'source_is_multi_index=True', 'joining_cols' must be specified."
+            )
         self._df = df
+        if joining_cols and not source_is_multi_index:
+            self._df = self.setIndex(self._df, joining_cols)
+        self._cols = joining_cols
         self._descriptors = df.columns.tolist() if df is not None else []
+        if joining_cols:
+            self._descriptors = [
+                col for col in self._descriptors if col not in joining_cols
+            ]
+        self.suffix = suffix
+
+    @property
+    def requiredProps(self) -> list[str]:
+        """Return the required properties for the dataframe."""
+        prior = super().requiredProps
+        new = prior + self._cols if self._cols is not None else prior
+        return list(set(new))  # remove duplicates
 
     def getDF(self):
         """Return the dataframe of descriptors."""
@@ -195,6 +246,10 @@ def getIndex(self):
         """Return the index of the dataframe."""
         return self._df.index if self._df is not None else None
 
+    def getIndexCols(self):
+        """Return the index columns of the dataframe."""
+        return self._cols if self._df is not None else None
+
     def getDescriptors(
         self, mols: list[Mol], props: dict[str, list[Any]], *args, **kwargs
     ) -> np.ndarray:
@@ -210,11 +265,25 @@ def getDescriptors(
         Returns:
             numpy array of descriptor values of shape (n_mols, n_descriptors)
         """
-        index = pd.Index(props[self.idProp], name=self.idProp)
-        if self._df is None:
-            raise ValueError("No dataframe set.")
-        ret = pd.DataFrame(index=index)
-        ret = ret.merge(self._df, how="left", left_index=True, right_index=True)
+        # create a return data frame with the desired columns as index
+        index_cols = self.getIndexCols()
+        if index_cols:
+            ret = pd.DataFrame(
+                # fetch the join columns from our required props
+                {col: props[col] for col in index_cols}
+            )
+            ret = self.setIndex(ret, index_cols)  # set our multi-index
+            ret.drop(columns=index_cols, inplace=True)  # only keep the index
+        else:
+            ret = pd.DataFrame(index=pd.Index(props[self.idProp], name=self.idProp))
+        ret = ret.join(
+            # join in our descriptors
+            # each molecule gets the correct descriptors from the data frame
+            self._df,
+            how="left",
+            on=index_cols,
+        )
+        # ret is in the same order as the input mols, so we can just return the values
         return ret[self.descriptors].values
 
     @property
@@ -226,7 +295,7 @@ def descriptors(self, value):
         self._descriptors = value
 
     def __str__(self):
-        return "DataFrame"
+        return "DataFrame" if not self.suffix else f"{self.suffix}_DataFrame"
 
 
 class DrugExPhyschem(DescriptorSet):

diff --git a/qsprpred/data/processing/tests.py b/qsprpred/data/processing/tests.py
@@ -3,9 +3,9 @@
 
 import numpy as np
 import pandas as pd
+from mlchemad.applicability_domains import KNNApplicabilityDomain
 from parameterized import parameterized
 from rdkit.Chem import Mol
-from mlchemad.applicability_domains import KNNApplicabilityDomain
 from sklearn.preprocessing import StandardScaler
 
 from .mol_processor import MolProcessor
@@ -30,6 +30,7 @@ class TestDataFilters(DataSetsPathMixIn, QSPRTestCase):
 
     The tests here should be used to check for all their specific parameters and
     edge cases."""
+
     def setUp(self):
         super().setUp()
         self.setUpPaths()
@@ -102,10 +103,17 @@ def testConsistency(self):
 
 
 class TestFeatureFilters(PathMixIn, QSPRTestCase):
-    """Tests to check if the feature filters work on their own."""
+    """Tests to check if the feature filters work on their own.
+
+    Note: This also tests the `DataframeDescriptorSet`,
+    as it is used to add test descriptors.
+    """
+
     def setUp(self):
         """Set up the small test Dataframe."""
         super().setUp()
+        self.nCPU = 2  # just to test parallel processing
+        self.chunkSize = 2
         self.setUpPaths()
         descriptors = [
             "Descriptor_F1",
@@ -133,39 +141,94 @@ def setUp(self):
         )
         self.dataset = QSPRDataset(
             "TestFeatureFilters",
-            target_props=[{
-                "name": "y",
-                "task": TargetTasks.REGRESSION
-            }],
+            target_props=[{"name": "y", "task": TargetTasks.REGRESSION}],
             df=self.df,
             store_dir=self.generatedPath,
             n_jobs=self.nCPU,
             chunk_size=self.chunkSize,
         )
-        self.df_descriptors.index = self.dataset.df.index
+        self.df_descriptors["QSPRID"] = self.dataset.getProperty(
+            self.dataset.idProp
+        ).values
+        self.df_descriptors.set_index("QSPRID", inplace=True, drop=True)
         self.dataset.addDescriptors([DataFrameDescriptorSet(self.df_descriptors)])
         self.descriptors = self.dataset.featureNames
 
-    def testLowVarianceFilter(self):
+    def recalculateWithMultiIndex(self):
+        self.dataset.dropDescriptors(self.dataset.descriptorSets)
+        self.df_descriptors["ID_COL1"] = (
+            self.dataset.getProperty(self.dataset.idProp)
+            .apply(lambda x: x.split("_")[0])
+            .to_list()
+        )
+        self.df_descriptors["ID_COL2"] = (
+            self.dataset.getProperty(self.dataset.idProp)
+            .apply(lambda x: x.split("_")[1])
+            .to_list()
+        )
+        self.dataset.addProperty("ID_COL1", self.df_descriptors["ID_COL1"].values)
+        self.dataset.addProperty("ID_COL2", self.df_descriptors["ID_COL2"].values)
+        self.dataset.addDescriptors(
+            [
+                DataFrameDescriptorSet(
+                    self.df_descriptors,
+                    ["ID_COL1", "ID_COL2"],
+                )
+            ]
+        )
+
+    # def testDefaultDescriptorAdd(self):
+    #     """Test adding without index columns."""
+    #     # TODO: issue 88 needs to be solved for this to work
+    #     self.dataset.nJobs = 1
+    #     df_new = self.dataset.getFeatures(concat=True).copy()
+    #     calc = DataFrameDescriptorSet(df_new, suffix="new_df_desc")
+    #     self.dataset.addDescriptors([calc])
+
+    @parameterized.expand(
+        [
+            (True,),
+            (False,),
+        ]
+    )
+    def testLowVarianceFilter(self, use_index_cols):
         """Test the low variance filter, which drops features with a variance below
         a threshold."""
+        if use_index_cols:
+            self.recalculateWithMultiIndex()
         self.dataset.filterFeatures([LowVarianceFilter(0.01)])
         # check if correct columns selected and values still original
         self.assertListEqual(list(self.dataset.featureNames), self.descriptors[1:])
         self.assertListEqual(list(self.dataset.X.columns), self.descriptors[1:])
 
-    def testHighCorrelationFilter(self):
+    @parameterized.expand(
+        [
+            (True,),
+            (False,),
+        ]
+    )
+    def testHighCorrelationFilter(self, use_index_cols):
         """Test the high correlation filter, which drops features with a correlation
         above a threshold."""
+        if use_index_cols:
+            self.recalculateWithMultiIndex()
         self.dataset.filterFeatures([HighCorrelationFilter(0.8)])
         # check if correct columns selected and values still original
         self.descriptors.pop(2)
         self.assertListEqual(list(self.dataset.featureNames), self.descriptors)
         self.assertListEqual(list(self.dataset.X.columns), self.descriptors)
 
-    def testBorutaFilter(self):
+    @parameterized.expand(
+        [
+            (True,),
+            (False,),
+        ]
+    )
+    def testBorutaFilter(self, use_index_cols):
         """Test the Boruta filter, which removes the features which are statistically as
         relevant as random features."""
+        if use_index_cols:
+            self.recalculateWithMultiIndex()
         self.dataset.filterFeatures([BorutaFilter()])
         # check if correct columns selected and values still original
         self.assertListEqual(list(self.dataset.featureNames), self.descriptors[-1:])
@@ -174,6 +237,7 @@ def testBorutaFilter(self):
 
 class TestFeatureStandardizer(DataSetsPathMixIn, QSPRTestCase):
     """Test the feature standardizer."""
+
     def setUp(self):
         """Create a small test dataset with MorganFP descriptors."""
         super().setUp()
@@ -277,6 +341,7 @@ def testMolProcess(self, _, n_jobs, chunk_size, props, add_rdkit, args, kwargs):
 
 class testApplicabilityDomain(DataSetsPathMixIn, QSPRTestCase):
     """Test the applicability domain."""
+
     def setUp(self):
         """Create a small test dataset with MorganFP descriptors."""
         super().setUp()

diff --git a/qsprpred/data/tables/mol.py b/qsprpred/data/tables/mol.py
@@ -779,25 +779,26 @@ def addDescriptors(
             df_descriptors = pd.concat(df_descriptors, axis=0)
             df_descriptors[self.indexCols] = None
             df_descriptors.loc[self.df.index, self.indexCols] = self.df[self.indexCols]
-            self.attachDescriptors(calculator, df_descriptors, self.indexCols)
+            self.attachDescriptors(calculator, df_descriptors, [self.idProp])
 
     def getDescriptors(self):
         """Get the calculated descriptors as a pandas data frame.
 
         Returns:
             pd.DataFrame: Data frame containing only descriptors.
         """
-        join_cols = set()
-        for descriptors in self.descriptors:
-            join_cols.update(set(descriptors.indexCols))
-        join_cols = list(join_cols)
-        ret = self.df[join_cols].copy()
-        ret.reset_index(drop=True, inplace=True)
+        # join_cols = set()
+        # for descriptors in self.descriptors:
+        #     join_cols.update(set(descriptors.indexCols))
+        # join_cols = list(join_cols)
+        # ret = self.df[join_cols].copy()
+        # ret.reset_index(drop=True, inplace=True)
+        ret = pd.DataFrame(index=pd.Index(self.df.index.values, name=self.idProp))
         for descriptors in self.descriptors:
             df_descriptors = descriptors.getDescriptors()
-            ret = ret.join(df_descriptors, on=descriptors.indexCols, how="left")
-        ret.set_index(self.df.index, inplace=True)
-        ret.drop(columns=join_cols, inplace=True)
+            ret = ret.join(df_descriptors, how="left")
+        # ret.set_index(self.df.index, inplace=True)
+        # ret.drop(columns=join_cols, inplace=True)
         return ret
 
     def getDescriptorNames(self):