Skip to content

Commit

Permalink
Merge branch 'enhancement/interaction_fingerprints_alternate' into 'dev'
Browse files Browse the repository at this point in the history
Alternative implementation of joining on arbitrary columns

See merge request cdd/QSPRpred!178
  • Loading branch information
martin-sicho committed Mar 21, 2024
2 parents 7f63e85 + 6f84dd5 commit 4d37019
Show file tree
Hide file tree
Showing 4 changed files with 167 additions and 29 deletions.
5 changes: 4 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
# Change Log

From v3.0.2 to v3.0.3
From v3.0.1 to v3.0.2

## Fixes

- Fixed a bug in `QSPRDataset` where property transformations were not applied.
- Fixed a bug where an attached standardizer would be refit when calling
`QSPRModel.predictMols` with `use_applicability_domain=True`.
- Fixed random seed not set in `FoldsFromDataSplit.iterFolds` for `ClusterSplit`.
Expand All @@ -14,6 +15,8 @@ None.

## New Features

- The `DataFrameDescriptorSet` class was extended to allow more flexibility when joining
custom descriptor sets.
- Added the `prepMols` method to `DescriptorSet` to allow separated customization of
molecule preparation before descriptor calculation.

Expand Down
85 changes: 77 additions & 8 deletions qsprpred/data/descriptors/sets.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,15 +177,66 @@ def getDescriptors(
class DataFrameDescriptorSet(DescriptorSet):
"""`DescriptorSet` that uses a `pandas.DataFrame` of precalculated descriptors."""

def __init__(self, df: pd.DataFrame):
@staticmethod
def setIndex(df: pd.DataFrame, cols: list[str]):
"""Create a multi-index from several columns of the data set.
Args:
df (pd.DataFrame): DataFrame to set index for.
cols (list[str]): List of columns to use as the new multi-index.
"""
df_index_tuples = df[cols].values
df_index_tuples = tuple(map(tuple, df_index_tuples))
df_index = pd.MultiIndex.from_tuples(df_index_tuples, names=cols)
df.index = df_index
return df

def __init__(
self,
df: pd.DataFrame,
joining_cols: list[str] | None = None,
suffix="",
source_is_multi_index=False,
):
"""Initialize the descriptor set with a dataframe of descriptors.
Args:
df: dataframe of descriptors
df:
dataframe of descriptors
joining_cols:
list of columns to use as joining index,
properties of the same name must exist in the data set
this descriptor is added to
suffix:
suffix to add to the descriptor name
source_is_multi_index:
assume that a multi-index is already present in the supplied dataframe.
If `True`, the `joining_cols` argument must
also be specified to indicate which properties should
be used to create the multi-index in the destination.
"""
super().__init__()
if source_is_multi_index and not joining_cols:
raise ValueError(
"When 'source_is_multi_index=True', 'joining_cols' must be specified."
)
self._df = df
if joining_cols and not source_is_multi_index:
self._df = self.setIndex(self._df, joining_cols)
self._cols = joining_cols
self._descriptors = df.columns.tolist() if df is not None else []
if joining_cols:
self._descriptors = [
col for col in self._descriptors if col not in joining_cols
]
self.suffix = suffix

@property
def requiredProps(self) -> list[str]:
"""Return the required properties for the dataframe."""
prior = super().requiredProps
new = prior + self._cols if self._cols is not None else prior
return list(set(new)) # remove duplicates

def getDF(self):
"""Return the dataframe of descriptors."""
Expand All @@ -195,6 +246,10 @@ def getIndex(self):
"""Return the index of the dataframe."""
return self._df.index if self._df is not None else None

def getIndexCols(self):
"""Return the index columns of the dataframe."""
return self._cols if self._df is not None else None

def getDescriptors(
self, mols: list[Mol], props: dict[str, list[Any]], *args, **kwargs
) -> np.ndarray:
Expand All @@ -210,11 +265,25 @@ def getDescriptors(
Returns:
numpy array of descriptor values of shape (n_mols, n_descriptors)
"""
index = pd.Index(props[self.idProp], name=self.idProp)
if self._df is None:
raise ValueError("No dataframe set.")
ret = pd.DataFrame(index=index)
ret = ret.merge(self._df, how="left", left_index=True, right_index=True)
# create a return data frame with the desired columns as index
index_cols = self.getIndexCols()
if index_cols:
ret = pd.DataFrame(
# fetch the join columns from our required props
{col: props[col] for col in index_cols}
)
ret = self.setIndex(ret, index_cols) # set our multi-index
ret.drop(columns=index_cols, inplace=True) # only keep the index
else:
ret = pd.DataFrame(index=pd.Index(props[self.idProp], name=self.idProp))
ret = ret.join(
# join in our descriptors
# each molecule gets the correct descriptors from the data frame
self._df,
how="left",
on=index_cols,
)
# ret is in the same order as the input mols, so we can just return the values
return ret[self.descriptors].values

@property
Expand All @@ -226,7 +295,7 @@ def descriptors(self, value):
self._descriptors = value

def __str__(self):
return "DataFrame"
return "DataFrame" if not self.suffix else f"{self.suffix}_DataFrame"


class DrugExPhyschem(DescriptorSet):
Expand Down
85 changes: 75 additions & 10 deletions qsprpred/data/processing/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@

import numpy as np
import pandas as pd
from mlchemad.applicability_domains import KNNApplicabilityDomain
from parameterized import parameterized
from rdkit.Chem import Mol
from mlchemad.applicability_domains import KNNApplicabilityDomain
from sklearn.preprocessing import StandardScaler

from .mol_processor import MolProcessor
Expand All @@ -30,6 +30,7 @@ class TestDataFilters(DataSetsPathMixIn, QSPRTestCase):
The tests here should be used to check for all their specific parameters and
edge cases."""

def setUp(self):
super().setUp()
self.setUpPaths()
Expand Down Expand Up @@ -102,10 +103,17 @@ def testConsistency(self):


class TestFeatureFilters(PathMixIn, QSPRTestCase):
"""Tests to check if the feature filters work on their own."""
"""Tests to check if the feature filters work on their own.
Note: This also tests the `DataframeDescriptorSet`,
as it is used to add test descriptors.
"""

def setUp(self):
"""Set up the small test Dataframe."""
super().setUp()
self.nCPU = 2 # just to test parallel processing
self.chunkSize = 2
self.setUpPaths()
descriptors = [
"Descriptor_F1",
Expand Down Expand Up @@ -133,39 +141,94 @@ def setUp(self):
)
self.dataset = QSPRDataset(
"TestFeatureFilters",
target_props=[{
"name": "y",
"task": TargetTasks.REGRESSION
}],
target_props=[{"name": "y", "task": TargetTasks.REGRESSION}],
df=self.df,
store_dir=self.generatedPath,
n_jobs=self.nCPU,
chunk_size=self.chunkSize,
)
self.df_descriptors.index = self.dataset.df.index
self.df_descriptors["QSPRID"] = self.dataset.getProperty(
self.dataset.idProp
).values
self.df_descriptors.set_index("QSPRID", inplace=True, drop=True)
self.dataset.addDescriptors([DataFrameDescriptorSet(self.df_descriptors)])
self.descriptors = self.dataset.featureNames

def testLowVarianceFilter(self):
def recalculateWithMultiIndex(self):
self.dataset.dropDescriptors(self.dataset.descriptorSets)
self.df_descriptors["ID_COL1"] = (
self.dataset.getProperty(self.dataset.idProp)
.apply(lambda x: x.split("_")[0])
.to_list()
)
self.df_descriptors["ID_COL2"] = (
self.dataset.getProperty(self.dataset.idProp)
.apply(lambda x: x.split("_")[1])
.to_list()
)
self.dataset.addProperty("ID_COL1", self.df_descriptors["ID_COL1"].values)
self.dataset.addProperty("ID_COL2", self.df_descriptors["ID_COL2"].values)
self.dataset.addDescriptors(
[
DataFrameDescriptorSet(
self.df_descriptors,
["ID_COL1", "ID_COL2"],
)
]
)

# def testDefaultDescriptorAdd(self):
# """Test adding without index columns."""
# # TODO: issue 88 needs to be solved for this to work
# self.dataset.nJobs = 1
# df_new = self.dataset.getFeatures(concat=True).copy()
# calc = DataFrameDescriptorSet(df_new, suffix="new_df_desc")
# self.dataset.addDescriptors([calc])

@parameterized.expand(
[
(True,),
(False,),
]
)
def testLowVarianceFilter(self, use_index_cols):
"""Test the low variance filter, which drops features with a variance below
a threshold."""
if use_index_cols:
self.recalculateWithMultiIndex()
self.dataset.filterFeatures([LowVarianceFilter(0.01)])
# check if correct columns selected and values still original
self.assertListEqual(list(self.dataset.featureNames), self.descriptors[1:])
self.assertListEqual(list(self.dataset.X.columns), self.descriptors[1:])

def testHighCorrelationFilter(self):
@parameterized.expand(
[
(True,),
(False,),
]
)
def testHighCorrelationFilter(self, use_index_cols):
"""Test the high correlation filter, which drops features with a correlation
above a threshold."""
if use_index_cols:
self.recalculateWithMultiIndex()
self.dataset.filterFeatures([HighCorrelationFilter(0.8)])
# check if correct columns selected and values still original
self.descriptors.pop(2)
self.assertListEqual(list(self.dataset.featureNames), self.descriptors)
self.assertListEqual(list(self.dataset.X.columns), self.descriptors)

def testBorutaFilter(self):
@parameterized.expand(
[
(True,),
(False,),
]
)
def testBorutaFilter(self, use_index_cols):
"""Test the Boruta filter, which removes the features which are statistically as
relevant as random features."""
if use_index_cols:
self.recalculateWithMultiIndex()
self.dataset.filterFeatures([BorutaFilter()])
# check if correct columns selected and values still original
self.assertListEqual(list(self.dataset.featureNames), self.descriptors[-1:])
Expand All @@ -174,6 +237,7 @@ def testBorutaFilter(self):

class TestFeatureStandardizer(DataSetsPathMixIn, QSPRTestCase):
"""Test the feature standardizer."""

def setUp(self):
"""Create a small test dataset with MorganFP descriptors."""
super().setUp()
Expand Down Expand Up @@ -277,6 +341,7 @@ def testMolProcess(self, _, n_jobs, chunk_size, props, add_rdkit, args, kwargs):

class testApplicabilityDomain(DataSetsPathMixIn, QSPRTestCase):
"""Test the applicability domain."""

def setUp(self):
"""Create a small test dataset with MorganFP descriptors."""
super().setUp()
Expand Down
21 changes: 11 additions & 10 deletions qsprpred/data/tables/mol.py
Original file line number Diff line number Diff line change
Expand Up @@ -779,25 +779,26 @@ def addDescriptors(
df_descriptors = pd.concat(df_descriptors, axis=0)
df_descriptors[self.indexCols] = None
df_descriptors.loc[self.df.index, self.indexCols] = self.df[self.indexCols]
self.attachDescriptors(calculator, df_descriptors, self.indexCols)
self.attachDescriptors(calculator, df_descriptors, [self.idProp])

def getDescriptors(self):
"""Get the calculated descriptors as a pandas data frame.
Returns:
pd.DataFrame: Data frame containing only descriptors.
"""
join_cols = set()
for descriptors in self.descriptors:
join_cols.update(set(descriptors.indexCols))
join_cols = list(join_cols)
ret = self.df[join_cols].copy()
ret.reset_index(drop=True, inplace=True)
# join_cols = set()
# for descriptors in self.descriptors:
# join_cols.update(set(descriptors.indexCols))
# join_cols = list(join_cols)
# ret = self.df[join_cols].copy()
# ret.reset_index(drop=True, inplace=True)
ret = pd.DataFrame(index=pd.Index(self.df.index.values, name=self.idProp))
for descriptors in self.descriptors:
df_descriptors = descriptors.getDescriptors()
ret = ret.join(df_descriptors, on=descriptors.indexCols, how="left")
ret.set_index(self.df.index, inplace=True)
ret.drop(columns=join_cols, inplace=True)
ret = ret.join(df_descriptors, how="left")
# ret.set_index(self.df.index, inplace=True)
# ret.drop(columns=join_cols, inplace=True)
return ret

def getDescriptorNames(self):
Expand Down

0 comments on commit 4d37019

Please sign in to comment.