Merge branch 'enhancement/descriptors/api_change' into 'dev'

Simplify Descriptors and Prallel Processing APIs See merge request cdd/QSPRpred!159
CDDLeiden · Jan 18, 2024 · 8b5fa9b · 8b5fa9b
2 parents b90b17d + c4a6dc6
commit 8b5fa9b
Show file tree

Hide file tree

Showing 69 changed files with 4,336 additions and 6,436 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -17,6 +17,7 @@ From v2.1.1 to v3.0.0
 - Add missing implementation of `QSPRDataset.removeProperty`
 - Improved behavior of the Papyrus data source (does not attempt to connect to the
   internet if the data set already exists).
+- It is now possible to define new descriptor sets outside the package without errors.
 
 ## Changes
 
@@ -55,8 +56,17 @@ From v2.1.1 to v3.0.0
 - The default log level for the package was changed from `INFO` to `WARNING`. A new
   tutorial
   was added to explain how to change the log level.
-- `RepeatsFilter` argument `year_name` renamed to `time_col` and arugment `additional_cols` added.
+- `RepeatsFilter` argument `year_name` renamed to `time_col` and
+  arugment `additional_cols` added.
 - The `perc` argument of `BorutaPy` can now be set from the CLI.
+- Descriptor calculators (previously used to aggregate and manage descriptor sets) were
+  completely removed from the API and descriptor sets can now be added directly to the
+  molecule tables.
+- The rdkit-like descriptor and fingerprint retrieval functions were removed from the
+  API because they complicated implementation of customized descriptors.
+- The `apply` method was simplified and a new API was clearly defined for parallel
+  processing of properties over data sets. To improve molecule processing,
+  a `processMols` method was added to `MoleculeTable`.
 
 ## New Features
 
@@ -95,6 +105,9 @@ From v2.1.1 to v3.0.0
 - It is now possible to save `PandasDataTable`s to a CSV file instead of the default
   pickle format (slower, but more human-readable).
 - New `RegressionPlot` class  `WilliamsPlot` added to plot Williams plots.
+- Data sets can now be optionally stored in the `csv` format and not just as a pickle
+  file. This makes it easier to debug and share data sets, but it is slower to load and
+  save.
 
 ## Removed Features
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -10,24 +10,24 @@ readme = { file = "README.md", content-type = "text/markdown" }
 requires-python = ">=3.10"
 license = { file = "LICENSE" }
 keywords = ["qsar", "machine learning", "cheminformatics"]
-authors = [{name = "Helle van den Maagdenberg", email = "[email protected]"},
-           {name = "Linde Schoenmaker", email = "[email protected]"},
-           {name = "Martin Sicho", email = "[email protected]"},
-           {name = "David Araripe", email = "[email protected]"},
-           {name = "Sohvi Luukkonen", email = "[email protected]"},
-           {name = "Olivier Béquignon", email = "[email protected]"},
-           {name = "Marina Gorostiola Gonzalez", email = "[email protected]"},
-           {name = "Remco van den Broek", email = "[email protected]"},
-           {name = "Gerard van Westen", email = "[email protected]"}]
-maintainers = [{name = "Helle van den Maagdenberg", email = "[email protected]"},
-               {name = "Linde Schoenmaker", email = "[email protected]"}, 
-               {name = "Martin Sicho", email = "[email protected]"},
-               {name = "David Araripe", email = "[email protected]"},
-               {name = "Sohvi Luukkonen", email = "[email protected]"},
-               {name = "Olivier Béquignon", email = "[email protected]"}, 
-               {name = "Marina Gorostiola Gonzalez", email = "[email protected]"},
-               {name = "Remco van den Broek", email = "[email protected]"},
-               {name = "Gerard van Westen", email = "[email protected]"}]
+authors = [{ name = "Helle van den Maagdenberg", email = "[email protected]" },
+    { name = "Linde Schoenmaker", email = "[email protected]" },
+    { name = "Martin Sicho", email = "[email protected]" },
+    { name = "David Araripe", email = "[email protected]" },
+    { name = "Sohvi Luukkonen", email = "[email protected]" },
+    { name = "Olivier Béquignon", email = "[email protected]" },
+    { name = "Marina Gorostiola Gonzalez", email = "[email protected]" },
+    { name = "Remco van den Broek", email = "[email protected]" },
+    { name = "Gerard van Westen", email = "[email protected]" }]
+maintainers = [{ name = "Helle van den Maagdenberg", email = "[email protected]" },
+    { name = "Linde Schoenmaker", email = "[email protected]" },
+    { name = "Martin Sicho", email = "[email protected]" },
+    { name = "David Araripe", email = "[email protected]" },
+    { name = "Sohvi Luukkonen", email = "[email protected]" },
+    { name = "Olivier Béquignon", email = "[email protected]" },
+    { name = "Marina Gorostiola Gonzalez", email = "[email protected]" },
+    { name = "Remco van den Broek", email = "[email protected]" },
+    { name = "Gerard van Westen", email = "[email protected]" }]
 classifiers = [
     "Development Status :: 4 - Beta",
     "Operating System :: OS Independent", # We tested it on Mac and Windows as well?
@@ -39,6 +39,7 @@ classifiers = [
 ]
 dependencies = [
     "parameterized",
+    "pebble",
     "numpy >= 1.19, <1.24.0",
     "scikit-learn >= 1.0.2",
     "pandas >= 1.2.2",
@@ -59,8 +60,8 @@ dependencies = [
 [project.optional-dependencies]
 
 extra = [
-  "mold2-pywrapper @ git+https://github.com/OlivierBeq/Mold2_pywrapper.git@master",
-  "padel-pywrapper >= 1.0.2.post1", "Mordred", "biopython", "prodec", "Signature-pywrapper",
+    "mold2-pywrapper @ git+https://github.com/OlivierBeq/Mold2_pywrapper.git@master",
+    "padel-pywrapper >= 1.0.2.post1", "Mordred", "biopython", "prodec", "Signature-pywrapper",
 ]
 pyboost = ["py-boost"]
 deep = ["torch >= 1.7.0", "chemprop >= 1.6.0"]
@@ -95,105 +96,105 @@ src_paths = ["qsprpred"]
 known_first_party = 'qsprpred'
 
 [tool.ruff]
- line-length = 88
- target-version = "py39"
- fix = true
- fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
- unfixable = []
+line-length = 88
+target-version = "py39"
+fix = true
+fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
+unfixable = []
 
- # inspired by the configuration used in the pandas project
- select = [
-   # pyflakes
-   "F",
-   # pycodestyle
-   "E", "W",
-   # flake8-2020
-   "YTT",
-   # flake8-bugbear
-   "B",
-   # flake8-quotes
-   "Q",
-   # flake8-debugger
-   "T10",
-   # flake8-gettext
-   "INT",
-   # pylint
-   "PLC", "PLE", "PLR", "PLW",
-   # misc lints
-   "PIE",
-   # flake8-pyi
-   "PYI",
-   # tidy imports
-   "TID",
-   # implicit string concatenation
-   "ISC",
-   # type-checking imports
-   "TCH",
-   # comprehensions
-   "C4",
-   # pygrep-hooks
-   "PGH",
-   # Ruff-specific rules
-   "RUF",
- ]
+# inspired by the configuration used in the pandas project
+select = [
+    # pyflakes
+    "F",
+    # pycodestyle
+    "E", "W",
+    # flake8-2020
+    "YTT",
+    # flake8-bugbear
+    "B",
+    # flake8-quotes
+    "Q",
+    # flake8-debugger
+    "T10",
+    # flake8-gettext
+    "INT",
+    # pylint
+    "PLC", "PLE", "PLR", "PLW",
+    # misc lints
+    "PIE",
+    # flake8-pyi
+    "PYI",
+    # tidy imports
+    "TID",
+    # implicit string concatenation
+    "ISC",
+    # type-checking imports
+    "TCH",
+    # comprehensions
+    "C4",
+    # pygrep-hooks
+    "PGH",
+    # Ruff-specific rules
+    "RUF",
+]
 
- ignore = [
-   # line length - somes lines are just too long
-   "E501",
-   # we decided on having relative rather than absolute imports
-   "TID252",
-   # module level import not at top of file
-   "E402",
-   # do not assign a lambda expression, use a def
-   "E731",
-   # controversial
-   "B006",
-   # controversial
-   "B007",
-   # controversial
-   "B008",
-   # tests use assert False
-   "B011",
-   # tests use comparisons but not their returned value
-   "B015",
-   # false positives
-   "B019",
-   # Loop control variable overrides iterable it iterates
-   "B020",
-   # Function definition does not bind loop variable
-   "B023",
-   # Only works with python >=3.10
-   "B905",
-   # Too many arguments to function call
-   "PLR0913",
-   # Too many returns
-   "PLR0911",
-   # Too many branches
-   "PLR0912",
-   # Too many statements
-   "PLR0915",
-   # Redefined loop name
-   "PLW2901",
-   # Global statements are discouraged
-   "PLW0603",
-   # Docstrings should not be included in stubs
-   "PYI021",
-   # No builtin `eval()` allowed
-   "PGH001",
-   # compare-to-empty-string
-   "PLC1901",
-   # pairwise-over-zipped (>=PY310 only)
-   "RUF007",
-   # Within an except clause, raise exceptions with ...
-   "B904",
-   # Use "collections.abc.*" instead of "typing.*" (PEP 585 syntax)
-   # "PYI027",  # not yet implemented
-   # while int | float can be shortened to float, the former is more explicit
-   # "PYI041",  # not yet implemented
- ]
+ignore = [
+    # line length - somes lines are just too long
+    "E501",
+    # we decided on having relative rather than absolute imports
+    "TID252",
+    # module level import not at top of file
+    "E402",
+    # do not assign a lambda expression, use a def
+    "E731",
+    # controversial
+    "B006",
+    # controversial
+    "B007",
+    # controversial
+    "B008",
+    # tests use assert False
+    "B011",
+    # tests use comparisons but not their returned value
+    "B015",
+    # false positives
+    "B019",
+    # Loop control variable overrides generator it iterates
+    "B020",
+    # Function definition does not bind loop variable
+    "B023",
+    # Only works with python >=3.10
+    "B905",
+    # Too many arguments to function call
+    "PLR0913",
+    # Too many returns
+    "PLR0911",
+    # Too many branches
+    "PLR0912",
+    # Too many statements
+    "PLR0915",
+    # Redefined loop name
+    "PLW2901",
+    # Global statements are discouraged
+    "PLW0603",
+    # Docstrings should not be included in stubs
+    "PYI021",
+    # No builtin `eval()` allowed
+    "PGH001",
+    # compare-to-empty-string
+    "PLC1901",
+    # pairwise-over-zipped (>=PY310 only)
+    "RUF007",
+    # Within an except clause, raise exceptions with ...
+    "B904",
+    # Use "collections.abc.*" instead of "typing.*" (PEP 585 syntax)
+    # "PYI027",  # not yet implemented
+    # while int | float can be shortened to float, the former is more explicit
+    # "PYI041",  # not yet implemented
+]
 
 # Esclude a variety of commonly ignored dictionaries
- exclude = [
+exclude = [
     "docs/*.py",
     ".bzr",
     ".direnv",

diff --git a/qsprpred/benchmarks/replica.py b/qsprpred/benchmarks/replica.py
@@ -7,7 +7,6 @@
 
 from .settings.benchmark import DataPrepSettings
 from ..data import QSPRDataset
-from ..data.descriptors.calculators import MoleculeDescriptorsCalculator
 from ..data.descriptors.sets import DescriptorSet
 from ..data.sources.data_source import DataSource
 from ..logs import logger
@@ -148,10 +147,7 @@ def addDescriptors(self, reload: bool = False):
             logger.info(f"Data set {self.ds.name} not yet found. It will be created.")
             # calculate descriptors if necessary
             logger.info(f"Calculating descriptors for {self.ds.name}.")
-            desc_calculator = MoleculeDescriptorsCalculator(
-                desc_sets=deepcopy(self.descriptors)
-            )
-            self.ds.addDescriptors(desc_calculator, recalculate=True)
+            self.ds.addDescriptors(deepcopy(self.descriptors), recalculate=True)
             self.ds.setTargetProperties(deepcopy(self.targetProps))
             self.ds.setRandomState(self.randomSeed)
             self.ds.save()

diff --git a/qsprpred/benchmarks/tests.py b/qsprpred/benchmarks/tests.py
@@ -1,5 +1,3 @@
-from unittest import TestCase
-
 from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
 from sklearn.impute import SimpleImputer
 from sklearn.model_selection import KFold
@@ -15,6 +13,7 @@
 from ..models.assessment_methods import CrossValAssessor, TestSetAssessor
 from ..models.scikit_learn import SklearnModel
 from ..utils.stringops import get_random_string
+from ..utils.testing.base import QSPRTestCase
 from ..utils.testing.path_mixins import DataSetsPathMixIn
 
 
@@ -42,7 +41,7 @@ def getDataSet(
         return self.createLargeTestDataSet(name, target_props=target_props)
 
 
-class BenchmarkingTest(DataSetsPathMixIn, TestCase):
+class BenchmarkingTest(DataSetsPathMixIn, QSPRTestCase):
     """Test benchmarking functionality on the test data set.
 
     Attributes:
@@ -56,7 +55,7 @@ def setUp(self):
         super().setUp()
         self.setUpPaths()
         prep = self.getDefaultPrep()
-        descriptors = prep["feature_calculators"][0].descSets
+        descriptors = prep["feature_calculators"]
         del prep["feature_calculators"]
         descriptors.append(RDKitDescs())
         self.seed = 42