Skip to content

Commit

Permalink
Merge branch 'enhancement/descriptors/api_change' into 'dev'
Browse files Browse the repository at this point in the history
Simplify Descriptors and Prallel Processing APIs

See merge request cdd/QSPRpred!159
  • Loading branch information
martin-sicho committed Jan 18, 2024
2 parents b90b17d + c4a6dc6 commit 8b5fa9b
Show file tree
Hide file tree
Showing 69 changed files with 4,336 additions and 6,436 deletions.
15 changes: 14 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ From v2.1.1 to v3.0.0
- Add missing implementation of `QSPRDataset.removeProperty`
- Improved behavior of the Papyrus data source (does not attempt to connect to the
internet if the data set already exists).
- It is now possible to define new descriptor sets outside the package without errors.

## Changes

Expand Down Expand Up @@ -55,8 +56,17 @@ From v2.1.1 to v3.0.0
- The default log level for the package was changed from `INFO` to `WARNING`. A new
tutorial
was added to explain how to change the log level.
- `RepeatsFilter` argument `year_name` renamed to `time_col` and arugment `additional_cols` added.
- `RepeatsFilter` argument `year_name` renamed to `time_col` and
arugment `additional_cols` added.
- The `perc` argument of `BorutaPy` can now be set from the CLI.
- Descriptor calculators (previously used to aggregate and manage descriptor sets) were
completely removed from the API and descriptor sets can now be added directly to the
molecule tables.
- The rdkit-like descriptor and fingerprint retrieval functions were removed from the
API because they complicated implementation of customized descriptors.
- The `apply` method was simplified and a new API was clearly defined for parallel
processing of properties over data sets. To improve molecule processing,
a `processMols` method was added to `MoleculeTable`.

## New Features

Expand Down Expand Up @@ -95,6 +105,9 @@ From v2.1.1 to v3.0.0
- It is now possible to save `PandasDataTable`s to a CSV file instead of the default
pickle format (slower, but more human-readable).
- New `RegressionPlot` class `WilliamsPlot` added to plot Williams plots.
- Data sets can now be optionally stored in the `csv` format and not just as a pickle
file. This makes it easier to debug and share data sets, but it is slower to load and
save.

## Removed Features

Expand Down
231 changes: 116 additions & 115 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,24 +10,24 @@ readme = { file = "README.md", content-type = "text/markdown" }
requires-python = ">=3.10"
license = { file = "LICENSE" }
keywords = ["qsar", "machine learning", "cheminformatics"]
authors = [{name = "Helle van den Maagdenberg", email = "[email protected]"},
{name = "Linde Schoenmaker", email = "[email protected]"},
{name = "Martin Sicho", email = "[email protected]"},
{name = "David Araripe", email = "[email protected]"},
{name = "Sohvi Luukkonen", email = "[email protected]"},
{name = "Olivier Béquignon", email = "[email protected]"},
{name = "Marina Gorostiola Gonzalez", email = "[email protected]"},
{name = "Remco van den Broek", email = "[email protected]"},
{name = "Gerard van Westen", email = "[email protected]"}]
maintainers = [{name = "Helle van den Maagdenberg", email = "[email protected]"},
{name = "Linde Schoenmaker", email = "[email protected]"},
{name = "Martin Sicho", email = "[email protected]"},
{name = "David Araripe", email = "[email protected]"},
{name = "Sohvi Luukkonen", email = "[email protected]"},
{name = "Olivier Béquignon", email = "[email protected]"},
{name = "Marina Gorostiola Gonzalez", email = "[email protected]"},
{name = "Remco van den Broek", email = "[email protected]"},
{name = "Gerard van Westen", email = "[email protected]"}]
authors = [{ name = "Helle van den Maagdenberg", email = "[email protected]" },
{ name = "Linde Schoenmaker", email = "[email protected]" },
{ name = "Martin Sicho", email = "[email protected]" },
{ name = "David Araripe", email = "[email protected]" },
{ name = "Sohvi Luukkonen", email = "[email protected]" },
{ name = "Olivier Béquignon", email = "[email protected]" },
{ name = "Marina Gorostiola Gonzalez", email = "[email protected]" },
{ name = "Remco van den Broek", email = "[email protected]" },
{ name = "Gerard van Westen", email = "[email protected]" }]
maintainers = [{ name = "Helle van den Maagdenberg", email = "[email protected]" },
{ name = "Linde Schoenmaker", email = "[email protected]" },
{ name = "Martin Sicho", email = "[email protected]" },
{ name = "David Araripe", email = "[email protected]" },
{ name = "Sohvi Luukkonen", email = "[email protected]" },
{ name = "Olivier Béquignon", email = "[email protected]" },
{ name = "Marina Gorostiola Gonzalez", email = "[email protected]" },
{ name = "Remco van den Broek", email = "[email protected]" },
{ name = "Gerard van Westen", email = "[email protected]" }]
classifiers = [
"Development Status :: 4 - Beta",
"Operating System :: OS Independent", # We tested it on Mac and Windows as well?
Expand All @@ -39,6 +39,7 @@ classifiers = [
]
dependencies = [
"parameterized",
"pebble",
"numpy >= 1.19, <1.24.0",
"scikit-learn >= 1.0.2",
"pandas >= 1.2.2",
Expand All @@ -59,8 +60,8 @@ dependencies = [
[project.optional-dependencies]

extra = [
"mold2-pywrapper @ git+https://github.com/OlivierBeq/Mold2_pywrapper.git@master",
"padel-pywrapper >= 1.0.2.post1", "Mordred", "biopython", "prodec", "Signature-pywrapper",
"mold2-pywrapper @ git+https://github.com/OlivierBeq/Mold2_pywrapper.git@master",
"padel-pywrapper >= 1.0.2.post1", "Mordred", "biopython", "prodec", "Signature-pywrapper",
]
pyboost = ["py-boost"]
deep = ["torch >= 1.7.0", "chemprop >= 1.6.0"]
Expand Down Expand Up @@ -95,105 +96,105 @@ src_paths = ["qsprpred"]
known_first_party = 'qsprpred'

[tool.ruff]
line-length = 88
target-version = "py39"
fix = true
fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
unfixable = []
line-length = 88
target-version = "py39"
fix = true
fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
unfixable = []

# inspired by the configuration used in the pandas project
select = [
# pyflakes
"F",
# pycodestyle
"E", "W",
# flake8-2020
"YTT",
# flake8-bugbear
"B",
# flake8-quotes
"Q",
# flake8-debugger
"T10",
# flake8-gettext
"INT",
# pylint
"PLC", "PLE", "PLR", "PLW",
# misc lints
"PIE",
# flake8-pyi
"PYI",
# tidy imports
"TID",
# implicit string concatenation
"ISC",
# type-checking imports
"TCH",
# comprehensions
"C4",
# pygrep-hooks
"PGH",
# Ruff-specific rules
"RUF",
]
# inspired by the configuration used in the pandas project
select = [
# pyflakes
"F",
# pycodestyle
"E", "W",
# flake8-2020
"YTT",
# flake8-bugbear
"B",
# flake8-quotes
"Q",
# flake8-debugger
"T10",
# flake8-gettext
"INT",
# pylint
"PLC", "PLE", "PLR", "PLW",
# misc lints
"PIE",
# flake8-pyi
"PYI",
# tidy imports
"TID",
# implicit string concatenation
"ISC",
# type-checking imports
"TCH",
# comprehensions
"C4",
# pygrep-hooks
"PGH",
# Ruff-specific rules
"RUF",
]

ignore = [
# line length - somes lines are just too long
"E501",
# we decided on having relative rather than absolute imports
"TID252",
# module level import not at top of file
"E402",
# do not assign a lambda expression, use a def
"E731",
# controversial
"B006",
# controversial
"B007",
# controversial
"B008",
# tests use assert False
"B011",
# tests use comparisons but not their returned value
"B015",
# false positives
"B019",
# Loop control variable overrides iterable it iterates
"B020",
# Function definition does not bind loop variable
"B023",
# Only works with python >=3.10
"B905",
# Too many arguments to function call
"PLR0913",
# Too many returns
"PLR0911",
# Too many branches
"PLR0912",
# Too many statements
"PLR0915",
# Redefined loop name
"PLW2901",
# Global statements are discouraged
"PLW0603",
# Docstrings should not be included in stubs
"PYI021",
# No builtin `eval()` allowed
"PGH001",
# compare-to-empty-string
"PLC1901",
# pairwise-over-zipped (>=PY310 only)
"RUF007",
# Within an except clause, raise exceptions with ...
"B904",
# Use "collections.abc.*" instead of "typing.*" (PEP 585 syntax)
# "PYI027", # not yet implemented
# while int | float can be shortened to float, the former is more explicit
# "PYI041", # not yet implemented
]
ignore = [
# line length - somes lines are just too long
"E501",
# we decided on having relative rather than absolute imports
"TID252",
# module level import not at top of file
"E402",
# do not assign a lambda expression, use a def
"E731",
# controversial
"B006",
# controversial
"B007",
# controversial
"B008",
# tests use assert False
"B011",
# tests use comparisons but not their returned value
"B015",
# false positives
"B019",
# Loop control variable overrides generator it iterates
"B020",
# Function definition does not bind loop variable
"B023",
# Only works with python >=3.10
"B905",
# Too many arguments to function call
"PLR0913",
# Too many returns
"PLR0911",
# Too many branches
"PLR0912",
# Too many statements
"PLR0915",
# Redefined loop name
"PLW2901",
# Global statements are discouraged
"PLW0603",
# Docstrings should not be included in stubs
"PYI021",
# No builtin `eval()` allowed
"PGH001",
# compare-to-empty-string
"PLC1901",
# pairwise-over-zipped (>=PY310 only)
"RUF007",
# Within an except clause, raise exceptions with ...
"B904",
# Use "collections.abc.*" instead of "typing.*" (PEP 585 syntax)
# "PYI027", # not yet implemented
# while int | float can be shortened to float, the former is more explicit
# "PYI041", # not yet implemented
]

# Esclude a variety of commonly ignored dictionaries
exclude = [
exclude = [
"docs/*.py",
".bzr",
".direnv",
Expand Down
6 changes: 1 addition & 5 deletions qsprpred/benchmarks/replica.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@

from .settings.benchmark import DataPrepSettings
from ..data import QSPRDataset
from ..data.descriptors.calculators import MoleculeDescriptorsCalculator
from ..data.descriptors.sets import DescriptorSet
from ..data.sources.data_source import DataSource
from ..logs import logger
Expand Down Expand Up @@ -148,10 +147,7 @@ def addDescriptors(self, reload: bool = False):
logger.info(f"Data set {self.ds.name} not yet found. It will be created.")
# calculate descriptors if necessary
logger.info(f"Calculating descriptors for {self.ds.name}.")
desc_calculator = MoleculeDescriptorsCalculator(
desc_sets=deepcopy(self.descriptors)
)
self.ds.addDescriptors(desc_calculator, recalculate=True)
self.ds.addDescriptors(deepcopy(self.descriptors), recalculate=True)
self.ds.setTargetProperties(deepcopy(self.targetProps))
self.ds.setRandomState(self.randomSeed)
self.ds.save()
Expand Down
7 changes: 3 additions & 4 deletions qsprpred/benchmarks/tests.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
from unittest import TestCase

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.model_selection import KFold
Expand All @@ -15,6 +13,7 @@
from ..models.assessment_methods import CrossValAssessor, TestSetAssessor
from ..models.scikit_learn import SklearnModel
from ..utils.stringops import get_random_string
from ..utils.testing.base import QSPRTestCase
from ..utils.testing.path_mixins import DataSetsPathMixIn


Expand Down Expand Up @@ -42,7 +41,7 @@ def getDataSet(
return self.createLargeTestDataSet(name, target_props=target_props)


class BenchmarkingTest(DataSetsPathMixIn, TestCase):
class BenchmarkingTest(DataSetsPathMixIn, QSPRTestCase):
"""Test benchmarking functionality on the test data set.
Attributes:
Expand All @@ -56,7 +55,7 @@ def setUp(self):
super().setUp()
self.setUpPaths()
prep = self.getDefaultPrep()
descriptors = prep["feature_calculators"][0].descSets
descriptors = prep["feature_calculators"]
del prep["feature_calculators"]
descriptors.append(RDKitDescs())
self.seed = 42
Expand Down
Loading

0 comments on commit 8b5fa9b

Please sign in to comment.