diff --git a/qsprpred/data/chem/clustering.py b/qsprpred/data/chem/clustering.py index 1d850f5f..cad3c663 100644 --- a/qsprpred/data/chem/clustering.py +++ b/qsprpred/data/chem/clustering.py @@ -6,7 +6,7 @@ from rdkit import Chem, DataStructs from rdkit.SimDivFilters import rdSimDivPickers -from .scaffolds import Murcko, Scaffold +from .scaffolds import BemisMurckoRDKit, Scaffold from .. import MoleculeTable from ..descriptors.fingerprints import Fingerprint, MorganFP from ...logs import logger @@ -89,7 +89,7 @@ class ScaffoldClusters(MoleculeClusters): scaffold (Scaffold): scaffold generator """ - def __init__(self, scaffold: Scaffold = Murcko()): + def __init__(self, scaffold: Scaffold = BemisMurckoRDKit()): super().__init__() self.scaffold = scaffold diff --git a/qsprpred/data/chem/scaffolds.py b/qsprpred/data/chem/scaffolds.py index 60b60ce8..e27b0533 100644 --- a/qsprpred/data/chem/scaffolds.py +++ b/qsprpred/data/chem/scaffolds.py @@ -33,20 +33,14 @@ def supportsParallel(self) -> bool: return True -class Murcko(Scaffold): - """Class for calculating Murcko scaffolds of a given molecule.""" +class BemisMurckoRDKit(Scaffold): + """Class for calculating Murcko scaffolds of a given molecule + using the default implementation in RDKit. If you want, an implementation + closer to the original paper, see the `BemisMurcko` class. - def __call__(self, mols, props, *args, **kwargs): - """ - Calculate the Murcko scaffold for a molecule as implemented - in RDKit. - - Args: - mol: SMILES as `str` or an instance of `Mol` + """ - Returns: - SMILES of the Murcko scaffold as `str` - """ + def __call__(self, mols, props, *args, **kwargs): res = [] for mol in mols: mol = Chem.MolFromSmiles(mol) if isinstance(mol, str) else mol @@ -55,7 +49,7 @@ def __call__(self, mols, props, *args, **kwargs): return pd.Series(res, index=props[self.idProp]) def __str__(self): - return "Murcko" + return "BemisMurckoRDKit" class BemisMurcko(Scaffold): @@ -114,15 +108,6 @@ def findTerminalAtoms(mol): return res def __call__(self, mols, props, *args, **kwargs): - """ - Calculate the Bemis-Murcko scaffold for a molecule. - - Args: - mol: SMILES as `str` or an instance of `Mol` - - Returns: - SMILES of the Bemis-Murcko scaffold as `str` - """ res = [] for mol in mols: mol = Chem.MolFromSmiles(mol) if isinstance(mol, str) else mol diff --git a/qsprpred/data/chem/tests.py b/qsprpred/data/chem/tests.py index b2ecb887..29d59e7f 100644 --- a/qsprpred/data/chem/tests.py +++ b/qsprpred/data/chem/tests.py @@ -3,7 +3,7 @@ from ... import TargetTasks from ...data import QSPRDataset -from ...data.chem.scaffolds import Murcko, BemisMurcko +from ...data.chem.scaffolds import BemisMurckoRDKit, BemisMurcko from ...utils.testing.base import QSPRTestCase from ...utils.testing.path_mixins import DataSetsPathMixIn @@ -19,7 +19,7 @@ def setUp(self): @parameterized.expand( [ - ("Murcko", Murcko()), + ("Murcko", BemisMurckoRDKit()), ("BemisMurcko", BemisMurcko()), ("BemisMurckoCSK", BemisMurcko(True, True)), ("BemisMurckoJustCSK", BemisMurcko(False, True)), diff --git a/qsprpred/data/sampling/splits.py b/qsprpred/data/sampling/splits.py index ebdaaa75..ed2cbdc2 100644 --- a/qsprpred/data/sampling/splits.py +++ b/qsprpred/data/sampling/splits.py @@ -18,7 +18,7 @@ RandomClusters, ScaffoldClusters, ) -from ...data.chem.scaffolds import Murcko, Scaffold +from ...data.chem.scaffolds import BemisMurckoRDKit, Scaffold from ...data.tables.base import MoleculeDataTable, DataSetDependant from ...data.tables.qspr import QSPRDataset from ...logs import logger @@ -476,7 +476,7 @@ class ScaffoldSplit(GBMTDataSplit): def __init__( self, dataset: QSPRDataset | None = None, - scaffold: Scaffold = Murcko(), + scaffold: Scaffold = BemisMurckoRDKit(), test_fraction: float = 0.1, n_folds: int = 1, custom_test_list: list | None = None, @@ -552,10 +552,10 @@ def setSeed(self, seed: int | None): self.seed = seed if hasattr(self.clustering, "seed"): self.clustering.seed = seed - + def getSeed(self): """Get the seed for this instance. - + Returns: int: the seed for this instance or None if no seed is set. """ diff --git a/qsprpred/data/sampling/tests.py b/qsprpred/data/sampling/tests.py index dc3b30fa..2a04a905 100644 --- a/qsprpred/data/sampling/tests.py +++ b/qsprpred/data/sampling/tests.py @@ -16,7 +16,7 @@ FPSimilarityLeaderPickerClusters, FPSimilarityMaxMinClusters, ) -from ...data.chem.scaffolds import Murcko, BemisMurcko +from ...data.chem.scaffolds import BemisMurckoRDKit, BemisMurcko from ...data.sampling.folds import FoldsFromDataSplit from ...data.sampling.splits import ManualSplit from ...utils.testing.base import QSPRTestCase @@ -109,13 +109,13 @@ def testTemporalSplit(self, multitask): @parameterized.expand( [ - (False, Murcko(), None), + (False, BemisMurckoRDKit(), None), ( False, BemisMurcko(use_csk=True), ["ScaffoldSplit_000", "ScaffoldSplit_001"], ), - (True, Murcko(), None), + (True, BemisMurckoRDKit(), None), ] ) def testScaffoldSplit(self, multitask, scaffold, custom_test_list): diff --git a/qsprpred/data_CLI.py b/qsprpred/data_CLI.py index 9de10a89..effb7d0d 100644 --- a/qsprpred/data_CLI.py +++ b/qsprpred/data_CLI.py @@ -15,6 +15,10 @@ from sklearn.impute import SimpleImputer from sklearn.preprocessing import StandardScaler +from qsprpred.data.chem.clustering import ( + FPSimilarityMaxMinClusters, + FPSimilarityLeaderPickerClusters, +) from qsprpred.data.descriptors.fingerprints import ( MorganFP, RDKitMACCSFP, @@ -24,10 +28,6 @@ RDKitFP, AvalonFP, ) -from qsprpred.data.chem.clustering import ( - FPSimilarityMaxMinClusters, - FPSimilarityLeaderPickerClusters -) from qsprpred.data.descriptors.sets import ( DrugExPhyschem, PredictorDesc, @@ -49,7 +49,7 @@ ) from qsprpred.data.tables.qspr import QSPRDataset from qsprpred.tasks import TargetTasks -from .data.chem.scaffolds import Murcko +from .data.chem.scaffolds import BemisMurckoRDKit from .extra.gpu.models.dnn import DNNModel from .logs.utils import backup_files, enable_file_logger from .models.scikit_learn import SklearnModel @@ -363,7 +363,7 @@ def QSPR_dataprep(args): else None, "imputer": SimpleImputer(strategy=args.imputation[prop]) if prop in args.imputation - else None + else None, } ) dataset_name = ( @@ -391,7 +391,7 @@ def QSPR_dataprep(args): if args.split == "scaffold": split = ScaffoldSplit( test_fraction=args.split_fraction, - scaffold=Murcko(), + scaffold=BemisMurckoRDKit(), dataset=mydataset, ) elif args.split == "time": @@ -525,12 +525,15 @@ def QSPR_dataprep(args): os.makedirs(args.output_dir) # get a list of all the folders in the output directory - folders = [f for f in os.listdir(args.output_dir) if os.path.isdir(f"{args.output_dir}/{f}")] + folders = [ + f + for f in os.listdir(args.output_dir) + if os.path.isdir(f"{args.output_dir}/{f}") + ] # remove folders that start with backup folders = [f for f in folders if not f.startswith("backup")] - if not args.skip_backup: backup_msg = backup_files( args.output_dir, diff --git a/tutorials/advanced/data/parallelization.ipynb b/tutorials/advanced/data/parallelization.ipynb index 29aa965e..1d10eb5d 100644 --- a/tutorials/advanced/data/parallelization.ipynb +++ b/tutorials/advanced/data/parallelization.ipynb @@ -4,10 +4,7 @@ "cell_type": "markdown", "id": "9fedcee856268b35", "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "source": [ "# Code Parallelization\n", @@ -77,10 +74,7 @@ "cell_type": "markdown", "id": "8f9ffda3a4b8202f", "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "source": [ "## Setting `nJobs` and `chunkSize`\n", @@ -117,10 +111,7 @@ "end_time": "2024-01-16T16:30:51.361058064Z", "start_time": "2024-01-16T16:30:47.131517756Z" }, - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "outputs": [ { @@ -161,10 +152,7 @@ "cell_type": "markdown", "id": "9357f12c0516b989", "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "source": [ "This calculation is done on one CPU by default:" @@ -179,10 +167,7 @@ "end_time": "2024-01-16T16:30:51.368391209Z", "start_time": "2024-01-16T16:30:51.363595085Z" }, - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "outputs": [ { @@ -204,10 +189,7 @@ "cell_type": "markdown", "id": "e7e51a9829413df0", "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "source": [ "and the whole data set supplied as one chunk:" @@ -222,10 +204,7 @@ "end_time": "2024-01-16T16:30:51.372183338Z", "start_time": "2024-01-16T16:30:51.367032511Z" }, - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "outputs": [ { @@ -247,10 +226,7 @@ "cell_type": "markdown", "id": "d28c75dc19273bed", "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "source": [ "We can now try running this calculation in parallel on 2 CPUs:" @@ -265,10 +241,7 @@ "end_time": "2024-01-16T16:30:51.379969255Z", "start_time": "2024-01-16T16:30:51.375227876Z" }, - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "outputs": [], "source": [ @@ -279,10 +252,7 @@ "cell_type": "markdown", "id": "6bc6ee9045cc5f12", "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "source": [ "The chunk size will automatically be adjusted to 25% of the data set size so that each portion of the data set is processed on a separate CPU:" @@ -297,10 +267,7 @@ "end_time": "2024-01-16T16:30:51.411732902Z", "start_time": "2024-01-16T16:30:51.378238063Z" }, - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "outputs": [ { @@ -322,10 +289,7 @@ "cell_type": "markdown", "id": "2e21998b62ee78bf", "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "source": [ "We can see how this affects the time taken to run the calculation:" @@ -340,10 +304,7 @@ "end_time": "2024-01-16T16:30:53.084658845Z", "start_time": "2024-01-16T16:30:51.383586975Z" }, - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "outputs": [ { @@ -364,10 +325,7 @@ "cell_type": "markdown", "id": "bc5243c149010a23", "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "source": [ "This was faster, but not by a factor of 4. This is because there is some overhead associated with parallelization and the calculation of fingerprints is very fast by itself so the overhead affects our runtime more. In such cases, be careful about setting the chunk size manually:" @@ -382,10 +340,7 @@ "end_time": "2024-01-16T16:31:10.073558913Z", "start_time": "2024-01-16T16:30:53.083216365Z" }, - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "outputs": [ { @@ -407,10 +362,7 @@ "cell_type": "markdown", "id": "c9fdc32aa83072e6", "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "source": [ "This was slower than even the single CPU calculation!" @@ -420,10 +372,7 @@ "cell_type": "markdown", "id": "7c2367dd655da9c8", "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "source": [ "## Custom Operations\n", @@ -440,10 +389,7 @@ "end_time": "2024-01-16T16:31:10.082418114Z", "start_time": "2024-01-16T16:31:10.077838705Z" }, - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "outputs": [ { @@ -474,10 +420,7 @@ "cell_type": "markdown", "id": "3ada92396624b990", "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "source": [ "As you can see, this gives us a generator object. In order to run the function on each chunk and get the results, we need to iterate over the generator and collect results:" @@ -492,10 +435,7 @@ "end_time": "2024-01-16T16:31:10.175831497Z", "start_time": "2024-01-16T16:31:10.081098696Z" }, - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "outputs": [ { @@ -539,10 +479,7 @@ "cell_type": "markdown", "id": "a5f2d451e08ec155", "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "source": [ "The results in this case are just four `None` values since our function doesn't return anything:" @@ -557,10 +494,7 @@ "end_time": "2024-01-16T16:31:10.223479222Z", "start_time": "2024-01-16T16:31:10.180906772Z" }, - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "outputs": [ { @@ -582,10 +516,7 @@ "cell_type": "markdown", "id": "84a590acb0626ee9", "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "source": [ "We can also instruct the `apply` method to pass a `DataFrame` instead of a dictionary of properties to the function. This is useful if you want to use the `pandas.DataFrame` API to process the data:" @@ -600,10 +531,7 @@ "end_time": "2024-01-16T16:31:10.254595551Z", "start_time": "2024-01-16T16:31:10.227714969Z" }, - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "outputs": [ { @@ -633,10 +561,7 @@ "cell_type": "markdown", "id": "a14646b3cc04daee", "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "source": [ "**WARNING:** The `apply` method does not guarantee that the results will be returned in the same order as the chunks were processed. This is because the chunks are processed in parallel and the order depends on the order in which the parallel processes finish." @@ -646,10 +571,7 @@ "cell_type": "markdown", "id": "39fcfa580de331", "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "source": [ "### Molecule Processors\n", @@ -666,10 +588,7 @@ "end_time": "2024-01-16T16:31:10.307074944Z", "start_time": "2024-01-16T16:31:10.228216373Z" }, - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "outputs": [ { @@ -734,10 +653,7 @@ "cell_type": "markdown", "id": "d4a679c7ec23c64a", "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "source": [ "With `processMols`, we can also automatically convert the molecules to RDKit molecules before passing them to the processor:" @@ -752,10 +668,7 @@ "end_time": "2024-01-16T16:31:10.955175012Z", "start_time": "2024-01-16T16:31:10.278782050Z" }, - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "outputs": [ { @@ -803,10 +716,7 @@ "cell_type": "markdown", "id": "4927b7b9fe7bfa4c", "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "source": [ "You can also derive from `MolProcessorWithID` if you want to access the molecule IDs provided by the data set in your processor. This is useful to overcome the issue that the order in which chunks are processed is not guaranteed:" @@ -821,10 +731,7 @@ "end_time": "2024-01-16T16:31:12.843689806Z", "start_time": "2024-01-16T16:31:10.956455648Z" }, - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } + "collapsed": false }, "outputs": [ { diff --git a/tutorials/basics/data/data_splitting.ipynb b/tutorials/basics/data/data_splitting.ipynb index d73a2820..a3e66187 100644 --- a/tutorials/basics/data/data_splitting.ipynb +++ b/tutorials/basics/data/data_splitting.ipynb @@ -1299,10 +1299,10 @@ } ], "source": [ - "from qsprpred.data.chem.scaffolds import Murcko\n", + "from qsprpred.data.chem.scaffolds import BemisMurckoRDKit\n", "from qsprpred.data import ScaffoldSplit\n", "\n", - "split = ScaffoldSplit(n_folds=10, scaffold=Murcko())\n", + "split = ScaffoldSplit(n_folds=10, scaffold=BemisMurckoRDKit())\n", "for fold, (X_train, X_test, y_train, y_test, train_index, test_index) in enumerate(\n", " dataset.iterFolds(split)):\n", " print_cv_split(fold, X_train, X_test, y_train, y_test, train_index, test_index)" @@ -1805,7 +1805,7 @@ "source": [ "CrossValAssessor(\n", " scoring=\"roc_auc\",\n", - " split=ScaffoldSplit(n_folds=10, scaffold=Murcko())\n", + " split=ScaffoldSplit(n_folds=10, scaffold=BemisMurckoRDKit())\n", ")(model, dataset)" ] },