Skip to content

Commit

Permalink
FIX: update scaler/pca file (#8)
Browse files Browse the repository at this point in the history
* DEV: rollback to old rdkit

* FIX: update link to the pca file, add cell numbering

* FIX: update pca_fname
  • Loading branch information
anmorgunov authored Jul 22, 2024
1 parent 4d7eb2e commit ecf19ec
Show file tree
Hide file tree
Showing 4 changed files with 57 additions and 60 deletions.
108 changes: 52 additions & 56 deletions ChemSpaceAL.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
"outputs": [],
"source": [
"%%capture\n",
"#@title Set Up Notebook\n",
"#@title Set Up Notebook (Cell 1)\n",
"#@markdown Press the *Play* button to install ChemSpaceAL and its dependencies\n",
"\n",
"!rm -r ChemSpaceAL\n",
Expand Down Expand Up @@ -94,7 +94,7 @@
},
"outputs": [],
"source": [
"# @title Specify (base) path for storing results\n",
"# @title Specify (base) path for storing results (Cell 2)\n",
"# @markdown make sure your path ends with a \"/\"\n",
"base_path = \"/content/drive/MyDrive/ChemSpaceAL-runs/\" # @param {type:\"string\"}\n",
"\n",
Expand Down Expand Up @@ -139,7 +139,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": null,
"metadata": {
"cellView": "form",
"colab": {
Expand All @@ -148,17 +148,9 @@
"id": "bm3jRgP-w0SX",
"outputId": "0dfaa0c2-0528-4202-f6eb-b09c85425993"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" will create folders at base_path='/Users/morgunov/batista/Summer/ChemSpaceAL/runs/'\n"
]
}
],
"outputs": [],
"source": [
"# @title create subfolders\n",
"# @title create subfolders (Cell 3)\n",
"# @markdown By default, the following folder structure will be created\n",
"if base_path is None:\n",
" base_path = os.getcwd() + \"/runs/\"\n",
Expand Down Expand Up @@ -200,7 +192,7 @@
}
],
"source": [
"#@title Download (if you want) dataset/weights\n",
"#@title Download (if you want) dataset/weights (Cell 4)\n",
"#@markdown note these files will be placed into appropriate folders created above\n",
"downloadDataset = True # @param {type:\"boolean\"}\n",
"downloadModelWeights = True # @param {type:\"boolean\"}\n",
Expand All @@ -219,7 +211,7 @@
" script += f\"curl -o {base_path}{f1} {remote_source}{f1}\\n\"\n",
" script += f\"curl -o {base_path}{f2} {remote_source}{f2}\\n\"\n",
"if downloadPCAweights:\n",
" f1 = \"3_Sampling/pca_weights/scaler_pca_combined_n120.pkl\"\n",
" f1 = \"3_Sampling/pca_weights/scaler_pca_combined_n120_v2.pkl\"\n",
" script += f\"curl -o {base_path}{f1} {remote_source}{f1}\\n\"\n",
"with open(\"fetch.bash\", \"w\") as f:\n",
" f.write(script)\n",
Expand Down Expand Up @@ -280,6 +272,7 @@
}
],
"source": [
"# Cell 5\n",
"config = Configuration.Config(\n",
" base_path=base_path,\n",
" cycle_prefix=\"model0\",\n",
Expand Down Expand Up @@ -310,20 +303,6 @@
"## Pretraining"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"id": "gizQ7Vow-1Nd"
},
"outputs": [],
"source": [
"# mode can be set to \"Pretraining\" or \"Active Learning\".\n",
"# In \"Pretraining\", an output is a list of two dataset objects corrresponding to (train, valid) partitions\n",
"# In \"Active Learning\", an output is a single dataset object corresponding to an AL training set\n",
"datasets = Dataset.load_data(config=config, mode=\"Pretraining\")"
]
},
{
"cell_type": "code",
"execution_count": 20,
Expand All @@ -350,11 +329,27 @@
}
],
"source": [
"# Cell 6\n",
"# You can also overwrite `learning_rate`, `lr_warmup` (a boolean of whether to do lr warmup),\n",
"# For a full list of available parameters run help(config.set_training_parameters)\n",
"config.set_training_parameters(mode=\"Pretraining\", epochs=10)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"id": "gizQ7Vow-1Nd"
},
"outputs": [],
"source": [
"# Cell 7\n",
"# mode can be set to \"Pretraining\" or \"Active Learning\".\n",
"# In \"Pretraining\", an output is a list of two dataset objects corrresponding to (train, valid) partitions\n",
"# In \"Active Learning\", an output is a single dataset object corresponding to an AL training set\n",
"datasets = Dataset.load_data(config=config, mode=\"Pretraining\")"
]
},
{
"cell_type": "code",
"execution_count": 21,
Expand Down Expand Up @@ -384,6 +379,7 @@
}
],
"source": [
"# Cell 8\n",
"# model objects and trainer objects are returned in case you want to do something with them\n",
"model, trainer = Training.train_GPT(\n",
" config=config, training_dataset=datasets[0], validation_dataset=datasets[1]\n",
Expand Down Expand Up @@ -445,10 +441,11 @@
}
],
"source": [
"# Cell 9\n",
"config.set_generation_parameters(\n",
" target_criterion=\"force_number_filtered\", # or you could choose `force_number_unique` or `force_number_completions`\n",
" force_filters=\"ADMET+FGs\", # could choose `ADMET` for no restriction on functional groups or simply remove this parameter\n",
" target_number=1,\n",
" target_number=100_000,\n",
")"
]
},
Expand All @@ -460,6 +457,7 @@
},
"outputs": [],
"source": [
"# Cell 10\n",
"Generation.generate_smiles(config) # this runs generation of SMILES\n",
"Generation.characterize_generated_molecules(config) # this runs an analysis of # unique, valid, and novel molecules"
]
Expand Down Expand Up @@ -501,10 +499,11 @@
}
],
"source": [
"# Cell 11\n",
"config.set_sampling_parameters(\n",
" n_clusters=10,\n",
" samples_per_cluster=2,\n",
" pca_fname=\"scaler_pca_combined_n120.pkl\",\n",
" pca_fname=\"scaler_pca_combined_n120_v2.pkl\",\n",
")"
]
},
Expand All @@ -516,7 +515,17 @@
},
"outputs": [],
"source": [
"Sampling.calculate_descriptors(config)\n",
"# Cell 12\n",
"Sampling.calculate_descriptors(config)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Cell 13\n",
"mols = Sampling.project_into_pca_space(config)\n",
"Sampling.cluster_and_sample(mols=mols, config=config, n_iter=1)"
]
Expand All @@ -540,7 +549,7 @@
"outputs": [],
"source": [
"%%capture\n",
"#@title Install Docking Software (DiffDock)\n",
"#@title Install Docking Software (DiffDock) (Cell 14)\n",
"#@markdown diffdock is pretty heavy and has a lot of dependencies, so we only install it when we need it (and we don't during pretraining, for example)\n",
"\n",
"import torch\n",
Expand Down Expand Up @@ -586,29 +595,7 @@
"import shutil\n",
"import os\n",
"import pandas as pd\n",
"from tqdm import tqdm"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "oT-N3o145UdV",
"outputId": "23b1379e-9f8d-40fe-8084-6eada4d38480"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2.1.0+cu121\n"
]
}
],
"source": [
"from tqdm import tqdm\n",
"from ChemSpaceAL.Docking import get_top_poses"
]
},
Expand Down Expand Up @@ -641,6 +628,7 @@
}
],
"source": [
"# Cell 15\n",
"config.set_scoring_parameters(\n",
" protein_path=\"HNH_processed.pdb\",\n",
")"
Expand Down Expand Up @@ -675,6 +663,7 @@
}
],
"source": [
"# Cell 16\n",
"get_top_poses(\n",
" ligands_csv=config.cycle_temp_params[\"path_to_sampled\"],\n",
" protein_pdb_path=config.cycle_temp_params[\"path_to_protein\"],\n",
Expand Down Expand Up @@ -711,6 +700,7 @@
}
],
"source": [
"# Cell 17\n",
"from ChemSpaceAL import Scoring\n",
"ligand_scores = Scoring.score_ligands(config)"
]
Expand All @@ -723,6 +713,7 @@
},
"outputs": [],
"source": [
"# Cell 18\n",
"Scoring.parse_and_prepare_diffdock_data(\n",
" ligand_scores=ligand_scores,\n",
" config=config\n",
Expand Down Expand Up @@ -764,6 +755,7 @@
}
],
"source": [
"# Cell 19\n",
"config.set_active_learning_parameters(\n",
" selection_mode=\"threshold\", probability_mode=\"linear\", threshold=11, training_size=10\n",
")"
Expand All @@ -777,6 +769,7 @@
},
"outputs": [],
"source": [
"# Cell 20\n",
"ALConstruction.construct_al_training_set(config=config, do_sampling=True)"
]
},
Expand All @@ -800,6 +793,7 @@
}
],
"source": [
"# Cell 21\n",
"al_ds = Dataset.load_data(config=config, mode=\"Active Learning\")"
]
},
Expand Down Expand Up @@ -830,6 +824,7 @@
}
],
"source": [
"# Cell 22\n",
"config.set_training_parameters(mode=\"Active Learning\", epochs=1)"
]
},
Expand All @@ -841,6 +836,7 @@
},
"outputs": [],
"source": [
"# Cell 23\n",
"model, trainer = Training.train_GPT(\n",
" config=config,\n",
" training_dataset=al_ds,\n",
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
[![codecov](https://codecov.io/gh/batistagroup/ChemSpaceAL/graph/badge.svg?token=ROJSISYJWC)](https://codecov.io/gh/batistagroup/ChemSpaceAL)
[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://github.com/batistagroup/ChemSpaceAL/blob/main/LICENSE)
[![image](https://img.shields.io/pypi/v/ChemSpaceAL.svg)](https://pypi.org/project/ChemSpaceAL/)
[![arXiv](https://img.shields.io/badge/arXiv-2309.05853.svg)](https://arxiv.org/abs/2309.05853)
<a target="_blank" href="https://colab.research.google.com/github/batistagroup/ChemSpaceAL/blob/main/ChemSpaceAL.ipynb">
<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>
Expand Down
4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
prolif==2.0.1
pandas==1.5.3
numpy
rdkit
numpy==1.25.2
rdkit==2023.03.3
torch
PyYAML
scikit_learn
Expand Down
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@
install_requires=[
"prolif==2.0.1",
"pandas==1.5.3",
"numpy",
"rdkit",
"numpy==1.25.2",
"rdkit==2023.03.3",
"torch",
"PyYAML",
"scikit_learn",
Expand Down

0 comments on commit ecf19ec

Please sign in to comment.