FIX: update scaler/pca file (#8)

* DEV: rollback to old rdkit * FIX: update link to the pca file, add cell numbering * FIX: update pca_fname
batistagroup · Jul 22, 2024 · ecf19ec · ecf19ec
1 parent 4d7eb2e
commit ecf19ec
Show file tree

Hide file tree

Showing 4 changed files with 57 additions and 60 deletions.
diff --git a/ChemSpaceAL.ipynb b/ChemSpaceAL.ipynb
@@ -37,7 +37,7 @@
       "outputs": [],
       "source": [
         "%%capture\n",
-        "#@title Set Up Notebook\n",
+        "#@title Set Up Notebook (Cell 1)\n",
         "#@markdown Press the *Play* button to install ChemSpaceAL and its dependencies\n",
         "\n",
         "!rm -r ChemSpaceAL\n",
@@ -94,7 +94,7 @@
       },
       "outputs": [],
       "source": [
-        "# @title Specify (base) path for storing results\n",
+        "# @title Specify (base) path for storing results (Cell 2)\n",
         "# @markdown make sure your path ends with a \"/\"\n",
         "base_path = \"/content/drive/MyDrive/ChemSpaceAL-runs/\"  # @param {type:\"string\"}\n",
         "\n",
@@ -139,7 +139,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 9,
+      "execution_count": null,
       "metadata": {
         "cellView": "form",
         "colab": {
@@ -148,17 +148,9 @@
         "id": "bm3jRgP-w0SX",
         "outputId": "0dfaa0c2-0528-4202-f6eb-b09c85425993"
       },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "    will create folders at base_path='/Users/morgunov/batista/Summer/ChemSpaceAL/runs/'\n"
-          ]
-        }
-      ],
+      "outputs": [],
       "source": [
-        "# @title create subfolders\n",
+        "# @title create subfolders (Cell 3)\n",
         "# @markdown By default, the following folder structure will be created\n",
         "if base_path is None:\n",
         "    base_path = os.getcwd() + \"/runs/\"\n",
@@ -200,7 +192,7 @@
         }
       ],
       "source": [
-        "#@title Download (if you want) dataset/weights\n",
+        "#@title Download (if you want) dataset/weights (Cell 4)\n",
         "#@markdown note these files will be placed into appropriate folders created above\n",
         "downloadDataset = True # @param {type:\"boolean\"}\n",
         "downloadModelWeights = True # @param {type:\"boolean\"}\n",
@@ -219,7 +211,7 @@
         "  script += f\"curl -o {base_path}{f1} {remote_source}{f1}\\n\"\n",
         "  script += f\"curl -o {base_path}{f2} {remote_source}{f2}\\n\"\n",
         "if downloadPCAweights:\n",
-        "  f1 = \"3_Sampling/pca_weights/scaler_pca_combined_n120.pkl\"\n",
+        "  f1 = \"3_Sampling/pca_weights/scaler_pca_combined_n120_v2.pkl\"\n",
         "  script += f\"curl -o {base_path}{f1} {remote_source}{f1}\\n\"\n",
         "with open(\"fetch.bash\", \"w\") as f:\n",
         "  f.write(script)\n",
@@ -280,6 +272,7 @@
         }
       ],
       "source": [
+        "# Cell 5\n",
         "config = Configuration.Config(\n",
         "    base_path=base_path,\n",
         "    cycle_prefix=\"model0\",\n",
@@ -310,20 +303,6 @@
         "## Pretraining"
       ]
     },
-    {
-      "cell_type": "code",
-      "execution_count": 23,
-      "metadata": {
-        "id": "gizQ7Vow-1Nd"
-      },
-      "outputs": [],
-      "source": [
-        "# mode can be set to \"Pretraining\" or \"Active Learning\".\n",
-        "# In \"Pretraining\", an output is a list of two dataset objects corrresponding to (train, valid) partitions\n",
-        "# In \"Active Learning\", an output is a single dataset object corresponding to an AL training set\n",
-        "datasets = Dataset.load_data(config=config, mode=\"Pretraining\")"
-      ]
-    },
     {
       "cell_type": "code",
       "execution_count": 20,
@@ -350,11 +329,27 @@
         }
       ],
       "source": [
+        "# Cell 6\n",
         "# You can also overwrite `learning_rate`, `lr_warmup` (a boolean of whether to do lr warmup),\n",
         "# For a full list of available parameters run help(config.set_training_parameters)\n",
         "config.set_training_parameters(mode=\"Pretraining\", epochs=10)"
       ]
     },
+    {
+      "cell_type": "code",
+      "execution_count": 23,
+      "metadata": {
+        "id": "gizQ7Vow-1Nd"
+      },
+      "outputs": [],
+      "source": [
+        "# Cell 7\n",
+        "# mode can be set to \"Pretraining\" or \"Active Learning\".\n",
+        "# In \"Pretraining\", an output is a list of two dataset objects corrresponding to (train, valid) partitions\n",
+        "# In \"Active Learning\", an output is a single dataset object corresponding to an AL training set\n",
+        "datasets = Dataset.load_data(config=config, mode=\"Pretraining\")"
+      ]
+    },
     {
       "cell_type": "code",
       "execution_count": 21,
@@ -384,6 +379,7 @@
         }
       ],
       "source": [
+        "# Cell 8\n",
         "# model objects and trainer objects are returned in case you want to do something with them\n",
         "model, trainer = Training.train_GPT(\n",
         "    config=config, training_dataset=datasets[0], validation_dataset=datasets[1]\n",
@@ -445,10 +441,11 @@
         }
       ],
       "source": [
+        "# Cell 9\n",
         "config.set_generation_parameters(\n",
         "    target_criterion=\"force_number_filtered\", # or you could choose `force_number_unique` or `force_number_completions`\n",
         "    force_filters=\"ADMET+FGs\", # could choose `ADMET` for no restriction on functional groups or simply remove this parameter\n",
-        "    target_number=1,\n",
+        "    target_number=100_000,\n",
         ")"
       ]
     },
@@ -460,6 +457,7 @@
       },
       "outputs": [],
       "source": [
+        "# Cell 10\n",
         "Generation.generate_smiles(config) # this runs generation of SMILES\n",
         "Generation.characterize_generated_molecules(config) # this runs an analysis of # unique, valid, and novel molecules"
       ]
@@ -501,10 +499,11 @@
         }
       ],
       "source": [
+        "# Cell 11\n",
         "config.set_sampling_parameters(\n",
         "    n_clusters=10,\n",
         "    samples_per_cluster=2,\n",
-        "    pca_fname=\"scaler_pca_combined_n120.pkl\",\n",
+        "    pca_fname=\"scaler_pca_combined_n120_v2.pkl\",\n",
         ")"
       ]
     },
@@ -516,7 +515,17 @@
       },
       "outputs": [],
       "source": [
-        "Sampling.calculate_descriptors(config)\n",
+        "# Cell 12\n",
+        "Sampling.calculate_descriptors(config)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Cell 13\n",
         "mols = Sampling.project_into_pca_space(config)\n",
         "Sampling.cluster_and_sample(mols=mols, config=config, n_iter=1)"
       ]
@@ -540,7 +549,7 @@
       "outputs": [],
       "source": [
         "%%capture\n",
-        "#@title Install Docking Software (DiffDock)\n",
+        "#@title Install Docking Software (DiffDock) (Cell 14)\n",
         "#@markdown diffdock is pretty heavy and has a lot of dependencies, so we only install it when we need it (and we don't during pretraining, for example)\n",
         "\n",
         "import torch\n",
@@ -586,29 +595,7 @@
         "import shutil\n",
         "import os\n",
         "import pandas as pd\n",
-        "from tqdm import tqdm"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 2,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "oT-N3o145UdV",
-        "outputId": "23b1379e-9f8d-40fe-8084-6eada4d38480"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "2.1.0+cu121\n"
-          ]
-        }
-      ],
-      "source": [
+        "from tqdm import tqdm\n",
         "from ChemSpaceAL.Docking import get_top_poses"
       ]
     },
@@ -641,6 +628,7 @@
         }
       ],
       "source": [
+        "# Cell 15\n",
         "config.set_scoring_parameters(\n",
         "    protein_path=\"HNH_processed.pdb\",\n",
         ")"
@@ -675,6 +663,7 @@
         }
       ],
       "source": [
+        "# Cell 16\n",
         "get_top_poses(\n",
         "    ligands_csv=config.cycle_temp_params[\"path_to_sampled\"],\n",
         "    protein_pdb_path=config.cycle_temp_params[\"path_to_protein\"],\n",
@@ -711,6 +700,7 @@
         }
       ],
       "source": [
+        "# Cell 17\n",
         "from ChemSpaceAL import Scoring\n",
         "ligand_scores = Scoring.score_ligands(config)"
       ]
@@ -723,6 +713,7 @@
       },
       "outputs": [],
       "source": [
+        "# Cell 18\n",
         "Scoring.parse_and_prepare_diffdock_data(\n",
         "    ligand_scores=ligand_scores,\n",
         "    config=config\n",
@@ -764,6 +755,7 @@
         }
       ],
       "source": [
+        "# Cell 19\n",
         "config.set_active_learning_parameters(\n",
         "    selection_mode=\"threshold\", probability_mode=\"linear\", threshold=11, training_size=10\n",
         ")"
@@ -777,6 +769,7 @@
       },
       "outputs": [],
       "source": [
+        "# Cell 20\n",
         "ALConstruction.construct_al_training_set(config=config, do_sampling=True)"
       ]
     },
@@ -800,6 +793,7 @@
         }
       ],
       "source": [
+        "# Cell 21\n",
         "al_ds = Dataset.load_data(config=config, mode=\"Active Learning\")"
       ]
     },
@@ -830,6 +824,7 @@
         }
       ],
       "source": [
+        "# Cell 22\n",
         "config.set_training_parameters(mode=\"Active Learning\", epochs=1)"
       ]
     },
@@ -841,6 +836,7 @@
       },
       "outputs": [],
       "source": [
+        "# Cell 23\n",
         "model, trainer = Training.train_GPT(\n",
         "    config=config,\n",
         "    training_dataset=al_ds,\n",

diff --git a/README.md b/README.md
@@ -6,6 +6,7 @@
 [![codecov](https://codecov.io/gh/batistagroup/ChemSpaceAL/graph/badge.svg?token=ROJSISYJWC)](https://codecov.io/gh/batistagroup/ChemSpaceAL)
 [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://github.com/batistagroup/ChemSpaceAL/blob/main/LICENSE)
 [![image](https://img.shields.io/pypi/v/ChemSpaceAL.svg)](https://pypi.org/project/ChemSpaceAL/)
+[![arXiv](https://img.shields.io/badge/arXiv-2309.05853.svg)](https://arxiv.org/abs/2309.05853)
 <a target="_blank" href="https://colab.research.google.com/github/batistagroup/ChemSpaceAL/blob/main/ChemSpaceAL.ipynb">
   <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
 </a>

diff --git a/requirements.txt b/requirements.txt
@@ -1,7 +1,7 @@
 prolif==2.0.1
 pandas==1.5.3
-numpy
-rdkit
+numpy==1.25.2
+rdkit==2023.03.3
 torch
 PyYAML
 scikit_learn

diff --git a/setup.py b/setup.py
@@ -31,8 +31,8 @@
     install_requires=[
         "prolif==2.0.1",
         "pandas==1.5.3",
-        "numpy",
-        "rdkit",
+        "numpy==1.25.2",
+        "rdkit==2023.03.3",
         "torch",
         "PyYAML",
         "scikit_learn",