From 4b139926e7d0d5c959d5743ab7bf0cbea4f3e499 Mon Sep 17 00:00:00 2001 From: "Francesco Patane, MSc" Date: Fri, 16 Sep 2022 21:23:30 +0200 Subject: [PATCH] Creato con Colaboratory --- ML_3.ipynb | 310 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 310 insertions(+) create mode 100644 ML_3.ipynb diff --git a/ML_3.ipynb b/ML_3.ipynb new file mode 100644 index 0000000..8e63f36 --- /dev/null +++ b/ML_3.ipynb @@ -0,0 +1,310 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "authorship_tag": "ABX9TyM1dCOzGN9zMcL1Nj+fE3Vk", + "include_colab_link": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "source": [ + "Unwanted substructures:\n", + "substructures can be reactive or toxic or they can interfere with certain assays. Filtering unwanted substructures can support assembling more efficient screening libraries, which can save time and resources.\n", + "\n", + "Examples of such unwanted features are nitro groups (mutagenic), sulfates and phosphates (likely resulting in unfavorable pharmacokinetic properties), 2-halopyridines and thiols (reactive). \n", + "\n", + "Pan Assay Interference Compounds (PAINS):\n", + "PAINS are compounds that often occur as hits in HTS even though they actually are false positives. PAINS show activity at numerous targets rather than one specific target." + ], + "metadata": { + "id": "pXWQZuaa19_j" + } + }, + { + "cell_type": "code", + "source": [ + "!pip install rdkit" + ], + "metadata": { + "id": "5Ebl1uvs2Vzz" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "eYx9Zu8n0-5g" + }, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "\n", + "import pandas as pd\n", + "from tqdm.auto import tqdm\n", + "from rdkit import Chem\n", + "from rdkit.Chem import PandasTools\n", + "from rdkit.Chem.FilterCatalog import FilterCatalog, FilterCatalogParams" + ] + }, + { + "cell_type": "code", + "source": [ + "# load data from Talktorial T2\n", + "TNFB_data = pd.read_csv(\n", + " \"TNFB_compounds_lipinski.csv\",\n", + " index_col=0,\n", + ")\n", + "# Drop unnecessary information\n", + "print(\"Dataframe shape:\", TNFB_data.shape)\n", + "TNFB_data.drop(columns=[\"molecular_weight\", \"n_hbd\", \"n_hba\", \"logp\"], inplace=True)\n", + "TNFB_data.head()" + ], + "metadata": { + "id": "5wN1gfYL2fEr" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Add molecule column\n", + "PandasTools.AddMoleculeColumnToFrame(TNFB_data, smilesCol=\"smiles\")\n", + "# Draw first 3 molecules\n", + "Chem.Draw.MolsToGridImage(\n", + " list(TNFB_data.head(3).ROMol),\n", + " legends=list(TNFB_data.head(3).molecule_chembl_id),\n", + ")" + ], + "metadata": { + "id": "rzsqw1ha3Ha8" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Filter for PAINS" + ], + "metadata": { + "id": "u2dRkR2R3epc" + } + }, + { + "cell_type": "code", + "source": [ + "# initialize filter\n", + "params = FilterCatalogParams()\n", + "params.AddCatalog(FilterCatalogParams.FilterCatalogs.PAINS)\n", + "catalog = FilterCatalog(params)" + ], + "metadata": { + "id": "mtlZeOxU3hJj" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# search for PAINS\n", + "matches = []\n", + "clean = []\n", + "for index, row in tqdm(TNFB_data.iterrows(), total=TNFB_data.shape[0]):\n", + " molecule = Chem.MolFromSmiles(row.smiles)\n", + " entry = catalog.GetFirstMatch(molecule) # Get the first matching PAINS\n", + " if entry is not None:\n", + " # store PAINS information\n", + " matches.append(\n", + " {\n", + " \"chembl_id\": row.molecule_chembl_id,\n", + " \"rdkit_molecule\": molecule,\n", + " \"pains\": entry.GetDescription().capitalize(),\n", + " }\n", + " )\n", + " else:\n", + " # collect indices of molecules without PAINS\n", + " clean.append(index)\n", + "\n", + "matches = pd.DataFrame(matches)\n", + "TNFB_data = TNFB_data.loc[clean] # keep molecules without PAINS" + ], + "metadata": { + "id": "X2kiOCOo3oe7" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "print(f\"Number of compounds with PAINS: {len(matches)}\")\n", + "print(f\"Number of compounds without PAINS: {len(TNFB_data)}\")" + ], + "metadata": { + "id": "msNTp6h_3_g6" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "Chem.Draw.MolsToGridImage(\n", + " list(matches.head(5).rdkit_molecule),\n", + " legends=list(matches.head(5)[\"pains\"]),\n", + ")" + ], + "metadata": { + "id": "zAsR5xdo4Hy7" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Filter and highlight unwanted substructures" + ], + "metadata": { + "id": "hNbb8xcZ4ZCa" + } + }, + { + "cell_type": "code", + "source": [ + "substructures = pd.read_csv(\"unwantedSubstructures.csv\", sep=\"\\s+\")\n", + "substructures[\"rdkit_molecule\"] = substructures.smart.apply(Chem.MolFromSmarts)\n", + "print(\"Number of unwanted substructures in collection:\", len(substructures))" + ], + "metadata": { + "id": "Zl3Lpz_Q4b3K" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "Chem.Draw.MolsToGridImage(\n", + " mols=substructures.rdkit_molecule.tolist()[2:5],\n", + " \n", + ")" + ], + "metadata": { + "id": "8XY7inLq7j5L" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# search for unwanted substructure\n", + "matches = []\n", + "clean = []\n", + "for index, row in tqdm(TNFB_data.iterrows(), total=TNFB_data.shape[0]):\n", + " molecule = Chem.MolFromSmiles(row.smiles)\n", + " match = False\n", + " for _, substructure in substructures.iterrows():\n", + " if molecule.HasSubstructMatch(substructure.rdkit_molecule):\n", + " matches.append(\n", + " {\n", + " \"chembl_id\": row.molecule_chembl_id,\n", + " \"rdkit_molecule\": molecule,\n", + " \"substructure\": substructure.rdkit_molecule,\n", + " \n", + " }\n", + " )\n", + " match = True\n", + " if not match:\n", + " clean.append(index)\n", + "\n", + "matches = pd.DataFrame(matches)\n", + "TNFB_data = TNFB_data.loc[clean]" + ], + "metadata": { + "id": "pvqK_Sgq946c" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "print(f\"Number of found unwanted substructure: {len(matches)}\")\n", + "print(f\"Number of compounds without unwanted substructure: {len(TNFB_data)}\")" + ], + "metadata": { + "id": "0AP-wfa4-RxL" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "to_highlight = [\n", + " row.rdkit_molecule.GetSubstructMatch(row.substructure) for _, row in matches.head(3).iterrows()\n", + "]\n", + "Chem.Draw.MolsToGridImage(\n", + " list(matches.head(3).rdkit_molecule),\n", + " highlightAtomLists=to_highlight,\n", + " \n", + ")" + ], + "metadata": { + "id": "46RQc5Mg-cSS" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Substructure statistics" + ], + "metadata": { + "id": "vHbNd3uc-may" + } + }, + { + "cell_type": "code", + "source": [ + "\n", + "group_frequencies = groups.size()\n", + "group_frequencies.sort_values(ascending=False, inplace=True)\n", + "group_frequencies.head(10)" + ], + "metadata": { + "id": "XEZb6VBj-myq" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file