From 724f8139018c4a6831ebc4dc1c873915d6ea5655 Mon Sep 17 00:00:00 2001 From: Brandon Duane Walker Date: Thu, 9 May 2024 14:52:19 -0400 Subject: [PATCH] generate-conformers --- .../.bumpversion.cfg | 29 +++ .../.gitattributes | 5 + .../CHANGELOG.md | 5 + .../Dockerfile | 23 +++ .../README.md | 22 ++ .../pdbbind-generate-conformers-tool/VERSION | 1 + .../build-docker.sh | 4 + .../environment.yml | 10 + .../pdbbind-generate-conformers-tool/ict.yml | 123 ++++++++++++ .../ligand_0.sdf | 3 + .../pdbbind_generate_conformers_0@1@0.cwl | 188 ++++++++++++++++++ .../pyproject.toml | 33 +++ .../pdbbind_generate_conformers/__init__.py | 7 + .../pdbbind_generate_conformers/__main__.py | 90 +++++++++ .../pdbbind_generate_conformers.py | 163 +++++++++++++++ .../tests/__init__.py | 1 + .../tests/ncats_target_based_curated.xlsx | 3 + .../tests/test_pdbbind_generate_conformers.py | 65 ++++++ 18 files changed, 775 insertions(+) create mode 100644 utils/pre-process/data-download/pdbbind-generate-conformers-tool/.bumpversion.cfg create mode 100644 utils/pre-process/data-download/pdbbind-generate-conformers-tool/.gitattributes create mode 100644 utils/pre-process/data-download/pdbbind-generate-conformers-tool/CHANGELOG.md create mode 100644 utils/pre-process/data-download/pdbbind-generate-conformers-tool/Dockerfile create mode 100644 utils/pre-process/data-download/pdbbind-generate-conformers-tool/README.md create mode 100644 utils/pre-process/data-download/pdbbind-generate-conformers-tool/VERSION create mode 100644 utils/pre-process/data-download/pdbbind-generate-conformers-tool/build-docker.sh create mode 100644 utils/pre-process/data-download/pdbbind-generate-conformers-tool/environment.yml create mode 100644 utils/pre-process/data-download/pdbbind-generate-conformers-tool/ict.yml create mode 100644 utils/pre-process/data-download/pdbbind-generate-conformers-tool/ligand_0.sdf create mode 100644 utils/pre-process/data-download/pdbbind-generate-conformers-tool/pdbbind_generate_conformers_0@1@0.cwl create mode 100644 utils/pre-process/data-download/pdbbind-generate-conformers-tool/pyproject.toml create mode 100644 utils/pre-process/data-download/pdbbind-generate-conformers-tool/src/polus/mm/utils/pdbbind_generate_conformers/__init__.py create mode 100644 utils/pre-process/data-download/pdbbind-generate-conformers-tool/src/polus/mm/utils/pdbbind_generate_conformers/__main__.py create mode 100644 utils/pre-process/data-download/pdbbind-generate-conformers-tool/src/polus/mm/utils/pdbbind_generate_conformers/pdbbind_generate_conformers.py create mode 100644 utils/pre-process/data-download/pdbbind-generate-conformers-tool/tests/__init__.py create mode 100644 utils/pre-process/data-download/pdbbind-generate-conformers-tool/tests/ncats_target_based_curated.xlsx create mode 100644 utils/pre-process/data-download/pdbbind-generate-conformers-tool/tests/test_pdbbind_generate_conformers.py diff --git a/utils/pre-process/data-download/pdbbind-generate-conformers-tool/.bumpversion.cfg b/utils/pre-process/data-download/pdbbind-generate-conformers-tool/.bumpversion.cfg new file mode 100644 index 00000000..def09b3b --- /dev/null +++ b/utils/pre-process/data-download/pdbbind-generate-conformers-tool/.bumpversion.cfg @@ -0,0 +1,29 @@ +[bumpversion] +current_version = 0.1.0 +commit = False +tag = False +parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))? +serialize = + {major}.{minor}.{patch}-{release}{dev} + {major}.{minor}.{patch} + +[bumpversion:part:release] +optional_value = _ +first_value = dev +values = + dev + _ + +[bumpversion:part:dev] + +[bumpversion:file:pyproject.toml] +search = version = "{current_version}" +replace = version = "{new_version}" + +[bumpversion:file:VERSION] + +[bumpversion:file:README.md] + +[bumpversion:file:plugin.json] + +[bumpversion:file:src/polus/mm/utils/pdbbind_generate_conformers/__init__.py] diff --git a/utils/pre-process/data-download/pdbbind-generate-conformers-tool/.gitattributes b/utils/pre-process/data-download/pdbbind-generate-conformers-tool/.gitattributes new file mode 100644 index 00000000..07fedc1e --- /dev/null +++ b/utils/pre-process/data-download/pdbbind-generate-conformers-tool/.gitattributes @@ -0,0 +1,5 @@ +*.pdb filter=lfs diff=lfs merge=lfs -text +*.pdbqt filter=lfs diff=lfs merge=lfs -text +*.mol2 filter=lfs diff=lfs merge=lfs -text +*.xlsx filter=lfs diff=lfs merge=lfs -text +*.sdf filter=lfs diff=lfs merge=lfs -text diff --git a/utils/pre-process/data-download/pdbbind-generate-conformers-tool/CHANGELOG.md b/utils/pre-process/data-download/pdbbind-generate-conformers-tool/CHANGELOG.md new file mode 100644 index 00000000..b67793f7 --- /dev/null +++ b/utils/pre-process/data-download/pdbbind-generate-conformers-tool/CHANGELOG.md @@ -0,0 +1,5 @@ +# CHANGELOG + +## 0.1.0 + +Initial release. diff --git a/utils/pre-process/data-download/pdbbind-generate-conformers-tool/Dockerfile b/utils/pre-process/data-download/pdbbind-generate-conformers-tool/Dockerfile new file mode 100644 index 00000000..79ccaab7 --- /dev/null +++ b/utils/pre-process/data-download/pdbbind-generate-conformers-tool/Dockerfile @@ -0,0 +1,23 @@ +# docker build -f Dockerfile -t polusai/pdbbind-generate-conformers-tool . +FROM condaforge/mambaforge + +ENV EXEC_DIR="/opt/executables" +ENV POLUS_LOG="INFO" +RUN mkdir -p ${EXEC_DIR} + +# Work directory defined in the base container +# WORKDIR ${EXEC_DIR} + +COPY pyproject.toml ${EXEC_DIR} +COPY VERSION ${EXEC_DIR} +COPY README.md ${EXEC_DIR} +COPY CHANGELOG.md ${EXEC_DIR} + +# Install needed packages here + +COPY src ${EXEC_DIR}/src +ADD Dockerfile . + +RUN pip3 install ${EXEC_DIR} --no-cache-dir + +CMD ["--help"] diff --git a/utils/pre-process/data-download/pdbbind-generate-conformers-tool/README.md b/utils/pre-process/data-download/pdbbind-generate-conformers-tool/README.md new file mode 100644 index 00000000..c0d20c55 --- /dev/null +++ b/utils/pre-process/data-download/pdbbind-generate-conformers-tool/README.md @@ -0,0 +1,22 @@ +# pdbbind_generate_conformers (0.1.0) + +Download the PDBbind refined database and generate conformers from SMILES + +## Options + +This plugin takes 9 input arguments and 3 output argument: + +| Name | Description | I/O | Type | Default | +|---------------|-------------------------|--------|--------|---------| +| input_excel_path | | Input | File | File | +| query | query str to search the dataset, Type: string, File type: input, Accepted formats: txt | Input | string | string | +| output_txt_path | Path to the text dataset file, Type: string, File type: output, Accepted formats: txt | Input | string | string | +| output_sdf_path | Path to the input file, Type: string, File type: input, Accepted formats: sdf | Input | string | string | +| min_row | The row min inex, Type: int | Input | int | int | +| max_row | The row max inex, Type: int | Input | int | int | +| smiles_column | The name of the smiles column, Type: string, File type: output, Accepted formats: txt | Input | string | string | +| binding_data_column | The name of the binding data column, Type: string, File type: output, Accepted formats: txt | Input | string | string | +| convert_Kd_dG | If this is set to true, dG will be calculated | Input | boolean | boolean | +| output_txt_path | Path to the txt file | Output | File | File | +| output_sdf_path | Path to the input file, Type: string, File type: input, Accepted formats: sdf | Output | File[] | File[] | +| experimental_dGs | Experimental Free Energies of Binding | Output | float[] | float[] | diff --git a/utils/pre-process/data-download/pdbbind-generate-conformers-tool/VERSION b/utils/pre-process/data-download/pdbbind-generate-conformers-tool/VERSION new file mode 100644 index 00000000..6e8bf73a --- /dev/null +++ b/utils/pre-process/data-download/pdbbind-generate-conformers-tool/VERSION @@ -0,0 +1 @@ +0.1.0 diff --git a/utils/pre-process/data-download/pdbbind-generate-conformers-tool/build-docker.sh b/utils/pre-process/data-download/pdbbind-generate-conformers-tool/build-docker.sh new file mode 100644 index 00000000..15e1d9f2 --- /dev/null +++ b/utils/pre-process/data-download/pdbbind-generate-conformers-tool/build-docker.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +version=$(=3.9 + - pandas==2.2.2 + - rdkit==2024.03.2 + - openpyxl==3.1.2 + - xorg-libxrender==0.9.11 + - pytest==8.2.0 diff --git a/utils/pre-process/data-download/pdbbind-generate-conformers-tool/ict.yml b/utils/pre-process/data-download/pdbbind-generate-conformers-tool/ict.yml new file mode 100644 index 00000000..beca1deb --- /dev/null +++ b/utils/pre-process/data-download/pdbbind-generate-conformers-tool/ict.yml @@ -0,0 +1,123 @@ +specVersion: "0.1.0" +name: pdbbind_generate_conformers +version: 0.1.0 +container: generate-conformers-tool +entrypoint: +title: pdbbind_generate_conformers +description: Download the PDBbind refined database and generate conformers from SMILES +author: Brandon Walker, Nazanin Donyapour +contact: brandon.walker@axleinfo.com, nazanin.donyapour@nih.gov +repository: +documentation: +citation: + +inputs: + - name: input_excel_path + required: true + description: + type: File + format: + uri: edam:format_3620 + - name: query + required: true + description: query str to search the dataset, Type string, File type input, Accepted formats txt + type: string + format: + uri: edam:format_2330 + - name: output_txt_path + required: true + description: Path to the text dataset file, Type string, File type output, Accepted formats txt + type: string + defaultValue: system.log + format: + uri: edam:format_2330 + - name: output_sdf_path + required: true + description: Path to the input file, Type string, File type input, Accepted formats sdf + type: string + format: + uri: edam:format_3814 + - name: min_row + required: true + description: The row min inex, Type int + type: int + format: + uri: edam:format_2330 + - name: max_row + required: true + description: The row max inex, Type int + type: int + format: + uri: edam:format_2330 + - name: smiles_column + required: true + description: The name of the smiles column, Type string, File type output, Accepted formats txt + type: string + format: + uri: edam:format_2330 + - name: binding_data_column + required: true + description: The name of the binding data column, Type string, File type output, Accepted formats txt + type: string + format: + uri: edam:format_2330 + - name: convert_Kd_dG + required: true + description: If this is set to true, dG will be calculated + type: boolean + format: + uri: edam:format_2330 +outputs: + - name: output_txt_path + required: true + description: Path to the txt file + type: File + format: + uri: edam:format_2330 + - name: output_sdf_path + required: true + description: Path to the input file, Type string, File type input, Accepted formats sdf + type: File[] + format: + uri: edam:format_3814 + - name: experimental_dGs + required: true + description: Experimental Free Energies of Binding + type: float[] +ui: + - key: inputs.input_excel_path + title: "input_excel_path: " + description: "" + type: File + - key: inputs.query + title: "query: " + description: "query str to search the dataset, Type string, File type input, Accepted formats txt" + type: string + - key: inputs.output_txt_path + title: "output_txt_path: " + description: "Path to the text dataset file, Type string, File type output, Accepted formats txt" + type: string + - key: inputs.output_sdf_path + title: "output_sdf_path: " + description: "Path to the input file, Type string, File type input, Accepted formats sdf" + type: string + - key: inputs.min_row + title: "min_row: " + description: "The row min inex, Type int" + type: int + - key: inputs.max_row + title: "max_row: " + description: "The row max inex, Type int" + type: int + - key: inputs.smiles_column + title: "smiles_column: " + description: "The name of the smiles column, Type string, File type output, Accepted formats txt" + type: string + - key: inputs.binding_data_column + title: "binding_data_column: " + description: "The name of the binding data column, Type string, File type output, Accepted formats txt" + type: string + - key: inputs.convert_Kd_dG + title: "convert_Kd_dG: " + description: "If this is set to true, dG will be calculated" + type: checkbox diff --git a/utils/pre-process/data-download/pdbbind-generate-conformers-tool/ligand_0.sdf b/utils/pre-process/data-download/pdbbind-generate-conformers-tool/ligand_0.sdf new file mode 100644 index 00000000..4b06a341 --- /dev/null +++ b/utils/pre-process/data-download/pdbbind-generate-conformers-tool/ligand_0.sdf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dbfdb1517bf21136e47fcee2247396d0ecc53b2e7c90a9462ebe0d92934f38a2 +size 2427 diff --git a/utils/pre-process/data-download/pdbbind-generate-conformers-tool/pdbbind_generate_conformers_0@1@0.cwl b/utils/pre-process/data-download/pdbbind-generate-conformers-tool/pdbbind_generate_conformers_0@1@0.cwl new file mode 100644 index 00000000..4497a3a6 --- /dev/null +++ b/utils/pre-process/data-download/pdbbind-generate-conformers-tool/pdbbind_generate_conformers_0@1@0.cwl @@ -0,0 +1,188 @@ +#!/usr/bin/env cwl-runner +cwlVersion: v1.0 + +class: CommandLineTool + +label: Download the PDBbind refined database and generate conformers from SMILES + +doc: |- + Download the PDBbind refined database and generate conformers from SMILES + +baseCommand: ["python", "-m", "polus.mm.utils.pdbbind_generate_conformers"] + +hints: + DockerRequirement: + dockerPull: polusai/pdbbind-generate-conformers-tool@sha256:f688a0e8fa1a2e909c62501d4039e984d75dd43dcdc21f80d4d102a04e0b02d7 + +requirements: + InlineJavascriptRequirement: {} + +inputs: + input_excel_path: + label: Path to the input xlsx file + type: File + format: edam:format_3620 + inputBinding: + prefix: --input_excel_path + + query: + label: query str to search the dataset + doc: |- + query str to search the dataset + Type: string + File type: input + Accepted formats: txt + type: string + format: edam:format_2330 + inputBinding: + prefix: --query + + output_txt_path: + label: Path to the text dataset file + doc: |- + Path to the text dataset file + Type: string + File type: output + Accepted formats: txt + type: string + format: edam:format_2330 + inputBinding: + prefix: --output_txt_path + default: system.log + + output_sdf_path: + label: Path to the input file + doc: |- + Path to the input file + Type: string + File type: input + Accepted formats: sdf + type: string? + format: edam:format_3814 # sdf + + min_row: + label: The row min index + doc: |- + The row min inex + Type: int + type: int? + format: edam:format_2330 + inputBinding: + prefix: --min_row + + max_row: + label: The row max index + doc: |- + The row max inex + Type: int + type: int? + format: edam:format_2330 + inputBinding: + prefix: --max_row + + smiles_column: + label: The name of the smiles column + doc: |- + The name of the smiles column + Type: string + File type: output + Accepted formats: txt + type: string + format: edam:format_2330 + inputBinding: + prefix: --smiles_column + + binding_data_column: + label: The name of the binding data column + doc: |- + The name of the binding data column + Type: string + File type: output + Accepted formats: txt + type: string + format: edam:format_2330 + inputBinding: + prefix: --binding_data_column + + convert_kd_dg: + label: If this is set to true, dG will be calculated + doc: If this is set to true, dG will be calculated + type: boolean + format: edam:format_2330 + inputBinding: + prefix: --convert_kd_dg + default: False + +outputs: + output_txt_path: + label: Path to the txt file + doc: |- + Path to the txt file + type: File + outputBinding: + glob: $(inputs.output_txt_path) + format: edam:format_2330 + + output_sdf_path: + label: Path to the input file + doc: |- + Path to the input file + Type: string + File type: input + Accepted formats: sdf + type: File[] + outputBinding: + # NOTE: Do NOT just use glob: ./*.sdf !!! This will return an array sorted by filenames. + # We want the order of output_sdf_paths to match the order of experimental_dGs, etc + # Because we need to compare experimental ΔGs with predicted values. + glob: $(inputs.output_txt_path) + loadContents: true + outputEval: | + ${ + var lines = self[0].contents.split("\n"); + var sdfs = []; + for (var idx = 0; idx < lines.length; idx++) { + var words = lines[idx].split(" "); + var sdffile = {"class": "File", "path": "ligand_" + idx + ".sdf"}; + sdfs.push(sdffile); + } + + return sdfs; + } + format: edam:format_3814 + + experimental_dGs: + label: Experimental Free Energies of Binding + doc: |- + Experimental Free Energies of Binding + type: float[] + outputBinding: + # NOTE: Do NOT just use $(inputs.output_txt_path) !!! This will return an array sorted by filenames. + # We want the order of output_sdf_paths to match the order of experimental_dGs, etc + # Because we need to compare experimental ΔGs with predicted values. + glob: $(inputs.output_txt_path) + loadContents: true + outputEval: | + ${ + var lines = self[0].contents.split("\n"); + var experimental_dGs = []; + for (var i = 0; i < lines.length; i++) { + var words = lines[i].split(" "); + if (words.length > 2) { + var experimental_dG = parseFloat(words[2]); + experimental_dGs.push(experimental_dG); + } + } + + if (experimental_dGs.length == 0) { + throw new Error("Error! Experimental dGs are empty!"); + } else { + return experimental_dGs; + } + } + +$namespaces: + edam: https://edamontology.org/ + +$schemas: +- https://raw.githubusercontent.com/edamontology/edamontology/master/EDAM_dev.owl diff --git a/utils/pre-process/data-download/pdbbind-generate-conformers-tool/pyproject.toml b/utils/pre-process/data-download/pdbbind-generate-conformers-tool/pyproject.toml new file mode 100644 index 00000000..2bb307ab --- /dev/null +++ b/utils/pre-process/data-download/pdbbind-generate-conformers-tool/pyproject.toml @@ -0,0 +1,33 @@ +[tool.poetry] +name = "polus-mm-utils-generate-conformers" +version = "0.1.0" +description = "Download the PDBbind refined database" +authors = ["Data Scientist "] +readme = "README.md" +packages = [{include = "polus", from = "src"}] + +[tool.poetry.dependencies] +python = ">=3.9,<3.12" +typer = "^0.7.0" +sophios = "0.1.1" +openpyxl = "3.1.5" +pandas = "2.2.2" +rdkit = "2024.3.3" + +[tool.poetry.group.dev.dependencies] +bump2version = "^1.0.1" +pytest = "^7.4" +pytest-sugar = "^0.9.6" +pre-commit = "^3.2.1" +black = "^23.3.0" +mypy = "^1.1.1" +ruff = "^0.0.270" + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" + +[tool.pytest.ini_options] +pythonpath = [ + "." +] diff --git a/utils/pre-process/data-download/pdbbind-generate-conformers-tool/src/polus/mm/utils/pdbbind_generate_conformers/__init__.py b/utils/pre-process/data-download/pdbbind-generate-conformers-tool/src/polus/mm/utils/pdbbind_generate_conformers/__init__.py new file mode 100644 index 00000000..e84b7b9b --- /dev/null +++ b/utils/pre-process/data-download/pdbbind-generate-conformers-tool/src/polus/mm/utils/pdbbind_generate_conformers/__init__.py @@ -0,0 +1,7 @@ +"""pdbbind_generate_conformers.""" + +__version__ = "0.1.0" + +from polus.mm.utils.pdbbind_generate_conformers.pdbbind_generate_conformers import ( # noqa # pylint: disable=unused-import + pdbbind_generate_conformers, +) diff --git a/utils/pre-process/data-download/pdbbind-generate-conformers-tool/src/polus/mm/utils/pdbbind_generate_conformers/__main__.py b/utils/pre-process/data-download/pdbbind-generate-conformers-tool/src/polus/mm/utils/pdbbind_generate_conformers/__main__.py new file mode 100644 index 00000000..79396bf2 --- /dev/null +++ b/utils/pre-process/data-download/pdbbind-generate-conformers-tool/src/polus/mm/utils/pdbbind_generate_conformers/__main__.py @@ -0,0 +1,90 @@ +"""Package entrypoint for the pdbbind_generate_conformers package.""" + +# Base packages +import logging +from os import environ +from pathlib import Path + +import typer +from polus.mm.utils.pdbbind_generate_conformers.pdbbind_generate_conformers import ( + pdbbind_generate_conformers, +) + +logging.basicConfig( + format="%(asctime)s - %(name)-8s - %(levelname)-8s - %(message)s", + datefmt="%d-%b-%y %H:%M:%S", +) +POLUS_LOG = getattr(logging, environ.get("POLUS_LOG", "INFO")) +logger = logging.getLogger("polus.mm.utils.pdbbind_generate_conformers.") +logger.setLevel(POLUS_LOG) + +app = typer.Typer(help="pdbbind_generate_conformers.") + + +@app.command() +def main( # noqa: PLR0913 + input_excel_path: Path = typer.Option( + ..., + "--input_excel_path", + help="", + ), + query: str = typer.Option( + ..., + "--query", + help="query str to search the dataset", + ), + output_txt_path: str = typer.Option( + ..., + "--output_txt_path", + help="Path to the text dataset file", + ), + min_row: int = typer.Option( + ..., + "--min_row", + help="The row min inex, Type int", + ), + max_row: int = typer.Option( + ..., + "--max_row", + help="The row max inex, Type int", + ), + smiles_column: str = typer.Option( + ..., + "--smiles_column", + help="The name of the smiles column", + ), + binding_data_column: str = typer.Option( + ..., + "--binding_data_column", + help="The name of the binding data column", + ), + convert_kd_dg: bool = typer.Option( + ..., + "--convert_kd_dg", + help="If this is set to true, dG will be calculated", + ), +) -> None: + """pdbbind_generate_conformers.""" + logger.info(f"input_excel_path: {input_excel_path}") + logger.info(f"query: {query}") + logger.info(f"output_txt_path: {output_txt_path}") + logger.info(f"min_row: {min_row}") + logger.info(f"max_row: {max_row}") + logger.info(f"smiles_column: {smiles_column}") + logger.info(f"binding_data_column: {binding_data_column}") + logger.info(f"convert_kd_dg: {convert_kd_dg}") + + pdbbind_generate_conformers( + input_excel_path=input_excel_path, + query=query, + output_txt_path=output_txt_path, + min_row=min_row, + max_row=max_row, + smiles_column=smiles_column, + binding_data_column=binding_data_column, + convert_kd_dg=convert_kd_dg, + ) + + +if __name__ == "__main__": + app() diff --git a/utils/pre-process/data-download/pdbbind-generate-conformers-tool/src/polus/mm/utils/pdbbind_generate_conformers/pdbbind_generate_conformers.py b/utils/pre-process/data-download/pdbbind-generate-conformers-tool/src/polus/mm/utils/pdbbind_generate_conformers/pdbbind_generate_conformers.py new file mode 100644 index 00000000..a6886313 --- /dev/null +++ b/utils/pre-process/data-download/pdbbind-generate-conformers-tool/src/polus/mm/utils/pdbbind_generate_conformers/pdbbind_generate_conformers.py @@ -0,0 +1,163 @@ +"""Generate conformers for a dataset of ligands and binding data.""" +import math +from pathlib import Path + +import pandas +import rdkit +from rdkit import Chem +from rdkit.Chem import AllChem + + +def pdbbind_generate_conformers( # noqa: PLR0913 + input_excel_path: Path, + query: str, + output_txt_path: str, + min_row: int, + max_row: int, + smiles_column: str, + binding_data_column: str, + convert_kd_dg: bool, +) -> None: + """pdbbind_generate_conformers. + + Args: + input_excel_path: Path to the input xlsx file + query: query str to search the dataset + output_txt_path: Path to the text dataset file + min_row: The row min index + max_row: The row max index + smiles_column: The name of the smiles column + binding_data_column: The name of the binding data column + convert_kd_dg: If this is set to true, dG will be calculated + Returns: + None + """ + load_data( + input_excel_path, + query, + smiles_column, + binding_data_column, + output_txt_path, + min_row, + max_row, + convert_kd_dg, + ) + + +def calculate_dg(kd: float) -> float: + """Calculates binding free energy from kd, See https://en.wikipedia.org/wiki/Binding_constant. + + Args: + kd (float): The binding affinity of the protein-ligand complex + + Returns: + float: The binding free energy + """ + # Calculate the binding free energy from kd so we can make the correlation plots. + # See https://en.wikipedia.org/wiki/Binding_constant + ideal_gas_constant = 8.31446261815324 # J/(Mol*K) + kcal_per_joule = 4184 + # NOTE: Unfortunately, the temperature at which experimental kd + # binding data was taken + # is often not recorded. Thus, we are forced to guess. The two standard guesses are + # physiological body temperature (310K) or room temperature (298K). + temperature = 298 + rt = (ideal_gas_constant / kcal_per_joule) * temperature + # NOTE: For performance, simulations are often done in a very small unit cell, and + # thus at a very high concentration. The size of the unit cell bounds the volume. + # For shorter simulations where the ligand has not explored the entire box, it may + # be less. See the Yank paper for a method of calculating the correct volumes. + standard_concentration = 1 # Units of mol / L, but see comment above. + return rt * math.log(kd / standard_concentration) + + +def load_data( # noqa: PLR0913 + input_excel_path: Path, + query: str, + smiles_column: str, + binding_data_column: str, + output_txt_path: str, + min_row: int = 1, + max_row: int = -1, + convert_kd_dg: bool = False, +) -> None: + """Reads SMILES strings and numerical binding affinity data. + + from the given Excel spreadsheet using a Pandas query. + + Args: + input_excel_path: (Path): Path to the input xlsx file + query (str): The Query to perform + min_row (int): min index of rows. Defaults to 1. + max_row (int): max index of rows. Defaults to -1. + smiles_column (str): The name of smiles column + binding_data_column (str): The name of the binding data column + convert_kd_dg (bool): If this set to True, + The dG will be calculated. Defaults to False. + output_txt_path (str): The output text file + """ + df = pandas.read_excel(str(input_excel_path), sheet_name=1) # Requires openpyxl + # 'Focus_Reduction_Assay', 'Proliferation', 'Antigen_Expression', \ + # 'Staining_Based', \ + # 'Flourescence', 'Viral_Titer', 'Cell_Viability_By_Neutral_Red_Uptake', \ + # 'eGFP_Reduction', \ + # 'Viral_Infection', 'Microscopy', 'Immunodetection', 'Replicon_Assay', \ + # 'Antigen_Synthesis', \ + # 'Luciferase_Reporter_Assay', 'Viral_Entry', 'MTT_Assay', 'RT-PCR', 'Cytopathy', \ + # 'Flow_Cytometry', \ + # 'Colorimetric', 'Luciferase_Reporter_Gene', 'Cell_Titer', 'Western_Blot', \ + # 'Cytotoxicity',\ + # 'SDS-PAGE', \ + # 'Fluorescence', 'Image-Based', \ + # 'Crystal_Violet_Staining_Assay', \ + # 'Viral_Reduction_Assay'] + # Standard Type ['IC50', 'EC90', 'Inhibition', 'Kd', 'EC50', 'Activity', 'Ki'] + # Standard Relation [nan, "<'", "<='", ">'", "='", ">='"] + # Standard Units [nan, 'uM', '%'] + + df = df.query(query) + + # Perform row slicing (if any) + if int(min_row) != 1 or int(max_row) != -1: + # We want to convert to zero-based indices and we also want + # the upper index to be inclusive (i.e. <=) so -1 lower index. + df = df[(int(min_row) - 1) : int(max_row)] + + # Now restrict to the columns we actually care about. + columns = [smiles_column, binding_data_column] + df = df[columns].dropna() + + # Generate 2D and/or 3D conformers + smiles_binding_data: list[str] = [] + for idx, row in enumerate(df.values): + (smiles, binding_datum) = row + micromolar = 0.000001 # uM + binding_datum = binding_datum * micromolar + + if convert_kd_dg: + dg = calculate_dg(binding_datum) + smiles_binding_data.append(f"{smiles} {binding_datum} {dg}") + else: + smiles_binding_data.append(f"{smiles} {binding_datum}") + + # See https://www.rdkit.org/docs/GettingStartedInPython.html#working-with-3d-molecules + mol_2d: rdkit.Chem.rdchem.Mol = Chem.MolFromSmiles( + smiles, + ) # pylint: disable=c-extension-no-member,no-member + AllChem.Compute2DCoords(mol_2d) # pylint: disable=no-member + + # rdkit.Chem.rdmolops.AddHs + # NOTE: "Much of the code assumes that Hs are not included in + # the molecular topology, + # so be very careful with the molecule that comes back from this function." + mol_3d = Chem.AddHs(mol_2d) # pylint: disable=no-member + AllChem.EmbedMolecule(mol_3d) # pylint: disable=no-member + AllChem.MMFFOptimizeMolecule(mol_3d) # pylint: disable=no-member + + filename = f"ligand_{idx}.sdf" # chemblid is NOT unique! + writer = Chem.SDWriter(filename) # pylint: disable=no-member + writer.write(mol_3d) + writer.close() + + with Path(output_txt_path).open(mode="w", encoding="utf-8") as f: + f.write("\n".join(smiles_binding_data)) diff --git a/utils/pre-process/data-download/pdbbind-generate-conformers-tool/tests/__init__.py b/utils/pre-process/data-download/pdbbind-generate-conformers-tool/tests/__init__.py new file mode 100644 index 00000000..29576a11 --- /dev/null +++ b/utils/pre-process/data-download/pdbbind-generate-conformers-tool/tests/__init__.py @@ -0,0 +1 @@ +"""Tests for pdbbind_generate_conformers.""" diff --git a/utils/pre-process/data-download/pdbbind-generate-conformers-tool/tests/ncats_target_based_curated.xlsx b/utils/pre-process/data-download/pdbbind-generate-conformers-tool/tests/ncats_target_based_curated.xlsx new file mode 100644 index 00000000..8468c2a6 --- /dev/null +++ b/utils/pre-process/data-download/pdbbind-generate-conformers-tool/tests/ncats_target_based_curated.xlsx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66b8381a79a0f0930bd455558715afd16b68302497a09a0c18cd02fe0d3fa3c7 +size 5336902 diff --git a/utils/pre-process/data-download/pdbbind-generate-conformers-tool/tests/test_pdbbind_generate_conformers.py b/utils/pre-process/data-download/pdbbind-generate-conformers-tool/tests/test_pdbbind_generate_conformers.py new file mode 100644 index 00000000..9ca00cf4 --- /dev/null +++ b/utils/pre-process/data-download/pdbbind-generate-conformers-tool/tests/test_pdbbind_generate_conformers.py @@ -0,0 +1,65 @@ +"""Tests for pdbbind_generate_conformers.""" +from pathlib import Path + +from polus.mm.utils.pdbbind_generate_conformers.pdbbind_generate_conformers import ( + pdbbind_generate_conformers, +) +from sophios.api.pythonapi import Step +from sophios.api.pythonapi import Workflow + + +def test_pdbbind_generate_conformers() -> None: + """Test pdbbind_generate_conformers.""" + input_excel_path = "ncats_target_based_curated.xlsx" + path = Path(__file__).resolve().parent / Path(input_excel_path) + query = "`Standard Type` == 'Kd' and `duplicate-type-classifier` == 'unique'" + output_txt_path = "binding_data.txt" + min_row = 1 + max_row = 1 + smiles_column = "SMILES" + binding_data_column = "Standard Value" + convert_kd_dg = True + + pdbbind_generate_conformers( + path, + query, + output_txt_path, + min_row, + max_row, + smiles_column, + binding_data_column, + convert_kd_dg, + ) + assert Path("binding_data.txt").exists() + + +def test_pdbbind_generate_conformers_cwl() -> None: + """Test pdbbind_generate_conformers CWL.""" + cwl_file = Path("pdbbind_generate_conformers_0@1@0.cwl") + + pdbbind_generate_conformers_step = Step(clt_path=cwl_file) + pdbbind_generate_conformers_step.input_excel_path = str( + Path(__file__).resolve().parent / Path("ncats_target_based_curated.xlsx"), + ) + pdbbind_generate_conformers_step.query = ( + "`Standard Type` == 'Kd' and `duplicate-type-classifier` == 'unique'" + ) + pdbbind_generate_conformers_step.smiles_column = "SMILES" + pdbbind_generate_conformers_step.binding_data_column = "Standard Value" + pdbbind_generate_conformers_step.convert_kd_dg = True + pdbbind_generate_conformers_step.min_row = 1 + pdbbind_generate_conformers_step.max_row = 1 + pdbbind_generate_conformers_step.output_txt_path = "system.log" + + steps = [pdbbind_generate_conformers_step] + filename = "pdbbind_generate_conformers" + workflow = Workflow(steps, filename) + + workflow.run() + + outdir = Path("outdir") + files = list(outdir.rglob("ligand_0.sdf")) + + assert ( + files + ), f"The file 'ligand_0.sdf' does not exist in any subdirectory of '{outdir}'."