From 724f8139018c4a6831ebc4dc1c873915d6ea5655 Mon Sep 17 00:00:00 2001
From: Brandon Duane Walker <walkerbd@UPDATEME.HOSTNAME.COM>
Date: Thu, 9 May 2024 14:52:19 -0400
Subject: [PATCH] generate-conformers

---
 .../.bumpversion.cfg                          |  29 +++
 .../.gitattributes                            |   5 +
 .../CHANGELOG.md                              |   5 +
 .../Dockerfile                                |  23 +++
 .../README.md                                 |  22 ++
 .../pdbbind-generate-conformers-tool/VERSION  |   1 +
 .../build-docker.sh                           |   4 +
 .../environment.yml                           |  10 +
 .../pdbbind-generate-conformers-tool/ict.yml  | 123 ++++++++++++
 .../ligand_0.sdf                              |   3 +
 .../pdbbind_generate_conformers_0@1@0.cwl     | 188 ++++++++++++++++++
 .../pyproject.toml                            |  33 +++
 .../pdbbind_generate_conformers/__init__.py   |   7 +
 .../pdbbind_generate_conformers/__main__.py   |  90 +++++++++
 .../pdbbind_generate_conformers.py            | 163 +++++++++++++++
 .../tests/__init__.py                         |   1 +
 .../tests/ncats_target_based_curated.xlsx     |   3 +
 .../tests/test_pdbbind_generate_conformers.py |  65 ++++++
 18 files changed, 775 insertions(+)
 create mode 100644 utils/pre-process/data-download/pdbbind-generate-conformers-tool/.bumpversion.cfg
 create mode 100644 utils/pre-process/data-download/pdbbind-generate-conformers-tool/.gitattributes
 create mode 100644 utils/pre-process/data-download/pdbbind-generate-conformers-tool/CHANGELOG.md
 create mode 100644 utils/pre-process/data-download/pdbbind-generate-conformers-tool/Dockerfile
 create mode 100644 utils/pre-process/data-download/pdbbind-generate-conformers-tool/README.md
 create mode 100644 utils/pre-process/data-download/pdbbind-generate-conformers-tool/VERSION
 create mode 100644 utils/pre-process/data-download/pdbbind-generate-conformers-tool/build-docker.sh
 create mode 100644 utils/pre-process/data-download/pdbbind-generate-conformers-tool/environment.yml
 create mode 100644 utils/pre-process/data-download/pdbbind-generate-conformers-tool/ict.yml
 create mode 100644 utils/pre-process/data-download/pdbbind-generate-conformers-tool/ligand_0.sdf
 create mode 100644 utils/pre-process/data-download/pdbbind-generate-conformers-tool/pdbbind_generate_conformers_0@1@0.cwl
 create mode 100644 utils/pre-process/data-download/pdbbind-generate-conformers-tool/pyproject.toml
 create mode 100644 utils/pre-process/data-download/pdbbind-generate-conformers-tool/src/polus/mm/utils/pdbbind_generate_conformers/__init__.py
 create mode 100644 utils/pre-process/data-download/pdbbind-generate-conformers-tool/src/polus/mm/utils/pdbbind_generate_conformers/__main__.py
 create mode 100644 utils/pre-process/data-download/pdbbind-generate-conformers-tool/src/polus/mm/utils/pdbbind_generate_conformers/pdbbind_generate_conformers.py
 create mode 100644 utils/pre-process/data-download/pdbbind-generate-conformers-tool/tests/__init__.py
 create mode 100644 utils/pre-process/data-download/pdbbind-generate-conformers-tool/tests/ncats_target_based_curated.xlsx
 create mode 100644 utils/pre-process/data-download/pdbbind-generate-conformers-tool/tests/test_pdbbind_generate_conformers.py
diff --git a/utils/pre-process/data-download/pdbbind-generate-conformers-tool/.bumpversion.cfg b/utils/pre-process/data-download/pdbbind-generate-conformers-tool/.bumpversion.cfg
new file mode 100644
index 00000000..def09b3b
--- /dev/null
+++ b/utils/pre-process/data-download/pdbbind-generate-conformers-tool/.bumpversion.cfg
@@ -0,0 +1,29 @@
+[bumpversion]
+current_version = 0.1.0
+commit = False
+tag = False
+parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>[a-z]+)(?P<dev>\d+))?
+serialize = 
+	{major}.{minor}.{patch}-{release}{dev}
+	{major}.{minor}.{patch}
+
+[bumpversion:part:release]
+optional_value = _
+first_value = dev
+values = 
+	dev
+	_
+
+[bumpversion:part:dev]
+
+[bumpversion:file:pyproject.toml]
+search = version = "{current_version}"
+replace = version = "{new_version}"
+
+[bumpversion:file:VERSION]
+
+[bumpversion:file:README.md]
+
+[bumpversion:file:plugin.json]
+
+[bumpversion:file:src/polus/mm/utils/pdbbind_generate_conformers/__init__.py]
diff --git a/utils/pre-process/data-download/pdbbind-generate-conformers-tool/.gitattributes b/utils/pre-process/data-download/pdbbind-generate-conformers-tool/.gitattributes
new file mode 100644
index 00000000..07fedc1e
--- /dev/null
+++ b/utils/pre-process/data-download/pdbbind-generate-conformers-tool/.gitattributes
@@ -0,0 +1,5 @@
+*.pdb filter=lfs diff=lfs merge=lfs -text
+*.pdbqt filter=lfs diff=lfs merge=lfs -text
+*.mol2 filter=lfs diff=lfs merge=lfs -text
+*.xlsx filter=lfs diff=lfs merge=lfs -text
+*.sdf filter=lfs diff=lfs merge=lfs -text
diff --git a/utils/pre-process/data-download/pdbbind-generate-conformers-tool/CHANGELOG.md b/utils/pre-process/data-download/pdbbind-generate-conformers-tool/CHANGELOG.md
new file mode 100644
index 00000000..b67793f7
--- /dev/null
+++ b/utils/pre-process/data-download/pdbbind-generate-conformers-tool/CHANGELOG.md
@@ -0,0 +1,5 @@
+# CHANGELOG
+
+## 0.1.0
+
+Initial release.
diff --git a/utils/pre-process/data-download/pdbbind-generate-conformers-tool/Dockerfile b/utils/pre-process/data-download/pdbbind-generate-conformers-tool/Dockerfile
new file mode 100644
index 00000000..79ccaab7
--- /dev/null
+++ b/utils/pre-process/data-download/pdbbind-generate-conformers-tool/Dockerfile
@@ -0,0 +1,23 @@
+# docker build -f Dockerfile -t polusai/pdbbind-generate-conformers-tool .
+FROM condaforge/mambaforge
+
+ENV EXEC_DIR="/opt/executables"
+ENV POLUS_LOG="INFO"
+RUN mkdir -p ${EXEC_DIR}
+
+# Work directory defined in the base container
+# WORKDIR ${EXEC_DIR}
+
+COPY pyproject.toml ${EXEC_DIR}
+COPY VERSION ${EXEC_DIR}
+COPY README.md ${EXEC_DIR}
+COPY CHANGELOG.md ${EXEC_DIR}
+
+# Install needed packages here
+
+COPY src ${EXEC_DIR}/src
+ADD Dockerfile .
+
+RUN pip3 install ${EXEC_DIR} --no-cache-dir
+
+CMD ["--help"]
diff --git a/utils/pre-process/data-download/pdbbind-generate-conformers-tool/README.md b/utils/pre-process/data-download/pdbbind-generate-conformers-tool/README.md
new file mode 100644
index 00000000..c0d20c55
--- /dev/null
+++ b/utils/pre-process/data-download/pdbbind-generate-conformers-tool/README.md
@@ -0,0 +1,22 @@
+# pdbbind_generate_conformers (0.1.0)
+
+Download the PDBbind refined database and generate conformers from SMILES
+
+## Options
+
+This plugin takes 9 input arguments and 3 output argument:
+
+| Name          | Description             | I/O    | Type   | Default |
+|---------------|-------------------------|--------|--------|---------|
+| input_excel_path |  | Input | File | File |
+| query | query str to search the dataset, Type: string, File type: input, Accepted formats: txt | Input | string | string |
+| output_txt_path | Path to the text dataset file, Type: string, File type: output, Accepted formats: txt | Input | string | string |
+| output_sdf_path | Path to the input file, Type: string, File type: input, Accepted formats: sdf | Input | string | string |
+| min_row | The row min inex, Type: int | Input | int | int |
+| max_row | The row max inex, Type: int | Input | int | int |
+| smiles_column | The name of the smiles column, Type: string, File type: output, Accepted formats: txt | Input | string | string |
+| binding_data_column | The name of the binding data column, Type: string, File type: output, Accepted formats: txt | Input | string | string |
+| convert_Kd_dG | If this is set to true, dG will be calculated | Input | boolean | boolean |
+| output_txt_path | Path to the txt file | Output | File | File |
+| output_sdf_path | Path to the input file, Type: string, File type: input, Accepted formats: sdf | Output | File[] | File[] |
+| experimental_dGs | Experimental Free Energies of Binding | Output | float[] | float[] |
diff --git a/utils/pre-process/data-download/pdbbind-generate-conformers-tool/VERSION b/utils/pre-process/data-download/pdbbind-generate-conformers-tool/VERSION
new file mode 100644
index 00000000..6e8bf73a
--- /dev/null
+++ b/utils/pre-process/data-download/pdbbind-generate-conformers-tool/VERSION
@@ -0,0 +1 @@
+0.1.0
diff --git a/utils/pre-process/data-download/pdbbind-generate-conformers-tool/build-docker.sh b/utils/pre-process/data-download/pdbbind-generate-conformers-tool/build-docker.sh
new file mode 100644
index 00000000..15e1d9f2
--- /dev/null
+++ b/utils/pre-process/data-download/pdbbind-generate-conformers-tool/build-docker.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+version=$(<VERSION)
+docker build . -t polusai/pdbbind-generate-conformers-tool:${version}
diff --git a/utils/pre-process/data-download/pdbbind-generate-conformers-tool/environment.yml b/utils/pre-process/data-download/pdbbind-generate-conformers-tool/environment.yml
new file mode 100644
index 00000000..077ec3e9
--- /dev/null
+++ b/utils/pre-process/data-download/pdbbind-generate-conformers-tool/environment.yml
@@ -0,0 +1,10 @@
+name: project_env
+channels:
+  - conda-forge
+dependencies:
+  - python==<3.12,>=3.9
+  - pandas==2.2.2
+  - rdkit==2024.03.2
+  - openpyxl==3.1.2
+  - xorg-libxrender==0.9.11
+  - pytest==8.2.0
diff --git a/utils/pre-process/data-download/pdbbind-generate-conformers-tool/ict.yml b/utils/pre-process/data-download/pdbbind-generate-conformers-tool/ict.yml
new file mode 100644
index 00000000..beca1deb
--- /dev/null
+++ b/utils/pre-process/data-download/pdbbind-generate-conformers-tool/ict.yml
@@ -0,0 +1,123 @@
+specVersion: "0.1.0"
+name: pdbbind_generate_conformers
+version: 0.1.0
+container: generate-conformers-tool
+entrypoint:
+title: pdbbind_generate_conformers
+description: Download the PDBbind refined database and generate conformers from SMILES
+author: Brandon Walker, Nazanin Donyapour
+contact: brandon.walker@axleinfo.com, nazanin.donyapour@nih.gov
+repository:
+documentation:
+citation:
+
+inputs:
+  - name: input_excel_path
+    required: true
+    description:
+    type: File
+    format:
+      uri: edam:format_3620
+  - name: query
+    required: true
+    description: query str to search the dataset, Type string, File type input, Accepted formats txt
+    type: string
+    format:
+      uri: edam:format_2330
+  - name: output_txt_path
+    required: true
+    description: Path to the text dataset file, Type string, File type output, Accepted formats txt
+    type: string
+    defaultValue: system.log
+    format:
+      uri: edam:format_2330
+  - name: output_sdf_path
+    required: true
+    description: Path to the input file, Type string, File type input, Accepted formats sdf
+    type: string
+    format:
+      uri: edam:format_3814
+  - name: min_row
+    required: true
+    description: The row min inex, Type int
+    type: int
+    format:
+      uri: edam:format_2330
+  - name: max_row
+    required: true
+    description: The row max inex, Type int
+    type: int
+    format:
+      uri: edam:format_2330
+  - name: smiles_column
+    required: true
+    description: The name of the smiles column, Type string, File type output, Accepted formats txt
+    type: string
+    format:
+      uri: edam:format_2330
+  - name: binding_data_column
+    required: true
+    description: The name of the binding data column, Type string, File type output, Accepted formats txt
+    type: string
+    format:
+      uri: edam:format_2330
+  - name: convert_Kd_dG
+    required: true
+    description: If this is set to true, dG will be calculated
+    type: boolean
+    format:
+      uri: edam:format_2330
+outputs:
+  - name: output_txt_path
+    required: true
+    description: Path to the txt file
+    type: File
+    format:
+      uri: edam:format_2330
+  - name: output_sdf_path
+    required: true
+    description: Path to the input file, Type string, File type input, Accepted formats sdf
+    type: File[]
+    format:
+      uri: edam:format_3814
+  - name: experimental_dGs
+    required: true
+    description: Experimental Free Energies of Binding
+    type: float[]
+ui:
+  - key: inputs.input_excel_path
+    title: "input_excel_path: "
+    description: ""
+    type: File
+  - key: inputs.query
+    title: "query: "
+    description: "query str to search the dataset, Type string, File type input, Accepted formats txt"
+    type: string
+  - key: inputs.output_txt_path
+    title: "output_txt_path: "
+    description: "Path to the text dataset file, Type string, File type output, Accepted formats txt"
+    type: string
+  - key: inputs.output_sdf_path
+    title: "output_sdf_path: "
+    description: "Path to the input file, Type string, File type input, Accepted formats sdf"
+    type: string
+  - key: inputs.min_row
+    title: "min_row: "
+    description: "The row min inex, Type int"
+    type: int
+  - key: inputs.max_row
+    title: "max_row: "
+    description: "The row max inex, Type int"
+    type: int
+  - key: inputs.smiles_column
+    title: "smiles_column: "
+    description: "The name of the smiles column, Type string, File type output, Accepted formats txt"
+    type: string
+  - key: inputs.binding_data_column
+    title: "binding_data_column: "
+    description: "The name of the binding data column, Type string, File type output, Accepted formats txt"
+    type: string
+  - key: inputs.convert_Kd_dG
+    title: "convert_Kd_dG: "
+    description: "If this is set to true, dG will be calculated"
+    type: checkbox
diff --git a/utils/pre-process/data-download/pdbbind-generate-conformers-tool/ligand_0.sdf b/utils/pre-process/data-download/pdbbind-generate-conformers-tool/ligand_0.sdf
new file mode 100644
index 00000000..4b06a341
--- /dev/null
+++ b/utils/pre-process/data-download/pdbbind-generate-conformers-tool/ligand_0.sdf
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dbfdb1517bf21136e47fcee2247396d0ecc53b2e7c90a9462ebe0d92934f38a2
+size 2427
diff --git a/utils/pre-process/data-download/pdbbind-generate-conformers-tool/pdbbind_generate_conformers_0@1@0.cwl b/utils/pre-process/data-download/pdbbind-generate-conformers-tool/pdbbind_generate_conformers_0@1@0.cwl
new file mode 100644
index 00000000..4497a3a6
--- /dev/null
+++ b/utils/pre-process/data-download/pdbbind-generate-conformers-tool/pdbbind_generate_conformers_0@1@0.cwl
@@ -0,0 +1,188 @@
+#!/usr/bin/env cwl-runner
+cwlVersion: v1.0
+
+class: CommandLineTool
+
+label: Download the PDBbind refined database and generate conformers from SMILES
+
+doc: |-
+  Download the PDBbind refined database and generate conformers from SMILES
+
+baseCommand: ["python", "-m", "polus.mm.utils.pdbbind_generate_conformers"]
+
+hints:
+  DockerRequirement:
+    dockerPull: polusai/pdbbind-generate-conformers-tool@sha256:f688a0e8fa1a2e909c62501d4039e984d75dd43dcdc21f80d4d102a04e0b02d7
+
+requirements:
+  InlineJavascriptRequirement: {}
+
+inputs:
+  input_excel_path:
+    label: Path to the input xlsx file
+    type: File
+    format: edam:format_3620
+    inputBinding:
+      prefix: --input_excel_path
+
+  query:
+    label: query str to search the dataset
+    doc: |-
+      query str to search the dataset
+      Type: string
+      File type: input
+      Accepted formats: txt
+    type: string
+    format: edam:format_2330
+    inputBinding:
+      prefix: --query
+
+  output_txt_path:
+    label: Path to the text dataset file
+    doc: |-
+      Path to the text dataset file
+      Type: string
+      File type: output
+      Accepted formats: txt
+    type: string
+    format: edam:format_2330
+    inputBinding:
+      prefix: --output_txt_path
+    default: system.log
+
+  output_sdf_path:
+    label: Path to the input file
+    doc: |-
+      Path to the input file
+      Type: string
+      File type: input
+      Accepted formats: sdf
+    type: string?
+    format: edam:format_3814 # sdf
+
+  min_row:
+    label: The row min index
+    doc: |-
+      The row min inex
+      Type: int
+    type: int?
+    format: edam:format_2330
+    inputBinding:
+      prefix: --min_row
+
+  max_row:
+    label: The row max index
+    doc: |-
+      The row max inex
+      Type: int
+    type: int?
+    format: edam:format_2330
+    inputBinding:
+      prefix: --max_row
+
+  smiles_column:
+    label: The name of the smiles column
+    doc: |-
+      The name of the smiles column
+      Type: string
+      File type: output
+      Accepted formats: txt
+    type: string
+    format: edam:format_2330
+    inputBinding:
+      prefix: --smiles_column
+
+  binding_data_column:
+    label: The name of the binding data column
+    doc: |-
+      The name of the binding data column
+      Type: string
+      File type: output
+      Accepted formats: txt
+    type: string
+    format: edam:format_2330
+    inputBinding:
+      prefix: --binding_data_column
+
+  convert_kd_dg:
+    label: If this is set to true, dG will be calculated
+    doc: If this is set to true, dG will be calculated
+    type: boolean
+    format: edam:format_2330
+    inputBinding:
+      prefix: --convert_kd_dg
+    default: False
+
+outputs:
+  output_txt_path:
+    label: Path to the txt file
+    doc: |-
+      Path to the txt file
+    type: File
+    outputBinding:
+      glob: $(inputs.output_txt_path)
+    format: edam:format_2330
+
+  output_sdf_path:
+    label: Path to the input file
+    doc: |-
+      Path to the input file
+      Type: string
+      File type: input
+      Accepted formats: sdf
+    type: File[]
+    outputBinding:
+      # NOTE: Do NOT just use glob: ./*.sdf !!! This will return an array sorted by filenames.
+      # We want the order of output_sdf_paths to match the order of experimental_dGs, etc
+      # Because we need to compare experimental ΔGs with predicted values.
+      glob: $(inputs.output_txt_path)
+      loadContents: true
+      outputEval: |
+        ${
+          var lines = self[0].contents.split("\n");
+          var sdfs = [];
+          for (var idx = 0; idx < lines.length; idx++) {
+            var words = lines[idx].split(" ");
+            var sdffile = {"class": "File", "path": "ligand_" + idx + ".sdf"};
+            sdfs.push(sdffile);
+            }
+
+          return sdfs;
+        }
+    format: edam:format_3814
+
+  experimental_dGs:
+    label: Experimental Free Energies of Binding
+    doc: |-
+      Experimental Free Energies of Binding
+    type: float[]
+    outputBinding:
+      # NOTE: Do NOT just use $(inputs.output_txt_path) !!! This will return an array sorted by filenames.
+      # We want the order of output_sdf_paths to match the order of experimental_dGs, etc
+      # Because we need to compare experimental ΔGs with predicted values.
+      glob: $(inputs.output_txt_path)
+      loadContents: true
+      outputEval: |
+        ${
+          var lines = self[0].contents.split("\n");
+          var experimental_dGs = [];
+          for (var i = 0; i < lines.length; i++) {
+            var words = lines[i].split(" ");
+            if (words.length > 2) {
+              var experimental_dG = parseFloat(words[2]);
+              experimental_dGs.push(experimental_dG);
+            }
+          }
+
+          if (experimental_dGs.length == 0) {
+            throw new Error("Error! Experimental dGs are empty!");
+          } else {
+            return experimental_dGs;
+          }
+        }
+
+$namespaces:
+  edam: https://edamontology.org/
+
+$schemas:
+- https://raw.githubusercontent.com/edamontology/edamontology/master/EDAM_dev.owl
diff --git a/utils/pre-process/data-download/pdbbind-generate-conformers-tool/pyproject.toml b/utils/pre-process/data-download/pdbbind-generate-conformers-tool/pyproject.toml
new file mode 100644
index 00000000..2bb307ab
--- /dev/null
+++ b/utils/pre-process/data-download/pdbbind-generate-conformers-tool/pyproject.toml
@@ -0,0 +1,33 @@
+[tool.poetry]
+name = "polus-mm-utils-generate-conformers"
+version = "0.1.0"
+description = "Download the PDBbind refined database"
+authors = ["Data Scientist <data.scientist@labshare.org>"]
+readme = "README.md"
+packages = [{include = "polus", from = "src"}]
+
+[tool.poetry.dependencies]
+python = ">=3.9,<3.12"
+typer = "^0.7.0"
+sophios = "0.1.1"
+openpyxl = "3.1.5"
+pandas = "2.2.2"
+rdkit = "2024.3.3"
+
+[tool.poetry.group.dev.dependencies]
+bump2version = "^1.0.1"
+pytest = "^7.4"
+pytest-sugar = "^0.9.6"
+pre-commit = "^3.2.1"
+black = "^23.3.0"
+mypy = "^1.1.1"
+ruff = "^0.0.270"
+
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
+
+[tool.pytest.ini_options]
+pythonpath = [
+  "."
+]
diff --git a/utils/pre-process/data-download/pdbbind-generate-conformers-tool/src/polus/mm/utils/pdbbind_generate_conformers/__init__.py b/utils/pre-process/data-download/pdbbind-generate-conformers-tool/src/polus/mm/utils/pdbbind_generate_conformers/__init__.py
new file mode 100644
index 00000000..e84b7b9b
--- /dev/null
+++ b/utils/pre-process/data-download/pdbbind-generate-conformers-tool/src/polus/mm/utils/pdbbind_generate_conformers/__init__.py
@@ -0,0 +1,7 @@
+"""pdbbind_generate_conformers."""
+
+__version__ = "0.1.0"
+
+from polus.mm.utils.pdbbind_generate_conformers.pdbbind_generate_conformers import (  # noqa # pylint: disable=unused-import
+    pdbbind_generate_conformers,
+)
diff --git a/utils/pre-process/data-download/pdbbind-generate-conformers-tool/src/polus/mm/utils/pdbbind_generate_conformers/__main__.py b/utils/pre-process/data-download/pdbbind-generate-conformers-tool/src/polus/mm/utils/pdbbind_generate_conformers/__main__.py
new file mode 100644
index 00000000..79396bf2
--- /dev/null
+++ b/utils/pre-process/data-download/pdbbind-generate-conformers-tool/src/polus/mm/utils/pdbbind_generate_conformers/__main__.py
@@ -0,0 +1,90 @@
+"""Package entrypoint for the pdbbind_generate_conformers package."""
+
+# Base packages
+import logging
+from os import environ
+from pathlib import Path
+
+import typer
+from polus.mm.utils.pdbbind_generate_conformers.pdbbind_generate_conformers import (
+    pdbbind_generate_conformers,
+)
+
+logging.basicConfig(
+    format="%(asctime)s - %(name)-8s - %(levelname)-8s - %(message)s",
+    datefmt="%d-%b-%y %H:%M:%S",
+)
+POLUS_LOG = getattr(logging, environ.get("POLUS_LOG", "INFO"))
+logger = logging.getLogger("polus.mm.utils.pdbbind_generate_conformers.")
+logger.setLevel(POLUS_LOG)
+
+app = typer.Typer(help="pdbbind_generate_conformers.")
+
+
+@app.command()
+def main(  # noqa: PLR0913
+    input_excel_path: Path = typer.Option(
+        ...,
+        "--input_excel_path",
+        help="",
+    ),
+    query: str = typer.Option(
+        ...,
+        "--query",
+        help="query str to search the dataset",
+    ),
+    output_txt_path: str = typer.Option(
+        ...,
+        "--output_txt_path",
+        help="Path to the text dataset file",
+    ),
+    min_row: int = typer.Option(
+        ...,
+        "--min_row",
+        help="The row min inex, Type int",
+    ),
+    max_row: int = typer.Option(
+        ...,
+        "--max_row",
+        help="The row max inex, Type int",
+    ),
+    smiles_column: str = typer.Option(
+        ...,
+        "--smiles_column",
+        help="The name of the smiles column",
+    ),
+    binding_data_column: str = typer.Option(
+        ...,
+        "--binding_data_column",
+        help="The name of the binding data column",
+    ),
+    convert_kd_dg: bool = typer.Option(
+        ...,
+        "--convert_kd_dg",
+        help="If this is set to true, dG will be calculated",
+    ),
+) -> None:
+    """pdbbind_generate_conformers."""
+    logger.info(f"input_excel_path: {input_excel_path}")
+    logger.info(f"query: {query}")
+    logger.info(f"output_txt_path: {output_txt_path}")
+    logger.info(f"min_row: {min_row}")
+    logger.info(f"max_row: {max_row}")
+    logger.info(f"smiles_column: {smiles_column}")
+    logger.info(f"binding_data_column: {binding_data_column}")
+    logger.info(f"convert_kd_dg: {convert_kd_dg}")
+
+    pdbbind_generate_conformers(
+        input_excel_path=input_excel_path,
+        query=query,
+        output_txt_path=output_txt_path,
+        min_row=min_row,
+        max_row=max_row,
+        smiles_column=smiles_column,
+        binding_data_column=binding_data_column,
+        convert_kd_dg=convert_kd_dg,
+    )
+
+
+if __name__ == "__main__":
+    app()
diff --git a/utils/pre-process/data-download/pdbbind-generate-conformers-tool/src/polus/mm/utils/pdbbind_generate_conformers/pdbbind_generate_conformers.py b/utils/pre-process/data-download/pdbbind-generate-conformers-tool/src/polus/mm/utils/pdbbind_generate_conformers/pdbbind_generate_conformers.py
new file mode 100644
index 00000000..a6886313
--- /dev/null
+++ b/utils/pre-process/data-download/pdbbind-generate-conformers-tool/src/polus/mm/utils/pdbbind_generate_conformers/pdbbind_generate_conformers.py
@@ -0,0 +1,163 @@
+"""Generate conformers for a dataset of ligands and binding data."""
+import math
+from pathlib import Path
+
+import pandas
+import rdkit
+from rdkit import Chem
+from rdkit.Chem import AllChem
+
+
+def pdbbind_generate_conformers(  # noqa: PLR0913
+    input_excel_path: Path,
+    query: str,
+    output_txt_path: str,
+    min_row: int,
+    max_row: int,
+    smiles_column: str,
+    binding_data_column: str,
+    convert_kd_dg: bool,
+) -> None:
+    """pdbbind_generate_conformers.
+
+    Args:
+        input_excel_path: Path to the input xlsx file
+        query: query str to search the dataset
+        output_txt_path: Path to the text dataset file
+        min_row: The row min index
+        max_row: The row max index
+        smiles_column: The name of the smiles column
+        binding_data_column: The name of the binding data column
+        convert_kd_dg: If this is set to true, dG will be calculated
+    Returns:
+        None
+    """
+    load_data(
+        input_excel_path,
+        query,
+        smiles_column,
+        binding_data_column,
+        output_txt_path,
+        min_row,
+        max_row,
+        convert_kd_dg,
+    )
+
+
+def calculate_dg(kd: float) -> float:
+    """Calculates binding free energy from kd, See https://en.wikipedia.org/wiki/Binding_constant.
+
+    Args:
+        kd (float): The binding affinity of the protein-ligand complex
+
+    Returns:
+        float: The binding free energy
+    """
+    # Calculate the binding free energy from kd so we can make the correlation plots.
+    # See https://en.wikipedia.org/wiki/Binding_constant
+    ideal_gas_constant = 8.31446261815324  # J/(Mol*K)
+    kcal_per_joule = 4184
+    # NOTE: Unfortunately, the temperature at which experimental kd
+    # binding data was taken
+    # is often not recorded. Thus, we are forced to guess. The two standard guesses are
+    # physiological body temperature (310K) or room temperature (298K).
+    temperature = 298
+    rt = (ideal_gas_constant / kcal_per_joule) * temperature
+    # NOTE: For performance, simulations are often done in a very small unit cell, and
+    # thus at a very high concentration. The size of the unit cell bounds the volume.
+    # For shorter simulations where the ligand has not explored the entire box, it may
+    # be less. See the Yank paper for a method of calculating the correct volumes.
+    standard_concentration = 1  # Units of mol / L, but see comment above.
+    return rt * math.log(kd / standard_concentration)
+
+
+def load_data(  # noqa: PLR0913
+    input_excel_path: Path,
+    query: str,
+    smiles_column: str,
+    binding_data_column: str,
+    output_txt_path: str,
+    min_row: int = 1,
+    max_row: int = -1,
+    convert_kd_dg: bool = False,
+) -> None:
+    """Reads SMILES strings and numerical binding affinity data.
+
+    from the given Excel spreadsheet using a Pandas query.
+
+    Args:
+        input_excel_path: (Path): Path to the input xlsx file
+        query (str): The Query to perform
+        min_row (int): min index of rows. Defaults to 1.
+        max_row (int): max index of rows. Defaults to -1.
+        smiles_column (str): The name of smiles column
+        binding_data_column (str): The name of the binding data column
+        convert_kd_dg (bool): If this set to True,
+        The dG will be calculated. Defaults to False.
+        output_txt_path (str): The output text file
+    """
+    df = pandas.read_excel(str(input_excel_path), sheet_name=1)  # Requires openpyxl
+    # 'Focus_Reduction_Assay', 'Proliferation', 'Antigen_Expression', \
+    # 'Staining_Based', \
+    # 'Flourescence', 'Viral_Titer', 'Cell_Viability_By_Neutral_Red_Uptake', \
+    # 'eGFP_Reduction', \
+    # 'Viral_Infection', 'Microscopy', 'Immunodetection', 'Replicon_Assay', \
+    # 'Antigen_Synthesis', \
+    # 'Luciferase_Reporter_Assay', 'Viral_Entry', 'MTT_Assay', 'RT-PCR', 'Cytopathy', \
+    # 'Flow_Cytometry', \
+    # 'Colorimetric', 'Luciferase_Reporter_Gene', 'Cell_Titer', 'Western_Blot', \
+    # 'Cytotoxicity',\
+    # 'SDS-PAGE', \
+    # 'Fluorescence', 'Image-Based', \
+    # 'Crystal_Violet_Staining_Assay', \
+    # 'Viral_Reduction_Assay']
+    # Standard Type ['IC50', 'EC90', 'Inhibition', 'Kd', 'EC50', 'Activity', 'Ki']
+    # Standard Relation [nan, "<'", "<='", ">'", "='", ">='"]
+    # Standard Units [nan, 'uM', '%']
+
+    df = df.query(query)
+
+    # Perform row slicing (if any)
+    if int(min_row) != 1 or int(max_row) != -1:
+        # We want to convert to zero-based indices and we also want
+        # the upper index to be inclusive (i.e. <=) so -1 lower index.
+        df = df[(int(min_row) - 1) : int(max_row)]
+
+    # Now restrict to the columns we actually care about.
+    columns = [smiles_column, binding_data_column]
+    df = df[columns].dropna()
+
+    # Generate 2D and/or 3D conformers
+    smiles_binding_data: list[str] = []
+    for idx, row in enumerate(df.values):
+        (smiles, binding_datum) = row
+        micromolar = 0.000001  # uM
+        binding_datum = binding_datum * micromolar
+
+        if convert_kd_dg:
+            dg = calculate_dg(binding_datum)
+            smiles_binding_data.append(f"{smiles} {binding_datum} {dg}")
+        else:
+            smiles_binding_data.append(f"{smiles} {binding_datum}")
+
+        # See https://www.rdkit.org/docs/GettingStartedInPython.html#working-with-3d-molecules
+        mol_2d: rdkit.Chem.rdchem.Mol = Chem.MolFromSmiles(
+            smiles,
+        )  # pylint: disable=c-extension-no-member,no-member
+        AllChem.Compute2DCoords(mol_2d)  # pylint: disable=no-member
+
+        # rdkit.Chem.rdmolops.AddHs
+        # NOTE: "Much of the code assumes that Hs are not included in
+        # the molecular topology,
+        # so be very careful with the molecule that comes back from this function."
+        mol_3d = Chem.AddHs(mol_2d)  # pylint: disable=no-member
+        AllChem.EmbedMolecule(mol_3d)  # pylint: disable=no-member
+        AllChem.MMFFOptimizeMolecule(mol_3d)  # pylint: disable=no-member
+
+        filename = f"ligand_{idx}.sdf"  # chemblid is NOT unique!
+        writer = Chem.SDWriter(filename)  # pylint: disable=no-member
+        writer.write(mol_3d)
+        writer.close()
+
+    with Path(output_txt_path).open(mode="w", encoding="utf-8") as f:
+        f.write("\n".join(smiles_binding_data))
diff --git a/utils/pre-process/data-download/pdbbind-generate-conformers-tool/tests/__init__.py b/utils/pre-process/data-download/pdbbind-generate-conformers-tool/tests/__init__.py
new file mode 100644
index 00000000..29576a11
--- /dev/null
+++ b/utils/pre-process/data-download/pdbbind-generate-conformers-tool/tests/__init__.py
@@ -0,0 +1 @@
+"""Tests for pdbbind_generate_conformers."""
diff --git a/utils/pre-process/data-download/pdbbind-generate-conformers-tool/tests/ncats_target_based_curated.xlsx b/utils/pre-process/data-download/pdbbind-generate-conformers-tool/tests/ncats_target_based_curated.xlsx
new file mode 100644
index 00000000..8468c2a6
--- /dev/null
+++ b/utils/pre-process/data-download/pdbbind-generate-conformers-tool/tests/ncats_target_based_curated.xlsx
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:66b8381a79a0f0930bd455558715afd16b68302497a09a0c18cd02fe0d3fa3c7
+size 5336902
diff --git a/utils/pre-process/data-download/pdbbind-generate-conformers-tool/tests/test_pdbbind_generate_conformers.py b/utils/pre-process/data-download/pdbbind-generate-conformers-tool/tests/test_pdbbind_generate_conformers.py
new file mode 100644
index 00000000..9ca00cf4
--- /dev/null
+++ b/utils/pre-process/data-download/pdbbind-generate-conformers-tool/tests/test_pdbbind_generate_conformers.py
@@ -0,0 +1,65 @@
+"""Tests for pdbbind_generate_conformers."""
+from pathlib import Path
+
+from polus.mm.utils.pdbbind_generate_conformers.pdbbind_generate_conformers import (
+    pdbbind_generate_conformers,
+)
+from sophios.api.pythonapi import Step
+from sophios.api.pythonapi import Workflow
+
+
+def test_pdbbind_generate_conformers() -> None:
+    """Test pdbbind_generate_conformers."""
+    input_excel_path = "ncats_target_based_curated.xlsx"
+    path = Path(__file__).resolve().parent / Path(input_excel_path)
+    query = "`Standard Type` == 'Kd' and `duplicate-type-classifier` == 'unique'"
+    output_txt_path = "binding_data.txt"
+    min_row = 1
+    max_row = 1
+    smiles_column = "SMILES"
+    binding_data_column = "Standard Value"
+    convert_kd_dg = True
+
+    pdbbind_generate_conformers(
+        path,
+        query,
+        output_txt_path,
+        min_row,
+        max_row,
+        smiles_column,
+        binding_data_column,
+        convert_kd_dg,
+    )
+    assert Path("binding_data.txt").exists()
+
+
+def test_pdbbind_generate_conformers_cwl() -> None:
+    """Test pdbbind_generate_conformers CWL."""
+    cwl_file = Path("pdbbind_generate_conformers_0@1@0.cwl")
+
+    pdbbind_generate_conformers_step = Step(clt_path=cwl_file)
+    pdbbind_generate_conformers_step.input_excel_path = str(
+        Path(__file__).resolve().parent / Path("ncats_target_based_curated.xlsx"),
+    )
+    pdbbind_generate_conformers_step.query = (
+        "`Standard Type` == 'Kd' and `duplicate-type-classifier` == 'unique'"
+    )
+    pdbbind_generate_conformers_step.smiles_column = "SMILES"
+    pdbbind_generate_conformers_step.binding_data_column = "Standard Value"
+    pdbbind_generate_conformers_step.convert_kd_dg = True
+    pdbbind_generate_conformers_step.min_row = 1
+    pdbbind_generate_conformers_step.max_row = 1
+    pdbbind_generate_conformers_step.output_txt_path = "system.log"
+
+    steps = [pdbbind_generate_conformers_step]
+    filename = "pdbbind_generate_conformers"
+    workflow = Workflow(steps, filename)
+
+    workflow.run()
+
+    outdir = Path("outdir")
+    files = list(outdir.rglob("ligand_0.sdf"))
+
+    assert (
+        files
+    ), f"The file 'ligand_0.sdf' does not exist in any subdirectory of '{outdir}'."