From 9e613840169997e1a6715e6f4c4029509bb85570 Mon Sep 17 00:00:00 2001 From: Brandon Duane Walker Date: Fri, 2 Feb 2024 16:01:01 -0500 Subject: [PATCH] sanitize_ligand --- utils/sanitize-ligand-plugin/.bumpversion.cfg | 29 +++ utils/sanitize-ligand-plugin/.dockerignore | 4 + utils/sanitize-ligand-plugin/CHANGELOG.md | 5 + utils/sanitize-ligand-plugin/Dockerfile | 27 +++ utils/sanitize-ligand-plugin/README.md | 18 ++ utils/sanitize-ligand-plugin/VERSION | 1 + utils/sanitize-ligand-plugin/build-docker.sh | 4 + utils/sanitize-ligand-plugin/plugin.json | 36 ++++ utils/sanitize-ligand-plugin/pyproject.toml | 30 +++ utils/sanitize-ligand-plugin/run-plugin.sh | 13 ++ .../src/polus/mm/utils/__init__.py | 7 + .../src/polus/mm/utils/__main__.py | 63 ++++++ .../src/polus/mm/utils/sanitize_ligand.py | 197 ++++++++++++++++++ .../sanitize-ligand-plugin/tests/__init__.py | 1 + .../tests/test_sanitize.py | 26 +++ 15 files changed, 461 insertions(+) create mode 100644 utils/sanitize-ligand-plugin/.bumpversion.cfg create mode 100644 utils/sanitize-ligand-plugin/.dockerignore create mode 100644 utils/sanitize-ligand-plugin/CHANGELOG.md create mode 100644 utils/sanitize-ligand-plugin/Dockerfile create mode 100644 utils/sanitize-ligand-plugin/README.md create mode 100644 utils/sanitize-ligand-plugin/VERSION create mode 100755 utils/sanitize-ligand-plugin/build-docker.sh create mode 100644 utils/sanitize-ligand-plugin/plugin.json create mode 100644 utils/sanitize-ligand-plugin/pyproject.toml create mode 100755 utils/sanitize-ligand-plugin/run-plugin.sh create mode 100644 utils/sanitize-ligand-plugin/src/polus/mm/utils/__init__.py create mode 100644 utils/sanitize-ligand-plugin/src/polus/mm/utils/__main__.py create mode 100644 utils/sanitize-ligand-plugin/src/polus/mm/utils/sanitize_ligand.py create mode 100644 utils/sanitize-ligand-plugin/tests/__init__.py create mode 100644 utils/sanitize-ligand-plugin/tests/test_sanitize.py diff --git a/utils/sanitize-ligand-plugin/.bumpversion.cfg b/utils/sanitize-ligand-plugin/.bumpversion.cfg new file mode 100644 index 00000000..8897a253 --- /dev/null +++ b/utils/sanitize-ligand-plugin/.bumpversion.cfg @@ -0,0 +1,29 @@ +[bumpversion] +current_version = 0.1.1-dev1 +commit = False +tag = False +parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))? +serialize = + {major}.{minor}.{patch}-{release}{dev} + {major}.{minor}.{patch} + +[bumpversion:part:release] +optional_value = _ +first_value = dev +values = + dev + _ + +[bumpversion:part:dev] + +[bumpversion:file:pyproject.toml] +search = version = "{current_version}" +replace = version = "{new_version}" + +[bumpversion:file:VERSION] + +[bumpversion:file:README.md] + +[bumpversion:file:plugin.json] + +[bumpversion:file:src/polus/mm/utils/__init__.py] diff --git a/utils/sanitize-ligand-plugin/.dockerignore b/utils/sanitize-ligand-plugin/.dockerignore new file mode 100644 index 00000000..7c603f81 --- /dev/null +++ b/utils/sanitize-ligand-plugin/.dockerignore @@ -0,0 +1,4 @@ +.venv +out +tests +__pycache__ diff --git a/utils/sanitize-ligand-plugin/CHANGELOG.md b/utils/sanitize-ligand-plugin/CHANGELOG.md new file mode 100644 index 00000000..b67793f7 --- /dev/null +++ b/utils/sanitize-ligand-plugin/CHANGELOG.md @@ -0,0 +1,5 @@ +# CHANGELOG + +## 0.1.0 + +Initial release. diff --git a/utils/sanitize-ligand-plugin/Dockerfile b/utils/sanitize-ligand-plugin/Dockerfile new file mode 100644 index 00000000..731c9690 --- /dev/null +++ b/utils/sanitize-ligand-plugin/Dockerfile @@ -0,0 +1,27 @@ +FROM condaforge/mambaforge + +# environment variables defined +ENV EXEC_DIR="/opt/executables" + +RUN mkdir -p ${EXEC_DIR} +COPY pyproject.toml ${EXEC_DIR} +COPY VERSION ${EXEC_DIR} +COPY README.md ${EXEC_DIR} +COPY CHANGELOG.md ${EXEC_DIR} + +RUN conda install -c conda-forge rdkit --yes + +RUN pip install filepattern + +RUN conda init bash + +RUN mamba clean --all --yes + +ADD Dockerfile ${EXEC_DIR} + +COPY src ${EXEC_DIR}/src + +RUN pip3 install ${EXEC_DIR} --no-cache-dir +# Default command. Additional arguments are provided through the command line +ENTRYPOINT ["python3", "-m", "polus.mm.utils"] +CMD ["--help"] diff --git a/utils/sanitize-ligand-plugin/README.md b/utils/sanitize-ligand-plugin/README.md new file mode 100644 index 00000000..cb70b45e --- /dev/null +++ b/utils/sanitize-ligand-plugin/README.md @@ -0,0 +1,18 @@ +# Sanitize Ligand (0.1.1-dev1) + +Handle molecules with rdkit errors gracefully. + +## Building + +To build the Docker image for the conversion plugin, run `./build-docker.sh`. + of `plugin.json` into the pop-up window and submit. + +## Options + +This plugin takes 2 input arguments and 1 output argument: + +| Name | Description | I/O | Type | +|------------------------|----------------------------------|--------|--------| +| pattern | Filepattern to parse files | Input | string +| indir | Input collection | Input | collection +| outdir | Output collection | Output | collection diff --git a/utils/sanitize-ligand-plugin/VERSION b/utils/sanitize-ligand-plugin/VERSION new file mode 100644 index 00000000..916e7d19 --- /dev/null +++ b/utils/sanitize-ligand-plugin/VERSION @@ -0,0 +1 @@ +0.1.1-dev1 diff --git a/utils/sanitize-ligand-plugin/build-docker.sh b/utils/sanitize-ligand-plugin/build-docker.sh new file mode 100755 index 00000000..935ea0c1 --- /dev/null +++ b/utils/sanitize-ligand-plugin/build-docker.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +version=$("] +readme = "README.md" +packages = [{include = "polus", from = "src"}] + +[tool.poetry.dependencies] +python = ">=3.8" +typer = "^0.7.0" +rdkit = "*" + +[tool.poetry.group.dev.dependencies] +bump2version = "^1.0.1" +pytest = "^7.4" +pytest-sugar = "^0.9.6" +pre-commit = "^3.2.1" +black = "^23.3.0" +mypy = "^1.1.1" +ruff = "^0.0.270" + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" + +[tool.pytest.ini_options] +pythonpath = [ + "." +] diff --git a/utils/sanitize-ligand-plugin/run-plugin.sh b/utils/sanitize-ligand-plugin/run-plugin.sh new file mode 100755 index 00000000..4ac090fe --- /dev/null +++ b/utils/sanitize-ligand-plugin/run-plugin.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +version=$( None: + """Sanitize Ligand.""" + logger.info(f"pattern: {pattern}") + logger.info(f"indir: {in_dir}") + logger.info(f"outdir: {out_dir}") + ligand_file_paths = [] + ligand_files = fp.FilePattern(in_dir, pattern) + for input_small_mol_ligand in ligand_files(): + ligand = input_small_mol_ligand[-1][0] + ligand_file_paths.append(ligand) + + sanitize_ligand(ligand_file_paths, out_dir) + + +if __name__ == "__main__": + app() diff --git a/utils/sanitize-ligand-plugin/src/polus/mm/utils/sanitize_ligand.py b/utils/sanitize-ligand-plugin/src/polus/mm/utils/sanitize_ligand.py new file mode 100644 index 00000000..1bedb67e --- /dev/null +++ b/utils/sanitize-ligand-plugin/src/polus/mm/utils/sanitize_ligand.py @@ -0,0 +1,197 @@ +# pylint: disable=E0401,E1101,I1101 +"""Filter molecules with kekulization errors. + +Handle molecules with sanitization errors by assign formal charge based on +valence. +""" +# https://depth-first.com/articles/2020/02/10/a-comprehensive-treatment-of-aromaticity-in-the-smiles-language/ +import logging +from os import environ +from pathlib import Path + +import rdkit +import typer +from rdkit import Chem +from rdkit.Chem import AllChem + +app = typer.Typer() + +# Set up logging parameters +POLUS_LOG = getattr(logging, environ.get("POLUS_LOG", "WARNING")) +logger = logging.getLogger("polus.mm.utils.sanitize_ligand") +logger.setLevel(POLUS_LOG) + + +def adjust_formal_charges(molecule: Chem.SDMolSupplier) -> Chem.SDMolSupplier: + """Correct formal charges based on valence. + + Sometimes input structures do not have correct formal charges + corresponding to bond order topology. So choose to trust bond orders + assigned and generate formal charges based on that. + Explicit valence determined what the formal charge should be from + dictionary of valence to formal charge for that atomic number. + Special case if atom == carbon or nitrogen and if neighbors contain + nitrogen, oyxgen or sulfur (polarizable atoms) then if carbon and + explicit valence only 3, give formal charge of +1 + (more stable then -1 case). + + Args: + molecule (Chem.SDMolSupplier): The rdkit molecule object + + Returns: + Chem.SDMolSupplier: Molecule object with adjusted formal charges + """ + atomicnumtoformalchg: dict[int, dict[int, int]] = { + 1: {2: 1}, + 5: {4: 1}, + 6: {3: -1}, + 7: {2: -1, 4: 1}, + 8: {1: -1, 3: 1}, + 15: {4: 1}, + 16: {1: -1, 3: 1, 5: -1}, + 17: {0: -1, 4: 3}, + 9: {0: -1}, + 35: {0: -1}, + 53: {0: -1}, + } + for atom in molecule.GetAtoms(): + atomnum = atom.GetAtomicNum() + val = atom.GetExplicitValence() + if atomicnumtoformalchg.get(atomnum) is None: + continue + valtochg = atomicnumtoformalchg[atomnum] + chg = valtochg.get(val, 0) + # special case of polar neighbors surrounding carbon or nitrogen + # https://docs.eyesopen.com/toolkits/cpp/oechemtk/valence.html#openeye-charge-model + # + # See Section 6: Factors That Stabilize Carbocations - Adjacent Lone Pairs + # https://www.masterorganicchemistry.com/2011/03/11/3-factors-that-stabilize-carbocations/#six + polneighb = False + if atomnum in (6, 7): + for natom in atom.GetNeighbors(): + natomicnum = natom.GetAtomicNum() + if natomicnum in (7, 8, 16): + polneighb = True + num_neighbs = 3 + carbon_atomic_num = 6 + if polneighb and val == num_neighbs and atomnum == carbon_atomic_num: + chg = 1 + + atom.SetFormalCharge(chg) + return molecule + + +def generate_conformer(molecule: Chem.SDMolSupplier) -> None: + """Generate conformer for molecule. + + Generate conformer for molecule. Sometimes rdkit embedding can fail, + so use random coordinates if failed at first. + + Args: + molecule (Chem.SDMolSupplier): _description_ + """ + ps = AllChem.ETKDGv2() + confid = AllChem.EmbedMolecule(molecule, ps) + if confid == -1: + ps.useRandomCoords = True + AllChem.EmbedMolecule(molecule, ps) + # only want to catch error Bad Conformation Id, + # dont need to spend time optimizing + AllChem.MMFFOptimizeMolecule(molecule, confId=0, maxIters=1) + + +def is_valid_ligand(molecule: Chem.SDMolSupplier) -> bool: + """Check if ligand is valid. + + Args: + molecule (Chem.SDMolSupplier): The rdkit small molecule object + + Returns: + bool: if ligand is valid + """ + try: + Chem.SanitizeMol(molecule) + return True + except: # pylint: disable=broad-exception-caught # noqa: E722 + return False + + +def attempt_fix_ligand( + molecule: Chem.SDMolSupplier, +) -> tuple[bool, Chem.SDMolSupplier]: + """Attempt to fix ligand if not valid. + + Check for sanitization errors, attempt to fix formal + charges/valence consistency errors. DiffDock uses rdkit + to generate a seed conformation that will sometimes crash, + so generating conformations here to catch that error and + prevent DiffDock from running that ligand. + + Args: + molecule (Chem.SDMolSupplier): The rdkit small molecule object + + Returns: + bool: if ligand is valid + Chem.SDMolSupplier: molecule object + """ + valid_lig = True + try: + Chem.SanitizeMol(molecule) + generate_conformer(molecule) + except rdkit.Chem.rdchem.KekulizeException as e: + valid_lig = False + logger.warning(f"KekulizeException: {e}") + # Not handling kekulization error now so just remove file + # to prevent DiffDock execution + except rdkit.Chem.rdchem.MolSanitizeException: + # can also be explicit valence error (i.e.) formal charge + # not consistent with bond topology + # choose to trust bond topology around atom and add formal + # charge based on that + molecule = adjust_formal_charges(molecule) + except ValueError as e: + # assuming this is Bad Conformer Id error + # from generate_conformer + valid_lig = False + logger.warning(f"ValueError: {e}") + except Exception as e: # pylint: disable=broad-exception-caught # noqa: BLE001 + # catch *all* exceptions rdkit can throw + valid_lig = False + logger.warning(f"Exception: {e}") + + return valid_lig, molecule + + +def sanitize_ligand( + ligand_files: list[Path], + outdir: Path, +) -> None: + """Sanitize ligand file. + + Args: + ligand_files: Ligand file pattern + outdir: Output directory + """ + for input_small_mol_ligand in ligand_files: + output_small_mol_ligand = outdir / Path(input_small_mol_ligand.name) + mol: Chem.SDMolSupplier = Chem.SDMolSupplier( + input_small_mol_ligand.resolve(), + sanitize=False, + removeHs=False, + )[0] + + valid_ligand = is_valid_ligand(mol) + if not valid_ligand: + valid_ligand, rdkit_mol = attempt_fix_ligand(mol) + else: + rdkit_mol = mol + + if valid_ligand: + with Chem.SDWriter(output_small_mol_ligand) as w: + w.write(rdkit_mol) + + if len(ligand_files) == 1: + # if scattering with many files + # let the presence of the file indicate validity + with outdir.joinpath("valid.txt").open("w", encoding="utf-8") as f: + f.write(str(valid_ligand)) diff --git a/utils/sanitize-ligand-plugin/tests/__init__.py b/utils/sanitize-ligand-plugin/tests/__init__.py new file mode 100644 index 00000000..48b09c9a --- /dev/null +++ b/utils/sanitize-ligand-plugin/tests/__init__.py @@ -0,0 +1 @@ +"""Test module for the sanitize_ligand plugin.""" diff --git a/utils/sanitize-ligand-plugin/tests/test_sanitize.py b/utils/sanitize-ligand-plugin/tests/test_sanitize.py new file mode 100644 index 00000000..829d7b51 --- /dev/null +++ b/utils/sanitize-ligand-plugin/tests/test_sanitize.py @@ -0,0 +1,26 @@ +"""Test the sanitize_ligand plugin.""" +import pytest +from rdkit import Chem +from src.polus.mm.utils.sanitize_ligand import attempt_fix_ligand + + +@pytest.mark.catch_error() +def test_kekulization_error_catch() -> None: + """Test catching Kekulization error. + + Can't kekulize mol. Unkekulized atoms: 6 7 8 9 10. + """ + mol = Chem.MolFromSmiles("c1ccc(cc1)-c1nnc(n1)-c1ccccc1") + valid_ligand, rdkit_mol = attempt_fix_ligand(mol) + assert not valid_ligand + + +@pytest.mark.fix_ligand() +def test_fix_explicit_valence_error() -> None: + """Test fixing explicit valence error. + + Explicit valence for atom # 1 C, 5, is greater than permitted + """ + mol = Chem.MolFromSmiles("c1c(ccc2NC(CN=c(c21)(C)C)=O)O", sanitize=False) + valid_ligand, rdkit_mol = attempt_fix_ligand(mol) + assert valid_ligand