From 99088639d77435336813ba2de32b6b4dd6cbc6c1 Mon Sep 17 00:00:00 2001
From: Brandon Duane Walker <walkerbd@UPDATEME.HOSTNAME.COM>
Date: Thu, 9 May 2024 16:56:10 -0400
Subject: [PATCH] pre-process

---
 pre-process/download-data/download_pdb.wic    | 73 +++++++++++++++++++
 .../download_smiles_ligand_db.wic             | 52 +++++++++++++
 2 files changed, 125 insertions(+)
 create mode 100644 pre-process/download-data/download_pdb.wic
 create mode 100644 pre-process/download-data/download_smiles_ligand_db.wic

diff --git a/pre-process/download-data/download_pdb.wic b/pre-process/download-data/download_pdb.wic
new file mode 100644
index 00000000..6ede9807
--- /dev/null
+++ b/pre-process/download-data/download_pdb.wic
@@ -0,0 +1,73 @@
+inputs:
+  pdb_id:
+    type: string
+    format:
+    - edam:format_1476
+
+steps:
+  config_tag_pdb:
+    in:
+      pdb_id: pdb_id
+      #filter: "False" # False = do not 'clean' the pdb file.
+    out:
+    - output_config_string: !& pdb_id_str
+  pdb:
+    in:
+      output_pdb_path: !ii protein_models.pdb
+      config: !* pdb_id_str
+      #config: !ii {pdb_code: 1aki} # from tutorial
+      #config: !ii {pdb_code: 1enh} # "Structural Studies of the Engrailed Homeodomain" https://pubs.acs.org/doi/10.1021/acs.jpcb.8b02144
+
+      # Chignolin model mini-protein
+      #config: !ii {pdb_code: 1uao} # "10 residue folded peptide designed by segment statistics" http://dx.doi.org/10.1016/j.str.2004.05.022
+
+      # Trp-cage model mini-protein (use 1l2y; 2m7d 2m7c 6d37 have chainbreaks and 6d37 is a hexamer)
+      #config: !ii {pdb_code: 1l2y} # "Designing a 20-residue protein." http://dx.doi.org/10.1038/nsb798
+      #config: !ii {pdb_code: 2m7d} # "Folding Dynamics and Pathways of the Trp-Cage Miniproteins" https://doi.org/10.1021/bi501021r
+      #config: !ii {pdb_code: 2m7c} # "Circular Permutation of the Trp-cage: Fold Rescue upon Addition of a Hydrophobic Staple" http://dx.doi.org/10.1039/C3RA43674H
+      #config: !ii {pdb_code: 6d37} # "Trp-cage tr16b R16Nva : Elimination of pH Dependent Interactions" https://doi.org/10.1002/bip.23260
+
+      # Other model mini-proteins
+      #config: !ii {pdb_code: 5kwp} # "Accurate de novo design of hyperstable constrained peptides." http://dx.doi.org/10.1038/nature19791
+      #config: !ii {pdb_code: 6b17} # "Design of a short thermally stable alpha-helix embedded in a macrocycle" http://dx.doi.org/10.1002/cbic.201800026
+      #config: !ii {pdb_code: 1bzv} # "The solution structure of a superpotent B-chain-shortened single-replacement insulin analogue." http://dx.doi.org/10.1110/ps.8.3.499
+
+      # Trypsin
+      #config: !ii {pdb_code: 1trn} # Dimer. This does not download residue 151, but residue 151 is in the pdb file if you download it using a web browser! ???
+      #config: !ii {pdb_code: 1ntp}
+      #config: !ii {pdb_code: 1bty} # "IndexError: list index out of range" in str_check_add_hydrogens
+  extract_model:
+    in:
+      config: !ii
+        models: [1]
+    # out:
+    # - output_structure_path: !& protein.pdb
+  python_script:
+    in:
+      script: !ii ../scripts/atomselect.py
+      dockerPull: !ii jakefennick/atomselect
+      # Remove any unknown residues that weren't already 'cleaned' in the `pdb` step above.
+      selection_string: !ii not resname UNK
+    out:
+    - output_pdb_path: !& protein.pdb
+
+wic:
+  graphviz:
+    label: Molecular\nModeling
+  steps:
+    (1, config_tag_pdb):
+      wic:
+        graphviz:
+          label: 'Specify PDB Code'
+    (2, pdb):
+      wic:
+        graphviz:
+          label: 'Download and Clean\nPDB File'
+    (3, extract_model):
+      wic:
+        graphviz:
+          label: 'Extract First Model'
+    (4, python_script):
+      wic:
+        graphviz:
+          label: 'Remove Unknown Residues'
diff --git a/pre-process/download-data/download_smiles_ligand_db.wic b/pre-process/download-data/download_smiles_ligand_db.wic
new file mode 100644
index 00000000..16ad0497
--- /dev/null
+++ b/pre-process/download-data/download_smiles_ligand_db.wic
@@ -0,0 +1,52 @@
+inputs:
+  path:
+    type: string
+  query:
+    type: string
+  max_row:
+    type: int
+  smiles_column:
+    type: string
+  binding_data_column:
+    type: string
+  convert_Kd_dG:
+    type: boolean
+  output_txt_path:
+    type: string
+
+outputs:
+  output_txt_path:
+    type: File
+    format: edam:format_2330
+    outputSource: download_smiles_ligand_db__step__2__generate_conformers/output_txt_path
+
+steps:
+  wget_xlsx:
+    in:
+      url: path
+  generate_conformers:
+    in:
+      #input_excel_path: # inferred
+      # query syntax: `column name` 'column value'
+      query: query #"`Standard Type` == 'Kd' and `duplicate-type-classifier` == 'unique'"
+      max_row: max_row #1 #25 # Use 1 for CI
+      smiles_column: smiles_column #SMILES
+      binding_data_column: binding_data_column #Standard Value
+      convert_Kd_dG: convert_Kd_dG #'True'
+      output_txt_path: output_txt_path
+      output_sdf_path: !ii ligand_conformers.sdf
+    out:
+    - output_sdf_path: !& ligand_conformers.sdf
+
+wic:
+  graphviz:
+    label: Download Smiles\nLigand Database
+  steps:
+    (1, wget_xlsx):
+      wic:
+        graphviz:
+          label: Download Excel File
+    (2, generate_conformers):
+      wic:
+        graphviz:
+          label: Query Spreadsheet\nGenerate Conformers