From 99088639d77435336813ba2de32b6b4dd6cbc6c1 Mon Sep 17 00:00:00 2001 From: Brandon Duane Walker Date: Thu, 9 May 2024 16:56:10 -0400 Subject: [PATCH] pre-process --- pre-process/download-data/download_pdb.wic | 73 +++++++++++++++++++ .../download_smiles_ligand_db.wic | 52 +++++++++++++ 2 files changed, 125 insertions(+) create mode 100644 pre-process/download-data/download_pdb.wic create mode 100644 pre-process/download-data/download_smiles_ligand_db.wic diff --git a/pre-process/download-data/download_pdb.wic b/pre-process/download-data/download_pdb.wic new file mode 100644 index 00000000..6ede9807 --- /dev/null +++ b/pre-process/download-data/download_pdb.wic @@ -0,0 +1,73 @@ +inputs: + pdb_id: + type: string + format: + - edam:format_1476 + +steps: + config_tag_pdb: + in: + pdb_id: pdb_id + #filter: "False" # False = do not 'clean' the pdb file. + out: + - output_config_string: !& pdb_id_str + pdb: + in: + output_pdb_path: !ii protein_models.pdb + config: !* pdb_id_str + #config: !ii {pdb_code: 1aki} # from tutorial + #config: !ii {pdb_code: 1enh} # "Structural Studies of the Engrailed Homeodomain" https://pubs.acs.org/doi/10.1021/acs.jpcb.8b02144 + + # Chignolin model mini-protein + #config: !ii {pdb_code: 1uao} # "10 residue folded peptide designed by segment statistics" http://dx.doi.org/10.1016/j.str.2004.05.022 + + # Trp-cage model mini-protein (use 1l2y; 2m7d 2m7c 6d37 have chainbreaks and 6d37 is a hexamer) + #config: !ii {pdb_code: 1l2y} # "Designing a 20-residue protein." http://dx.doi.org/10.1038/nsb798 + #config: !ii {pdb_code: 2m7d} # "Folding Dynamics and Pathways of the Trp-Cage Miniproteins" https://doi.org/10.1021/bi501021r + #config: !ii {pdb_code: 2m7c} # "Circular Permutation of the Trp-cage: Fold Rescue upon Addition of a Hydrophobic Staple" http://dx.doi.org/10.1039/C3RA43674H + #config: !ii {pdb_code: 6d37} # "Trp-cage tr16b R16Nva : Elimination of pH Dependent Interactions" https://doi.org/10.1002/bip.23260 + + # Other model mini-proteins + #config: !ii {pdb_code: 5kwp} # "Accurate de novo design of hyperstable constrained peptides." http://dx.doi.org/10.1038/nature19791 + #config: !ii {pdb_code: 6b17} # "Design of a short thermally stable alpha-helix embedded in a macrocycle" http://dx.doi.org/10.1002/cbic.201800026 + #config: !ii {pdb_code: 1bzv} # "The solution structure of a superpotent B-chain-shortened single-replacement insulin analogue." http://dx.doi.org/10.1110/ps.8.3.499 + + # Trypsin + #config: !ii {pdb_code: 1trn} # Dimer. This does not download residue 151, but residue 151 is in the pdb file if you download it using a web browser! ??? + #config: !ii {pdb_code: 1ntp} + #config: !ii {pdb_code: 1bty} # "IndexError: list index out of range" in str_check_add_hydrogens + extract_model: + in: + config: !ii + models: [1] + # out: + # - output_structure_path: !& protein.pdb + python_script: + in: + script: !ii ../scripts/atomselect.py + dockerPull: !ii jakefennick/atomselect + # Remove any unknown residues that weren't already 'cleaned' in the `pdb` step above. + selection_string: !ii not resname UNK + out: + - output_pdb_path: !& protein.pdb + +wic: + graphviz: + label: Molecular\nModeling + steps: + (1, config_tag_pdb): + wic: + graphviz: + label: 'Specify PDB Code' + (2, pdb): + wic: + graphviz: + label: 'Download and Clean\nPDB File' + (3, extract_model): + wic: + graphviz: + label: 'Extract First Model' + (4, python_script): + wic: + graphviz: + label: 'Remove Unknown Residues' diff --git a/pre-process/download-data/download_smiles_ligand_db.wic b/pre-process/download-data/download_smiles_ligand_db.wic new file mode 100644 index 00000000..16ad0497 --- /dev/null +++ b/pre-process/download-data/download_smiles_ligand_db.wic @@ -0,0 +1,52 @@ +inputs: + path: + type: string + query: + type: string + max_row: + type: int + smiles_column: + type: string + binding_data_column: + type: string + convert_Kd_dG: + type: boolean + output_txt_path: + type: string + +outputs: + output_txt_path: + type: File + format: edam:format_2330 + outputSource: download_smiles_ligand_db__step__2__generate_conformers/output_txt_path + +steps: + wget_xlsx: + in: + url: path + generate_conformers: + in: + #input_excel_path: # inferred + # query syntax: `column name` 'column value' + query: query #"`Standard Type` == 'Kd' and `duplicate-type-classifier` == 'unique'" + max_row: max_row #1 #25 # Use 1 for CI + smiles_column: smiles_column #SMILES + binding_data_column: binding_data_column #Standard Value + convert_Kd_dG: convert_Kd_dG #'True' + output_txt_path: output_txt_path + output_sdf_path: !ii ligand_conformers.sdf + out: + - output_sdf_path: !& ligand_conformers.sdf + +wic: + graphviz: + label: Download Smiles\nLigand Database + steps: + (1, wget_xlsx): + wic: + graphviz: + label: Download Excel File + (2, generate_conformers): + wic: + graphviz: + label: Query Spreadsheet\nGenerate Conformers