-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprocess_mpnn_sequences.py
82 lines (62 loc) · 2.61 KB
/
process_mpnn_sequences.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#!/software/conda/envs/pyrosetta/
import os, sys
import numpy as np
import glob
import shutil
import argparse
import string
import random
import re
from pathlib import Path
##########################################################################################
##########################################################################################
parser = argparse.ArgumentParser(description='This script parses MPNN outputs sequences and concatenates them input AF2 parsable fasta')
parser.add_argument("--number_of_chains", type=str, required=True, help="number of chains ")
args = parser.parse_args(args=None if sys.argv[1:] else ['--help'])
number_of_chains = args.number_of_chains
##########################################################################################
#Parse MPNN .fa outputs and returns dict of name and seq
def parse_fasta(fasta, number_of_chains):
with open(fasta, "r") as f:
count = 0
identifier = None
sequence = []
identifiers = []
comp_seq = []
for line in f:
line = line.rstrip()
if line.startswith('>'):
sequence = []
count += 1
line1 = line.split()
score = [name for name in line1 if 'score' in name][0].rstrip(",")
identifier = Path(fasta).stem + "_" + str(count) + "_" + score.replace("=", "_")
sequence = []
identifiers.append(identifier)
else:
sequence.append(line)
seq = sequence[0].split("/")
A_seq = seq[0]
complete_sequence = ( int(number_of_chains) - 1 ) * ( A_seq + 'UUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUU' ) + A_seq
comp_seq.append(complete_sequence)
dict_id_seq = dict(zip(identifiers, comp_seq))
return dict_id_seq
#############################################
#############################################
def main():
pdb_files = glob.glob( "*.pdb", recursive = True)
print("About to process all these pdbs sequences: " + str(pdb_files))
full_dict = {}
for pdb in pdb_files:
print("Now working on: " + str(pdb))
#parse MPNN output sequences
path_to_fasta = "output/temp_0.1/seqs/" + os.path.splitext(pdb)[0] + ".fa"
parsed_seqs = parse_fasta(path_to_fasta, number_of_chains )
print(parsed_seqs)
full_dict.update(parsed_seqs)
f = open("sequences_to_fold.fasta", "w")
for k in full_dict.keys():
f.write(">" + str(k) + "\n" + str(full_dict[k]) + "\n")
f.close()
print ("All done!")
main()