-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathrandom_peptides_by_peptide_length.py
73 lines (59 loc) · 2.5 KB
/
random_peptides_by_peptide_length.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# -*- coding: utf-8 -*-
"""
@author: HXu8
"""
import re
import random
import numpy as np
import pandas as pd
from collections import Counter
import random_pick
import read_proteome_uniprot
def random_peptides_by_peptide_length(): #randomly generate peptides from the proteome
proteome = read_proteome_uniprot()
iedb_csv = "eluted ligands or binding affinity datasets" #change as your training dataset filepath (only including positive)
iedb_df = pd.read_csv(iedb_csv, sep=',', skiprows=0, low_memory=False, dtype=object)
iedb_df = np.array(iedb_df)
all_positive_peptide = list(set([p[0] for p in iedb_df]))
data_dict = {}
for i in range(len(iedb_df)):
allele = iedb_df[i][4]
if allele not in data_dict.keys():
data_dict[allele] = [iedb_df[i].tolist()]
else:
data_dict[allele].append(iedb_df[i].tolist())
all_neg = []
for allele in data_dict.keys():
print(allele)
traing_data = data_dict[allele]
all_length = [len(traing_data[j][0]) for j in range(len(traing_data))]
all_length_times = Counter(all_length)
all_probabilities = []
for kmer in [8,9,10,11,12,13]:
try:
probabilities = all_length_times[kmer]
except:
probabilities = 0
all_probabilities.append(probabilities)
pep_seq = []
while len(pep_seq) < 10*len(traing_data): # you can change the fold number
length = random_pick([8,9,10,11,12,13],all_probabilities)
accession = random.choice(list(proteome.keys()))
protein = proteome[accession]
# protein = random.choice(list(proteome.values()))
if len(protein) < length:
continue
pep_start = random.randint(0, len(protein) - length)
pep = protein[pep_start:pep_start + length]
if set(list(pep)).difference(list('ACDEFGHIKLMNPQRSTVWY')):
continue
if pep in all_positive_peptide:
print('In positive peptides')
continue
if pep not in pep_seq:
pep_seq.append([accession, pep])
for k in pep_seq:
all_neg.append([allele, k[0], k[1]])
return all_neg
if __name__ == '__main__':
all_neg = random_peptides_by_peptide_length()