-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathannotation1.py
executable file
·213 lines (186 loc) · 6.28 KB
/
annotation1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
#! /usr/bin/env python
# -*- coding:Utf-8 -*-
#Author: Pier-Luc Plante
#License: GPL
"""
This script provide automatic annotation for small genomes and contig.
An internet connection, blastx and a protein DB are required.
To use:
annotation1 ORF.gff sequence.fasta nameOfOutput.gk
You can modify some variable like the DB path at the top of the script.
We do not provide any guarantee on the quality of the results.
"""
db="/pubseq/genbank/nr-20120303/nr";
blastxOutFile="ORF.blastx";
blastxOutFileSorted="ORF.blastx.sorted"
#These step are necessary:
#1.1: extraire les ORF et les mettre dans un fichier fasta.
#1.2: blastx
#2: parse blast results
#3: load web page from ncbi or if no results seems ok, leave blank or write something like "NEED TO BE DONE MANUALLY"
#4: get title
#5: assign title and id to gene
import sys, urllib, os
#A custom class to facilitate the manipulation of the information.
class Gene(object):
def __init__(self):
ident="";
note="";
comment="";
name="";
length=0;
pourcentageID=0;
alignementLength=0;
strand="";
left=0;
right=0;
if(len(sys.argv)!=4):
print __doc__
sys.exit(1)
listOfORF={};
#step 1: extract ORF and place them in a fasta file.
numberOfORF=0;
fasta=open("ORF.fasta",'w');
detect=0;
erreur=0;
tableauGene={}
for lines in open(sys.argv[1]):
if("#" in lines.split() and "Length" in lines.split()):
detect=1;
print("start to read Genes")
elif (detect==1):
try:
name=lines.split()[0];
left=int(lines.split()[2]);
right=int(lines.split()[3]);
length=int(lines.split()[4]);
strand=lines.split()[1];
sequence=open(sys.argv[2]);
erreur=0;
header=(">"+name+"|"+str(left)+"|"+str(right)+"\n");
fasta.write(header);
tableauGene[numberOfORF]=Gene();
tableauGene[numberOfORF].name=name;
tableauGene[numberOfORF].length=length;
tableauGene[numberOfORF].strand=strand;
tableauGene[numberOfORF].left=left;
tableauGene[numberOfORF].right=right;
tableauGene[numberOfORF].ident="";
tableauGene[numberOfORF].note="";
tableauGene[numberOfORF].comment="";
tableauGene[numberOfORF].length=0;
tableauGene[numberOfORF].pourcentageID=0;
numberOfORF=numberOfORF+1;
except:
print("The following line of the .gff file has not been treated because of an unhandled character:")
print(lines);
print("The program will now contiue.")
erreur=1;
actualPos=0
inSeq=0
#print(lines.split()[0]+"\t"+lines.split()[2])
#on va maintenant chercher la sequence dans le fasta. J'ai mis -1 dans les conditions
#pour le caractere de fin de ligne.
while (1 and erreur==0):
ch=sequence.readline();
#print(ch);
if (ch==""):
#print("Erreur");
sequence.close();
break;
elif (ch[0]==">"):
#print("Case >");
continue;
elif ((len(ch)-1+actualPos)<left and inSeq==0): #le égale va ici??
actualPos=len(ch)+actualPos-1;
#print("Actual Pos Increased"+"\t"+str(actualPos))
elif ((len(ch)-1+actualPos)>=left and inSeq==0):
ch=ch[left-actualPos-1:];
actualPos=left;
#print("in the seq!");
inSeq=1;
fasta.write(ch);
actualPos=actualPos+len(ch)-2;
elif (inSeq==1):
if(len(ch)-1+actualPos<=right):
actualPos=actualPos+len(ch)-1;
fasta.write(ch);
elif(len(ch)-1+actualPos>right):
ch=ch[0:right-actualPos]+"\n";
fasta.write(ch);
sequence.close();
#print("fin")
break;
fasta.close();
#step 2: BLASTX!!!! probably the longest step...multithread!
command="time blastx -query ORF.fasta -db "+db+" -outfmt 6 -query_gencode 11 -num_threads 10 >"+blastxOutFile;
print("The folowing command is now executed by the system. Its long...");
print(command);
os.system(command);
print("Done");
del command;
#step 3: parse blast results
#sort -k1n -k3nr -k4nr blastxOutFile
print("Now sorting the blast output")
command2="sort -k1n -k3nr -k4nr "+blastxOutFile+" >"+blastxOutFileSorted;
os.system(command2);
print("Done");
del command2;
#step 4
#For each ORF, we must find a name and a corresponding sequence.
#That's the part where we use the internet connection because we search the information on the NCBI WebSite.
#For this, we use the blastx output file and we consider that it is parsed correcltly (most relevant result first)
print("Now getting information from the NCI website")
currentPos=0;
fileBlastx=open(blastxOutFile, 'r')
lines=fileBlastx.readline();
while (1):
if(lines==""):
break
currentPos=int((lines.split()[0]).split("|")[0])-1; #minus 1 to acces the good position in the table;
if(float(lines.split()[2])>50.00): #The id% must be more than 50%.
tableauGene[currentPos].pourcentageID=float(lines.split()[2]);
if (float(lines.split()[3])>(tableauGene[currentPos].length*0.5/3)): #The length ratio must be over 50%.
tableauGene[currentPos].alignementLength=float(lines.split()[3]);
try:
entryNumber=(lines.split()[1]).split('|')[3];
url="http://www.ncbi.nlm.nih.gov/protein/"+entryNumber;
webPage=urllib.urlopen(url);
except:
entryNumber="NEED TO BE DONE MANUALLY";
else:
entryNumber="NEED TO BE DONE MANUALLY";
tableauGene[currentPos].ident=entryNumber;
if(webPage.readline()!="" or entryNumber!="NEED TO BE DONE MANUALLY"):
for lines in webPage:
if ("<title>" in lines):
tableauGene[currentPos].name=(lines.split(">")[1]).split("]")[0]+"]";
break;
#enf of for...
#end of if...
while(1):
lines=fileBlastx.readline();
if (lines=="" or (int((lines.split()[0]).split("|")[0])!=currentPos+1)):
break
#end of while.
#end of while.
#It is now time to create the .gbk file!
print("done")
print("Now sending information to the output file")
gbk=open(sys.argv[3], 'w')
for entry in tableauGene:
line="FT"+" "+"gene"+" ";
if tableauGene[entry].strand=="-":
line=line+"complement""("+str(tableauGene[entry].left)+".."+str(tableauGene[entry].right)+")"+"\n";
else:
line=line+str(tableauGene[entry].left)+".."+str(tableauGene[entry].right)+"\n"
gbk.write(line);
line="FT"+" "+"/note="+tableauGene[entry].name+"\n";
gbk.write(line);
line="FT"+" "+'''/id="'''+tableauGene[entry].ident+'''"'''+"\n";
gbk.write(line);
line="FT"+" "+'''/comment="'''+str(tableauGene[entry].pourcentageID)+'''"'''+"\n";
gbk.write(line);
gbk.close();
print("Annotation is finished, I recommand you to verify all the results.")
print("Have a good day!")