-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
2fce4e8
commit 2093dc4
Showing
14 changed files
with
385 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
import sys | ||
|
||
print("Lecture du fichier {}".format(sys.argv[1])) | ||
f = open(sys.argv[1], "r") | ||
sam1 = f.readlines() | ||
f.close() | ||
|
||
for i in range(0, len(sam1)): | ||
if sam1[i][0] != "@": | ||
break | ||
sam1 = sam1[i:] | ||
print("Nombre d'alignements dans ce fichier : {}".format(len(sam1))) | ||
print("Lecture du fichier {}".format(sys.argv[2])) | ||
f = open(sys.argv[2], "r") | ||
sam2 = f.readlines() | ||
f.close() | ||
for i in range(0, len(sam2)): | ||
if sam2[i][0] != "@": | ||
break | ||
sam2 = sam2[i:] | ||
print("Nombre d'alignements dans ce fichier : {}".format(len(sam2))) | ||
print("Comparaison des deux SAM") | ||
sam1.sort() | ||
sam2.sort() | ||
meme_chr = 0 | ||
exact = 0 | ||
aprox = 0 | ||
j = 0 | ||
i = 0 | ||
diff = 0 | ||
output = open("liste_reads_diff", "w") | ||
while 1: | ||
if i >= len(sam1) - 1 or j >= len(sam2) - 1: | ||
break | ||
if sam1[i].split()[0] == sam2[j].split()[0]: | ||
if sam1[i].split()[2] == sam2[j].split()[2]: | ||
meme_chr += 1 | ||
else: | ||
diff += 1 | ||
output.write(sam1[i].split()[0] + "\n") | ||
if sam1[i].split()[3] == sam2[j].split()[3]: | ||
exact += 1 | ||
elif abs(int(sam1[i].split()[3]) - int(sam2[j].split()[3])) < 5000: | ||
aprox += 1 | ||
j += 1 | ||
i += 1 | ||
elif sam1[i].split()[0] > sam2[j].split()[0]: | ||
j += 1 | ||
elif sam1[i].split()[0] < sam2[j].split()[0]: | ||
i += 1 | ||
print("Nombre d'alignements differents : {}".format(diff)) | ||
print("Nombre d'alignements sur le meme chr : {}".format(meme_chr)) | ||
print("Nombre d'alignements exacts en commun : {}".format(exact)) | ||
print("Nombre d'alignements approximativement les memes : {}".format(aprox)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
import sys | ||
|
||
f = open(sys.argv[1], "r") | ||
sam = f.readlines() | ||
|
||
D = {} | ||
for aln in sam: | ||
if aln[0] == "@": | ||
continue | ||
aln = aln.split() | ||
D[aln[0]] = 1 | ||
print("Nombre de reads alignes : {}".format(len(D.keys()))) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
import sys | ||
from Bio import SeqIO | ||
|
||
#usage : python comptage_transcrits.py aln.sam ref.fa | ||
|
||
f = open(sys.argv[1], "r") | ||
sam = f.readlines() | ||
f.close() | ||
|
||
genes = {} | ||
for ref in SeqIO.parse(sys.argv[2], "fasta"): | ||
genes[ref.description.split()[0]] = ref.description.split()[3].split(":")[1] | ||
|
||
output = open("liste_reads_genes_transcrits.txt", "w") | ||
D = {} | ||
G = {} | ||
for aln in sam: | ||
if aln[0] == "@": | ||
continue | ||
aln = aln.split() | ||
D[aln[2]] = 1 | ||
G[genes[aln[2]]] = 1 | ||
output.write(aln[0] + "\t" + genes[aln[2]] + "\t" + aln[2] + "\n") | ||
print("Nombre de transcrits touches : {}".format(len(D.keys()))) | ||
print("Nombre de genes touches : {}".format(len(G.keys()))) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
import sys | ||
|
||
#usage: python correspondance.py aln_megablast.sam | ||
|
||
f = open(sys.argv[1], "r") | ||
sam = f.readlines() | ||
f.close() | ||
|
||
output = open(sys.argv[1][0:-4] + "_names.sam", "w") | ||
ref = [] | ||
for aln in sam: | ||
if aln[0:3] == "@HD" or aln[0:3] == "@PG": | ||
output.write(aln) | ||
continue | ||
if aln[0:3] == "@SQ": | ||
output.write(aln) | ||
ref.append(aln.split()[1][3:]) | ||
continue | ||
tmp = aln.split() | ||
tmp[0] = ref[int(tmp[0].split("_")[1]) - 1] | ||
output.write("\t".join(tmp) + "\n") | ||
output.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
#-*- coding: utf-8 -*- | ||
import sys, time | ||
from Bio import SeqIO | ||
|
||
#usage : python filtre_transcript.py ref_transcript.fa | ||
|
||
start_time = time.time() | ||
output = open("transcript.fa", "w") | ||
compt = 0 | ||
for record in SeqIO.parse(sys.argv[1], "fasta"): | ||
if len(record.seq) > 200: | ||
output.write(record.format("fasta")) | ||
else: | ||
compt += 1 | ||
print("Done. Total time in sec : " + str(time.time() - start_time)) | ||
print("On a retiré " + str(compt) + " séquences < 200 nucléotides") | ||
output.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
#!/usr/bin/env python | ||
# -*- coding: utf-8 -*- | ||
|
||
import sys | ||
|
||
#usage : python mean_evalue.py aln.sam | ||
|
||
f = open(sys.argv[1], "r") | ||
sam = f.readlines() | ||
f.close() | ||
|
||
evalue = 0 | ||
nb_aln = 0 | ||
max_evalue = 0.0 | ||
min_evalue = 0.0 | ||
for aln in sam: | ||
if aln[0] == "@": | ||
continue | ||
nb_aln += 1 | ||
aln = aln.split() | ||
for i in range(0, len(aln)): | ||
if aln[i].split(":")[0] == "EV": | ||
evalue += float(aln[i].split(":")[2]) | ||
if float(aln[i].split(":")[2]) > max_evalue: | ||
max_evalue = float(aln[i].split(":")[2]) | ||
if float(aln[i].split(":")[2]) < min_evalue: | ||
min_evalue = float(aln[i].split(":")[2]) | ||
print("evalue moyenne : {}".format(evalue/float(nb_aln))) | ||
print("evalue min : {}, evalue max : {}".format(min_evalue, max_evalue)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
import sys | ||
|
||
c = 0 | ||
output = open(sys.argv[1] + "_id" + sys.argv[2] + ".sam", "w") | ||
f = open(sys.argv[1], "r") | ||
alns = f.readlines() | ||
for aln in alns: | ||
if aln[0] == "@": | ||
output.write(aln) | ||
else: | ||
tmp = aln.split() | ||
for i in range(0, len(tmp)): | ||
if tmp[i].split(":")[0] == "PI" and float(tmp[i].split(":")[2]) >= float(sys.argv[2]): | ||
output.write(aln) | ||
output.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
import sys | ||
|
||
#usage: python parse_sam.py to_parse.sam liste_reads.txt output_parsed.sam | ||
|
||
f = open(sys.argv[1], "r") | ||
sam = f.readlines() | ||
f.close() | ||
f = open(sys.argv[2], "r") | ||
liste = f.readlines() | ||
f.close() | ||
for i in range(0, len(liste)): | ||
liste[i] = liste[i][:-1] | ||
output = open(sys.argv[3], "w") | ||
for aln in sam: | ||
if aln[0] == "@": | ||
output.write(aln) | ||
else: | ||
tmp = aln.split() | ||
if tmp[0] in liste: | ||
output.write(aln) | ||
output.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
import sys, re | ||
|
||
#python parse_evalue.py aln.sam taille | ||
|
||
f = open(sys.argv[1], "r") | ||
alns = f.readlines() | ||
output = open(sys.argv[1][0:-4] + "_size" + sys.argv[2] + ".sam", "w") | ||
for aln in alns: | ||
if aln[0] == "@": | ||
output.write(aln) | ||
continue | ||
tmp = aln.split() | ||
if len(tmp[9]) > int(sys.argv[2]): | ||
output.write(aln) | ||
f.close() | ||
output.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
import sys | ||
|
||
#usage ; python qualite_graph.py aln.sam liste_reads.txt | ||
|
||
f = open(sys.argv[1], "r") | ||
sam = f.readlines() | ||
f.close() | ||
|
||
relations = {} | ||
g = open(sys.argv[2], "r") | ||
liste = g.readlines() | ||
g.close() | ||
|
||
for r in liste: | ||
if r.split()[0] in relations: | ||
relations[r.split()[0]].append([r.split()[1]]) | ||
else: | ||
relations[r.split()[0]] = r.split()[1] | ||
|
||
nb_aln = 0 | ||
good = 0 | ||
for aln in sam: | ||
if aln[0] == "@": | ||
continue | ||
nb_aln += 1 | ||
if relations[aln.split()[0]] == relations[aln.split()[2]]: | ||
good += 1 | ||
print("sur {} relations : ".format(nb_aln)) | ||
print("on a {} relations qui sont bonnes".format(good)) | ||
print("donc {} qui sont mauvaises".format(nb_aln - good)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
import sys | ||
|
||
# usage : python reads_comparaison.py graphmap.sam bwa.sam | ||
|
||
f = open(sys.argv[1], "r") | ||
graphmap = f.readlines() | ||
f.close() | ||
f = open(sys.argv[2], "r") | ||
bwa = f.readlines() | ||
f.close() | ||
g = [] | ||
b = [] | ||
for aln in graphmap: | ||
if aln[0] == "@": | ||
continue | ||
if aln.split()[0] not in g: | ||
g.append(aln.split()[0]) | ||
for aln in bwa: | ||
if aln[0] == "@": | ||
continue | ||
if aln.split()[0] not in b: | ||
b.append(aln.split()[0]) | ||
commun = 0 | ||
for i in g: | ||
if i in b: | ||
commun += 1 | ||
print("Nombre d'alignements total bwa : {}".format(len(b))) | ||
print("Nombre d'alignements total graphmap : {}".format(len(g))) | ||
print("Nombre d'alignements par les deux : {}".format(commun)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
from Bio import SeqIO | ||
import sys | ||
|
||
#usage : python unmapped_reads.py aln.sam reads.fa | ||
|
||
f = open(sys.argv[1], "r") | ||
sam = f.readlines() | ||
output = open("unmapped_reads" + sys.argv[1][0:-4] + ".fa", "w") | ||
D = [] | ||
for aln in sam: | ||
if aln.split()[0][0] == "@": | ||
continue | ||
if aln.split()[0] not in D: | ||
D.append(aln.split()[0]) | ||
for rec in SeqIO.parse(sys.argv[2], "fasta"): | ||
if rec.name not in D: | ||
output.write(rec.format('fasta')) | ||
output.close() | ||
f.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
import sys | ||
|
||
#usage : python verif_composantes.py liste_reads_genes verif_graph.txt | ||
|
||
f = open(sys.argv[1], "r") | ||
ref = f.readlines() | ||
f.close() | ||
g = open(sys.argv[2], "r") | ||
graph = g.readlines() | ||
g.close() | ||
|
||
D = {} # read : gene,transcrit | ||
for r in ref: | ||
D[r.split()[0]] = [r.split()[1], r.split()[2]] | ||
|
||
res = [] | ||
genes = [] | ||
transcrits = [] | ||
for comp in graph: | ||
comp = comp.split() | ||
tmp_g = [] | ||
tmp_t = [] | ||
for read in comp: | ||
tmp_g.append(D[read][0]) | ||
tmp_t.append(D[read][1]) | ||
res.append(len(set(tmp_g))) | ||
[genes.append(x) for x in list(set(tmp_g))] | ||
[transcrits.append(x) for x in list(set(tmp_t))] | ||
|
||
print("Nombre de transcrits : {}".format(len(set(transcrits)))) | ||
print("Nombre de genes : {}".format(len(set(genes)))) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
import sys | ||
from Bio import SeqIO | ||
|
||
#usage : python verif_graph.py verif_graph.txt aln.sam ref.fa chr | ||
|
||
composantes = 0 | ||
same_gene = 0 | ||
diff_gene = 0 | ||
|
||
D = {} | ||
f = open(sys.argv[2], "r") | ||
sam = f.readlines() | ||
f.close() | ||
for aln in sam: | ||
if aln[0] == "@": | ||
continue | ||
tmp = aln.split() | ||
D[tmp[0]] = tmp[2] | ||
#D : dictionnaire {read1:transcrit1, read2:transcrit2, ...} | ||
|
||
genes = [] | ||
nb_transcrits = [] | ||
|
||
reference = {} | ||
for ref in SeqIO.parse(sys.argv[3], "fasta"): | ||
tmp = ref.description | ||
if tmp.split()[2].split(":")[2] != sys.argv[4]: | ||
continue | ||
reference[tmp.split()[0]] = tmp.split()[3].split(":")[1] | ||
if reference[tmp.split()[0]] not in nb_transcrits: | ||
nb_transcrits.append(reference[tmp.split()[0]]) | ||
if tmp.split()[3].split(":")[1] not in genes: | ||
genes.append(tmp.split()[3].split(":")[1]) | ||
#reference : dictionaire {transcrit1 : gene, transcrit2:gene, ...} | ||
|
||
c = 0 | ||
transcrits = {} | ||
f = open(sys.argv[1], "r") | ||
graph = f.readlines() | ||
for connexes in graph: | ||
connexes = connexes.split() | ||
composantes += 1 | ||
flag = True | ||
ref = reference[D[connexes[0]]] | ||
for read in connexes: | ||
c += 1 | ||
if D[read] in transcrits: | ||
transcrits[D[read]] += 1 | ||
else: | ||
transcrits[D[read]] = 1 | ||
if reference[D[read]] != ref: | ||
print(len(connexes)) | ||
diff_gene += 1 | ||
flag = False | ||
break | ||
ref = reference[D[read]] | ||
if flag: | ||
same_gene += 1 | ||
f.close() | ||
print("Nombre de noeuds : {}".format(c)) | ||
print("Nombre de composantes du graphe : {}".format(composantes)) | ||
print("Nombre de composantes composees uniquement du meme gene : {}".format(same_gene)) | ||
print("Nombre de fausses composantes : {}".format(diff_gene)) | ||
print("Nombre de transcrits : {}, {}".format(len(transcrits), len(nb_transcrits))) | ||
print("Nombre de genes dans le graphe : {}".format(len(genes))) |