Skip to content

Commit

Permalink
added scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
AmbreThomas committed Aug 30, 2017
1 parent 2fce4e8 commit 2093dc4
Show file tree
Hide file tree
Showing 14 changed files with 385 additions and 0 deletions.
54 changes: 54 additions & 0 deletions compare_sam.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import sys

print("Lecture du fichier {}".format(sys.argv[1]))
f = open(sys.argv[1], "r")
sam1 = f.readlines()
f.close()

for i in range(0, len(sam1)):
if sam1[i][0] != "@":
break
sam1 = sam1[i:]
print("Nombre d'alignements dans ce fichier : {}".format(len(sam1)))
print("Lecture du fichier {}".format(sys.argv[2]))
f = open(sys.argv[2], "r")
sam2 = f.readlines()
f.close()
for i in range(0, len(sam2)):
if sam2[i][0] != "@":
break
sam2 = sam2[i:]
print("Nombre d'alignements dans ce fichier : {}".format(len(sam2)))
print("Comparaison des deux SAM")
sam1.sort()
sam2.sort()
meme_chr = 0
exact = 0
aprox = 0
j = 0
i = 0
diff = 0
output = open("liste_reads_diff", "w")
while 1:
if i >= len(sam1) - 1 or j >= len(sam2) - 1:
break
if sam1[i].split()[0] == sam2[j].split()[0]:
if sam1[i].split()[2] == sam2[j].split()[2]:
meme_chr += 1
else:
diff += 1
output.write(sam1[i].split()[0] + "\n")
if sam1[i].split()[3] == sam2[j].split()[3]:
exact += 1
elif abs(int(sam1[i].split()[3]) - int(sam2[j].split()[3])) < 5000:
aprox += 1
j += 1
i += 1
elif sam1[i].split()[0] > sam2[j].split()[0]:
j += 1
elif sam1[i].split()[0] < sam2[j].split()[0]:
i += 1
print("Nombre d'alignements differents : {}".format(diff))
print("Nombre d'alignements sur le meme chr : {}".format(meme_chr))
print("Nombre d'alignements exacts en commun : {}".format(exact))
print("Nombre d'alignements approximativement les memes : {}".format(aprox))
12 changes: 12 additions & 0 deletions comptage_reads.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import sys

f = open(sys.argv[1], "r")
sam = f.readlines()

D = {}
for aln in sam:
if aln[0] == "@":
continue
aln = aln.split()
D[aln[0]] = 1
print("Nombre de reads alignes : {}".format(len(D.keys())))
25 changes: 25 additions & 0 deletions comptage_transcrits.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import sys
from Bio import SeqIO

#usage : python comptage_transcrits.py aln.sam ref.fa

f = open(sys.argv[1], "r")
sam = f.readlines()
f.close()

genes = {}
for ref in SeqIO.parse(sys.argv[2], "fasta"):
genes[ref.description.split()[0]] = ref.description.split()[3].split(":")[1]

output = open("liste_reads_genes_transcrits.txt", "w")
D = {}
G = {}
for aln in sam:
if aln[0] == "@":
continue
aln = aln.split()
D[aln[2]] = 1
G[genes[aln[2]]] = 1
output.write(aln[0] + "\t" + genes[aln[2]] + "\t" + aln[2] + "\n")
print("Nombre de transcrits touches : {}".format(len(D.keys())))
print("Nombre de genes touches : {}".format(len(G.keys())))
22 changes: 22 additions & 0 deletions correspondance_megablast.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import sys

#usage: python correspondance.py aln_megablast.sam

f = open(sys.argv[1], "r")
sam = f.readlines()
f.close()

output = open(sys.argv[1][0:-4] + "_names.sam", "w")
ref = []
for aln in sam:
if aln[0:3] == "@HD" or aln[0:3] == "@PG":
output.write(aln)
continue
if aln[0:3] == "@SQ":
output.write(aln)
ref.append(aln.split()[1][3:])
continue
tmp = aln.split()
tmp[0] = ref[int(tmp[0].split("_")[1]) - 1]
output.write("\t".join(tmp) + "\n")
output.close()
17 changes: 17 additions & 0 deletions filtre_transcript.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#-*- coding: utf-8 -*-
import sys, time
from Bio import SeqIO

#usage : python filtre_transcript.py ref_transcript.fa

start_time = time.time()
output = open("transcript.fa", "w")
compt = 0
for record in SeqIO.parse(sys.argv[1], "fasta"):
if len(record.seq) > 200:
output.write(record.format("fasta"))
else:
compt += 1
print("Done. Total time in sec : " + str(time.time() - start_time))
print("On a retiré " + str(compt) + " séquences < 200 nucléotides")
output.close()
29 changes: 29 additions & 0 deletions mean_evalue.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import sys

#usage : python mean_evalue.py aln.sam

f = open(sys.argv[1], "r")
sam = f.readlines()
f.close()

evalue = 0
nb_aln = 0
max_evalue = 0.0
min_evalue = 0.0
for aln in sam:
if aln[0] == "@":
continue
nb_aln += 1
aln = aln.split()
for i in range(0, len(aln)):
if aln[i].split(":")[0] == "EV":
evalue += float(aln[i].split(":")[2])
if float(aln[i].split(":")[2]) > max_evalue:
max_evalue = float(aln[i].split(":")[2])
if float(aln[i].split(":")[2]) < min_evalue:
min_evalue = float(aln[i].split(":")[2])
print("evalue moyenne : {}".format(evalue/float(nb_aln)))
print("evalue min : {}, evalue max : {}".format(min_evalue, max_evalue))
15 changes: 15 additions & 0 deletions parse_identity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import sys

c = 0
output = open(sys.argv[1] + "_id" + sys.argv[2] + ".sam", "w")
f = open(sys.argv[1], "r")
alns = f.readlines()
for aln in alns:
if aln[0] == "@":
output.write(aln)
else:
tmp = aln.split()
for i in range(0, len(tmp)):
if tmp[i].split(":")[0] == "PI" and float(tmp[i].split(":")[2]) >= float(sys.argv[2]):
output.write(aln)
output.close()
21 changes: 21 additions & 0 deletions parse_sam.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import sys

#usage: python parse_sam.py to_parse.sam liste_reads.txt output_parsed.sam

f = open(sys.argv[1], "r")
sam = f.readlines()
f.close()
f = open(sys.argv[2], "r")
liste = f.readlines()
f.close()
for i in range(0, len(liste)):
liste[i] = liste[i][:-1]
output = open(sys.argv[3], "w")
for aln in sam:
if aln[0] == "@":
output.write(aln)
else:
tmp = aln.split()
if tmp[0] in liste:
output.write(aln)
output.close()
16 changes: 16 additions & 0 deletions parse_size.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import sys, re

#python parse_evalue.py aln.sam taille

f = open(sys.argv[1], "r")
alns = f.readlines()
output = open(sys.argv[1][0:-4] + "_size" + sys.argv[2] + ".sam", "w")
for aln in alns:
if aln[0] == "@":
output.write(aln)
continue
tmp = aln.split()
if len(tmp[9]) > int(sys.argv[2]):
output.write(aln)
f.close()
output.close()
30 changes: 30 additions & 0 deletions qualite_graph.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import sys

#usage ; python qualite_graph.py aln.sam liste_reads.txt

f = open(sys.argv[1], "r")
sam = f.readlines()
f.close()

relations = {}
g = open(sys.argv[2], "r")
liste = g.readlines()
g.close()

for r in liste:
if r.split()[0] in relations:
relations[r.split()[0]].append([r.split()[1]])
else:
relations[r.split()[0]] = r.split()[1]

nb_aln = 0
good = 0
for aln in sam:
if aln[0] == "@":
continue
nb_aln += 1
if relations[aln.split()[0]] == relations[aln.split()[2]]:
good += 1
print("sur {} relations : ".format(nb_aln))
print("on a {} relations qui sont bonnes".format(good))
print("donc {} qui sont mauvaises".format(nb_aln - good))
29 changes: 29 additions & 0 deletions reads_comparaison.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import sys

# usage : python reads_comparaison.py graphmap.sam bwa.sam

f = open(sys.argv[1], "r")
graphmap = f.readlines()
f.close()
f = open(sys.argv[2], "r")
bwa = f.readlines()
f.close()
g = []
b = []
for aln in graphmap:
if aln[0] == "@":
continue
if aln.split()[0] not in g:
g.append(aln.split()[0])
for aln in bwa:
if aln[0] == "@":
continue
if aln.split()[0] not in b:
b.append(aln.split()[0])
commun = 0
for i in g:
if i in b:
commun += 1
print("Nombre d'alignements total bwa : {}".format(len(b)))
print("Nombre d'alignements total graphmap : {}".format(len(g)))
print("Nombre d'alignements par les deux : {}".format(commun))
19 changes: 19 additions & 0 deletions unmapped_reads.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from Bio import SeqIO
import sys

#usage : python unmapped_reads.py aln.sam reads.fa

f = open(sys.argv[1], "r")
sam = f.readlines()
output = open("unmapped_reads" + sys.argv[1][0:-4] + ".fa", "w")
D = []
for aln in sam:
if aln.split()[0][0] == "@":
continue
if aln.split()[0] not in D:
D.append(aln.split()[0])
for rec in SeqIO.parse(sys.argv[2], "fasta"):
if rec.name not in D:
output.write(rec.format('fasta'))
output.close()
f.close()
31 changes: 31 additions & 0 deletions verif_composantes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import sys

#usage : python verif_composantes.py liste_reads_genes verif_graph.txt

f = open(sys.argv[1], "r")
ref = f.readlines()
f.close()
g = open(sys.argv[2], "r")
graph = g.readlines()
g.close()

D = {} # read : gene,transcrit
for r in ref:
D[r.split()[0]] = [r.split()[1], r.split()[2]]

res = []
genes = []
transcrits = []
for comp in graph:
comp = comp.split()
tmp_g = []
tmp_t = []
for read in comp:
tmp_g.append(D[read][0])
tmp_t.append(D[read][1])
res.append(len(set(tmp_g)))
[genes.append(x) for x in list(set(tmp_g))]
[transcrits.append(x) for x in list(set(tmp_t))]

print("Nombre de transcrits : {}".format(len(set(transcrits))))
print("Nombre de genes : {}".format(len(set(genes))))
65 changes: 65 additions & 0 deletions verif_graph.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import sys
from Bio import SeqIO

#usage : python verif_graph.py verif_graph.txt aln.sam ref.fa chr

composantes = 0
same_gene = 0
diff_gene = 0

D = {}
f = open(sys.argv[2], "r")
sam = f.readlines()
f.close()
for aln in sam:
if aln[0] == "@":
continue
tmp = aln.split()
D[tmp[0]] = tmp[2]
#D : dictionnaire {read1:transcrit1, read2:transcrit2, ...}

genes = []
nb_transcrits = []

reference = {}
for ref in SeqIO.parse(sys.argv[3], "fasta"):
tmp = ref.description
if tmp.split()[2].split(":")[2] != sys.argv[4]:
continue
reference[tmp.split()[0]] = tmp.split()[3].split(":")[1]
if reference[tmp.split()[0]] not in nb_transcrits:
nb_transcrits.append(reference[tmp.split()[0]])
if tmp.split()[3].split(":")[1] not in genes:
genes.append(tmp.split()[3].split(":")[1])
#reference : dictionaire {transcrit1 : gene, transcrit2:gene, ...}

c = 0
transcrits = {}
f = open(sys.argv[1], "r")
graph = f.readlines()
for connexes in graph:
connexes = connexes.split()
composantes += 1
flag = True
ref = reference[D[connexes[0]]]
for read in connexes:
c += 1
if D[read] in transcrits:
transcrits[D[read]] += 1
else:
transcrits[D[read]] = 1
if reference[D[read]] != ref:
print(len(connexes))
diff_gene += 1
flag = False
break
ref = reference[D[read]]
if flag:
same_gene += 1
f.close()
print("Nombre de noeuds : {}".format(c))
print("Nombre de composantes du graphe : {}".format(composantes))
print("Nombre de composantes composees uniquement du meme gene : {}".format(same_gene))
print("Nombre de fausses composantes : {}".format(diff_gene))
print("Nombre de transcrits : {}, {}".format(len(transcrits), len(nb_transcrits)))
print("Nombre de genes dans le graphe : {}".format(len(genes)))

0 comments on commit 2093dc4

Please sign in to comment.