forked from adlape95/Spaghetti
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmerfePAF.py
157 lines (122 loc) · 3.74 KB
/
merfePAF.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Script for merging PAF files coming from the alignment of ONT 16S sequences
to GreenGenes Database.
Result: summary (OTU table-like) of reads assigned to each TaxID in each
sample.
@author: Adriel Latorre-Pérez
@company: Darwin Bioprospecting Excellence S.L.
@date: 15/10/2020
"""
import sys, os
from argparse import ArgumentParser # Para gestionar argumentos
def Arguments():
"""For input folder.
"""
parser = ArgumentParser (description ="Script for merging PAF files coming\
from the alignment of ONT 16S sequences to \
GreenGenes Database. \
Result: summary (OTU table-like) of reads assigned\
to each TaxID in each sample.")
parser.add_argument ('-i', '--input', dest='dir',
action ='store', required =True ,
help='Path to dir containing PAF files')
# Procesa los argumentos
try:
args = parser.parse_args ()
return args
except:
print('Please, include the required arguments.')
sys.exit()
# end try
def taxReader(pafRoutes):
"""
Given a directory of PAF files, it reads the files and creates a dictionary
with all the taxIDs detected.
Parameters
----------
pafRoutes : STRING
PATH to PAF folder.
Returns
-------
A dictionary:
Key: taxID
Value: list of 0s. Length list = number of PAF files in the directory
"""
taxDic = {}
for file in pafRoutes:
file = open(file)
for line in file:
taxID = line.strip().split("\t")[5]
if taxID not in taxDic:
taxDic[taxID] = [0] * len(pafRoutes)
file.close()
return taxDic
def taxSummary(pafRoutes, taxDic):
"""
Given the path to the PAF files and the dictionary generated by taxReader,
taxSummary generates a summary of the number of hits for each taxID in
each sample
Parameters
----------
pafRoutes : STRING
PATH to PAF folder.
taxDic : DICTIONARY
Output of taxSummary.
Returns
-------
Updated dic.
"""
n = 0 # index variable
for file in pafRoutes:
file = open(file)
for line in file:
taxID = line.strip().split("\t")[5]
taxDic[taxID][n] += 1
n += 1
file.close()
return taxDic
def PAFmerger(dir):
"""
Function for merging and summarizing diferent PAF files coming from align-
ing 16S files (different barcodes) to a Database with minimap
Parameters
----------
dir : STRING
Path to the folder containing the PAF files.
Returns
-------
A dictionary to be formatted for final output.
"""
if dir[-1] != "/":
dir += "/"
files = []
header = ["#OTU ID"]
for file in os.listdir(dir):
if ".paf" in file:
files.append(dir + file)
header.append(file.split(".paf")[0])
taxDic = taxReader(files)
return header, taxSummary(files, taxDic)
def summaryTable(header, taxDic):
"""
Function for convert the output of PAFmerger into a OTU table-like CSV
file.
Parameters
----------
header : LIST
1st output from PAFmerger.
taxDic : DICTIONARY
2nd output from PAFmerger.
Returns
-------
None. Prints the table to stdout.
"""
print (",".join(header))
for taxID in taxDic:
print (taxID + ',' + ','.join([str(elem) for elem in taxDic[taxID]]))
if __name__ == "__main__":
args = Arguments()
header, taxDic = PAFmerger (args.dir)
summaryTable(header, taxDic)