forked from wanyuac/BINF_toolkit
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrename_fasta_seqs.py
70 lines (53 loc) · 2.4 KB
/
rename_fasta_seqs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#!/usr/bin/env python
"""
Rename sequences in a FASTA file. It filters out sequences that are not included in the target list,
when specified.
Author: Yu Wan ([email protected], https://github.com/wanyuac)
Python version 2 and 3 compatible
License: GNU GPL 2.1
First edition: 11 Nov 2018, the latest revision: 14 Nov 2021.
Created and finished in Nara, Japan.
"""
from __future__ import print_function
import sys
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from argparse import ArgumentParser
def parse_arguments():
parser = ArgumentParser(description="Read options and arguments")
parser.add_argument("--fasta", "-f", dest = "fasta", type = str, required = True, help = "A FASTA file whose sequences will be renamed.")
parser.add_argument("--mapping", "-m", dest = "mapping", type = str, required = True, help = "A tab-delimited file mapping original sequence IDs to new IDs.")
parser.add_argument("--out", "-o", dest = "out", type = str, required = False, default = "./renamed.fasta", help = "Name and path for output FASTA file.")
parser.add_argument("--keep_all", "-k", dest = "keep_all", action = "store_true", required = False, help = "Set to keep all sequences when some IDs are not found in the rename table.")
parser.add_argument("--simple", "-s", dest = "simple", action = "store_true", required = False, help = "Drop original sequence names to make simple headers.")
return parser.parse_args()
def main():
args = parse_arguments()
mapping = import_mapping_table(args.mapping)
drop_prev_name = args.simple
to_rename = list(mapping.keys())
in_fasta = open(args.fasta, "r")
out = open(args.out, "w")
for seq in SeqIO.parse(in_fasta, "fasta"): # read the input FASTA file from stdin
if seq.id in to_rename:
if drop_prev_name:
seq.description = ""
seq.id = mapping[seq.id]
print(seq.format("fasta"), file = out)
elif args.keep_all:
print(seq.format("fasta"), file = out)
in_fasta.close()
out.close()
return
def import_mapping_table(rename):
# Read the tab-delimited table for renaming sequences.
with open(rename, "r") as f:
lines = f.read().splitlines()
r = {}
for l in lines:
old_id, new_id = l.split("\t")
r[old_id] = new_id
return(r)
if __name__ == "__main__":
main()