-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfix_gff_file.py
63 lines (48 loc) · 2.12 KB
/
fix_gff_file.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import pandas
import re
accepted_feature_types = [
'gene',
'CDS',
'mRNA',
'exon',
'five_prime_UTR',
'three_prime_UTR',
'rRNA',
'tRNA',
'ncRNA',
'tmRNA',
'transcript',
'mobile_genetic_element',
'origin_of_replication',
'promoter',
'repeat_region',
]
gff_cols = ['seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes']
data = pandas.read_csv('data/Schizosaccharomyces_pombe_all_chromosomes.gff3', sep='\t', na_filter=False, names=gff_cols, skiprows=1)
# Annotate pseudogenes
def format_regex(x):
# Function to make a regex that matches the attributes of any pseudogene
transcript_id, gene_id = re.search("ID=(.+?);Parent=(.+)", x).groups()
# The gene itself, the transcript itself, any children of the transcript (exons and introns)
return f'ID={gene_id}(;|$)|ID={transcript_id}(;|$)|Parent={transcript_id}(;|$)'
pseudogene_regex = '|'.join(data.loc[data.type == 'pseudogenic_transcript', 'attributes'].apply(format_regex))
pseudo = data.attributes.str.contains(pseudogene_regex)
#TODO: change the qualifier type
data.loc[pseudo, 'attributes'] = data.loc[pseudo, 'attributes'].apply(lambda x: x + ';pseudogene=unknown')
feature_mappings = pandas.read_csv('mappings/mappings.tsv', sep='\t', na_filter=False)
mappings_dict = dict(zip(feature_mappings.feature, feature_mappings.replace_by))
data['type'] = data['type'].apply(lambda x: mappings_dict[x] if x in mappings_dict else x)
# Drop the lines without type
data = data.loc[data.type != '', :].copy()
valid_systematic_ids = set(pandas.read_csv('data/gene_IDs_names.tsv', sep='\t', na_filter=False, skiprows=1)['gene_systematic_id'])
# Set locus_tag qualifier
def get_locus_tag(x):
match = re.match('ID=(.+?)(;|$)', x)
if match and match.groups()[0] in valid_systematic_ids:
return match.groups()[0]
match = re.match('ID=(.+?\..+?)(?=\.)', x)
if match and match.groups()[0] in valid_systematic_ids:
return match.groups()[0]
print((f'Cannot assign locus to {x}'))
data.to_csv('data/fixed.gff3', sep='\t', index=False, header=False)
data.loc[:, 'attributes'] = data.loc[:, 'attributes'].apply(lambda x: f'{x};{get_locus_tag(x)}')