forked from indrops/indrops
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathannotate_transcriptome.py
30 lines (30 loc) · 2.04 KB
/
annotate_transcriptome.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import re
in_genes="Homo_sapiens.GRCh38.82.with_tid.gtf"
# out_genes="Homo_sapiens.GRCh38.82.annotated.gtf"
out_genes="Homo_sapiens.GRCh38.82.annotated.gtf"
accepted_gene_biotypes_for_NA_transcripts = set(["IG_V_gene","IG_J_gene","protein_coding","TR_J_gene","TR_D_gene","TR_V_gene","IG_C_gene","IG_D_gene","TR_C_gene"])
with open(in_genes, 'r') as in_f, open(out_genes, 'w') as out_f:
for line in in_f:
chr_name = line.rstrip().split('\t')[0]
# Check the transcript_support level
# This should be faster than a regex
# We need to support the case where see these two types of annotations:
# transcript_support_level "1"
# transcript_support_level "1 (assigned to previous version X)"
# transcript_support_level "2" <- Clear example of a gene like this is NKX6.1
# transcript_support_level "2 (assigned to previous version X)"
line_valid_for_output = False
if 'transcript_support_level "1"' in line or 'transcript_support_level "1 ' in line or 'transcript_support_level "2"' in line or 'transcript_support_level "2 ' in line:
line_valid_for_output = True
elif 'transcript_support_level "NA' in line:
# Transcript Support Level Not Analysed. Pseudogenes, single exon transcripts, HLA, T-cell receptor and Ig transcripts are not analysed and therefore not given any of the TSL categories.
# Keep only a few ones annotated as "IG_V_gene","IG_J_gene","protein_coding","TR_J_gene","TR_D_gene","TR_V_gene","IG_C_gene","IG_D_gene","TR_C_gene"
gene_biotype = re.search(r'gene_biotype \"(.*?)\";', line)
if gene_biotype and gene_biotype.group(1) in accepted_gene_biotypes_for_NA_transcripts:
line_valid_for_output = True
if line_valid_for_output:
gene_name = re.search(r'gene_name \"(.*?)\";', line)
if gene_name:
gene_name = gene_name.group(1)
out_line = re.sub(r'(?<=transcript_id ")(.*?)(?=";)', r'\1|'+gene_name, line)
out_f.write(out_line)