This repository has been archived by the owner on Aug 5, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathreformatting.py
126 lines (107 loc) · 5.33 KB
/
reformatting.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# reformatting.py
# takes output of intronIC, get_gene_symbols, and ortholog_clusters and gets it
# into a well-organized SQL database
import re
import sys
import psycopg2
from psycopg2.extensions import quote_ident
from itertools import islice
# delete and re-create appropriate tables
def update_tables(genome, cur):
# replace whatever data exists for this genome assembly
cur.execute(f'DROP TABLE IF EXISTS {genome}')
cur.execute(f'CREATE TABLE {genome} (short_seq VARCHAR(33), tax_name \
VARCHAR(30), com_name VARCHAR(30), genome VARCHAR(15), score REAL, \
intron_class VARCHAR(13), intron_id TEXT PRIMARY KEY, chromosome \
VARCHAR(20), start INTEGER, stop INTEGER, length INTEGER, strand VARCHAR(1), \
intron_rank INTEGER, phase VARCHAR(1), tds VARCHAR(5), up_seq VARCHAR(50), \
branch_seq VARCHAR(50), down_seq VARCHAR(50), full_seq TEXT, gene_symbol TEXT,\
gene_id VARCHAR(20), trans_id VARCHAR(20))')
# make U12s tables if the database is empty
cur.execute('CREATE TABLE IF NOT EXISTS U12s (short_seq VARCHAR(33),\
tax_name VARCHAR(30), com_name VARCHAR(30), genome VARCHAR(15), score REAL,\
intron_class VARCHAR(13), intron_id TEXT PRIMARY KEY, chromosome \
VARCHAR(20), start INTEGER, stop INTEGER, length INTEGER, strand VARCHAR(1),\
intron_rank INTEGER, phase VARCHAR(1), tds VARCHAR(5), up_seq VARCHAR(50), \
branch_seq VARCHAR(50), down_seq VARCHAR(50), full_seq TEXT, gene_symbol TEXT,\
gene_id VARCHAR(20), trans_id VARCHAR(20))')
# remove all the U12 introns from this genome assembly before adding more
cur.execute('DELETE FROM U12s WHERE genome = %s', (genome,))
def insert_annotation(cur, genome, tax_name, com_name, symbols, batch):
# makes all of these insertions happen within a single transaction- v fast
cur.execute('BEGIN')
for line in batch:
fields = line.rstrip('\n').split('\t')
# these fields need to be re-arranged; name each element to reorder
(intron_id, short_seq, bad_genome, score, chromosome, start, stop,
strand, phase, rank, tds, up_seq, down_seq, branch_seq, trans_id,
gene_id, full_seq) = fields
# we've made bad_genome because we'll get <genome>_exon or <genome>_cds
# from fields, and we just want the genome assembly name in the website
# need to get intron_class using score and length using full_seq
intron_class = ''
if float(score) >= 0:
intron_class = "U12-Dependent"
else:
intron_class = "U2-Dependent"
length = len(full_seq)
# need to find the gene symbol using symbols and gene_id; we won't find
# a gene symbol for every gene_id
try:
gene_symbol = symbols[gene_id]
except KeyError:
gene_symbol = '.'
# insert all the information
cur.execute(f'INSERT INTO {genome} VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,\
%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)', (short_seq, tax_name, com_name,
genome, score, intron_class, intron_id, chromosome, start, stop, length,
strand, rank, phase, tds, up_seq, branch_seq, down_seq, full_seq, gene_symbol,
gene_id, trans_id))
if intron_class == "U12-Dependent":
cur.execute(f'INSERT INTO U12s VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,\
%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)', (short_seq, tax_name, com_name,
genome, score, intron_class, intron_id, chromosome, start, stop, length,
strand, rank, phase, tds, up_seq, branch_seq, down_seq, full_seq, gene_symbol,
gene_id, trans_id))
cur.execute('COMMIT')
# get command-line arguments
if len(sys.argv) != 4:
sys.exit('Specify the genome assembly name, the taxonomic name (delimited \
by underscores), and the common name (also delimited by underscores)')
genome = sys.argv[1]
bad_tax_name = sys.argv[2]
bad_com_name = sys.argv[3]
# remove underscores from tax_name and com_name
tax_name = re.sub('_', ' ', bad_tax_name)
com_name = re.sub('_', ' ', bad_com_name)
# get connection to database
conn = psycopg2.connect(dbname = 'iaod', user = 'dcmoyer')
cur = conn.cursor()
# many of the genome assemblies have hyphens in them, so we have to escape them
escaped_genome = quote_ident(genome, cur)
# remove existing data for this genome assembly and/or create new tables for it
print(f'Updating SQL tables for {tax_name} ({genome}).')
update_tables(escaped_genome, cur)
# make dictionary of gene IDs and gene symbols; batch input file because it may
# get really long with some genomes
print(f'Reading gene symbols.')
symbols = {}
with open(f'info/{genome}_gene_symbols.tsv', 'r') as in_file:
for batch in iter(lambda: tuple(islice(in_file, 10000)), ()):
for line in batch:
id_symbol_pair = line.rstrip('\n').split('\t')
symbols[id_symbol_pair[0]] = id_symbol_pair[1]
# get all of the annotation information from intronIC
print(f'Reading intronIC output and creating annotation SQL table.')
with open(f'info/{genome}_info.tsv', 'r') as in_file:
i = 0
for batch in iter(lambda: tuple(islice(in_file, 10000)), ()):
i += 1
print(f'On batch {i}.')
insert_annotation(
cur, escaped_genome, tax_name, com_name, symbols, batch)
print('Creating full-text search index.')
index_name = quote_ident(f'{genome}_idx', cur)
cur.execute(f'CREATE INDEX {index_name} ON {escaped_genome} USING \
GIN(to_tsvector(\'english\', gene_id || trans_id || genome || gene_symbol || \
tax_name || com_name))')