This repository has been archived by the owner on Aug 5, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathortholog_sql.py
88 lines (77 loc) · 2.9 KB
/
ortholog_sql.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# ortholog_sql.py
# this needs to be a separate script so that it only gets done once instead of
# 25 times
import psycopg2
from itertools import islice
import re
def get_introns(cur, genome):
cur.execute(f'SELECT intron_id FROM "{genome}"')
introns = [re.sub('_(exon|cds)', '', intron) for
row in cur.fetchall() for intron in row]
return(introns)
def chunks(l, n):
for i in range(0, len(l), n):
yield l[i:i + n]
print(f'Preparing ortholog data.')
# get connection to database
conn = psycopg2.connect(dbname = 'iaod', user = 'dcmoyer')
cur = conn.cursor()
# replace existing data every time
cur.execute('DROP TABLE IF EXISTS orthologs')
cur.execute('CREATE TABLE orthologs (id SERIAL, cluster TEXT)')
cur.execute('DROP TABLE IF EXISTS orthologs_lookup')
cur.execute('CREATE TABLE orthologs_lookup (intron_id TEXT, clusters TEXT)')
# get list of all genomes
cur.execute("SELECT table_name FROM information_schema.tables WHERE table_schema = 'public'")
genomes = [genome for row in cur.fetchall() for genome in row]
# drop names of tables added by django
genomes = [
i for i in genomes
if not re.search('(orthologs|auth|django|u12s)', i)
]
# get list of all intron IDs and count how many introns are in each genome
genome_count_dict = dict()
introns = []
for genome in genomes:
one_genome = get_introns(cur, genome)
introns.extend(one_genome)
genome_count_dict[genome] = len(one_genome)
print(genome_count_dict)
# make dict where each key is an intron ID and each value is an empty list
cluster_dict = {}
for intron in introns:
cluster_dict[intron] = []
# read in all of the ortholog clusters to make the ortholog table
# also record which line numbers each intron appears on to make lookup table
print('Creating ortholog cluster table.')
with open('IAOD_clustered_introns.txt', 'r') as in_file:
batch_no = 0
row_no = 0
for batch in iter(lambda: tuple(islice(in_file, 10000)), ()):
batch_no += 1
print(f'On batch {batch_no}')
cur.execute('BEGIN')
for line in batch:
row_no += 1
# for all introns in this line, add this line number to their list
# of line numbers in the cluster_dict
line = line.rstrip('\n')
intron_cluster = line.split('\t')
for intron in intron_cluster:
try:
cluster_dict[intron].append(row_no)
except KeyError:
pass
cur.execute('INSERT INTO orthologs VALUES (DEFAULT, %s)', (line,))
cur.execute('COMMIT')
# use cluster_dict to create the lookup table
print('Creating ortholog lookup table.')
batch_no = 0
for chunk in chunks(introns, 10000):
batch_no += 1
print(f'On batch {batch_no}')
cur.execute('BEGIN')
for intron in chunk:
cur.execute('INSERT INTO orthologs_lookup VALUES (%s, %s)',
(intron, cluster_dict[intron]))
cur.execute('COMMIT')