-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path2_extract_reviewed.py
51 lines (40 loc) · 2.25 KB
/
2_extract_reviewed.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
'''
Note: This script isn't used anymore. Use it only if merged data is required only from reviewed UniProtKB IDs.
Script to parse in-house restructured UniProt genome coordinates data to keep only the data with UniProt's reviewed status.
Simple parsing and comparison is done here, instead of using pandas, numpy, etc. Hence, it takes a bit longer time (~ 1 min) to complete.
Author: 'Mana'valan Gajapathy
'''
import csv
import os
from natsort import natsorted
input_bed_file = 'Download_data/merged_select_UniProt_hg19_restructured_type1.bed'
# output_filename = os.path.basename(input_bed_file).replace('.bed', '_reviewed_only.bed')
output_filename = input_bed_file.replace('.bed', '_reviewed_only.bed')
# reads file with uniprot IDs that are from human and has reviewed status
reviewed_ids = []
with open('settings_files/UniProt_IDs_human_reviewed.txt', 'Ur') as reviewed_uniprot_handle:
for line in reviewed_uniprot_handle:
line = line.replace('\n', '')
reviewed_ids.append(line.lower())
# reads bedfile and writes in output if their UniProt IDs have reviewed status
match = 0
in_input = 0
reviewed_only_list = []
with open(input_bed_file, 'Ur') as input_bed_handle:
input_bed_csv = csv.reader(input_bed_handle, delimiter='\t')
for row in input_bed_csv:
# uniprot_id = row[3].split(':')[0].lower() # splitting not required due to format change in Feb 2017 release
uniprot_id = row[3].lower()
if uniprot_id in reviewed_ids:
reviewed_only_list.append('\t'.join(row))
# reviewed_only_out_handle.write('\t'.join(row) + '\n')
# print count, uniprot_id
match += 1
in_input += 1
with open(output_filename, 'w') as reviewed_only_out_handle:
title_line = ['#chrom','start_pos','end_pos','uniprot_id','seq_feature','strand','thick_start','thick_end','rgb', 'blocks','block_size','block_start','annotation_id','description']
reviewed_only_out_handle.write('\t'.join(title_line) + '\n')
reviewed_only_out_handle.write('\n'.join(natsorted(reviewed_only_list))) # sorts bedfile data before writing outfile
print 'No. of annotations in input bedfile: %i\n ' \
'Total written in to output: %i\n' \
'Total excluded from output: %i' %(in_input, match, in_input - match)