-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathallele_qc.py
101 lines (81 loc) · 5.07 KB
/
allele_qc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
"""
Runs the main analysis pipeline (see readme). The output is stored in the file indicated
in --output, in addition, two extra output files are created, for the default output value:
Only the subset of alleles that needs fixing.
results/allele_results_errors.tsv
Only the subset of alleles that needs fixing, only the columns 'allele_description' and 'change_description_to'.
results/allele_results_errors_summarised.tsv
The extra columns created are described in the readme.md.
"""
from models import SyntaxRule, AllowedTypes
from refinement_functions import check_allele_description
from grammar import allowed_types_dict, composed_types_dict, aminoacid_grammar, nucleotide_grammar, disruption_grammar
import pickle
import pandas
import argparse
from common_autofix_functions import print_warnings
from genome_functions import handle_systematic_id_for_allele_qc
def empty_dict():
"""
Return the dictionary with error
"""
return {
'allele_parts': '',
'needs_fixing': False,
'change_description_to': '',
'rules_applied': '',
'pattern_error': '',
'invalid_error': '',
'sequence_error': '',
'change_type_to': ''
}
def check_fun(row, genome, syntax_rules_aminoacids, syntax_rules_nucleotides, syntax_rules_disruption, allowed_types):
systematic_id = handle_systematic_id_for_allele_qc(row['systematic_id'], row['allele_name'], genome)
if systematic_id is None:
return empty_dict() | {'needs_fixing': True, 'invalid_error': 'systematic_id not in genome'}
gene = genome[systematic_id]
if 'amino_acid' in row['allele_type'] or 'nonsense_mutation' == row['allele_type']:
if 'peptide' not in gene:
return empty_dict() | {'needs_fixing': True, 'invalid_error': 'peptide sequence missing'}
else:
return check_allele_description(row.allele_description, syntax_rules_aminoacids, row.allele_type, allowed_types, gene)
elif 'nucleotide' in row['allele_type']:
return check_allele_description(row.allele_description, syntax_rules_nucleotides, row.allele_type, allowed_types, gene)
elif 'disruption' == row['allele_type']:
# TODO: handle this better and refactor
if row['allele_description'] != '':
return check_allele_description(row.allele_description, syntax_rules_disruption, row.allele_type, allowed_types, gene)
# Special case where the description is empty
else:
out_dict = check_allele_description(row.allele_name, syntax_rules_disruption, row.allele_type, allowed_types, gene)
# The name matches the pattern
if out_dict['change_description_to'] != '':
return out_dict
else:
return empty_dict()
else:
return empty_dict()
if __name__ == '__main__':
class Formatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter):
pass
parser = argparse.ArgumentParser(description=__doc__, formatter_class=Formatter)
parser.add_argument('--genome', default='data/genome.pickle', help='input: genome dictionary built from contig files.')
parser.add_argument('--alleles', default='data/alleles.tsv', help='input allele dataset')
parser.add_argument('--output', default='results/allele_results.tsv', help='output file, also creates two extra files with the extension _errors.tsv and _errors_summarised.tsv')
args = parser.parse_args()
with open(args.genome, 'rb') as ins:
genome = pickle.load(ins)
allele_data = pandas.read_csv(args.alleles, delimiter='\t', na_filter=False)
syntax_rules_aminoacids = [SyntaxRule.parse_obj(r) for r in aminoacid_grammar]
syntax_rules_nucleotides = [SyntaxRule.parse_obj(r) for r in nucleotide_grammar]
syntax_rules_disruption = [SyntaxRule.parse_obj(r) for r in disruption_grammar]
allowed_types = AllowedTypes(allowed_types=allowed_types_dict, composed_types=composed_types_dict)
extra_cols = allele_data.apply(lambda row: check_fun(row, genome, syntax_rules_aminoacids, syntax_rules_nucleotides, syntax_rules_disruption, allowed_types), axis=1, result_type='expand')
output_data = pandas.concat([allele_data, extra_cols], axis=1)
column_order = ['systematic_id', 'gene_name', 'allele_id', 'allele_name', 'allele_description', 'allele_type', 'reference', 'allele_parts', 'needs_fixing', 'change_description_to', 'rules_applied', 'pattern_error', 'invalid_error', 'sequence_error', 'change_type_to']
output_data = output_data[column_order]
print_warnings(output_data[(output_data['needs_fixing'] == True) & (output_data['pattern_error'] == '') & (output_data['allele_type'].str.contains('nucleot') | output_data['allele_type'].str.contains('amino'))])
output_data.to_csv(args.output, sep='\t', index=False)
root_output_name = args.output.split('.')[0]
output_data[output_data['needs_fixing'] == True].to_csv(f'{root_output_name}_errors.tsv', sep='\t', index=False)
output_data[output_data['needs_fixing'] == True][['allele_description', 'change_description_to']].to_csv(f'{root_output_name}_errors_summarised.tsv', sep='\t', index=False)