Merge pull request #28 from molgenis/feat/ar_gene_filtering

Feat: ar gene filtering
molgenis · Oct 13, 2022 · 3a6433e · 3a6433e
2 parents f1096da + 2573fa6
commit 3a6433e
Show file tree

Hide file tree

Showing 9 changed files with 159 additions and 59 deletions.
diff --git a/utility_scripts/process_vep_tsv.py b/utility_scripts/process_vep_tsv.py
@@ -2,6 +2,7 @@
 
 import os
 import argparse
+import typing
 import warnings
 
 import numpy as np
@@ -22,7 +23,26 @@ class DataError(Exception):
 ID_SEPARATOR = '!'
 
 
-class CommandLineDigest:
+# Defining it here so the initial size can be set, removing duplicated printing code
+class ProgressPrinter:
+    def __init__(self):
+        self.before_drop = 0
+
+    def set_initial_size(self, sample_size: int):
+        self.before_drop = sample_size
+
+    def new_shape(self, sample_size: int):
+        print(f'Dropped {self.before_drop - sample_size} variants.\n')
+        self.before_drop = sample_size
+
+    def print_final_shape(self):
+        print(f'Final number of samples: {self.before_drop}')
+
+
+progress_printer = ProgressPrinter()
+
+
+class CommandLineParser:
     def __init__(self):
         parser = self._create_argument_parser()
         self.arguments = parser.parse_args()
@@ -51,6 +71,14 @@ def _create_argument_parser(self):
             required=True,
             help='Output path + filename'
         )
+        required.add_argument(
+            '-g',
+            '--genes',
+            type=str,
+            required=True,
+            help='File containing all Autosomal Recessive genes, each gene on a newline. '
+                 'Available at: https://research.nhgri.nih.gov/CGD/download/txt/CGD.txt.gz'
+        )
         optional.add_argument(
             '-a',
             '--assembly',
@@ -82,97 +110,169 @@ def validate_output_cla(output_path):
             raise NotADirectoryError('Output file has to be placed in an directory that already '
                                      'exists!')
 
+    @staticmethod
+    def validate_cgd_path(path):
+        if not path.endswith(('.txt.gz', '.tsv.gz')):
+            raise IncorrectFileError('Input CGD file is not either a gzipped txt or tsv file!')
+
+    @staticmethod
+    def validate_cgd_content(cgd_data):
+        required_columns = ['#GENE', 'INHERITANCE']
+        for column in required_columns:
+            if column not in cgd_data.columns:
+                raise DataError(f'Missing required column in CGD data: {column}')
+
     @staticmethod
     def validate_input_dataset(input_data):
-        columns_must_be_present = ['SYMBOL', 'CHROM', 'ID']
+        columns_must_be_present = ['SYMBOL', 'CHROM', 'ID', 'gnomAD_HN']
         for column in columns_must_be_present:
             if column not in input_data.columns:
-                raise DataError(f'Missing required column: {column}')
+                raise DataError(f'Missing required column in supplied dataset: {column}')
 
 
-def main():
-    print('Parsing CLI')
-    cli = CommandLineDigest()
+def load_and_correct_cgd(path, present_genes: typing.Iterable):
+    validator = Validator()
+    cgd = pd.read_csv(path, sep='\t')
+    validator.validate_cgd_content(cgd)
+    # Correct TENM1 since it can absolutely not be AR
+    cgd.drop(index=cgd[cgd['#GENE'] == 'TENM1'].index, inplace=True)
+    return_genes = []
+    genes_in_cgd = cgd[cgd['INHERITANCE'].str.contains('AR')]['#GENE'].values
+    for gene in genes_in_cgd:
+        if gene in present_genes:
+            return_genes.append(gene)
+    return return_genes
+
+
+def process_cli(validator):
+    print('Obtaining CLA.')
+    cli = CommandLineParser()
     data_path = cli.get_argument('input')
     output = cli.get_argument('output')
     grch38 = cli.get_argument('assembly')
-    print('')
+    cgd = cli.get_argument('genes')
 
-    print('Validating input arguments.')
-    validator = Validator()
+    print('Validating CLA.')
     validator.validate_input_cla(data_path)
+    validator.validate_cgd_path(cgd)
     validator.validate_output_cla(output)
+    print('Input arguments passed.\n')
+    return data_path, output, grch38, cgd
 
-    print('Reading in dataset')
-    data = pd.read_csv(data_path, sep='\t', na_values='.')
-    print('Validating dataset')
+
+def print_stats_dataset(data: pd.DataFrame, validator):
+    print('Validating dataset.')
     validator.validate_input_dataset(data)
-    print('Read in dataset.')
-    print(f'Shape: {data.shape}')
-    print(f'Head:\n{data.head()}\n')
-
-    print('Dropping entries without gene.')
-    before_drop = data.shape[0]
+    print('Dataset passed.')
+    n_samples = data.shape[0]
+    print(f'Sample size: {n_samples}')
+    print(f'Feature size: {data.shape[1]}')
+    progress_printer.set_initial_size(n_samples)
+    print('Head:')
+    with pd.option_context('display.max_rows', None, 'display.max_columns', None,
+                           'display.precision', 3):
+        print(data.head())
+
+
+def drop_genes_empty(data: pd.DataFrame):
+    print('Dropping variants without a gene.')
     data.drop(index=data[data['SYMBOL'].isnull()].index, inplace=True)
-    after_drop = data.shape[0]
-    print(f'Dropped {before_drop-after_drop} variants.\n')
+    progress_printer.new_shape(data.shape[0])
 
-    if grch38:
-        print('Converting chromosome column')
-        data['CHROM'] = data['CHROM'].str.split('chr', expand=True)[1]
-        y = np.append(np.arange(1, 23).astype(str), ['X', 'Y', 'MT'])
-        before_drop = data.shape[0]
-        data.drop(data[~data["CHROM"].isin(y)].index, inplace=True)
-        after_drop = data.shape[0]
-        print(f'Dropped {before_drop-after_drop} rows due to unknown chromosome.')
-        print('Conversion done.\n')
-
-    print('Dropping full duplicates')
-    before_drop = data.shape[0]
+
+def process_grch38(data: pd.DataFrame):
+    print('Processing GRCh38.')
+    data['CHROM'] = data['CHROM'].str.split('chr', expand=True)[1]
+    y = np.append(np.arange(1, 23).astype(str), ['X', 'Y', 'MT'])
+    data.drop(data[~data["CHROM"].isin(y)].index, inplace=True)
+    progress_printer.new_shape(data.shape[0])
+
+
+def drop_duplicate_entries(data: pd.DataFrame):
+    print('Dropping duplicated variants.')
     data.drop_duplicates(inplace=True)
-    after_drop = data.shape[0]
-    print(f'Dropped {before_drop-after_drop} duplicates')
-    print('Dropping duplicates done.\n')
-
-    print('Dropping mismatching gene entries.')
-    before_drop = data.shape[0]
+    progress_printer.new_shape(data.shape[0])
+
+
+def drop_mismatching_genes(data: pd.DataFrame):
+    print('Dropping variants with mismatching genes.')
     data.drop(
         index=data[data['ID'].str.split(ID_SEPARATOR, expand=True)[4] != data['SYMBOL']].index,
         inplace=True
     )
-    after_drop = data.shape[0]
-    print(f'Dropped {before_drop-after_drop} variants.\n')
-
-    print('Extracting sample weight and binarized_label')
+    progress_printer.new_shape(data.shape[0])
+
+
+def drop_heterozygous_variants_in_ar_genes(data: pd.DataFrame, cgd):
+    print('Dropping heterozygous variants in AR genes.')
+    ar_genes = load_and_correct_cgd(cgd, data['SYMBOL'].unique())
+    data.drop(
+        data[
+            (data['gnomAD_HN'].notnull()) &
+            (data['gnomAD_HN'] == 0) &
+            (data['SYMBOL'].isin(ar_genes))
+        ].index, inplace=True
+    )
+    progress_printer.new_shape(data.shape[0])
+
+
+def extract_label_and_weight(data: pd.DataFrame):
+    print('Extracting binarized_label and sample_weight')
     data['binarized_label'] = data['ID'].str.split(ID_SEPARATOR, expand=True)[5].astype(float)
     data['sample_weight'] = data['ID'].str.split(ID_SEPARATOR, expand=True)[6].astype(float)
-    print('')
-
-    print('Correcting possible errors within binarized_label or sample_weight')
-    before_drop = data.shape[0]
-    # Drop everything that doesn't have a binarized_label, also drop unused columns
+
+
+def drop_variants_incorrect_label_or_weight(data: pd.DataFrame):
+    print('Dropping variants with an incorrect label or weight')
     data.drop(index=data[data['binarized_label'].isnull()].index, columns=['ID'], inplace=True)
     data.drop(index=data[~data['binarized_label'].isin([0.0, 1.0])].index, inplace=True)
     data.drop(index=data[~data['sample_weight'].isin(SAMPLE_WEIGHTS)].index, inplace=True)
-    after_drop = data.shape[0]
-    print(f'Dropped {before_drop-after_drop} variants due to incorrect binarized_label or '
-          f'sample_weight.\n')
+    progress_printer.new_shape(data.shape[0])
 
-    print('Please check the sample weights:')
-    print(data[['binarized_label', 'sample_weight']].value_counts())
-    print('')
 
-    print(f'Final shape of data: {data.shape}')
-    print(f'Of which benign: {data[data["binarized_label"] == 0].shape[0]}')
-    print(f'Of which pathogenic: {data[data["binarized_label"] == 1].shape[0]}')
+def print_finalized_statistics(data: pd.DataFrame):
+    progress_printer.print_final_shape()
+    print('Value counts of dataset:')
+    print(data[['sample_weight', 'binarized_label']].value_counts())
+    print(f'Number of benign variants: {data[data["binarized_label"] == 0].shape[0]}')
+    print(f'Number of pathogenic variants: {data[data["binarized_label"] == 1].shape[0]}')
     n_other = data[~data["binarized_label"].isin([0, 1])].shape[0]
     if n_other > 0:
-        warnings.warn(f'Of which other: {n_other}')
-    print('')
+        warnings.warn(f'Number of variants that do not have a binarized label of 0 or 1: {n_other}')
+
 
-    print(f'Done! Exporting to {output}')
+def export_data(data: pd.DataFrame, output):
+    print(f'Exporting to: {output}')
     data.to_csv(output, index=False, compression='gzip', na_rep='.', sep='\t')
 
 
+def main():
+    validator = Validator()
+    data_path, output, grch38, cgd = process_cli(validator)
+
+    print('Reading in dataset')
+    data = pd.read_csv(data_path, sep='\t', na_values='.')
+    print_stats_dataset(data, validator)
+
+    drop_genes_empty(data)
+
+    if grch38:
+        process_grch38(data)
+
+    drop_duplicate_entries(data)
+
+    drop_mismatching_genes(data)
+
+    drop_heterozygous_variants_in_ar_genes(data, cgd)
+
+    extract_label_and_weight(data)
+
+    drop_variants_incorrect_label_or_weight(data)
+
+    print_finalized_statistics(data)
+
+    export_data(data, output)
+
+
 if __name__ == '__main__':
     main()
diff --git a/validation/4.0.0/4.0.0-v2/4.0.0-v1_4.0.0-v2_build37/auc.png b/validation/4.0.0/4.0.0-v2/4.0.0-v1_4.0.0-v2_build37/auc.png
diff --git a/validation/4.0.0/4.0.0-v2/4.0.0-v1_4.0.0-v2_build37/roc.png b/validation/4.0.0/4.0.0-v2/4.0.0-v1_4.0.0-v2_build37/roc.png
diff --git a/validation/4.0.0/4.0.0-v2/4.0.0-v1_4.0.0-v2_build37/score_differences.png b/validation/4.0.0/4.0.0-v2/4.0.0-v1_4.0.0-v2_build37/score_differences.png
diff --git a/validation/4.0.0/4.0.0-v2/4.0.0-v1_4.0.0-v2_build37/score_distributions.png b/validation/4.0.0/4.0.0-v2/4.0.0-v1_4.0.0-v2_build37/score_distributions.png
diff --git a/validation/4.0.0/4.0.0-v2/4.0.0-v1_4.0.0-v2_build38/auc.png b/validation/4.0.0/4.0.0-v2/4.0.0-v1_4.0.0-v2_build38/auc.png
diff --git a/validation/4.0.0/4.0.0-v2/4.0.0-v1_4.0.0-v2_build38/roc.png b/validation/4.0.0/4.0.0-v2/4.0.0-v1_4.0.0-v2_build38/roc.png
diff --git a/validation/4.0.0/4.0.0-v2/4.0.0-v1_4.0.0-v2_build38/score_differences.png b/validation/4.0.0/4.0.0-v2/4.0.0-v1_4.0.0-v2_build38/score_differences.png
diff --git a/validation/4.0.0/4.0.0-v2/4.0.0-v1_4.0.0-v2_build38/score_distributions.png b/validation/4.0.0/4.0.0-v2/4.0.0-v1_4.0.0-v2_build38/score_distributions.png