From 763d5a77092668fd58e6c73cf5dad15bfeeb326f Mon Sep 17 00:00:00 2001 From: RSWilson1 Date: Tue, 11 Apr 2023 13:44:14 +0100 Subject: [PATCH 01/12] - Added hyperlink handling for list of cosmic ids joined by '&'. - Changed cosmic stub to new one which incorporates genome build. --- .../home/dnanexus/generate_workbook/utils/vcf.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/resources/home/dnanexus/generate_workbook/utils/vcf.py b/resources/home/dnanexus/generate_workbook/utils/vcf.py index 6d7b1d1b..6a9d0f09 100644 --- a/resources/home/dnanexus/generate_workbook/utils/vcf.py +++ b/resources/home/dnanexus/generate_workbook/utils/vcf.py @@ -44,7 +44,7 @@ def __init__(self, args) -> None: self.urls = { "csq_existing_variation": "https://www.ncbi.nlm.nih.gov/snp/", "csq_clinvar": "https://www.ncbi.nlm.nih.gov/clinvar/variation/", - "csq_cosmic": "https://cancer.sanger.ac.uk/cosmic/search?q=", + "csq_cosmic": "https://cancer.sanger.ac.uk/cosmic/search?genome=BUILD&q=", # genome=37&q={ID} "csq_hgmd": "https://my.qiagendigitalinsights.com/bbp/view/hgmd/pro/mut.php?acc=", "csq_mastermind_mmid3": "https://mastermind.genomenon.com/detail?mutation=", "gnomad_base_url": "https://gnomad.broadinstitute.org/variant/CHROM-POS-REF-ALT", @@ -469,6 +469,9 @@ def add_hyperlinks(self) -> None: if 'gnomad' in col.lower(): # gnomAD columns won't be exact match on name to dict url = self.urls.get('gnomad') + elif 'cosmic' in col.lower(): + url = self.urls.get('csq_cosmic') + print(f"csomic stub:{url}") else: url = self.urls.get(col.lower(), None) @@ -557,6 +560,15 @@ def make_hyperlink(self, column, url, value, build): # sheet so there is no need to display full length hyperlink value[column] = url.split('/')[-1] + elif 'cosmic' in column.lower(): + # COSMIC requires the url to have the COSM ID added to the url + # stub differs based on genome build + url = url.replace('BUILD', str(build)) + # COSMIC IDs are separated by & and rejoined for each unique ID + # re-assigned to value[column] to display only unique IDs in excel + value[column] = '&'.join(set(value[column].split('&'))) + url = f'{url}{value[column]}' + else: # other URLs with value appended to end url = f'{url}{value[column]}' From 6c5ddc5a6b40f2cb7a21abee70276f02f3849337 Mon Sep 17 00:00:00 2001 From: RSWilson1 Date: Wed, 12 Apr 2023 09:23:01 +0100 Subject: [PATCH 02/12] tests for cosmic handling: - checks correct output for single cosmic id. - checks correct output for multiple cosmic ids (COSV12345&COSV12345). --- .../generate_workbook/tests/test_vcf.py | 56 +++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/resources/home/dnanexus/generate_workbook/tests/test_vcf.py b/resources/home/dnanexus/generate_workbook/tests/test_vcf.py index 48bd5643..600e6638 100644 --- a/resources/home/dnanexus/generate_workbook/tests/test_vcf.py +++ b/resources/home/dnanexus/generate_workbook/tests/test_vcf.py @@ -484,6 +484,62 @@ def test_gnomad_build_38(): "gnomAD AF link output incorrect for build 38 input" ) + @staticmethod + def test_cosmic_build_37(): + ''' + Test that the COSMIC links are generated correctly for build 37 + ''' + # Intialise test dataframe with build 37 genome positions + df = pd.DataFrame([ + {'CHROM': 1, 'POS': 2488153, 'REF': 'A', + 'ALT': 'G', 'COSMICcMuts': 'COSV63186428'}, + ]) + + test_vcf = vcf(argparse.Namespace()) + test_vcf.vcfs = [df] + test_vcf.refs = ['37'] # Set reference = build 37 + + # Call function to add hyperlinks + vcf.add_hyperlinks(test_vcf) + + valid_string = ( + '=HYPERLINK("https://cancer.sanger.ac.uk/cosmic/search?' + 'genome=37&q=COSV63186428", "COSV63186428")' + ) + + # Assert the output is as expected + assert test_vcf.vcfs[0]["COSMICcMuts"][0] == valid_string, ( + "COSMICcMuts link output incorrect for build 37 input" + ) + + @staticmethod + def test_cosmic_build_37_multiple(): + ''' + Test that the COSMIC links are generated correctly for build 37 + ''' + # Intialise test dataframe with build 37 genome positions + df = pd.DataFrame([ + {'CHROM': 1, 'POS': 2488153, 'REF': 'A', + 'ALT': 'G', 'COSMICcMuts': 'COSV63186428&COSV63186428&COSV63186428'}, + ]) + + test_vcf = vcf(argparse.Namespace()) + test_vcf.vcfs = [df] + test_vcf.refs = ['37'] # Set reference = build 37 + + # Call function to add hyperlinks + vcf.add_hyperlinks(test_vcf) + + valid_string = ( + '=HYPERLINK("https://cancer.sanger.ac.uk/cosmic/search?' + 'genome=37&q=COSV63186428", "COSV63186428")' + ) + + # Assert the output is as expected + assert test_vcf.vcfs[0]["COSMICcMuts"][0] == valid_string, ( + "COSMICcMuts link output incorrect for build 37 input" + ) + if __name__ == "__main__": header = TestHeader() header.test_column_names() From 9ca1d196b5c30f9f0e333d7671ecbeaaaf776b7a Mon Sep 17 00:00:00 2001 From: RSWilson1 Date: Tue, 25 Apr 2023 16:21:55 +0100 Subject: [PATCH 03/12] removed testing prints --- resources/home/dnanexus/generate_workbook/utils/vcf.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/resources/home/dnanexus/generate_workbook/utils/vcf.py b/resources/home/dnanexus/generate_workbook/utils/vcf.py index 6a9d0f09..16f6303c 100644 --- a/resources/home/dnanexus/generate_workbook/utils/vcf.py +++ b/resources/home/dnanexus/generate_workbook/utils/vcf.py @@ -471,7 +471,6 @@ def add_hyperlinks(self) -> None: url = self.urls.get('gnomad') elif 'cosmic' in col.lower(): url = self.urls.get('csq_cosmic') - print(f"csomic stub:{url}") else: url = self.urls.get(col.lower(), None) @@ -564,9 +563,7 @@ def make_hyperlink(self, column, url, value, build): # COSMIC requires the url to have the COSM ID added to the url # stub differs based on genome build url = url.replace('BUILD', str(build)) - # COSMIC IDs are separated by & and rejoined for each unique ID - # re-assigned to value[column] to display only unique IDs in excel - value[column] = '&'.join(set(value[column].split('&'))) + # Build COSMIC URL and set value to display equal to what is in url = f'{url}{value[column]}' else: From 5d0b0337e131fad6d703f5854b16e4e40904e1c3 Mon Sep 17 00:00:00 2001 From: RSWilson1 Date: Tue, 25 Apr 2023 16:37:21 +0100 Subject: [PATCH 04/12] - changed to handle all VEP fields not just cosmic. - creates set to get unique COSV ids, sorts and then re-join by '&'. --- .../generate_workbook/utils/columns.py | 22 ++++++++++++------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/resources/home/dnanexus/generate_workbook/utils/columns.py b/resources/home/dnanexus/generate_workbook/utils/columns.py index 01f9c38e..c810e369 100644 --- a/resources/home/dnanexus/generate_workbook/utils/columns.py +++ b/resources/home/dnanexus/generate_workbook/utils/columns.py @@ -1,6 +1,7 @@ import sys from typing import Union import pandas as pd +import time class splitColumns(): @@ -28,14 +29,15 @@ def split(self, vcf_df) -> Union[pd.DataFrame, int]: """ vcf_df = self.info(vcf_df) vcf_df = self.format_fields(vcf_df) - vcf_df = self.unique_cosmic(vcf_df) + vcf_df = self.unique_vep(vcf_df) return vcf_df - def unique_cosmic(self, vcf_df) -> pd.DataFrame: + def unique_vep(self, vcf_df): """ Handle known bug in VEP annotation where it duplicates COSMIC IDs + This creates a Parameters ---------- @@ -47,10 +49,16 @@ def unique_cosmic(self, vcf_df) -> pd.DataFrame: vcf_df : pd.DataFrame dataframe of variants """ - if 'COSMIC' in vcf_df.columns: - vcf_df['COSMIC'] = vcf_df['COSMIC'].apply( - lambda x: ' & '.join(set(x.split('&'))) + + # Find all columns that start with 'csq' + csq_columns = [col for col in vcf_df.columns if col.lower().startswith('csq')] + + # Join the 'csq' columns using '&' and remove duplicates + for col in csq_columns: + vcf_df[col] = vcf_df[col].apply( + lambda x: ' & '.join(sorted(set(x.split('&')))) if isinstance(x, str) else x ) + return vcf_df @@ -174,12 +182,10 @@ def info(self, vcf_df) -> pd.DataFrame: info_keys = [x for x in info_keys if x] # can end up with empty string info_values = [] - # info_pairs -> list of list of pairs, one list per variant for variant_pairs in info_pairs: # for every variants values, split them out to dict to add to df pair_values = {} - for pair in variant_pairs: if '=' in pair: # key value pair @@ -187,7 +193,6 @@ def info(self, vcf_df) -> pd.DataFrame: else: # Flag value present (e.g STR) key, value = pair, True - pair_values[key] = value info_values.append(pair_values) @@ -199,4 +204,5 @@ def info(self, vcf_df) -> pd.DataFrame: # drop INFO and CSQ as we fully split them out vcf_df.drop(['INFO'], axis=1, inplace=True) + return vcf_df From 4307f0c651faa494b29f4dbaa32b716967ca6b2c Mon Sep 17 00:00:00 2001 From: RSWilson1 Date: Tue, 25 Apr 2023 16:38:01 +0100 Subject: [PATCH 05/12] removed time used for testing --- resources/home/dnanexus/generate_workbook/utils/columns.py | 1 - 1 file changed, 1 deletion(-) diff --git a/resources/home/dnanexus/generate_workbook/utils/columns.py b/resources/home/dnanexus/generate_workbook/utils/columns.py index c810e369..861fd722 100644 --- a/resources/home/dnanexus/generate_workbook/utils/columns.py +++ b/resources/home/dnanexus/generate_workbook/utils/columns.py @@ -1,7 +1,6 @@ import sys from typing import Union import pandas as pd -import time class splitColumns(): From 9aecfa7a84139baa56745dc59ed03af77cc271a8 Mon Sep 17 00:00:00 2001 From: RSWilson1 Date: Tue, 25 Apr 2023 16:41:22 +0100 Subject: [PATCH 06/12] Added tests for VEP handling in columns.py Added add_columns to namespace Added test for Cosmic Hyperlinks --- .../generate_workbook/tests/test_columns.py | 64 +++++++++++++++++++ .../generate_workbook/tests/test_vcf.py | 55 ++++++++-------- 2 files changed, 92 insertions(+), 27 deletions(-) diff --git a/resources/home/dnanexus/generate_workbook/tests/test_columns.py b/resources/home/dnanexus/generate_workbook/tests/test_columns.py index 8dd12edb..78bc475f 100644 --- a/resources/home/dnanexus/generate_workbook/tests/test_columns.py +++ b/resources/home/dnanexus/generate_workbook/tests/test_columns.py @@ -25,6 +25,7 @@ def read_test_vcf(vcf_file): vcf_handler = vcf(argparse.Namespace( add_name=False, analysis='', clinical_indication='', exclude=None, filter=None, include=None, keep=False, merge=False, + add_comment_column=False, out_dir='', output='', panel='', print_columns=False, print_header=False, reads='', @@ -250,6 +251,69 @@ def test_format_sample_values_are_correct(self): ) +class TestVEPHandling(): + """ + Tests for splitColumns.unique_vep() that handles + duplicates in INFO/CSQ VEP columns. + """ + # test vcf standard sample + test_vcf = os.path.join(TEST_DATA_DIR, "HD753-unittest_annotated.split.vcf") + # run dataframe through splitColumns.info() to split out INFO column + vcf_df = read_test_vcf(vcf_file=test_vcf) + vcf_df = splitColumns().split(vcf_df) + + + def test_parsed_correct_COSMICcMuts_values(self): + """ + Test values read into dataframe for COSMICcMuts match the values + above from the VCF + """ + # read COSMICcMuts values from vcf + output = subprocess.run( + ( + f"grep -v '^#' {self.test_vcf} | grep -oh " + f"'COSMICcMuts=[A-Z0-9&\.]*;' | sort | uniq" + ), shell=True, capture_output=True + ) + # clean up values + stdout = output.stdout.decode().splitlines() + stdout = sorted(list([ + x.replace(';', '').replace('COSMICcMuts=', '') for x in stdout + ])) + stdout = [' & '.join(set(x.split("&"))) for x in stdout] + # get COSMICcMuts values from dataframe + df_values = sorted(list(self.vcf_df['CSQ_COSMICcMuts'].unique().tolist())) + assert all([str(x) == str(y) for x, y in zip(stdout, df_values)]), ( + "COSMICcMuts values in VCF do not match those in dataframe" + ) + + def test_parsed_correct_COSMICncMuts_values(self): + """ + Test values read into dataframe for COSMICncMuts match the values + above from the VCF + """ + # read COSMICncMuts values from vcf + output = subprocess.run( + ( + f"grep -v '^#' {self.test_vcf} | grep -oh " + f"'COSMICncMuts=[A-Z0-9&\.]*;' | sort | uniq" + ), shell=True, capture_output=True + ) + + # clean up values + stdout = output.stdout.decode().splitlines() + stdout = sorted(list([ + x.replace(';', '').replace('COSMICncMuts=', '') for x in stdout + ])) + stdout = [' & '.join(set(x.split("&"))) for x in stdout] + # get COSMICncMuts values from dataframe + df_values = sorted(list(self.vcf_df['CSQ_COSMICncMuts'].unique().tolist())) + + assert all([str(x) == str(y) for x, y in zip(stdout, df_values)]), ( + "COSMICncMuts values in VCF do not match those in dataframe" + ) + + if __name__ == "__main__": columns = TestMainColumns() columns.test_filter() diff --git a/resources/home/dnanexus/generate_workbook/tests/test_vcf.py b/resources/home/dnanexus/generate_workbook/tests/test_vcf.py index 600e6638..12b1c725 100644 --- a/resources/home/dnanexus/generate_workbook/tests/test_vcf.py +++ b/resources/home/dnanexus/generate_workbook/tests/test_vcf.py @@ -93,6 +93,7 @@ class instance of vcf from utils add_name=True, analysis='', filter=None, keep=False, merge=False, reorder=[], exclude=None, include=None, + add_comment_column=False, out_dir='', output='', panel='', print_columns=False, print_header=False, reads='', rename=None, sample='', sheets=['variants'], summary=None, @@ -512,33 +513,33 @@ def test_cosmic_build_37(): "COSMICcMuts link output incorrect for build 37 input" ) - @staticmethod - def test_cosmic_build_37_multiple(): - ''' - Test that the COSMIC links are generated correctly for build 37 - ''' - # Intialise test dataframe with build 37 genome positions - df = pd.DataFrame([ - {'CHROM': 1, 'POS': 2488153, 'REF': 'A', - 'ALT': 'G', 'COSMICcMuts': 'COSV63186428&COSV63186428&COSV63186428'}, - ]) - - test_vcf = vcf(argparse.Namespace()) - test_vcf.vcfs = [df] - test_vcf.refs = ['37'] # Set reference = build 37 - - # Call function to add hyperlinks - vcf.add_hyperlinks(test_vcf) - - valid_string = ( - '=HYPERLINK("https://cancer.sanger.ac.uk/cosmic/search?' - 'genome=37&q=COSV63186428", "COSV63186428")' - ) - - # Assert the output is as expected - assert test_vcf.vcfs[0]["COSMICcMuts"][0] == valid_string, ( - "COSMICcMuts link output incorrect for build 37 input" - ) + # @staticmethod + # def test_cosmic_build_37_multiple(): + # ''' + # Test that the COSMIC links are generated correctly for build 37 + # ''' + # # Intialise test dataframe with build 37 genome positions + # df = pd.DataFrame([ + # {'CHROM': 1, 'POS': 2488153, 'REF': 'A', + # 'ALT': 'G', 'COSMICcMuts': 'COSV63186428&COSV63186428&COSV63186428'}, + # ]) + + # test_vcf = vcf(argparse.Namespace()) + # test_vcf.vcfs = [df] + # test_vcf.refs = ['37'] # Set reference = build 37 + + # # Call function to add hyperlinks + # vcf.add_hyperlinks(test_vcf) + + # valid_string = ( + # '=HYPERLINK("https://cancer.sanger.ac.uk/cosmic/search?' + # 'genome=37&q=COSV63186428", "COSV63186428")' + # ) + + # # Assert the output is as expected + # assert test_vcf.vcfs[0]["COSMICcMuts"][0] == valid_string, ( + # "COSMICcMuts link output incorrect for build 37 input" + # ) if __name__ == "__main__": header = TestHeader() From 08e061fc440a4220f94e91b2f91912d4e8cf599f Mon Sep 17 00:00:00 2001 From: RSWilson1 Date: Wed, 26 Apr 2023 09:22:47 +0100 Subject: [PATCH 07/12] removed old redundant test for handling cosmic duplicates. --- .../generate_workbook/tests/test_vcf.py | 27 ------------------- 1 file changed, 27 deletions(-) diff --git a/resources/home/dnanexus/generate_workbook/tests/test_vcf.py b/resources/home/dnanexus/generate_workbook/tests/test_vcf.py index 12b1c725..be4992e5 100644 --- a/resources/home/dnanexus/generate_workbook/tests/test_vcf.py +++ b/resources/home/dnanexus/generate_workbook/tests/test_vcf.py @@ -513,33 +513,6 @@ def test_cosmic_build_37(): "COSMICcMuts link output incorrect for build 37 input" ) - # @staticmethod - # def test_cosmic_build_37_multiple(): - # ''' - # Test that the COSMIC links are generated correctly for build 37 - # ''' - # # Intialise test dataframe with build 37 genome positions - # df = pd.DataFrame([ - # {'CHROM': 1, 'POS': 2488153, 'REF': 'A', - # 'ALT': 'G', 'COSMICcMuts': 'COSV63186428&COSV63186428&COSV63186428'}, - # ]) - - # test_vcf = vcf(argparse.Namespace()) - # test_vcf.vcfs = [df] - # test_vcf.refs = ['37'] # Set reference = build 37 - - # # Call function to add hyperlinks - # vcf.add_hyperlinks(test_vcf) - - # valid_string = ( - # '=HYPERLINK("https://cancer.sanger.ac.uk/cosmic/search?' - # 'genome=37&q=COSV63186428", "COSV63186428")' - # ) - - # # Assert the output is as expected - # assert test_vcf.vcfs[0]["COSMICcMuts"][0] == valid_string, ( - # "COSMICcMuts link output incorrect for build 37 input" - # ) if __name__ == "__main__": header = TestHeader() From 22cb7b5de54001487f552ca4ba6fe988067a7b60 Mon Sep 17 00:00:00 2001 From: RSWilson1 Date: Thu, 27 Apr 2023 15:25:28 +0100 Subject: [PATCH 08/12] Fixed Decipher build38 test in test_vcf.py --- resources/home/dnanexus/generate_workbook/tests/test_vcf.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/resources/home/dnanexus/generate_workbook/tests/test_vcf.py b/resources/home/dnanexus/generate_workbook/tests/test_vcf.py index be4992e5..d8b0f784 100644 --- a/resources/home/dnanexus/generate_workbook/tests/test_vcf.py +++ b/resources/home/dnanexus/generate_workbook/tests/test_vcf.py @@ -412,12 +412,11 @@ def test_decipher_links_build_38(): ]) test_vcf = vcf(argparse.Namespace(decipher=True)) - test_vcf.vcf = [df] + test_vcf.vcfs = [df] test_vcf.refs = ['38'] # Set reference = build 38 # Call function to add hyperlinks vcf.add_hyperlinks(test_vcf) - # Define expected string output valid_string = ( '=HYPERLINK("https://www.deciphergenomics.org/sequence-variant/1-6' From e00e7ed10ff6387256f92f935954e76caa32adc2 Mon Sep 17 00:00:00 2001 From: RSWilson1 Date: Thu, 27 Apr 2023 15:26:06 +0100 Subject: [PATCH 09/12] Updated requirements. --- requirements.txt | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 855a6861..82b324a2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,13 @@ colour==0.1.5 -python-Levenshtein==0.12.2 openpyxl==3.0.9 -pandas==1.3.5 \ No newline at end of file +pandas==1.3.5 +et-xmlfile==1.1.0 +filetype==1.1.0 +jarowinkler==1.2.1 +Levenshtein==0.20.2 +numpy==1.23.2 +python-dateutil==2.8.2 +python-Levenshtein==0.12.2 +pytz==2022.2.1 +rapidfuzz==2.5.0 +six==1.16.0 From 5bf087fbe377ab85f7276cfb93a3ad9285ff2f45 Mon Sep 17 00:00:00 2001 From: RSWilson1 Date: Thu, 27 Apr 2023 15:57:29 +0100 Subject: [PATCH 10/12] Updated version with notes in dxapp.json --- dxapp.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dxapp.json b/dxapp.json index 30ad6d51..3039fe89 100644 --- a/dxapp.json +++ b/dxapp.json @@ -3,11 +3,11 @@ "title": "eggd_generate_variant_workbook", "summary": "Create Excel workbook from VEP annotated vcf", "dxapi": "1.0.0", - "version": "2.3.0", + "version": "2.4.0", "properties": { - "githubRelease": "v2.3.0" + "githubRelease": "v2.4.0" }, - "whatsNew": "* v2.0.0 Rewrite of previous app to generate xlsx file from a VEP annotated VCF(s); * v2.0.1 Bug fix to correctly treat CHROM as string values; * v2.0.2 Bug fix for ACMG report template structure; * v2.0.3 Bug fixes for issues with hyperlinks, changed app name to eggd_generate_variant_workbook; * v2.1.0 Handle VCFs from GATK gCNV and Illumina TSO500, readability tweaks to variant sheets; * v2.1.1 Bug fix for typing of numeric values in hyperlinks; * v2.2.0 Added ability to pass in non VCF files (tsvs/csvs and images) to additional sheets, optional adding of links to DECIPHER with --decipher; * v2.3.0 Added conditional colouring of cells in variant sheets, new 'basic' summary sheet;", + "whatsNew": "* v2.0.0 Rewrite of previous app to generate xlsx file from a VEP annotated VCF(s); * v2.0.1 Bug fix to correctly treat CHROM as string values; * v2.0.2 Bug fix for ACMG report template structure; * v2.0.3 Bug fixes for issues with hyperlinks, changed app name to eggd_generate_variant_workbook; * v2.1.0 Handle VCFs from GATK gCNV and Illumina TSO500, readability tweaks to variant sheets; * v2.1.1 Bug fix for typing of numeric values in hyperlinks; * v2.2.0 Added ability to pass in non VCF files (tsvs/csvs and images) to additional sheets, optional adding of links to DECIPHER with --decipher; * v2.3.0 Added conditional colouring of cells in variant sheets, new 'basic' summary sheet; * v2.4.0 Added handling for duplicate annotation in VEP fields (i.e. cosmic, CGC, etc..);", "authorizedUsers": [ "org-emee_1" ], From a5b05c937c8f2b3be7860cd04da96c1863817501 Mon Sep 17 00:00:00 2001 From: RSWilson1 Date: Tue, 2 May 2023 14:30:01 +0100 Subject: [PATCH 11/12] PEP8 blankline --- resources/home/dnanexus/generate_workbook/tests/test_columns.py | 1 + 1 file changed, 1 insertion(+) diff --git a/resources/home/dnanexus/generate_workbook/tests/test_columns.py b/resources/home/dnanexus/generate_workbook/tests/test_columns.py index 78bc475f..7ac21a2e 100644 --- a/resources/home/dnanexus/generate_workbook/tests/test_columns.py +++ b/resources/home/dnanexus/generate_workbook/tests/test_columns.py @@ -58,6 +58,7 @@ def read_column_from_vcf(vcf, column) -> list: return output.stdout.decode().splitlines() + class TestMainColumns(): """ Tests for ensuring the CHROM, POS, REF, ALT, ID, QUAL and FILTER From 2cdcdeb81bafa24141cdb7e6b9668a6592030372 Mon Sep 17 00:00:00 2001 From: RSWilson1 Date: Tue, 2 May 2023 15:46:13 +0100 Subject: [PATCH 12/12] unique_vep function - added return type for function - changed delimiter returned from ' & ' to '&'. --- resources/home/dnanexus/generate_workbook/utils/columns.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/resources/home/dnanexus/generate_workbook/utils/columns.py b/resources/home/dnanexus/generate_workbook/utils/columns.py index 861fd722..68342d03 100644 --- a/resources/home/dnanexus/generate_workbook/utils/columns.py +++ b/resources/home/dnanexus/generate_workbook/utils/columns.py @@ -33,7 +33,7 @@ def split(self, vcf_df) -> Union[pd.DataFrame, int]: return vcf_df - def unique_vep(self, vcf_df): + def unique_vep(self, vcf_df) -> pd.DataFrame: """ Handle known bug in VEP annotation where it duplicates COSMIC IDs This creates a @@ -55,7 +55,7 @@ def unique_vep(self, vcf_df): # Join the 'csq' columns using '&' and remove duplicates for col in csq_columns: vcf_df[col] = vcf_df[col].apply( - lambda x: ' & '.join(sorted(set(x.split('&')))) if isinstance(x, str) else x + lambda x: '&'.join(sorted(set(x.split('&')))) if isinstance(x, str) else x ) return vcf_df