diff --git a/cleaner.sh b/cleaner.sh index e737b95..5ca3f11 100644 --- a/cleaner.sh +++ b/cleaner.sh @@ -2,8 +2,8 @@ rm -r work/ rm -r .nextflow/ -rm timeline-* -rm trace-* +rm timeline* +rm trace* rm .nextflow.log* find . -maxdepth 4 -type d -name .idea -exec rm -r {} + diff --git a/containers_build/boostdm/features/exon.py b/containers_build/boostdm/features/exon.py index 2b0beb6..cae2874 100644 --- a/containers_build/boostdm/features/exon.py +++ b/containers_build/boostdm/features/exon.py @@ -6,36 +6,17 @@ from boostdm.vepreader import Tabix -def nmd_rule(exon, total_exons): - - """ - mutation is in an exon - if first or last exon then return 1 - otherwise return 0 - """ - - if exon == 0: - nmd = 0 - elif (exon == 1) or (exon == total_exons): - nmd = 1 - else: - nmd = 0 - return nmd - - -def get_exon(chr_, pos, alt,gene, reader): +def get_nmd(chr_, pos, alt, gene, reader): for data in reader.get(chr_, pos, pos): alt_vep = (data["ALT"] == alt) mane_vep = (data["MANE_SELECT"] != '-') # impose mane transcript correct_gene = (data["SYMBOL"] == gene) # skip cases with antisense overlapping gene if alt_vep and mane_vep and correct_gene: - exons = data["EXON"] - if '/' in exons: - exon, total_exons = tuple(exons.split('/')) - else: - exon, total_exons = 0, 0 - return nmd_rule(exon, total_exons) + if data["NMD_SKIPPING"] == '-': + return 0 + elif data["NMD_SKIPPING"] == 'NMD_escaping_variant': + return 1 return 0 @@ -43,14 +24,12 @@ def add_feature(df): df = df.copy() with Tabix(TABIX_FILE) as reader: - get_from_reader = partial(get_exon, reader=reader) - df['nmd'] = df.apply(lambda row: get_from_reader(str(row['chr']), - int(row['pos']), - row['alt'], - row['gene']), axis=1) + get_from_reader = partial(get_nmd, reader=reader) + df['nmd'] = df.apply(lambda row: get_from_reader(row['chr'], row['pos'], row['alt'], row['gene']), axis=1) return df + def test(): """Test function""" df = pd.DataFrame({ diff --git a/containers_build/boostdm/vepreader.py b/containers_build/boostdm/vepreader.py index 8a53c91..7253821 100644 --- a/containers_build/boostdm/vepreader.py +++ b/containers_build/boostdm/vepreader.py @@ -8,7 +8,7 @@ HEADER = [ 'CHR', 'POS', 'REF', 'ALT', 'GENE','ENST','TYPE','CNSQ','cDNA_POS', 'CDS_POS', 'PROT_POS','AA','CODONS','EXISTING_VARIATION','IMPACT','DISTANCE','STRAND','FLAGS','SYMBOL', - 'SYMBOL_SOURCE','HGNC_ID','CANONICAL','MANE_SELECT','MANE_PLUS_CLINICAL','ENSP','EXON','INTRON' + 'SYMBOL_SOURCE','HGNC_ID','CANONICAL','MANE_SELECT','MANE_PLUS_CLINICAL','ENSP','EXON','INTRON','NMD_SKIPPING' ] diff --git a/nextflow.config b/nextflow.config index 4b01936..cc9cc71 100644 --- a/nextflow.config +++ b/nextflow.config @@ -7,10 +7,13 @@ params { env { GENOME_BUILD = "hg38" INTOGEN_DATASETS = "/workspace/datasets/intogen/runs/v2024/20240409_ALL/" - BOOSTDM_DATASETS = "/workspace/projects/intogen_plus/intogen-plus-v2024/datasets/boostdm/" + + BOOSTDM_DATASETS = "/workspace/projects/intogen_plus/containers/datasets_patch_24_11/boostdm/" + // BOOSTDM_DATASETS = "/workspace/projects/intogen_plus/intogen-plus-v2024/datasets/boostdm/" + VEP_SATURATION = env.INTOGEN_DATASETS + "/steps/boostDM/saturation/" - PIPELINE = "/workspace/datasets/boostdm_runs/boostdm-pipeline-2024/" - OUTPUT = "/workspace/datasets/boostdm_runs/boostdm-cancer-output-2024-noIARC/" + PIPELINE = "/workspace/datasets/boostdm_runs/boostdm-pipeline-patch-2024/" + OUTPUT = "/workspace/datasets/boostdm_runs/boostdm-patch-2024-output/" MAVE_DATA = "/workspace/projects/boostdm_analyses/mave_data/" } diff --git a/scan_errors.sh b/scan_errors.sh index 2c5a94a..63ceb71 100644 --- a/scan_errors.sh +++ b/scan_errors.sh @@ -1,3 +1,7 @@ # uses conda environment boostdm-new-pipeline +# usage example: + +# bash scan_errors.sh trace.txt + cat $1 | grep FAILED | cut -f3 | python _scan_errors.py