diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index a89ed1b6a..37e1eb774 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -147,7 +147,7 @@ jobs: with: directory: .test snakefile: workflow/Snakefile - args: "--configfile .test/config-simple/config.yaml --report report.zip" + args: "--configfile .test/config-simple/config.yaml --cores 1 --report report.zip" show-disk-usage-on-error: true - name: Test workflow (local FASTQs, target regions) diff --git a/.test/config-chm-eval/samples.tsv b/.test/config-chm-eval/samples.tsv index 639b083d6..899f3ed7b 100644 --- a/.test/config-chm-eval/samples.tsv +++ b/.test/config-chm-eval/samples.tsv @@ -1,2 +1,2 @@ -sample_name group alias platform -chm chm ILLUMINA +sample_name group alias platform datatype calling +chm chm ILLUMINA dna variants diff --git a/.test/config-giab/samples.tsv b/.test/config-giab/samples.tsv index c063795be..bc9421200 100644 --- a/.test/config-giab/samples.tsv +++ b/.test/config-giab/samples.tsv @@ -1,2 +1,2 @@ -sample_name alias group platform purity -NA12878 NA12878 NA12878 ILLUMINA \ No newline at end of file +sample_name alias group platform purity datatype calling +NA12878 NA12878 NA12878 ILLUMINA dna variants \ No newline at end of file diff --git a/.test/config-no-candidate-filtering/samples.tsv b/.test/config-no-candidate-filtering/samples.tsv index 9d39093cf..9d0291513 100644 --- a/.test/config-no-candidate-filtering/samples.tsv +++ b/.test/config-no-candidate-filtering/samples.tsv @@ -1,3 +1,3 @@ -sample_name group alias platform -a a ILLUMINA -b b ILLUMINA +sample_name group alias platform datatype calling +a a ILLUMINA dna variants +b b ILLUMINA dna variants diff --git a/.test/config-simple/samples.tsv b/.test/config-simple/samples.tsv index e89ac165a..bce79d10b 100644 --- a/.test/config-simple/samples.tsv +++ b/.test/config-simple/samples.tsv @@ -1,5 +1,5 @@ -sample_name group alias platform -a one x ILLUMINA -b one y ILLUMINA -b two x ILLUMINA -a two y ILLUMINA +sample_name group alias platform datatype calling +a one x ILLUMINA dna variants +b one y ILLUMINA dna variants +b two x ILLUMINA dna variants +a two y ILLUMINA dna variants \ No newline at end of file diff --git a/.test/config-sra/samples.tsv b/.test/config-sra/samples.tsv index 924cbca05..a43bb68ff 100644 --- a/.test/config-sra/samples.tsv +++ b/.test/config-sra/samples.tsv @@ -1,5 +1,5 @@ -sample_name group alias platform -PD12A medium_L base ILLUMINA -PD13B medium_L changed ILLUMINA -PD09A soil changed ILLUMINA -PD12A soil base ILLUMINA +sample_name group alias platform datatype calling +PD12A medium_L base ILLUMINA dna variants +PD13B medium_L changed ILLUMINA dna variants +PD09A soil changed ILLUMINA dna variants +PD12A soil base ILLUMINA dna variants \ No newline at end of file diff --git a/.test/config-target-regions/samples.tsv b/.test/config-target-regions/samples.tsv index 9d39093cf..9d0291513 100644 --- a/.test/config-target-regions/samples.tsv +++ b/.test/config-target-regions/samples.tsv @@ -1,3 +1,3 @@ -sample_name group alias platform -a a ILLUMINA -b b ILLUMINA +sample_name group alias platform datatype calling +a a ILLUMINA dna variants +b b ILLUMINA dna variants diff --git a/.test/config_primers/samples.tsv b/.test/config_primers/samples.tsv index 9d39093cf..9d0291513 100644 --- a/.test/config_primers/samples.tsv +++ b/.test/config_primers/samples.tsv @@ -1,3 +1,3 @@ -sample_name group alias platform -a a ILLUMINA -b b ILLUMINA +sample_name group alias platform datatype calling +a a ILLUMINA dna variants +b b ILLUMINA dna variants diff --git a/config/README.md b/config/README.md index 363b65a5f..a5fa7bf96 100644 --- a/config/README.md +++ b/config/README.md @@ -4,11 +4,13 @@ To configure this workflow, modify ``config/config.yaml`` according to your need # Sample sheet -Add samples to `config/samples.tsv`. For each sample, the columns `sample_name`, `alias`, `platform`, and `group` have to be defined. +Add samples to `config/samples.tsv`. For each sample, the columns `sample_name`, `alias`, `platform`, `datatype`, `calling` and `group` have to be defined. * Samples within the same `group` can be referenced in a joint [Calling scenario](#calling-scenario) via their `alias`es. * `alias`es represent the name of the sample within its group. They are meant to be some abstract description of the sample type to be used in the [Calling scenario](#calling-scenario), and should thus be used consistently across groups. A classic example would be a combination of the `tumor` and `normal` aliases. * The `platform` column needs to contain the used sequencing plaform (one of 'CAPILLARY', 'LS454', 'ILLUMINA', 'SOLID', 'HELICOS', 'IONTORRENT', 'ONT', 'PACBIO’). * The same `sample_name` entry can be used multiple times within a `samples.tsv` sample sheet, with only the value in the `group` column differing between repeated rows. This way, you can use the same sample for variant calling in different groups, for example if you use a panel of normal samples when you don't have matched normal samples for tumor variant calling. +* The `datatype` column specifies what kind of data each sample corresponds to. This can either be `rna` or `dna`. +* The `calling` column sets the kind of analysis to be performed. This can be either `fusions`, `variants` or both (comma separated). Fusion calling is still under developement and should be considered as experimental. If mutational burdens shall be estimated for a sample, the to be used ``events`` from the calling scenario (see below) have to be specified in an additional column ``mutational_burden_events``. Multiple events have to be separated by commas within that column. diff --git a/config/config.yaml b/config/config.yaml index bdb7555c4..becf3e18d 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -22,7 +22,7 @@ ref: # Ensembl species name species: homo_sapiens # Ensembl release - release: 110 + release: 111 # Genome build build: GRCh38 # Optionally, instead of downloading the whole reference from Ensembl via the @@ -121,8 +121,9 @@ calling: # Add any number of events here to filter for. # The id of each event can be chosen freely, but needs to contain # only alphanumerics and underscores - # ("somatic" below is just an example and can be modified as needed). + # ("some_id" below is just an example and can be modified as needed). some_id: + types: ["variants", "fusions"] # labels for the callset, displayed in the report. Will fall back to id if no labels specified labels: some-label: label text diff --git a/config/samples.tsv b/config/samples.tsv index 7161b2267..dcd1d552c 100644 --- a/config/samples.tsv +++ b/config/samples.tsv @@ -1 +1 @@ -sample_name alias group platform purity panel umi_read umi_read_structure +sample_name alias group platform purity panel umi_read umi_read_structure datatype calling diff --git a/workflow/Snakefile b/workflow/Snakefile index 8c13d25e6..201d51e8a 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -35,6 +35,7 @@ include: "rules/table.smk" include: "rules/regions.smk" include: "rules/plugins.smk" include: "rules/datavzrd.smk" +include: "rules/fusion_calling.smk" include: "rules/testcase.smk" diff --git a/workflow/envs/arriba.yaml b/workflow/envs/arriba.yaml new file mode 100644 index 000000000..861318b7d --- /dev/null +++ b/workflow/envs/arriba.yaml @@ -0,0 +1,5 @@ +channels: + - conda-forge + - bioconda +dependencies: + - arriba =2.4 \ No newline at end of file diff --git a/workflow/envs/bcftools.yaml b/workflow/envs/bcftools.yaml index 35bf76437..0d0e8da51 100644 --- a/workflow/envs/bcftools.yaml +++ b/workflow/envs/bcftools.yaml @@ -2,4 +2,4 @@ channels: - conda-forge - bioconda dependencies: - - bcftools =1.14 + - bcftools =1.16 diff --git a/workflow/envs/pandas.yaml b/workflow/envs/pandas.yaml index 9ae27f207..dffa29f8c 100644 --- a/workflow/envs/pandas.yaml +++ b/workflow/envs/pandas.yaml @@ -2,5 +2,5 @@ channels: - conda-forge - bioconda dependencies: - - pandas =1.4 + - pandas =2.1 - python =3.10 \ No newline at end of file diff --git a/workflow/report/workflow.rst b/workflow/report/workflow.rst index fd5703ae2..e39ac4276 100644 --- a/workflow/report/workflow.rst +++ b/workflow/report/workflow.rst @@ -1,7 +1,7 @@ This workflow generates annotated variant calls that can be viewed in interactive reports, showing all evidence levels provided by Varlociraptor_. Adapters were removed with Cutadapt_. Reads were mapped with `BWA MEM`_, PCR and optical duplicates were removed with Picard_. Candidate variant discovery was performed with Freebayes_ and Delly_. Statisticall assessment of variants was conducted with Varlociraptor_. -Variant calling results, sorted by type, event, and impact can be found under `Variant calls`_. +Fusion resp. variant calling results, sorted by type, event, and impact can be found under Fusion/Variant calls. The corresponding Varlociraptor_ scenarios, containing the detailed definition of events can be found unter `Variant calling scenarios`_. .. _Varlociraptor: https://varlociraptor.github.io diff --git a/workflow/resources/datavzrd/fusion-calls-template.datavzrd.yaml b/workflow/resources/datavzrd/fusion-calls-template.datavzrd.yaml new file mode 100644 index 000000000..757e5c6fe --- /dev/null +++ b/workflow/resources/datavzrd/fusion-calls-template.datavzrd.yaml @@ -0,0 +1,57 @@ +name: ?f"Fusion calls {wildcards.event}" + +default-view: ?f"{params.groups[0]}-fusions" + +__definitions__: + - import os + - | + def read_file(path): + return open(path, 'r').read() + +datasets: + ?for group, path in zip(params.groups, params.fusion_calls): + ?f"{group}-fusions": + path: ?path + separator: "\t" + +views: + ?for group in params.groups: + ?f"{group}-fusions": + desc: ?f"Fusion calls.\n{config['calling']['fdr-control']['events'][wildcards.event]['desc']}" + dataset: ?f"{group}-fusions" + render-table: + columns: + "regex('.+: allele frequency')": + plot: + ticks: + scale: "linear" + domain: [0.0, 1.0] + aux-domain-columns: + - "regex('.+: allele frequency')" + "regex('.+: read depth')": + plot: + ticks: + scale: "linear" + aux-domain-columns: + - "regex('.+: read depth')" + "regex('prob: .+')": + plot: + heatmap: + scale: linear + domain: [0.0, 1.0] + range: + - white + - "#1f77b4" + ?for alias in params.samples.loc[params.samples["group"] == group, "alias"]: + '?f"{alias}: short observations"': + optional: true + custom-plot: + data: ?read_file(params.data_short_observations) + spec: ?read_file(params.spec_short_observations) + display-mode: detail + '?f"{alias}: observations"': + optional: true + custom-plot: + data: ?read_file(params.data_observations) + spec-path: ?params.spec_observations + display-mode: detail \ No newline at end of file diff --git a/workflow/resources/datavzrd/variant-calls-template.datavzrd.yaml b/workflow/resources/datavzrd/variant-calls-template.datavzrd.yaml index b3bc91e6a..a666a4339 100644 --- a/workflow/resources/datavzrd/variant-calls-template.datavzrd.yaml +++ b/workflow/resources/datavzrd/variant-calls-template.datavzrd.yaml @@ -58,6 +58,19 @@ __definitions__: return `https://${{build}}${{url_suffix}}${{hgvsg}}` }} """ + - | + protein_id = f""" + function(row) {{ + let protein_id = row.hgvsp.split(':')[0] + return protein_id + }} + """ + - | + empty_content = f""" + function(row) {{ + return "" + }} + """ datasets: ?if input.variant_oncoprints: @@ -290,6 +303,28 @@ views: display-mode: detail protein alteration (short): display-mode: detail + canonical: + optional: true + display-mode: detail + plot: + heatmap: + scale: "ordinal" + domain: ["", "True"] + range: + - white + - black + custom-content: ?empty_content + mane_plus_clinical: + optional: true + display-mode: detail + plot: + heatmap: + scale: "ordinal" + domain: ["", "True"] + range: + - white + - black + custom-content: ?empty_content ?for alias in params.samples.loc[params.samples["group"] == group, "alias"]: '?f"{alias}: short observations"': optional: true @@ -310,6 +345,10 @@ views: query_genomenexus: value: ?genomenexus_link display-mode: hidden + ensembl_protein_id: + value: ?protein_id + display-mode: detail + @@ -404,6 +443,28 @@ views: display-mode: detail alternative allele: display-mode: detail + canonical: + optional: true + display-mode: detail + plot: + heatmap: + scale: "ordinal" + domain: ["", "True"] + range: + - white + - black + custom-content: ?empty_content + mane_plus_clinical: + optional: true + display-mode: detail + plot: + heatmap: + scale: "ordinal" + domain: ["", "True"] + range: + - white + - black + custom-content: ?empty_content ?for alias in params.samples.loc[params.samples["group"] == group, "alias"]: '?f"{alias}: short observations"': optional: true diff --git a/workflow/rules/annotation.smk b/workflow/rules/annotation.smk index 055e3a594..d94db1dd8 100644 --- a/workflow/rules/annotation.smk +++ b/workflow/rules/annotation.smk @@ -19,12 +19,12 @@ rule annotate_candidate_variants: "benchmarks/vep/{group}.{caller}.{scatteritem}.annotate_candidates.tsv" threads: get_vep_threads() wrapper: - "v2.5.0/bio/vep/annotate" + "v3.3.5/bio/vep/annotate" rule annotate_variants: input: - calls="results/calls/{group}.{scatteritem}.bcf", + calls="results/calls/{group}.{calling_type}.{scatteritem}.bcf", cache="resources/vep/cache", plugins="resources/vep/plugins", revel=lambda wc: get_plugin_aux("REVEL"), @@ -32,8 +32,8 @@ rule annotate_variants: fasta=genome, fai=genome_fai, output: - calls="results/calls/{group}.{scatteritem}.annotated.bcf", - stats="results/calls/{group}.{scatteritem}.stats.html", + calls="results/calls/{group}.{calling_type}.{scatteritem}.annotated.bcf", + stats="results/calls/{group}.{calling_type}.{scatteritem}.stats.html", params: # Pass a list of plugins to use, see https://www.ensembl.org/info/docs/tools/vep/script/vep_plugins.html # Plugin args can be added as well, e.g. via an entry "MyPlugin,1,FOO", see docs. @@ -42,10 +42,10 @@ rule annotate_variants: config["annotations"]["vep"]["final_calls"]["params"] ), log: - "logs/vep/{group}.{scatteritem}.annotate.log", + "logs/vep/{group}.{calling_type}.{scatteritem}.annotate.log", threads: get_vep_threads() wrapper: - "v2.5.0/bio/vep/annotate" + "v3.3.5/bio/vep/annotate" # TODO What about multiple ID Fields? @@ -89,9 +89,9 @@ rule gather_annotated_calls: calls=get_gather_annotated_calls_input(), idx=get_gather_annotated_calls_input(ext="bcf.csi"), output: - "results/final-calls/{group}.annotated.bcf", + "results/final-calls/{group}.{calling_type}.annotated.bcf", log: - "logs/gather-annotated-calls/{group}.log", + "logs/gather-annotated-calls/{group}.{calling_type}.log", params: extra="-a", wrapper: diff --git a/workflow/rules/calling.smk b/workflow/rules/calling.smk index b66702f80..7d1cba8c5 100644 --- a/workflow/rules/calling.smk +++ b/workflow/rules/calling.smk @@ -27,6 +27,7 @@ rule varlociraptor_alignment_properties: ref=genome, ref_idx=genome_fai, bam="results/recal/{sample}.bam", + bai="results/recal/{sample}.bai", output: "results/alignment-properties/{group}/{sample}.json", log: @@ -48,7 +49,9 @@ rule varlociraptor_preprocess: output: "results/observations/{group}/{sample}.{caller}.{scatteritem}.bcf", params: - extra=config["params"]["varlociraptor"]["preprocess"], + extra=lambda wc: get_varlociraptor_params( + wc, config["params"]["varlociraptor"]["preprocess"] + ), log: "logs/varlociraptor/preprocess/{group}/{sample}.{caller}.{scatteritem}.log", benchmark: @@ -66,12 +69,16 @@ rule varlociraptor_call: obs=get_group_observations, scenario="results/scenarios/{group}.yaml", output: - temp("results/calls/{group}.{caller}.{scatteritem}.bcf"), + temp("results/calls/{group}.{caller}.{scatteritem}.unsorted.bcf"), log: "logs/varlociraptor/call/{group}.{caller}.{scatteritem}.log", params: obs=get_varlociraptor_obs_args, - extra=config["params"]["varlociraptor"]["call"], + # Add -o as workaround to separate info field entries from subcommand + extra=lambda wc: get_varlociraptor_params( + wc, config["params"]["varlociraptor"]["call"] + ) + + " -o /dev/stdout", postprocess=">" if not config["calling"].get("infer_genotypes") else "| varlociraptor genotype >", @@ -86,18 +93,20 @@ rule varlociraptor_call: rule sort_calls: input: - "results/calls/{group}.{caller}.{scatteritem}.bcf", + "results/calls/{group}.{caller}.{scatteritem}.unsorted.bcf", output: temp("results/calls/{group}.{caller}.{scatteritem}.bcf"), + params: + # Set to True, in case you want uncompressed BCF output + uncompressed_bcf=False, + # Extra arguments + extras="", log: "logs/bcf-sort/{group}.{caller}.{scatteritem}.log", - conda: - "../envs/bcftools.yaml" resources: mem_mb=8000, - shell: - "bcftools sort --max-mem {resources.mem_mb}M --temp-dir `mktemp -d` " - "-Ob {input} > {output} 2> {log}" + wrapper: + "v2.6.0/bio/bcftools/sort" rule bcftools_concat: @@ -105,9 +114,9 @@ rule bcftools_concat: calls=get_scattered_calls(), indexes=get_scattered_calls(ext="bcf.csi"), output: - "results/calls/{group}.{scatteritem}.bcf", + "results/calls/{group}.{calling_type}.{scatteritem}.bcf", log: - "logs/concat-calls/{group}.{scatteritem}.log", + "logs/concat-calls/{group}.{calling_type}.{scatteritem}.log", params: extra="-a", # TODO Check this wrapper: diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index f722e6d89..dbb110e86 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -21,11 +21,11 @@ if not "mutational_burden_events" in samples.columns: samples["mutational_burden_events"] = pd.NA # construct genome name -datatype = "dna" +datatype_genome = "dna" species = config["ref"]["species"] build = config["ref"]["build"] release = config["ref"]["release"] -genome_name = f"genome.{datatype}.{species}.{build}.{release}" +genome_name = f"genome.{datatype_genome}.{species}.{build}.{release}" genome_prefix = f"resources/{genome_name}" genome = f"{genome_prefix}.fasta" genome_fai = f"{genome}.fai" @@ -58,7 +58,18 @@ if "umi_read" not in samples.columns: validate(samples, schema="../schemas/samples.schema.yaml") +# Does this correctly return groups where fusion and variants are set? +def get_calling_groups(calling_type): + return samples.loc[ + samples["calling"].str.contains(calling_type), + "group", + ].unique() + + groups = samples["group"].unique() +calling_types = samples["calling"].str.split(",").explode().unique().tolist() +variants_groups = get_calling_groups("variants") +fusions_groups = get_calling_groups("fusions") if "groups" in config: group_annotation = ( @@ -99,6 +110,15 @@ primer_panels = ( ) +def get_calling_events(calling_type): + events = [ + event + for event, entries in config["calling"]["fdr-control"]["events"].items() + if calling_type in entries.get("types", ["variants"]) + ] + return events + + def get_heterogeneous_labels(): nunique = group_annotation.nunique() cols_to_drop = nunique[nunique == 1].index @@ -112,43 +132,56 @@ def get_final_output(wildcards): ) final_output.extend( - expand("results/datavzrd-report/{group}.coverage", group=groups) + expand( + "results/datavzrd-report/{group}.coverage", + group=groups, + ) ) - if config["report"]["activate"]: - final_output.extend( - expand( - "results/datavzrd-report/{batch}.{event}.fdr-controlled", - event=config["calling"]["fdr-control"]["events"], - batch=get_report_batches(), + for calling_type in calling_types: + if config["report"]["activate"]: + final_output.extend( + expand( + "results/datavzrd-report/{batch}.{event}.{calling_type}.fdr-controlled", + batch=get_report_batches(), + event=get_calling_events(calling_type), + calling_type=calling_type, + ) ) - ) - else: - final_output.extend( - expand( - "results/final-calls/{group}.{event}.fdr-controlled.bcf", - group=groups, - event=config["calling"]["fdr-control"]["events"], + else: + final_output.extend( + expand( + "results/final-calls/{group}.{event}.{calling_type}.fdr-controlled.bcf", + group=variants_groups + if calling_type == "variants" + else fusions_groups, + event=get_calling_events(calling_type), + calling_type=calling_type, + ) ) - ) - if config["tables"]["activate"]: - final_output.extend( - expand( - "results/tables/{group}.{event}.fdr-controlled.tsv", - group=groups, - event=config["calling"]["fdr-control"]["events"], - ) - ) - if config["tables"].get("generate_excel", False): + if config["tables"]["activate"]: final_output.extend( expand( - "results/tables/{group}.{event}.fdr-controlled.xlsx", - group=groups, - event=config["calling"]["fdr-control"]["events"], + "results/tables/{group}.{event}.{calling_type}.fdr-controlled.tsv", + group=variants_groups + if calling_type == "variants" + else fusions_groups, + event=get_calling_events(calling_type), + calling_type=calling_type, ) ) - + if config["tables"].get("generate_excel", False): + final_output.extend( + expand( + "results/tables/{group}.{event}.{calling_type}.fdr-controlled.xlsx", + group=variants_groups + if calling_type == "variants" + else fusions_groups, + event=get_calling_events(calling_type), + calling_type=calling_type, + ) + ) final_output.extend(get_mutational_burden_targets()) return final_output @@ -157,9 +190,9 @@ def get_final_output(wildcards): def get_gather_calls_input(ext="bcf"): def inner(wildcards): if wildcards.by == "odds": - pattern = "results/calls/{{{{group}}}}.{{{{event}}}}.{{scatteritem}}.filtered_odds.{ext}" + pattern = "results/calls/{{{{group}}}}.{{{{event}}}}.{{{{calling_type}}}}.{{scatteritem}}.filtered_odds.{ext}" elif wildcards.by == "ann": - pattern = "results/calls/{{{{group}}}}.{{{{event}}}}.{{scatteritem}}.filtered_ann.{ext}" + pattern = "results/calls/{{{{group}}}}.{{{{event}}}}.{{{{calling_type}}}}.{{scatteritem}}.filtered_ann.{ext}" else: raise ValueError( "Unexpected wildcard value for 'by': {}".format(wildcards.by) @@ -171,25 +204,33 @@ def get_gather_calls_input(ext="bcf"): def get_control_fdr_input(wildcards): query = get_fdr_control_params(wildcards) - if not is_activated("benchmarking") and query["filter"]: + if ( + not is_activated("benchmarking") + and query["filter"] + and wildcards.calling_type == "variants" + ): by = "ann" if query["local"] else "odds" - return "results/calls/{{group}}.{{event}}.filtered_{by}.bcf".format(by=by) + return "results/calls/{{group}}.{{event}}.{{calling_type}}.filtered_{by}.bcf".format( + by=by + ) else: - return "results/final-calls/{group}.annotated.bcf" + return "results/final-calls/{group}.{calling_type}.annotated.bcf" def get_recalibrate_quality_input(wildcards, bai=False): ext = "bai" if bai else "bam" + datatype = get_sample_datatype(wildcards.sample) + if datatype == "rna": + return "results/split/{{sample}}.{ext}".format(ext=ext) + # Post-processing of DNA samples if is_activated("calc_consensus_reads"): - return "results/consensus/{}.{}".format(wildcards.sample, ext) + return "results/consensus/{{sample}}.{ext}".format(ext=ext) elif is_activated("primers/trimming"): - return "results/trimmed/{sample}.trimmed.{ext}".format( - sample=wildcards.sample, ext=ext - ) + return "results/trimmed/{{sample}}.trimmed.{ext}".format(ext=ext) elif is_activated("remove_duplicates"): - return "results/dedup/{}.{}".format(wildcards.sample, ext) + return "results/dedup/{{sample}}.{ext}".format(ext=ext) else: - return "results/mapped/{}.{}".format(wildcards.sample, ext) + return "results/mapped/bwa/{{sample}}.{ext}".format(ext=ext) def get_cutadapt_input(wildcards): @@ -303,6 +344,18 @@ def get_map_reads_input(wildcards): return "results/merged/{sample}_single.fastq.gz" +def get_star_reads_input(wildcards, r2=False): + match (bool(is_paired_end(wildcards.sample)), r2): + case (True, False): + return "results/merged/{sample}_R1.fastq.gz" + case (True, True): + return "results/merged/{sample}_R2.fastq.gz" + case (False, False): + return "results/merged/{sample}_single.fastq.gz" + case (False, True): + return [] + + def get_group_aliases(group): return samples.loc[samples["group"] == group]["alias"] @@ -324,20 +377,36 @@ def get_group_sample_aliases(wildcards, controls=True): ]["alias"] +def get_sample_datatype(sample): + return samples.loc[[sample], "datatype"].iloc[0] + + +def get_markduplicates_input(wildcards): + aligner = "star" if get_sample_datatype(wildcards.sample) == "rna" else "bwa" + if sample_has_umis(wildcards.sample): + return "results/mapped/{aligner}/{{sample}}.annotated.bam".format( + aligner=aligner + ) + else: + return "results/mapped/{aligner}/{{sample}}.bam".format(aligner=aligner) + + def get_consensus_input(wildcards): if is_activated("primers/trimming"): - return "results/trimmed/{}.trimmed.bam".format(wildcards.sample) + return "results/trimmed/{sample}.trimmed.bam" elif is_activated("remove_duplicates"): - return "results/dedup/{}.bam".format(wildcards.sample) + return "results/dedup/{sample}.bam" else: - return "results/mapped/{}.bam".format(wildcards.sample) + aligner = "star" if get_sample_datatype(wildcards.sample) == "rna" else "bwa" + return "results/mapped/{aligner}/{{sample}}.bam".format(aligner=aligner) def get_trimming_input(wildcards): if is_activated("remove_duplicates"): - return "results/dedup/{}.bam".format(wildcards.sample) + return "results/dedup/{sample}.bam" else: - return "results/mapped/{}.bam".format(wildcards.sample) + aligner = "star" if get_sample_datatype(wildcards.sample) == "rna" else "bwa" + return "results/mapped/{aligner}/{{sample}}.bam".format(aligner=aligner) def get_primer_bed(wc): @@ -443,42 +512,13 @@ def get_group_bams(wildcards, bai=False): ) -def get_group_bams_report(group): - return [ - (sample, "results/recal/{}.bam".format(sample)) - for sample in get_group_samples(group) - ] - - -def _get_batch_info(wildcards): - for group in get_report_batch(wildcards): - for sample, bam in get_group_bams_report(group): - yield sample, bam, group - - -def get_batch_bams(wildcards): - return (bam for _, bam, _ in _get_batch_info(wildcards)) - - -def get_report_bam_params(wildcards, input): - return [ - "{group}:{sample}={bam}".format(group=group, sample=sample, bam=bam) - for (sample, _, group), bam in zip(_get_batch_info(wildcards), input.bams) - ] - - -def get_batch_bcfs(wildcards): - for group in get_report_batch(wildcards): - yield "results/final-calls/{group}.{event}.fdr-controlled.bcf".format( - group=group, event=wildcards.event - ) - - -def get_report_bcf_params(wildcards, input): - return [ - "{group}={bcf}".format(group=group, bcf=bcf) - for group, bcf in zip(get_report_batch(wildcards), input.bcfs) - ] +def get_arriba_group_candidates(wildcards, csi=False): + ext = ".csi" if csi else "" + return expand( + "results/candidate-calls/{sample}.arriba.bcf{ext}", + sample=get_group_samples(wildcards.group), + ext=ext, + ) def get_processed_consensus_input(wildcards): @@ -521,6 +561,14 @@ def is_activated(xpath): return bool(c.get("activate", False)) +def get_star_read_group(wildcards): + """Denote sample name and platform in read group.""" + platform = extract_unique_sample_column_value(wildcards.sample, "platform") + return r"--outSAMattrRGline ID:{sample} SM:{sample} PL:{platform}".format( + sample=wildcards.sample, platform=platform + ) + + def get_read_group(wildcards): """Denote sample name and platform in read group.""" platform = extract_unique_sample_column_value(wildcards.sample, "platform") @@ -532,7 +580,7 @@ def get_read_group(wildcards): def get_mutational_burden_targets(): mutational_burden_targets = [] if is_activated("mutational_burden"): - for group in groups: + for group in variants_groups: mutational_burden_targets.extend( expand( "results/plots/mutational-burden/{group}.{alias}.{mode}.mutational-burden.svg", @@ -546,6 +594,7 @@ def get_mutational_burden_targets(): def get_scattered_calls(ext="bcf"): def inner(wildcards): + caller = "arriba" if wildcards.calling_type == "fusions" else variant_caller return expand( "results/calls/{{group}}.{caller}.{{scatteritem}}.{ext}", caller=caller, @@ -565,17 +614,24 @@ def get_selected_annotations(): def get_annotated_bcf(wildcards): - selection = get_selected_annotations() - return "results/calls/{group}.{scatteritem}{selection}.bcf".format( - group=wildcards.group, selection=selection, scatteritem=wildcards.scatteritem + selection = ( + get_selected_annotations() if wildcards.calling_type == "variants" else "" + ) + return "results/calls/{group}.{calling_type}.{scatteritem}{selection}.bcf".format( + group=wildcards.group, + calling_type=wildcards.calling_type, + selection=selection, + scatteritem=wildcards.scatteritem, ) def get_gather_annotated_calls_input(ext="bcf"): def inner(wildcards): - selection = get_selected_annotations() + selection = ( + get_selected_annotations() if wildcards.calling_type == "variants" else "" + ) return gather.calling( - "results/calls/{{{{group}}}}.{{scatteritem}}{selection}.{ext}".format( + "results/calls/{{{{group}}}}.{{{{calling_type}}}}.{{scatteritem}}{selection}.{ext}".format( ext=ext, selection=selection ) ) @@ -591,12 +647,12 @@ def get_candidate_calls(): return "results/candidate-calls/{group}.{caller}.{scatteritem}.bcf" -def get_report_batch(wildcards): - if wildcards.batch == "all": - _groups = groups +def _get_report_batch(calling_type, batch): + if batch == "all": + _groups = variants_groups if calling_type == "variants" else fusions_groups else: _groups = samples.loc[ - samples[config["report"]["stratify"]["by-column"]] == wildcards.batch, + samples[config["report"]["stratify"]["by-column"]] == batch, "group", ].unique() if not any(_groups): @@ -604,6 +660,13 @@ def get_report_batch(wildcards): return _groups +def get_report_batch(calling_type): + def inner(wildcards): + return _get_report_batch(calling_type, wildcards.batch) + + return inner + + def get_report_batches(): if is_activated("report/stratify"): yield "all" @@ -614,10 +677,15 @@ def get_report_batches(): def get_merge_calls_input(ext="bcf"): def inner(wildcards): + vartype = ( + ["SNV", "INS", "DEL", "MNV", "BND", "INV", "DUP", "REP"] + if wildcards.calling_type == "variants" + else ["BND"] + ) return expand( - "results/calls/{{group}}.{vartype}.{{event}}.fdr-controlled.{ext}", + "results/calls/{{group}}.{vartype}.{{event}}.{{calling_type}}.fdr-controlled.{ext}", ext=ext, - vartype=["SNV", "INS", "DEL", "MNV", "BND", "INV", "DUP", "REP"], + vartype=vartype, ) return inner @@ -682,19 +750,19 @@ def get_filter_targets(wildcards, input): def get_filter_expression(filter_name): - filter = config["calling"]["filter"][filter_name] - if isinstance(filter, str): - return filter + filter_entry = config["calling"]["filter"][filter_name] + if isinstance(filter_entry, str): + return filter_entry else: - return config["calling"]["filter"][filter_name]["expression"] + return filter_entry["expression"] def get_filter_aux_entries(filter_name): - filter = config["calling"]["filter"][filter_name] - if isinstance(filter, str): + filter_entry = config["calling"]["filter"][filter_name] + if isinstance(filter_entry, str): return {} else: - aux = config["calling"]["filter"][filter_name].get("aux-files", {}) + aux = filter_entry.get("aux-files", {}) return aux # [f"--aux {name} {path}" for name, path in aux.items()] @@ -723,8 +791,8 @@ def get_annotation_filter_aux(wildcards): def get_annotation_filter_aux_files(wildcards): return [ path - for filter in get_annotation_filter_names(wildcards) - for name, path in get_filter_aux_entries(filter).items() + for filter_name in get_annotation_filter_names(wildcards) + for name, path in get_filter_aux_entries(filter_name).items() ] @@ -761,21 +829,34 @@ def get_varlociraptor_obs_args(wildcards, input): ] +def get_varlociraptor_params(wildcards, params): + if wildcards.caller == "arriba": + params += " --propagate-info-fields GENE_NAME GENE_ID EXON" + return params + + wildcard_constraints: group="|".join(groups), sample="|".join(samples["sample_name"]), - caller="|".join(["freebayes", "delly"]), + caller="|".join(["freebayes", "delly", "arriba"]), filter="|".join(config["calling"]["filter"]), event="|".join(config["calling"]["fdr-control"]["events"].keys()), regions_type="|".join(["expanded", "covered"]), + calling_type="|".join(["fusions", "variants"]), -caller = list( +variant_caller = list( filter( None, [ - "freebayes" if is_activated("calling/freebayes") else None, - "delly" if is_activated("calling/delly") else None, + "freebayes" + if is_activated("calling/freebayes") + and samples["calling"].str.contains("variants").any() + else None, + "delly" + if is_activated("calling/delly") + and samples["calling"].str.contains("variants").any() + else None, ], ) ) @@ -859,34 +940,43 @@ def get_vembrane_config(wildcards, input): parts.append(parts_field) header.append(header_name) - annotation_fields = [ - "SYMBOL", - "Gene", - "Feature", - "IMPACT", - "HGVSp", - {"name": "protein position", "expr": "ANN['Protein_position'].raw"}, - {"name": "protein alteration (short)", "expr": "ANN['Amino_acids']"}, - "HGVSg", - "Consequence", - "CANONICAL", - "MANE_PLUS_CLINICAL", - {"name": "clinical significance", "expr": "ANN['CLIN_SIG']"}, - {"name": "gnomad genome af", "expr": "ANN['gnomADg_AF']"}, - ] - - annotation_fields.extend( - [ - field - for field in config_output.get("annotation_fields", []) - if field not in annotation_fields + if wildcards.calling_type == "fusions": + info_fields = [ + {"name": "mateid", "expr": "INFO['MATEID'][0]"}, + {"name": "feature_name", "expr": "INFO['GENE_NAME']"}, + {"name": "feature_id", "expr": "INFO['GENE_ID']"}, + "EXON", ] - ) + append_items(info_fields, "INFO['{}']".format, lambda x: x.lower()) + else: + annotation_fields = [ + "SYMBOL", + "Gene", + "Feature", + "IMPACT", + "HGVSp", + {"name": "protein position", "expr": "ANN['Protein_position'].raw"}, + {"name": "protein alteration (short)", "expr": "ANN['Amino_acids']"}, + "HGVSg", + "Consequence", + "CANONICAL", + "MANE_PLUS_CLINICAL", + {"name": "clinical significance", "expr": "ANN['CLIN_SIG']"}, + {"name": "gnomad genome af", "expr": "ANN['gnomADg_AF']"}, + ] + + annotation_fields.extend( + [ + field + for field in config_output.get("annotation_fields", []) + if field not in annotation_fields + ] + ) - if "REVEL" in config["annotations"]["vep"]["final_calls"]["plugins"]: - annotation_fields.append("REVEL") + if "REVEL" in config["annotations"]["vep"]["final_calls"]["plugins"]: + annotation_fields.append("REVEL") - append_items(annotation_fields, "ANN['{}']".format, lambda x: x.lower()) + append_items(annotation_fields, "ANN['{}']".format, lambda x: x.lower()) samples = get_group_sample_aliases(wildcards) @@ -977,6 +1067,10 @@ def get_primer_extra(wc, input): def get_datavzrd_data(impact="coding"): + calling_type = "variants" + if impact == "fusions": + impact = "fusions.joined" + calling_type = "fusions" pattern = "results/tables/{group}.{event}.{impact}.fdr-controlled.tsv" def inner(wildcards): @@ -984,14 +1078,14 @@ def get_datavzrd_data(impact="coding"): pattern, impact=impact, event=wildcards.event, - group=get_report_batch(wildcards), + group=get_report_batch(calling_type), ) return inner def get_oncoprint_input(wildcards): - groups = get_report_batch(wildcards) + groups = get_report_batch("variants") return expand( "results/tables/{group}.{event}.coding.fdr-controlled.tsv", group=groups, @@ -1052,14 +1146,20 @@ def get_fastqc_results(wildcards): ) # samtools idxstats - yield from expand("results/qc/{sample}.bam.idxstats", sample=group_samples) + yield from expand( + "results/qc/{sample}.bam.idxstats", + sample=group_samples, + ) # samtools stats - yield from expand("results/qc/{sample}.bam.stats", sample=group_samples) + yield from expand( + "results/qc/{sample}.bam.stats", + sample=group_samples, + ) def get_variant_oncoprints(wildcards): - if len(get_report_batch(wildcards)) > 1: + if len(_get_report_batch("variants", wildcards.batch)) > 1: return "results/tables/oncoprints/{wildcards.batch}.{wildcards.event}/variant-oncoprints" else: return [] @@ -1067,7 +1167,7 @@ def get_variant_oncoprints(wildcards): def get_oncoprint(oncoprint_type): def inner(wildcards): - if len(get_report_batch(wildcards)) > 1: + if len(_get_report_batch("variants", wildcards.batch)) > 1: oncoprint_path = ( f"results/tables/oncoprints/{wildcards.batch}.{wildcards.event}" ) diff --git a/workflow/rules/datavzrd.smk b/workflow/rules/datavzrd.smk index dba2be1fc..7f2b61d3b 100644 --- a/workflow/rules/datavzrd.smk +++ b/workflow/rules/datavzrd.smk @@ -1,6 +1,6 @@ rule split_call_tables: input: - "results/tables/{group}.{event}.fdr-controlled.tsv", + "results/tables/{group}.{event}.variants.fdr-controlled.tsv", output: coding="results/tables/{group}.{event}.coding.fdr-controlled.tsv", noncoding="results/tables/{group}.{event}.noncoding.fdr-controlled.tsv", @@ -16,6 +16,19 @@ rule split_call_tables: "../scripts/split-call-tables.py" +rule process_fusion_call_tables: + input: + "results/tables/{group}.{event}.fusions.fdr-controlled.tsv", + output: + fusions="results/tables/{group}.{event}.fusions.joined.fdr-controlled.tsv", + log: + "logs/join_partner/{group}.{event}.log", + conda: + "../envs/pandas.yaml" + script: + "../scripts/join_fusion_partner.py" + + rule prepare_oncoprint: input: calls=get_oncoprint_input, @@ -31,7 +44,7 @@ rule prepare_oncoprint: log: "logs/prepare_oncoprint/{batch}.{event}.log", params: - groups=get_report_batch, + groups=get_report_batch("variants"), labels=get_heterogeneous_labels(), conda: "../envs/oncoprint.yaml" @@ -39,18 +52,18 @@ rule prepare_oncoprint: "../scripts/oncoprint.py" -rule render_datavzrd_config: +rule render_datavzrd_variant_config: input: template=workflow.source_path( "../resources/datavzrd/variant-calls-template.datavzrd.yaml" ), variant_oncoprints=get_oncoprint("variant"), output: - "resources/datavzrd/{batch}.{event}.datavzrd.yaml", + "resources/datavzrd/{batch}.{event}.variants.datavzrd.yaml", params: gene_oncoprint=get_oncoprint("gene"), variant_oncoprints=get_variant_oncoprint_tables, - groups=get_report_batch, + groups=get_report_batch("variants"), coding_calls=get_datavzrd_data(impact="coding"), noncoding_calls=get_datavzrd_data(impact="noncoding"), spec_observations=workflow.source_path( @@ -71,7 +84,36 @@ rule render_datavzrd_config: labels=get_heterogeneous_labels(), oncoprint_sorted_datasets="results/tables/oncoprints/{batch}.{event}/label_sortings/", log: - "logs/datavzrd_render/{batch}.{event}.log", + "logs/datavzrd_render/{batch}.{event}.variants.log", + template_engine: + "yte" + + +rule render_datavzrd_fusions_config: + input: + template=workflow.source_path( + "../resources/datavzrd/fusion-calls-template.datavzrd.yaml" + ), + output: + "resources/datavzrd/{batch}.{event}.fusions.datavzrd.yaml", + params: + groups=get_report_batch("fusions"), + fusion_calls=get_datavzrd_data(impact="fusions"), + spec_observations=workflow.source_path( + "../resources/datavzrd/spec_observations.json" + ), + spec_short_observations=workflow.source_path( + "../resources/datavzrd/spec_short_observations.json" + ), + data_observations=workflow.source_path( + "../resources/datavzrd/data_observations.js" + ), + data_short_observations=workflow.source_path( + "../resources/datavzrd/data_short_observations.js" + ), + samples=samples, + log: + "logs/datavzrd_render/{batch}.{event}.fusions.log", template_engine: "yte" @@ -86,12 +128,14 @@ rule datavzrd_variants_calls: data_observations=workflow.source_path( "../resources/datavzrd/data_observations.js" ), - config="resources/datavzrd/{batch}.{event}.datavzrd.yaml", + config="resources/datavzrd/{batch}.{event}.variants.datavzrd.yaml", gene_oncoprint=get_oncoprint("gene"), variant_oncoprints=get_oncoprint("variant"), output: report( - directory("results/datavzrd-report/{batch}.{event}.fdr-controlled"), + directory( + "results/datavzrd-report/{batch}.{event}.variants.fdr-controlled" + ), htmlindex="index.html", caption="../report/calls.rst", category="Variant calls", @@ -104,6 +148,33 @@ rule datavzrd_variants_calls: "v3.0.2/utils/datavzrd" +rule datavzrd_fusion_calls: + input: + fusion_calls=get_datavzrd_data(impact="fusions"), + spec_observations=workflow.source_path( + "../resources/datavzrd/spec_observations.json" + ), + data_observations=workflow.source_path( + "../resources/datavzrd/data_observations.js" + ), + config="resources/datavzrd/{batch}.{event}.fusions.datavzrd.yaml", + output: + report( + directory( + "results/datavzrd-report/{batch}.{event}.fusions.fdr-controlled" + ), + htmlindex="index.html", + caption="../report/calls.rst", + category="Fusion calls", + labels=get_datavzrd_report_labels, + subcategory=get_datavzrd_report_subcategory, + ), + log: + "logs/datavzrd_report/{batch}.{event}.log", + wrapper: + "v3.0.2/utils/datavzrd" + + rule bedtools_merge: input: left="results/regions/{group}/{sample}.regions.bed.gz", diff --git a/workflow/rules/filtering.smk b/workflow/rules/filtering.smk index 582a31300..a993dfb3d 100644 --- a/workflow/rules/filtering.smk +++ b/workflow/rules/filtering.smk @@ -21,9 +21,9 @@ rule filter_by_annotation: bcf=get_annotated_bcf, aux=get_annotation_filter_aux_files, output: - "results/calls/{group}.{event}.{scatteritem}.filtered_ann.bcf", + "results/calls/{group}.{event}.{calling_type}.{scatteritem}.filtered_ann.bcf", log: - "logs/filter-calls/annotation/{group}.{event}.{scatteritem}.log", + "logs/filter-calls/annotation/{group}.{event}.{calling_type}.{scatteritem}.log", params: filter=get_annotation_filter_expression, aux=get_annotation_filter_aux, @@ -35,15 +35,15 @@ rule filter_by_annotation: rule filter_odds: input: - "results/calls/{group}.{event}.{scatteritem}.filtered_ann.bcf", + "results/calls/{group}.{event}.{calling_type}.{scatteritem}.filtered_ann.bcf", output: - "results/calls/{group}.{event}.{scatteritem}.filtered_odds.bcf", + "results/calls/{group}.{event}.{calling_type}.{scatteritem}.filtered_odds.bcf", params: events=lambda wc: config["calling"]["fdr-control"]["events"][wc.event][ "varlociraptor" ], log: - "logs/filter-calls/posterior_odds/{group}.{event}.{scatteritem}.log", + "logs/filter-calls/posterior_odds/{group}.{event}.{calling_type}.{scatteritem}.log", conda: "../envs/varlociraptor.yaml" shell: @@ -55,9 +55,9 @@ rule gather_calls: calls=get_gather_calls_input(), idx=get_gather_calls_input(ext="bcf.csi"), output: - "results/calls/{group}.{event}.filtered_{by}.bcf", + "results/calls/{group}.{event}.{calling_type}.filtered_{by}.bcf", log: - "logs/gather-calls/{group}.{event}.filtered_{by}.log", + "logs/gather-calls/{group}.{event}.{calling_type}.filtered_{by}.log", params: extra="-a", wrapper: @@ -68,9 +68,9 @@ rule control_fdr: input: get_control_fdr_input, output: - "results/calls/{group}.{vartype}.{event}.fdr-controlled.bcf", + "results/calls/{group}.{vartype}.{event}.{calling_type}.fdr-controlled.bcf", log: - "logs/control-fdr/{group}.{vartype}.{event}.log", + "logs/control-fdr/{group}.{vartype}.{event}.{calling_type}.log", params: query=get_fdr_control_params, conda: @@ -85,9 +85,9 @@ rule merge_calls: calls=get_merge_calls_input("bcf"), idx=get_merge_calls_input("bcf.csi"), output: - "results/final-calls/{group}.{event}.fdr-controlled.bcf", + "results/final-calls/{group}.{event}.{calling_type}.fdr-controlled.bcf", log: - "logs/merge-calls/{group}.{event}.log", + "logs/merge-calls/{group}.{event}.{calling_type}.log", params: extra="-a", wrapper: @@ -96,11 +96,11 @@ rule merge_calls: rule convert_phred_scores: input: - "results/final-calls/{group}.{event}.fdr-controlled.bcf", + "results/final-calls/{group}.{event}.{calling_type}.fdr-controlled.bcf", output: - "results/final-calls/{group}.{event}.fdr-controlled.normal-probs.bcf", + "results/final-calls/{group}.{event}.{calling_type}.fdr-controlled.normal-probs.bcf", log: - "logs/convert-phred-scores/{group}.{event}.log", + "logs/convert-phred-scores/{group}.{event}.{calling_type}.log", conda: "../envs/varlociraptor.yaml" shell: diff --git a/workflow/rules/fusion_calling.smk b/workflow/rules/fusion_calling.smk new file mode 100644 index 000000000..3d63adb8f --- /dev/null +++ b/workflow/rules/fusion_calling.smk @@ -0,0 +1,113 @@ +module fusion_calling: + meta_wrapper: + "v3.3.3/meta/bio/star_arriba" + config: + config + + +use rule * from fusion_calling + + +use rule star_index from fusion_calling with: + input: + fasta=rules.get_genome.output, + gtf=rules.get_annotation.output, + + +use rule star_align from fusion_calling with: + input: + fq1=get_star_reads_input, + fq2=lambda wc: get_star_reads_input(wc, True), + idx=rules.star_index.output, + annotation=rules.get_annotation.output, + output: + aln="results/mapped/star/{sample}.bam", + reads_per_gene="results/mapped/star/{sample}.ReadsPerGene.tsv", + params: + # specific parameters to work well with arriba + extra=lambda wc, input: f"--quantMode GeneCounts --sjdbGTFfile {input.annotation} {get_star_read_group(wc)}" + " --outSAMtype BAM SortedByCoordinate --chimSegmentMin 10 --chimOutType WithinBAM SoftClip" + " --chimJunctionOverhangMin 10 --chimScoreMin 1 --chimScoreDropMax 30 --chimScoreJunctionNonGTAG 0" + " --chimScoreSeparation 1 --alignSJstitchMismatchNmax 5 -1 5 5 --chimSegmentReadGapMax 3", + + +use rule arriba from fusion_calling with: + input: + bam="results/mapped/star/{sample}.bam", + genome=rules.get_genome.output, + annotation=rules.get_annotation.output, + output: + fusions="results/arriba/{sample}.fusions.tsv", + discarded="results/arriba/{sample}.fusions.discarded.tsv", + + +rule annotate_exons: + input: + fusions="results/arriba/{sample}.fusions.tsv", + annotation=rules.get_annotation.output, + output: + "results/arriba/{sample}.fusions.annotated.tsv", + conda: + "../envs/arriba.yaml" + log: + "logs/annotate_fusions/{sample}.log", + shell: + """ + annotate_exon_numbers.sh {input.fusions} {input.annotation} {output} 2> {log} + """ + + +# Use script provide by arriba once new version is released +rule convert_fusions: + input: + fasta=rules.get_genome.output, + fai=genome_fai, + fusions="results/arriba/{sample}.fusions.annotated.tsv", + output: + temp("results/candidate-calls/{sample}.arriba.vcf"), + params: + script_path=workflow.source_path("../scripts/convert_fusions_to_vcf.sh"), + conda: + "../envs/arriba.yaml" + log: + "logs/convert_fusions/{sample}.log", + shell: + """ + bash {params.script_path} {input.fasta} {input.fusions} {output} 2> {log} + """ + + +rule sort_arriba_calls: + input: + "results/candidate-calls/{sample}.arriba.vcf", + output: + temp("results/candidate-calls/{sample}.arriba.bcf"), + params: + # Set to True, in case you want uncompressed BCF output + uncompressed_bcf=False, + # Extra arguments + extras="", + log: + "logs/bcf-sort/{sample}.log", + resources: + mem_mb=8000, + wrapper: + "v1.21.0/bio/bcftools/sort" + + +rule bcftools_concat_candidates: + input: + calls=get_arriba_group_candidates, + idx=lambda wc: get_arriba_group_candidates(wc, csi=True), + output: + "results/candidate-calls/{group}.arriba.bcf", + log: + "logs/concat_candidates/{group}.log", + params: + uncompressed_bcf=False, + extra="-d exact -a", + threads: 4 + resources: + mem_mb=10, + wrapper: + "v1.21.0/bio/bcftools/concat" diff --git a/workflow/rules/mapping.smk b/workflow/rules/mapping.smk index a6c63c423..b94dc443b 100644 --- a/workflow/rules/mapping.smk +++ b/workflow/rules/mapping.smk @@ -3,7 +3,7 @@ rule map_reads: reads=get_map_reads_input, idx=rules.bwa_index.output, output: - temp("results/mapped/{sample}.bam"), + temp("results/mapped/bwa/{sample}.bam"), log: "logs/bwa_mem/{sample}.log", params: @@ -30,25 +30,23 @@ rule merge_untrimmed_fastqs: rule annotate_umis: input: - bam="results/mapped/{sample}.bam", + bam="results/mapped/{aligner}/{sample}.bam", umi=get_umi_fastq, output: - temp("results/mapped/{sample}.annotated.bam"), + temp("results/mapped/{aligner}/{sample}.annotated.bam"), params: extra=get_umi_read_structure, resources: mem_mb=lambda wc, input: 2.5 * input.size_mb, log: - "logs/fgbio/annotate_bam/{sample}.log", + "logs/fgbio/annotate_bam/{aligner}/{sample}.log", wrapper: "v2.3.2/bio/fgbio/annotatebamwithumis" rule mark_duplicates: input: - bams=lambda wc: "results/mapped/{sample}.annotated.bam" - if sample_has_umis(wc.sample) - else "results/mapped/{sample}.bam", + bams=get_markduplicates_input, output: bam=temp("results/dedup/{sample}.bam"), metrics="results/qc/dedup/{sample}.metrics.txt", @@ -125,6 +123,26 @@ rule sort_consensus_reads: "v2.3.2/bio/samtools/sort" +# TODO Does not use consensus reads +rule splitncigarreads: + input: + bam=lambda wc: "results/dedup/{sample}.bam" + if is_activated("remove_duplicates") + else "results/mapped/star/{sample}.bam", + ref=genome, + output: + "results/split/{sample}.bam", + log: + "logs/gatk/splitNCIGARreads/{sample}.log", + params: + extra="", # optional + java_opts="", # optional + resources: + mem_mb=1024, + wrapper: + "v3.1.0/bio/gatk/splitncigarreads" + + rule recalibrate_base_qualities: input: bam=get_recalibrate_quality_input, diff --git a/workflow/rules/mutational_burden.smk b/workflow/rules/mutational_burden.smk index d5f32f337..2571921a5 100644 --- a/workflow/rules/mutational_burden.smk +++ b/workflow/rules/mutational_burden.smk @@ -17,7 +17,7 @@ if config["mutational_burden"]["activate"]: rule estimate_mutational_burden: input: - calls="results/final-calls/{group}.annotated.bcf", + calls="results/final-calls/{group}.variants.annotated.bcf", coverage_breadth="results/regions/{group}.covered_regions.filtered.coding.coverage_breadth.txt", output: report( diff --git a/workflow/rules/ref.smk b/workflow/rules/ref.smk index 5ee17d623..4917d0645 100644 --- a/workflow/rules/ref.smk +++ b/workflow/rules/ref.smk @@ -118,8 +118,6 @@ rule bwa_index: idx=multiext(genome, ".amb", ".ann", ".bwt", ".pac", ".sa"), log: "logs/bwa_index.log", - resources: - mem_mb=369000, cache: True wrapper: "v2.3.2/bio/bwa/index" @@ -136,7 +134,7 @@ rule get_vep_cache: "logs/vep/cache.log", cache: "omit-software" wrapper: - "v2.3.2/bio/vep/cache" + "v3.3.5/bio/vep/cache" rule get_vep_plugins: @@ -147,4 +145,4 @@ rule get_vep_plugins: log: "logs/vep/plugins.log", wrapper: - "v2.3.2/bio/vep/plugins" + "v3.3.5/bio/vep/plugins" diff --git a/workflow/rules/table.smk b/workflow/rules/table.smk index 834f323c5..9b8028bbe 100644 --- a/workflow/rules/table.smk +++ b/workflow/rules/table.smk @@ -1,15 +1,15 @@ rule vembrane_table: input: - bcf="results/final-calls/{group}.{event}.fdr-controlled.normal-probs.bcf", + bcf="results/final-calls/{group}.{event}.{calling_type}.fdr-controlled.normal-probs.bcf", scenario="results/scenarios/{group}.yaml", output: - bcf="results/tables/{group}.{event}.fdr-controlled.tsv", + bcf="results/tables/{group}.{event}.{calling_type}.fdr-controlled.tsv", conda: "../envs/vembrane.yaml" params: config=lambda wc, input: get_vembrane_config(wc, input), log: - "logs/vembrane-table/{group}.{event}.log", + "logs/vembrane-table/{group}.{event}.{calling_type}.log", shell: 'vembrane table --header "{params.config[header]}" "{params.config[expr]}" ' "{input.bcf} > {output.bcf} 2> {log}" diff --git a/workflow/schemas/samples.schema.yaml b/workflow/schemas/samples.schema.yaml index eb1ddfd51..bf69e7e39 100644 --- a/workflow/schemas/samples.schema.yaml +++ b/workflow/schemas/samples.schema.yaml @@ -32,6 +32,19 @@ properties: minimum: 0.0 maximum: 1.0 description: Purity to use for tumor/normal groups. + datatype: + type: string + enum: + - dna + - rna + calling: + type: string + enum: + - variants + - fusions + - fusions,variants + - variants,fusions + description: datatype of sample primer_panel: type: string description: ID of primer panel @@ -48,6 +61,8 @@ required: - alias - group - platform + - datatype + - calling dependentRequired: umi_read: ["umi_read_structure"] \ No newline at end of file diff --git a/workflow/scripts/convert_fusions_to_vcf.sh b/workflow/scripts/convert_fusions_to_vcf.sh new file mode 100755 index 000000000..f7618e80f --- /dev/null +++ b/workflow/scripts/convert_fusions_to_vcf.sh @@ -0,0 +1,94 @@ +#!/bin/bash + +# this script is part of arriba (https://github.com/suhrig/arriba) and published under the MIT Expat license. +# source: https://github.com/suhrig/arriba/blob/910478/scripts/convert_fusions_to_vcf.sh + + +# parse command-line arguments +if [ $# -ne 3 ]; then + echo Usage: $(basename "$0") assembly.fa input_fusions.tsv output_fusions.vcf + echo + echo "Description: This script converts fusion predictions from Arriba's custom tab-separated format to the standards-compliant VCF 4.3 format." + echo "Note: If a FastA index (.fai) does not exist for the given assembly file, it will be created on-the-fly." + exit 1 +fi 1>&2 +ASSEMBLY="$1" +INPUT=$(cat "$2") +OUTPUT="$3" + +# tell bash to abort on error +set -e -u -o pipefail + +# make sure required software is installed +if ! [[ $(samtools --version-only 2> /dev/null) =~ ^1\. ]]; then + echo "samtools >= 1.0 must be installed" 1>&2 + exit 1 +fi + +# create FastA index, if necessary +if [ ! -e "$ASSEMBLY.fai" ]; then + echo "Indexing FastA file" 1>&2 + samtools faidx "$ASSEMBLY" +fi + +if [ $(head -n1 <<<"$INPUT" | cut -f31) = "exon_number1" ]; then HAS_EXONS=true; else HAS_EXONS=false; fi + +# print VCF header +echo '##fileformat=VCFv4.3' > "$OUTPUT" +echo "$INPUT" | cut -f5-6 | tr '\t' '\n' | sed -e 's/:[0-9]*$//' | sort -u | +awk -F '\t' ' + FILENAME == "/dev/stdin" { contigs[$0] } + FILENAME != "/dev/stdin" && $1 in contigs { print "##contig=" } +' /dev/stdin "$ASSEMBLY.fai" >> "$OUTPUT" +echo '##FILTER= +##INFO= +##INFO= +##INFO= +##INFO=' >> "$OUTPUT" +if [ $HAS_EXONS = true ]; then + echo '##INFO=' >> "$OUTPUT" +fi +echo '#CHROM POS ID REF ALT QUAL FILTER INFO' >> "$OUTPUT" + +# read TSV file and convert it to VCF line-by-line +FUSION=0 +tail -n +2 <<<"$INPUT" | while read LINE; do + FUSION=$((FUSION+1)) + SITE1=$(cut -f7 <<<"$LINE") + SITE2=$(cut -f8 <<<"$LINE") + GENE_NAME1=$([ "$SITE1" = "intergenic" ] || cut -f1 <<<"$LINE") + GENE_NAME2=$([ "$SITE2" = "intergenic" ] || cut -f2 <<<"$LINE") + GENE_ID1=$([ "$SITE1" = "intergenic" ] || cut -f21 <<<"$LINE") + GENE_ID2=$([ "$SITE2" = "intergenic" ] || cut -f22 <<<"$LINE") + BREAKPOINT1=$(cut -f5 <<<"$LINE"); CHROMOSOME1="${BREAKPOINT1%:*}"; POSITION1="${BREAKPOINT1##*:}" + BREAKPOINT2=$(cut -f6 <<<"$LINE"); CHROMOSOME2="${BREAKPOINT2%:*}"; POSITION2="${BREAKPOINT2##*:}" + QUAL=$(cut -f15 <<<"$LINE" | sed -e 's/low/0.5/' -e 's/medium/2/' -e 's/high/5/') + REF1=$(samtools faidx "$ASSEMBLY" "$BREAKPOINT1-$POSITION1" | tail -n1 | tr '[:lower:]' '[:upper:]') + REF2=$(samtools faidx "$ASSEMBLY" "$BREAKPOINT2-$POSITION2" | tail -n1 | tr '[:lower:]' '[:upper:]') + NON_TEMPLATE_BASES=$(cut -f28 <<<"$LINE" | tr '[:lower:]' '[:upper:]' | sed -n -e 's/.*|\([^|]*\)|.*/\1/p') # get bases between two pipes in transcript sequence + STRAND1=$(cut -f3 <<<"$LINE" | cut -f2 -d/) + if [ "$STRAND1" = "-" ]; then NON_TEMPLATE_BASES=$(tr ATCG TAGC <<<"$NON_TEMPLATE_BASES"); fi # complement bases on reverse strand + DIRECTION1=$(cut -f25 <<<"$LINE") + DIRECTION2=$(cut -f26 <<<"$LINE") + ALT1="$REF1$NON_TEMPLATE_BASES" + ALT2="$NON_TEMPLATE_BASES$REF2" + if [ "$DIRECTION1" = "upstream" ]; then ALT1=$(rev <<<"$ALT1"); fi + if [ "$DIRECTION2" = "downstream" ]; then ALT2=$(rev <<<"$ALT2"); fi + if [ "$DIRECTION1" = "downstream" ]; then ALT2_BREAKPOINT="]$BREAKPOINT1]"; else ALT2_BREAKPOINT="[$BREAKPOINT1["; fi + if [ "$DIRECTION2" = "downstream" ]; then ALT1_BREAKPOINT="]$BREAKPOINT2]"; else ALT1_BREAKPOINT="[$BREAKPOINT2["; fi + if [ "$DIRECTION1" = "downstream" ]; then ALT1="$ALT1$ALT1_BREAKPOINT"; else ALT1="$ALT1_BREAKPOINT$ALT1"; fi + if [ "$DIRECTION2" = "downstream" ]; then ALT2="$ALT2$ALT2_BREAKPOINT"; else ALT2="$ALT2_BREAKPOINT$ALT2"; fi + FILTER="PASS" + INFO1="SVTYPE=BND;MATEID=${FUSION}b;GENE_NAME=$GENE_NAME1;GENE_ID=$GENE_ID1" + INFO2="SVTYPE=BND;MATEID=${FUSION}a;GENE_NAME=$GENE_NAME2;GENE_ID=$GENE_ID2" + if [ $HAS_EXONS = true ]; then + EXON_INFO1=";EXON=$(cut -f31 <<<"$LINE")" + EXON_INFO2=";EXON=$(cut -f32 <<<"$LINE")" + else + EXON_INFO1="" + EXON_INFO2="" + fi + echo "$CHROMOSOME1 $POSITION1 ${FUSION}a $REF1 $ALT1 $QUAL $FILTER $INFO1$EXON_INFO1" >> "$OUTPUT" + echo "$CHROMOSOME2 $POSITION2 ${FUSION}b $REF2 $ALT2 $QUAL $FILTER $INFO2$EXON_INFO2" >> "$OUTPUT" +done + diff --git a/workflow/scripts/join_fusion_partner.py b/workflow/scripts/join_fusion_partner.py new file mode 100644 index 000000000..9035d2a26 --- /dev/null +++ b/workflow/scripts/join_fusion_partner.py @@ -0,0 +1,90 @@ +import sys + +sys.stderr = open(snakemake.log[0], "w") + +import json +import re + +import numpy as np +import pandas as pd + + +def write(df, path): + if not df.empty: + remaining_columns = df.dropna(how="all", axis="columns").columns.tolist() + df = df[remaining_columns] + df.to_csv(path, index=False, sep="\t") + + +def get_prefix_columns(df, prefix): + return df.columns[df.columns.str.startswith(prefix)] + + +def get_samples(df): + return list( + df.columns[df.columns.str.endswith(": allele frequency")].str.replace( + ": allele frequency", "" + ) + ) + + +def drop_shared_columns(df): + prob_columns = get_prefix_columns(df, "prob: ") + df = df.drop(prob_columns, axis="columns") + samples = get_samples(df) + sample_columns = [ + column for sample in samples for column in get_prefix_columns(df, sample) + ] + return df.drop(sample_columns, axis="columns") + + +def join_short_obs(df, samples): + for sample in samples: + sobs_loc = df.columns.get_loc(f"{sample}: observations") + df.insert( + sobs_loc, + f"{sample}: short observations", + df[f"{sample}: short ref observations"] + + "," + + df[f"{sample}: short alt observations"], + ) + df = df.drop( + df.columns[ + df.columns.str.endswith(": short ref observations") + | df.columns.str.endswith(": short alt observations") + ], + axis=1, + ) + return df + + +calls = pd.read_csv( + snakemake.input[0], + sep="\t", + dtype={"exon": "Int64"}, +) +calls[["feature_name", "feature_id"]] = calls[["feature_id", "feature_name"]].fillna( + "('intronic',)" +) +calls = calls.drop( + ["reference allele", "alternative allele", "end position", "event"], axis="columns" +) +# Obtain first entry of columns annotated as tuple by arriba +for col in ["feature_id", "feature_name"]: + calls[col] = calls[col].apply(eval).apply(lambda x: x[0]) + +first_partners = calls["id"].str.contains("a$") +first_calls = calls[first_partners] +first_calls = drop_shared_columns(first_calls) +second_calls = calls[~first_partners] +samples = get_samples(second_calls) +second_calls = join_short_obs(second_calls, samples) +paired_fusions = first_calls.merge( + second_calls, + left_on="mateid", + right_on="id", + suffixes=("_partner1", "_partner2"), +) +paired_fusions = paired_fusions.filter(regex="^(?!mateid|id)") + +write(paired_fusions, snakemake.output.fusions) diff --git a/workflow/scripts/split-call-tables.py b/workflow/scripts/split-call-tables.py index cc27cf59d..bd615f249 100644 --- a/workflow/scripts/split-call-tables.py +++ b/workflow/scripts/split-call-tables.py @@ -13,8 +13,7 @@ def write(df, path): - df = df.drop(["mane_plus_clinical"], axis="columns", errors="ignore") - df = df.drop(["canonical"], axis="columns", errors="ignore") + df["mane_plus_clinical"][df["mane_plus_clinical"].notna()] = True if not df.empty: remaining_columns = df.dropna(how="all", axis="columns").columns.tolist() if path == snakemake.output.coding: