feat: fusion calling (#222)

* Initial commit * concat arriba calls * generalize workflow * update arriba * Update tests * typo * intermediate changes * remove repetetive code * fix typo * indexing * snakefmt * refactoring * fmt * fmt * fixed incompatibilities * skip vep * improve report * Fix final output * fix output * handle mutational burden * remove unused rules, separate groups * fix minor issues * cleanup * Add missing script * update readme * update freebayes * reset download revel * renaming * fix datatype handling * fmt * fmt * Update wrapper * breaking up workflow (not yet working) * fmt * fmt * fmt * invoked datatype and candidate-calling * fix formatting * unified workflow * formatting * update samplesheet * Add read group for star * fix param * support fusions and variants in rna * feat: render canonical transcript source * clean report * cleanup template * github action workaround * update readme * cleanup * fmt * introduce pattern delegatoin * fmt * add comment to script * typo * Update convert_fusions_to_vcf.sh
snakemake-workflows · Feb 27, 2024 · d0b45ba · d0b45ba
1 parent ee973c6
commit d0b45ba
Show file tree

Hide file tree

Showing 32 changed files with 861 additions and 227 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -147,7 +147,7 @@ jobs:
       with:
         directory: .test
         snakefile: workflow/Snakefile
-        args: "--configfile .test/config-simple/config.yaml --report report.zip"
+        args: "--configfile .test/config-simple/config.yaml --cores 1 --report report.zip"
         show-disk-usage-on-error: true
 
     - name: Test workflow (local FASTQs, target regions)

diff --git a/.test/config-chm-eval/samples.tsv b/.test/config-chm-eval/samples.tsv
@@ -1,2 +1,2 @@
-sample_name	group	alias	platform
-chm		chm	ILLUMINA
+sample_name	group	alias	platform	datatype	calling
+chm		chm	ILLUMINA	dna	variants
diff --git a/.test/config-giab/samples.tsv b/.test/config-giab/samples.tsv
@@ -1,2 +1,2 @@
-sample_name	alias	group	platform	purity
-NA12878	NA12878	NA12878	ILLUMINA	
+sample_name	alias	group	platform	purity	datatype	calling
+NA12878	NA12878	NA12878	ILLUMINA		dna	variants
diff --git a/.test/config-no-candidate-filtering/samples.tsv b/.test/config-no-candidate-filtering/samples.tsv
@@ -1,3 +1,3 @@
-sample_name	group	alias	platform
-a		a	ILLUMINA
-b		b	ILLUMINA
+sample_name	group	alias	platform	datatype	calling
+a		a	ILLUMINA	dna	variants
+b		b	ILLUMINA	dna	variants
diff --git a/.test/config-simple/samples.tsv b/.test/config-simple/samples.tsv
@@ -1,5 +1,5 @@
-sample_name	group	alias	platform
-a	one	x	ILLUMINA
-b	one	y	ILLUMINA
-b	two	x	ILLUMINA
-a	two	y	ILLUMINA
+sample_name	group	alias	platform	datatype	calling
+a	one	x	ILLUMINA	dna	variants
+b	one	y	ILLUMINA	dna	variants
+b	two	x	ILLUMINA	dna	variants
+a	two	y	ILLUMINA	dna	variants
diff --git a/.test/config-sra/samples.tsv b/.test/config-sra/samples.tsv
@@ -1,5 +1,5 @@
-sample_name	group	alias	platform
-PD12A	medium_L	base	ILLUMINA
-PD13B	medium_L	changed	ILLUMINA
-PD09A	soil	changed	ILLUMINA
-PD12A	soil	base	ILLUMINA
+sample_name	group	alias	platform	datatype	calling
+PD12A	medium_L	base	ILLUMINA	dna	variants
+PD13B	medium_L	changed	ILLUMINA	dna	variants
+PD09A	soil	changed	ILLUMINA	dna	variants
+PD12A	soil	base	ILLUMINA	dna	variants
diff --git a/.test/config-target-regions/samples.tsv b/.test/config-target-regions/samples.tsv
@@ -1,3 +1,3 @@
-sample_name	group	alias	platform
-a		a	ILLUMINA
-b		b	ILLUMINA
+sample_name	group	alias	platform	datatype	calling
+a		a	ILLUMINA	dna	variants
+b		b	ILLUMINA	dna	variants
diff --git a/.test/config_primers/samples.tsv b/.test/config_primers/samples.tsv
@@ -1,3 +1,3 @@
-sample_name	group	alias	platform
-a		a	ILLUMINA
-b		b	ILLUMINA
+sample_name	group	alias	platform	datatype	calling
+a		a	ILLUMINA	dna	variants
+b		b	ILLUMINA	dna	variants
diff --git a/config/README.md b/config/README.md
@@ -4,11 +4,13 @@ To configure this workflow, modify ``config/config.yaml`` according to your need
 
 # Sample sheet
 
-Add samples to `config/samples.tsv`. For each sample, the columns `sample_name`, `alias`, `platform`, and `group` have to be defined. 
+Add samples to `config/samples.tsv`. For each sample, the columns `sample_name`, `alias`, `platform`, `datatype`, `calling` and `group` have to be defined. 
 * Samples within the same `group` can be referenced in a joint [Calling scenario](#calling-scenario) via their `alias`es.
 * `alias`es represent the name of the sample within its group. They are meant to be some abstract description of the sample type to be used in the [Calling scenario](#calling-scenario), and should thus be used consistently across groups. A classic example would be a combination of the `tumor` and `normal` aliases.
 * The `platform` column needs to contain the used sequencing plaform (one of 'CAPILLARY', 'LS454', 'ILLUMINA', 'SOLID', 'HELICOS', 'IONTORRENT', 'ONT', 'PACBIO’).
 * The same `sample_name` entry can be used multiple times within a `samples.tsv` sample sheet, with only the value in the `group` column differing between repeated rows. This way, you can use the same sample for variant calling in different groups, for example if you use a panel of normal samples when you don't have matched normal samples for tumor variant calling.
+* The `datatype` column specifies what kind of data each sample corresponds to. This can either be `rna` or `dna`.
+* The `calling` column sets the kind of analysis to be performed. This can be either `fusions`, `variants` or both (comma separated). Fusion calling is still under developement and should be considered as experimental. 
 
 If mutational burdens shall be estimated for a sample, the to be used ``events`` from the calling scenario (see below) have to be specified in an additional column ``mutational_burden_events``. Multiple events have to be separated by commas within that column.
 

diff --git a/config/config.yaml b/config/config.yaml
@@ -22,7 +22,7 @@ ref:
   # Ensembl species name
   species: homo_sapiens
   # Ensembl release
-  release: 110
+  release: 111
   # Genome build
   build: GRCh38
   # Optionally, instead of downloading the whole reference from Ensembl via the
@@ -121,8 +121,9 @@ calling:
       # Add any number of events here to filter for.
       # The id of each event can be chosen freely, but needs to contain
       # only alphanumerics and underscores
-      # ("somatic" below is just an example and can be modified as needed).
+      # ("some_id" below is just an example and can be modified as needed).
       some_id:
+        types: ["variants", "fusions"]
         # labels for the callset, displayed in the report. Will fall back to id if no labels specified
         labels:
           some-label: label text

diff --git a/config/samples.tsv b/config/samples.tsv
@@ -1 +1 @@
-sample_name	alias	group	platform	purity	panel	umi_read	umi_read_structure
+sample_name	alias	group	platform	purity	panel	umi_read	umi_read_structure	datatype	calling
diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -35,6 +35,7 @@ include: "rules/table.smk"
 include: "rules/regions.smk"
 include: "rules/plugins.smk"
 include: "rules/datavzrd.smk"
+include: "rules/fusion_calling.smk"
 include: "rules/testcase.smk"
 
 

diff --git a/workflow/envs/arriba.yaml b/workflow/envs/arriba.yaml
@@ -0,0 +1,5 @@
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - arriba =2.4
diff --git a/workflow/envs/bcftools.yaml b/workflow/envs/bcftools.yaml
@@ -2,4 +2,4 @@ channels:
   - conda-forge
   - bioconda
 dependencies:
-  - bcftools =1.14
+  - bcftools =1.16
diff --git a/workflow/envs/pandas.yaml b/workflow/envs/pandas.yaml
@@ -2,5 +2,5 @@ channels:
   - conda-forge
   - bioconda
 dependencies:
-  - pandas =1.4
+  - pandas =2.1
   - python =3.10
diff --git a/workflow/report/workflow.rst b/workflow/report/workflow.rst
@@ -1,7 +1,7 @@
 This workflow generates annotated variant calls that can be viewed in interactive reports, showing all evidence levels provided by Varlociraptor_.
 Adapters were removed with Cutadapt_. Reads were mapped with `BWA MEM`_, PCR and optical duplicates were removed with Picard_.
 Candidate variant discovery was performed with Freebayes_ and Delly_. Statisticall assessment of variants was conducted with Varlociraptor_.
-Variant calling results, sorted by type, event, and impact can be found under `Variant calls`_.
+Fusion resp. variant calling results, sorted by type, event, and impact can be found under Fusion/Variant calls.
 The corresponding Varlociraptor_ scenarios, containing the detailed definition of events can be found unter `Variant calling scenarios`_.
 
 .. _Varlociraptor: https://varlociraptor.github.io

diff --git a/workflow/resources/datavzrd/fusion-calls-template.datavzrd.yaml b/workflow/resources/datavzrd/fusion-calls-template.datavzrd.yaml
@@ -0,0 +1,57 @@
+name: ?f"Fusion calls {wildcards.event}"
+
+default-view: ?f"{params.groups[0]}-fusions"
+
+__definitions__:
+  - import os
+  - |
+    def read_file(path):
+        return open(path, 'r').read()
+
+datasets:
+  ?for group, path in zip(params.groups, params.fusion_calls):
+    ?f"{group}-fusions":
+      path: ?path
+      separator: "\t"
+
+views:
+  ?for group in params.groups:
+    ?f"{group}-fusions":
+          desc: ?f"Fusion calls.\n{config['calling']['fdr-control']['events'][wildcards.event]['desc']}"
+          dataset: ?f"{group}-fusions"
+          render-table:
+            columns:
+              "regex('.+: allele frequency')":
+                plot: 
+                  ticks:
+                    scale: "linear"
+                    domain: [0.0, 1.0]
+                    aux-domain-columns:
+                      - "regex('.+: allele frequency')"
+              "regex('.+: read depth')":
+                plot: 
+                  ticks:
+                    scale: "linear"
+                    aux-domain-columns:
+                      - "regex('.+: read depth')"
+              "regex('prob: .+')":
+                plot:
+                  heatmap:
+                    scale: linear
+                    domain: [0.0, 1.0]
+                    range:
+                      - white
+                      - "#1f77b4"
+              ?for alias in params.samples.loc[params.samples["group"] == group, "alias"]:
+                '?f"{alias}: short observations"':
+                  optional: true
+                  custom-plot:
+                    data: ?read_file(params.data_short_observations)
+                    spec: ?read_file(params.spec_short_observations)
+                  display-mode: detail
+                '?f"{alias}: observations"':
+                  optional: true
+                  custom-plot:
+                    data: ?read_file(params.data_observations)
+                    spec-path: ?params.spec_observations
+                  display-mode: detail
diff --git a/workflow/resources/datavzrd/variant-calls-template.datavzrd.yaml b/workflow/resources/datavzrd/variant-calls-template.datavzrd.yaml
@@ -58,6 +58,19 @@ __definitions__:
         return `https://${{build}}${{url_suffix}}${{hgvsg}}`
       }}
     """
+  - |
+    protein_id = f"""
+      function(row) {{
+        let protein_id = row.hgvsp.split(':')[0]
+        return protein_id
+      }}
+    """
+  - |
+    empty_content = f"""
+      function(row) {{
+        return ""
+      }}
+    """
 
 datasets:
   ?if input.variant_oncoprints:
@@ -290,6 +303,28 @@ views:
             display-mode: detail
           protein alteration (short):
             display-mode: detail
+          canonical:
+            optional: true
+            display-mode: detail
+            plot:
+              heatmap:
+                scale: "ordinal"
+                domain: ["", "True"]
+                range:
+                  - white
+                  - black
+                custom-content: ?empty_content
+          mane_plus_clinical:
+            optional: true
+            display-mode: detail
+            plot:
+              heatmap:
+                scale: "ordinal"
+                domain: ["", "True"]
+                range:
+                  - white
+                  - black
+                custom-content: ?empty_content
           ?for alias in params.samples.loc[params.samples["group"] == group, "alias"]:
             '?f"{alias}: short observations"':
               optional: true
@@ -310,6 +345,10 @@ views:
           query_genomenexus:
             value: ?genomenexus_link
             display-mode: hidden
+          ensembl_protein_id:
+            value: ?protein_id
+            display-mode: detail
+
 
 
 
@@ -404,6 +443,28 @@ views:
             display-mode: detail
           alternative allele:
             display-mode: detail
+          canonical:
+            optional: true
+            display-mode: detail
+            plot:
+              heatmap:
+                scale: "ordinal"
+                domain: ["", "True"]
+                range:
+                  - white
+                  - black
+                custom-content: ?empty_content
+          mane_plus_clinical:
+            optional: true
+            display-mode: detail
+            plot:
+              heatmap:
+                scale: "ordinal"
+                domain: ["", "True"]
+                range:
+                  - white
+                  - black
+                custom-content: ?empty_content
           ?for alias in params.samples.loc[params.samples["group"] == group, "alias"]:
             '?f"{alias}: short observations"':
               optional: true

diff --git a/workflow/rules/annotation.smk b/workflow/rules/annotation.smk
@@ -19,21 +19,21 @@ rule annotate_candidate_variants:
         "benchmarks/vep/{group}.{caller}.{scatteritem}.annotate_candidates.tsv"
     threads: get_vep_threads()
     wrapper:
-        "v2.5.0/bio/vep/annotate"
+        "v3.3.5/bio/vep/annotate"
 
 
 rule annotate_variants:
     input:
-        calls="results/calls/{group}.{scatteritem}.bcf",
+        calls="results/calls/{group}.{calling_type}.{scatteritem}.bcf",
         cache="resources/vep/cache",
         plugins="resources/vep/plugins",
         revel=lambda wc: get_plugin_aux("REVEL"),
         revel_tbi=lambda wc: get_plugin_aux("REVEL", True),
         fasta=genome,
         fai=genome_fai,
     output:
-        calls="results/calls/{group}.{scatteritem}.annotated.bcf",
-        stats="results/calls/{group}.{scatteritem}.stats.html",
+        calls="results/calls/{group}.{calling_type}.{scatteritem}.annotated.bcf",
+        stats="results/calls/{group}.{calling_type}.{scatteritem}.stats.html",
     params:
         # Pass a list of plugins to use, see https://www.ensembl.org/info/docs/tools/vep/script/vep_plugins.html
         # Plugin args can be added as well, e.g. via an entry "MyPlugin,1,FOO", see docs.
@@ -42,10 +42,10 @@ rule annotate_variants:
             config["annotations"]["vep"]["final_calls"]["params"]
         ),
     log:
-        "logs/vep/{group}.{scatteritem}.annotate.log",
+        "logs/vep/{group}.{calling_type}.{scatteritem}.annotate.log",
     threads: get_vep_threads()
     wrapper:
-        "v2.5.0/bio/vep/annotate"
+        "v3.3.5/bio/vep/annotate"
 
 
 # TODO What about multiple ID Fields?
@@ -89,9 +89,9 @@ rule gather_annotated_calls:
         calls=get_gather_annotated_calls_input(),
         idx=get_gather_annotated_calls_input(ext="bcf.csi"),
     output:
-        "results/final-calls/{group}.annotated.bcf",
+        "results/final-calls/{group}.{calling_type}.annotated.bcf",
     log:
-        "logs/gather-annotated-calls/{group}.log",
+        "logs/gather-annotated-calls/{group}.{calling_type}.log",
     params:
         extra="-a",
     wrapper:
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		sample_name alias group platform purity panel umi_read umi_read_structure
		sample_name alias group platform purity panel umi_read umi_read_structure datatype calling