From cb85e79a57dc7e5bee32197522be25b700bdcc38 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Tue, 6 Jul 2021 15:05:14 -0400 Subject: [PATCH 01/40] Update conf.py --- docs/conf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 784ea0ab..fea7b6be 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -46,7 +46,7 @@ # General information about the project. project = u'SvAnna' -copyright = u'2021' +copyright = u'2021, Daniel Danis, Peter N Robinson' author = u'Daniel Danis, Peter Robinson' # The version info for the project you're documenting, acts as replacement for @@ -142,7 +142,7 @@ # author, documentclass [howto, manual, or own class]). latex_documents = [ (master_doc, 'SvAnna.tex', u'svann Documentation', - u'Peter Robinson', 'manual'), + u'Daniel Danis, Peter N Robinson', 'manual'), ] From c2d053956b5a43133b10bfd5abb67b1a655c119e Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Tue, 6 Jul 2021 15:09:32 -0400 Subject: [PATCH 02/40] Update index.rst and Makefile --- docs/Makefile | 5 ++--- docs/index.rst | 13 ++++++------- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/docs/Makefile b/docs/Makefile index a0b36c9b..a324abe2 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -3,11 +3,10 @@ # You can set these variables from the command line. SPHINXOPTS = -SPHINXBUILD = python -msphinx -SPHINXPROJ = svann +SPHINXBUILD = sphinx-build +SPHINXPROJ = SvAnna SOURCEDIR = . BUILDDIR = _build -html_static_path = ['..'] # Put it first so that "make" without argument is like "make help". help: diff --git a/docs/index.rst b/docs/index.rst index 40f60cb6..d3e200f7 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,9 +1,7 @@ -SvAnna: Annotation of Structural Variants in VCF files -===================================================== +SvAnna: +======= - -SvAnna -~~~~~ +Efficient and accurate pathogenicity prediction for coding and regulatory structural variants in long-read genome sequencing This application annotates structural variants in VCF files, focussing specifically on long-read WGS analysis of germline variants. @@ -15,8 +13,9 @@ of germline variants. :caption: Contents: setup - enhancers running - BND + outputformats structuralvariation + enhancers + BND From abd9f678b33d08da066f2708b7a50cf9d61d38d1 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Tue, 6 Jul 2021 18:15:08 -0400 Subject: [PATCH 03/40] Next development iteration 1.0.0-RC2-SNAPSHOT --- pom.xml | 2 +- svanna-benchmark/pom.xml | 2 +- svanna-cli/pom.xml | 2 +- svanna-core/pom.xml | 2 +- svanna-db/pom.xml | 2 +- svanna-ingest/pom.xml | 2 +- svanna-io/pom.xml | 2 +- svanna-spring-boot-autoconfigure/pom.xml | 2 +- svanna-spring-boot-starter/pom.xml | 2 +- svanna-test/pom.xml | 2 +- 10 files changed, 10 insertions(+), 10 deletions(-) diff --git a/pom.xml b/pom.xml index 0ec8fe4f..69fd4390 100644 --- a/pom.xml +++ b/pom.xml @@ -18,7 +18,7 @@ svanna-benchmark org.jax.svanna - 1.0.0-RC1 + 1.0.0-RC2-SNAPSHOT SvAnna diff --git a/svanna-benchmark/pom.xml b/svanna-benchmark/pom.xml index ae506972..51419de3 100644 --- a/svanna-benchmark/pom.xml +++ b/svanna-benchmark/pom.xml @@ -5,7 +5,7 @@ SvAnna org.jax.svanna - 1.0.0-RC1 + 1.0.0-RC2-SNAPSHOT 4.0.0 diff --git a/svanna-cli/pom.xml b/svanna-cli/pom.xml index 60366036..eabf316d 100644 --- a/svanna-cli/pom.xml +++ b/svanna-cli/pom.xml @@ -5,7 +5,7 @@ SvAnna org.jax.svanna - 1.0.0-RC1 + 1.0.0-RC2-SNAPSHOT 4.0.0 svanna-cli diff --git a/svanna-core/pom.xml b/svanna-core/pom.xml index 346f1cec..008fa04d 100644 --- a/svanna-core/pom.xml +++ b/svanna-core/pom.xml @@ -5,7 +5,7 @@ SvAnna org.jax.svanna - 1.0.0-RC1 + 1.0.0-RC2-SNAPSHOT 4.0.0 diff --git a/svanna-db/pom.xml b/svanna-db/pom.xml index 68aad568..5be9aaaf 100644 --- a/svanna-db/pom.xml +++ b/svanna-db/pom.xml @@ -5,7 +5,7 @@ SvAnna org.jax.svanna - 1.0.0-RC1 + 1.0.0-RC2-SNAPSHOT 4.0.0 diff --git a/svanna-ingest/pom.xml b/svanna-ingest/pom.xml index c68d5d1c..b56d28e3 100644 --- a/svanna-ingest/pom.xml +++ b/svanna-ingest/pom.xml @@ -5,7 +5,7 @@ SvAnna org.jax.svanna - 1.0.0-RC1 + 1.0.0-RC2-SNAPSHOT 4.0.0 svanna-ingest diff --git a/svanna-io/pom.xml b/svanna-io/pom.xml index c2f58dc0..30f15de0 100644 --- a/svanna-io/pom.xml +++ b/svanna-io/pom.xml @@ -5,7 +5,7 @@ SvAnna org.jax.svanna - 1.0.0-RC1 + 1.0.0-RC2-SNAPSHOT 4.0.0 diff --git a/svanna-spring-boot-autoconfigure/pom.xml b/svanna-spring-boot-autoconfigure/pom.xml index 514b5d1b..8d9a0394 100644 --- a/svanna-spring-boot-autoconfigure/pom.xml +++ b/svanna-spring-boot-autoconfigure/pom.xml @@ -5,7 +5,7 @@ SvAnna org.jax.svanna - 1.0.0-RC1 + 1.0.0-RC2-SNAPSHOT 4.0.0 diff --git a/svanna-spring-boot-starter/pom.xml b/svanna-spring-boot-starter/pom.xml index 4e40ffa3..f2164368 100644 --- a/svanna-spring-boot-starter/pom.xml +++ b/svanna-spring-boot-starter/pom.xml @@ -5,7 +5,7 @@ SvAnna org.jax.svanna - 1.0.0-RC1 + 1.0.0-RC2-SNAPSHOT 4.0.0 diff --git a/svanna-test/pom.xml b/svanna-test/pom.xml index 00c63b13..bfc02896 100644 --- a/svanna-test/pom.xml +++ b/svanna-test/pom.xml @@ -5,7 +5,7 @@ SvAnna org.jax.svanna - 1.0.0-RC1 + 1.0.0-RC2-SNAPSHOT 4.0.0 From acc50cd942bb21570f608f9969948c5c33368fb1 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Tue, 6 Jul 2021 18:31:17 -0400 Subject: [PATCH 04/40] Update index pages --- README.md | 5 ++++- docs/index.rst | 1 + 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 631a99f4..4d000529 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,7 @@ -# SvAnna +# SvAnna - Structural Variant Annotation and Analysis + +![Java CI with Maven](https://github.com/TheJacksonLaboratory/SvAnna/workflows/Java%20CI%20with%20Maven/badge.svg) +[![Documentation Status](https://readthedocs.org/projects/squirls/badge/?version=latest)](https://svanna.readthedocs.io/en/latest/?badge=latest) Efficient and accurate pathogenicity prediction for coding and regulatory structural variants in long-read genome sequencing diff --git a/docs/index.rst b/docs/index.rst index d3e200f7..4fe976fc 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -12,6 +12,7 @@ of germline variants. :maxdepth: 2 :caption: Contents: + quickstart setup running outputformats From c0e933934e46ff2f170e792fde2beedd7f0f4883 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Tue, 6 Jul 2021 18:32:08 -0400 Subject: [PATCH 05/40] Add link to RTD into README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 631a99f4..98b9864e 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ Efficient and accurate pathogenicity prediction for coding and regulatory struct Most users should download the latest SvAnna distribution ZIP file from the [Releases page](https://github.com/TheJacksonLaboratory/SvAnna/releases). -Please consult the Read the docs site for detailed documentation - TODO - setup RTD. +Please consult the Read the docs site for [detailed documentation](https://svanna.readthedocs.io/en/latest). ## Attic From 07cd51035b734db97ec26e54dadd7d09aca66814 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Tue, 6 Jul 2021 18:34:32 -0400 Subject: [PATCH 06/40] Draft Quickstart and Setup --- docs/outputformats.rst | 7 ++ docs/quickstart.rst | 73 +++++++++++++++++ docs/setup.rst | 178 +++++++++++++++++++++++++++++------------ 3 files changed, 208 insertions(+), 50 deletions(-) create mode 100644 docs/outputformats.rst create mode 100644 docs/quickstart.rst diff --git a/docs/outputformats.rst b/docs/outputformats.rst new file mode 100644 index 00000000..8fce4e1c --- /dev/null +++ b/docs/outputformats.rst @@ -0,0 +1,7 @@ +.. _rstoutputformats: + +============== +Output formats +============== + +.. TODO - Write sections diff --git a/docs/quickstart.rst b/docs/quickstart.rst new file mode 100644 index 00000000..24c0371f --- /dev/null +++ b/docs/quickstart.rst @@ -0,0 +1,73 @@ +.. _rstquickstart: + +========== +Quickstart +========== + +This document is intended for the impatient users who want to quickly setup and prioritize variants with SvAnna. + +Prerequisites +^^^^^^^^^^^^^ + +SvAnna is written in Java 11 and needs Java 11+ to be present in the runtime environment. Please verify that you are +running Java 11+ by running:: + + java -version + + +SvAnna setup +^^^^^^^^^^^^ + +Run the following commands to setup SvAnna. + +Download and extract SvAnna distribution ZIP archive from `here `_. Then, run:: + + java -jar svanna-cli-1.0.0-RC1/svanna-cli-1.0.0-RC1.jar --help + +.. note:: + If things went well, the command above should print the following help message:: + + Structural variant annotation + Usage: svanna-cli.jar [-hV] [COMMAND] + -h, --help Show this help message and exit. + -V, --version Print version information and exit. + Commands: + generate-config, G Generate a configuration YAML file + prioritize, P Prioritize the variants + See the full documentation at `https://github.com/TheJacksonLaboratory/SvAnna` + +Download SvAnna database files:: + + svanna_data=TODO + wget $svanna_data && unzip TODO + wget https://squirls.s3.amazonaws.com/jannovar_v0.35.zip && unzip jannovar_v0.35.zip + + +Generate the configuration file:: + + java -jar `pwd`/svanna/svanna-cli-1.0.0-RC1.jar generate-config svanna-config.yml + +Now use your favorite text editor to fill in the details for: + +* ``dataDirectory:`` - the absolute path to the folder where SvAnna database files were extracted +* ``jannovarCachePath`` - the absolute path to selected Jannovar ``*.ser`` file, e.g. ``/path/to/hg38_refseq.ser`` + +.. tip:: + YAML syntax requires a whitespace to be present between the *key*: *value* pairs. + +Prioritize structural variants in VCF file +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Having completed the steps above, you're good to prioritize variants in VCF file. Let's annotate a toy VCF file containing +eight variants reported in the SvAnna manuscript. First, let's download the VCF file:: + + # download the VCF file + wget https://github.com/TheJacksonLaboratory/SvAnna/blob/master/svanna-cli/src/examples/example.vcf + +Now, let's prioritize the variants:: + + java -jar `pwd`/svanna/svanna-cli-1.0.0-RC1.jar prioritize svanna-config.yml --output-format html,csv,vcf --term HP:123456 --vcf example.vcf + +SvAnna will annotate the VCF file and store the results next to the VCF file. + + diff --git a/docs/setup.rst b/docs/setup.rst index a9ab0b53..f4538053 100644 --- a/docs/setup.rst +++ b/docs/setup.rst @@ -1,84 +1,162 @@ .. _rstsetup: +================= Setting up SvAnna -================ +================= -SvAnna is a desktop Java application that requires several external files to run. +SvAnna is a desktop Java application that requires several external files to run. This document explains how to download +the external files and how to prepare SvAnna for running in the local system. +.. note:: + SvAnna is written with Java version 11 and will run and compile under Java 11+. Prerequisites -~~~~~~~~~~~~~ +^^^^^^^^^^^^^ SvAnna was written with Java version 11. `Git `_ and `Java Development Kit `_ version 11 or better -are required to build SvAnna from source. Most users should download the prebuilt SvAnna files from the -`Releases page `_. +are required to build SvAnna from source. Installation -~~~~~~~~~~~~ +^^^^^^^^^^^^ -Go the GitHub page of `SvAnna `_, and clone or download the project. -Build the executable from source with maven, and then test the build. :: - - $ git clone https://github.com/TheJacksonLaboratory/svann - $ cd svann - $ ./mvnw package - $ java -jar svanna-cli/target/svanna-cli.jar - $ Usage:
[options] [command] [command options] - Options: - -h, --help - display this help message - (...) +To install SvAnna, you need to get SvAnna distribution ZIP archive that contains the executable JAR file, and SvAnna +database files. Prebuilt SvAnna executable -^^^^^^^^^^^^^^^^^^^^^^^^^^ +~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Alternatively, go to the `Releases section `_ on the -SvAnna GitHub page and download the latest precompiled version of SvAnna. +To download the prebuilt SvAnna JAR file, go to the +`Releases section `_ +on the SvAnna GitHub page and download the latest precompiled version of SvAnna. +SvAnna database files +~~~~~~~~~~~~~~~~~~~~~~~~~ -The download command -~~~~~~~~~~~~~~~~~~~~ +SvAnna database files are available for download from: + +.. TODO - complete + +**hg38/GRCh38** + Download `2107_hg38 `_ (~TODO GB for download, ~TODO GB unpacked) + +After the download, unzip the archive(s) content into a folder and note the folder path. + +.. tip:: + Keeping the files on a high-performance hard drive will improve the runtime performance. + +Jannovar transcript databases +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Functional annotation of variants, which is required for certain SvAnna tasks, is performed using `Jannovar`_ library. +To run the annotation, Jannovar transcript database files need to be provided. The Jannovar ``v0.35`` database files were +tested to work with SvAnna. + +For your convenience, the files containing *UCSC*, *RefSeq*, or *ENSEMBL* transcripts +for *hg19* or *hg38* genome assemblies are available for download (~330 MB for download, ~330 MB unpacked). + +`Download Jannovar files from here `_. + + +Build SvAnna from source +~~~~~~~~~~~~~~~~~~~~~~~~ + +As an alternative to using prebuilt SvAnna JAR file, the SvAnna JAR file can also be built from Java sources. + +Run the following commands to download SvAnna source code from GitHub repository and to build SvAnna JAR file:: -.. _rstdownload: + $ git https://github.com/TheJacksonLaboratory/SvAnna + $ cd SvAnna + $ ./mvnw package -SvAnna requires four additional files to run. TODO update this list! +.. note:: + To build SvAnna from sources, JDK 11 or better must be available in the environment -1. ``hp.obo``. The main Human Phenotype Ontology file -2. ``phenotype.hpoa`` The main annotation file with all HPO disease models -3. ``Homo_sapiens_gene_info.gz`` A file from NCBI Entrez Gene with information about human genes -4. ``mim2gene_medgen`` A file from the NCBI medgen project with OMIM-derived links between genes and diseases +After the successful build, the JAR file is located at ``svanna-cli/target/svanna-cli-1.0.0-RC1.jar``. -SvAnna offers a convenience function to download all four files -to a local directory. By default, SvAnna will download all four files into a newly created subdirectory -called ``data`` in the current working directory. You can change this default with the ``-d`` or ``--data`` options -(If you change this, then you will need to pass the location of your directory to all other SvAnna commands -using the ``-d`` flag). Download the files automatically as follows. :: +To verify that the building process went well, run:: + + $ java -jar svanna-cli/target/svanna-cli-1.0.0-RC1.jar --help + +You should see a message describing how to run the individual commands. + +.. note:: + From now on, we will use ``svanna-cli.jar`` instead of spelling out the full path to the JAR file within your environment. + +.. _generate-config-ref: + +``generate-config`` - Generate and fill the configuration file +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +SvAnna needs to know about the locations of the external files. The locations are provided in a YAML configuration file. +The command ``generate-config`` generates an empty configuration file:: + + $ java -jar svanna-cli.jar generate-config svanna-config.yml + + +The command above generates an empty configuration file ``svanna-config.yml`` in the working directory. + +The configuration file has the following content:: + + # Required properties template, the file follows YAML syntax. + svanna: + # path to folder with SvAnna database and the other required files + dataDirectory: + # path to Jannovar transcript database + jannovarCachePath: + + #dataParameters: + # TAD must be present in at least 90% of the tissues to be included in the analysis + #tadStabilityThreshold: 90 + #enhancers: + # Include VISTA developmental enhancers in the analysis + #useVista: true + #prioritizationParameters: + # Evaluate effect of all variants (including single-gene events) in context of the TAD domain + #forceTadEvaluation: false + # term similarity measure, choose from {RESNIK_SYMMETRIC, RESNIK_ASYMMETRIC} + #termSimilarityMeasure: RESNIK_SYMMETRIC + # The mode for getting information content of the most informative common ancestors for terms t1, and t2. + # Choose from {IN_MEMORY, DATABASE}. + # IN_MEMORY is faster but uses more memory + # DATABASE is slower but also more memory efficient + #icMicaMode: DATABASE + # An event involving max N genes to be considered by the prototype prioritizer + #maxGenes: 100 + # Number of bases prepended to a transcript and evaluated as a promoter region + #promoterLength: 2000 + # Set to 0. to score promoter variants as strictly as coding variants, or to 1. to skip + #promoterFitnessGain: .6 + +Mandatory parameters +~~~~~~~~~~~~~~~~~~~~ - $ java -jar svanna-cli.jar download +Open the file in your favorite text editor and provide the following three bits of information: -SvAnna will not download the files if they are already present unless the ``-f`` or ``--force-overwrite`` argument is passed. For -instance, the following command would download the four files to a directory called datafiles and would -overwrite any previously downloaded files. :: +1. ``dataDirectory`` - location the the folder with SvAnna data. The directory is expected to have a structure like:: - $ java -jar svanna-cli.jar download -d datafiles --force-overwrite + svanna_folder + |- gencode.v35.chr_patch_hapl_scaff.basic.annotation.gtf.gz + |- Homo_sapiens.gene_info.gz + |- hp.obo + |- mim2gene_medgen + |- phenotype.hpoa + \- svanna_db.mv.db + where ``svanna_folder`` corresponds to content of the ZIP files downloaded in the previous section -If desired, you can download these files on your own but you need to place them all in the -same directory to run SvAnna. +2. ``jannovarCachePath`` - path to Jannovar transcript database to be used for analysis. -DGV: Database of Genomic Variants -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. tip:: + The YAML syntax requires to include a white space between key, value pairs (e.g. ``dataDirectory: /project/joe/svanna_resources``. -SvAnna is being extended to filter variants for overlap with population variants. For now, SvAnna uses a downloaded -file from the DGV (http://dgv.tcag.ca/dgv/app/downloads). Choose the file for the correct genome assembly (probably hg38), and -download it. We then need to index this file with `tabix `_. -Assuming we have downloaded the file ``GRCh38_hg38_variants_2020-02-25.txt`` from DGV, proceed with the following steps :: - $ bgzip GRCh38_hg38_variants_2020-02-25.txt - $ tabix --skip-lines 1 -s 2 -b 3 -e 4 GRCh38_hg38_variants_2020-02-25.txt.gz +Optional parameters +~~~~~~~~~~~~~~~~~~~ -After running the commands, the DGV file is compressed and indexed for quick random access. +The optional parameters do not necessarily need to be touched unless you know what you're doing. +.. TODO - describe the optional parameters. +.. _Jannovar: https://pubmed.ncbi.nlm.nih.gov/24677618 From 420b45709ef62a36ee8d739113af1a8be716e0c4 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Tue, 6 Jul 2021 20:42:50 -0400 Subject: [PATCH 07/40] Delete obsolete reference to Resnik MICA file --- .../java/org/jax/svanna/autoconfigure/SvannaDataResolver.java | 3 --- 1 file changed, 3 deletions(-) diff --git a/svanna-spring-boot-autoconfigure/src/main/java/org/jax/svanna/autoconfigure/SvannaDataResolver.java b/svanna-spring-boot-autoconfigure/src/main/java/org/jax/svanna/autoconfigure/SvannaDataResolver.java index 47769a31..2668bfee 100644 --- a/svanna-spring-boot-autoconfigure/src/main/java/org/jax/svanna/autoconfigure/SvannaDataResolver.java +++ b/svanna-spring-boot-autoconfigure/src/main/java/org/jax/svanna/autoconfigure/SvannaDataResolver.java @@ -46,7 +46,4 @@ public Path geneInfoPath() { return svannaDataDirectory.resolve("Homo_sapiens.gene_info.gz"); } - public Path precomputedResnikSimilaritiesPath() { - return svannaDataDirectory.resolve("resnik_similarity.csv.gz"); - } } From 173b17f6c7deddc97b0d8aa7421ac7602620643d Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Tue, 6 Jul 2021 21:00:27 -0400 Subject: [PATCH 08/40] Remove the unused Gencode GTF file from ingest & datadir --- .../src/main/java/org/jax/svanna/ingest/cmd/BuildDb.java | 2 +- .../org/jax/svanna/ingest/cmd/IngestDbProperties.java | 8 -------- svanna-ingest/src/main/resources/application-template.yml | 1 - 3 files changed, 1 insertion(+), 10 deletions(-) diff --git a/svanna-ingest/src/main/java/org/jax/svanna/ingest/cmd/BuildDb.java b/svanna-ingest/src/main/java/org/jax/svanna/ingest/cmd/BuildDb.java index c469d28e..c9ba9b55 100644 --- a/svanna-ingest/src/main/java/org/jax/svanna/ingest/cmd/BuildDb.java +++ b/svanna-ingest/src/main/java/org/jax/svanna/ingest/cmd/BuildDb.java @@ -307,7 +307,7 @@ private static URLConnection openHttpConnectionHandlingRedirects(URL source) thr private static void downloadPhenotypeFiles(Path buildDir, IngestDbProperties properties) throws IOException { IngestDbProperties.PhenotypeProperties phenotype = properties.phenotype(); List fieldsToDownload = List.of(phenotype.hpoOboUrl(), phenotype.hpoAnnotationsUrl(), - phenotype.mim2geneMedgenUrl(), phenotype.geneInfoUrl(), phenotype.gencodeUrl()); + phenotype.mim2geneMedgenUrl(), phenotype.geneInfoUrl()); for (String field : fieldsToDownload) { URL url = new URL(field); downloadUrl(url, buildDir); diff --git a/svanna-ingest/src/main/java/org/jax/svanna/ingest/cmd/IngestDbProperties.java b/svanna-ingest/src/main/java/org/jax/svanna/ingest/cmd/IngestDbProperties.java index 2a87d272..16df3d03 100644 --- a/svanna-ingest/src/main/java/org/jax/svanna/ingest/cmd/IngestDbProperties.java +++ b/svanna-ingest/src/main/java/org/jax/svanna/ingest/cmd/IngestDbProperties.java @@ -162,7 +162,6 @@ public static class PhenotypeProperties { private String hpoAnnotationsUrl; private String mim2geneMedgenUrl; private String geneInfoUrl; - private String gencodeUrl; public String hpoOboUrl() { return hpoOboUrl; @@ -196,13 +195,6 @@ public void setGeneInfoUrl(String geneInfoUrl) { this.geneInfoUrl = geneInfoUrl; } - public String gencodeUrl() { - return gencodeUrl; - } - - public void setGencodeUrl(String gencodeUrl) { - this.gencodeUrl = gencodeUrl; - } } @ConfigurationProperties(prefix = "svanna.tad.phenotype") diff --git a/svanna-ingest/src/main/resources/application-template.yml b/svanna-ingest/src/main/resources/application-template.yml index 973e301e..1f32fd08 100644 --- a/svanna-ingest/src/main/resources/application-template.yml +++ b/svanna-ingest/src/main/resources/application-template.yml @@ -31,7 +31,6 @@ svanna.ingest: hpoAnnotationsUrl: http://purl.obolibrary.org/obo/hp/hpoa/phenotype.hpoa mim2geneMedgenUrl: ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/mim2gene_medgen geneInfoUrl: ftp://ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz - gencodeUrl: ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_35/gencode.v35.chr_patch_hapl_scaff.basic.annotation.gtf.gz ### 3D genome browser # TADs in hg38: http://3dgenome.fsm.northwestern.edu/downloads/hg38.TADs.zip From 5d42170ce0e60aebb90071ea578c17efe469dd8b Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Tue, 6 Jul 2021 21:11:09 -0400 Subject: [PATCH 09/40] Update data directory description --- docs/setup.rst | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/docs/setup.rst b/docs/setup.rst index f4538053..16c29d34 100644 --- a/docs/setup.rst +++ b/docs/setup.rst @@ -137,12 +137,11 @@ Open the file in your favorite text editor and provide the following three bits 1. ``dataDirectory`` - location the the folder with SvAnna data. The directory is expected to have a structure like:: svanna_folder - |- gencode.v35.chr_patch_hapl_scaff.basic.annotation.gtf.gz - |- Homo_sapiens.gene_info.gz - |- hp.obo - |- mim2gene_medgen - |- phenotype.hpoa - \- svanna_db.mv.db + |- svanna_db.mv.db + |- hp.obo + |- phenotype.hpoa + |- mim2gene_medgen + \- Homo_sapiens.gene_info.gz where ``svanna_folder`` corresponds to content of the ZIP files downloaded in the previous section From 67301ecd8e0980926d3ae84d597b72ce05bbde4d Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Tue, 6 Jul 2021 21:52:31 -0400 Subject: [PATCH 10/40] Add text to quickstart.rst and running.rst --- docs/quickstart.rst | 6 +- docs/running.rst | 63 ++++++++++++++++--- .../jax/svanna/cli/cmd/PrioritizeCommand.java | 10 +-- 3 files changed, 62 insertions(+), 17 deletions(-) diff --git a/docs/quickstart.rst b/docs/quickstart.rst index 24c0371f..6e7a1619 100644 --- a/docs/quickstart.rst +++ b/docs/quickstart.rst @@ -66,8 +66,8 @@ eight variants reported in the SvAnna manuscript. First, let's download the VCF Now, let's prioritize the variants:: - java -jar `pwd`/svanna/svanna-cli-1.0.0-RC1.jar prioritize svanna-config.yml --output-format html,csv,vcf --term HP:123456 --vcf example.vcf - -SvAnna will annotate the VCF file and store the results next to the VCF file. + java -jar `pwd`/svanna/svanna-cli-1.0.0-RC1.jar prioritize svanna-config.yml --output-format html,csv,vcf --vcf example.vcf --term HP:0011890 --term HP:0000978 --term HP:0012147 +SvAnna will prioritize the variants and store the results in HTML file next to the VCF file. +Read the :ref:`rstrunning` section to learn about additional options SvAnna offers. diff --git a/docs/running.rst b/docs/running.rst index fdfff254..8b1f6a0e 100644 --- a/docs/running.rst +++ b/docs/running.rst @@ -1,17 +1,62 @@ .. _rstrunning: -Running svanna -############## +========== +Run SvAnna +========== +SvAnna is a command-line Java tool that runs with Java version 11 or higher. -Creating a VCF file -^^^^^^^^^^^^^^^^^^^ +Before using SvAnna, you must setup SvAnna as describe in the :ref:`rstsetup` section. -We have tested svanna with VCF files -generated by the following programs. +SvAnna provides a command for performing phenotype-driven prioritization of structural variants (SVs) stored in +VCF format. +In the examples below, we assume that ``svanna-config.yml`` points to a configuration file with correct locations of +SvAnna resources. -1. `pbsv `_, an SV caller provided by Pacific Biosciences. Currently, -pbsv calls insertions, deletions, inversions, duplications, and translocations. +``prioritize`` - Prioritization of structural variants +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -2. `sniffles `_, todo... \ No newline at end of file +The aim of this command is to perform phenotype-driven prioritization of SVs in VCF file. The prioritized variants are +stored in one or more :ref:`rstoutputformats`. + +To prioritize variants in the `example.vcf`_ file (an example VCF file with 8 variants stored in SvAnna repository), run:: + + java -jar svanna-cli.jar prioritize --config svanna-config.yaml --vcf example.vcf --term HP:0011890 --term HP:0000978 --term HP:0012147 + +After the annotation, the results are stored at ``output.html``. + +Mandatory arguments +~~~~~~~~~~~~~~~~~~~ + +All CLI arguments for the ``prioritize`` command are supplied as *options* (no positional parameters). + +There is one *mandatory* option: + +* ``-c | --config`` - path to Squirls configuration file + +Then, input data is specified either as a path to VCF file along with one or more HPO terms, or as a *Phenopacket*: + +* ``--vcf`` - path to input VCF file +* ``-t | --term`` - HPO term describing clinical condition of the proband, may be specified multiple times (e.g. ``--term HP:1234567 --term HP:9876543`` +* ``-p | --phenopacket`` - path to Phenopacket in JSON format + +If both ``--vcf`` and ``--phenopacket`` options are specified, ``--vcf`` has a precedence and the phenopacket will *not* +be processed. + +Optional arguments +~~~~~~~~~~~~~~~~~~ + +SvAnna allows to fine-tune the prioritization using the following *optional* options: + +* ``--n-threads`` - prioritize variants using *n* threads to speed up the prioritization. More threads require more RAM (default ``2``) +* ``--min-read-support`` - minimum number of reads that must support presence of the *alt* allele in order for variant to be included in the analysis (default `3``) +* ``--overlap-threshold`` - threshold value to determine if the SV matches a variant from the population variant databases. The threshold is provided as a percentage (default ``80``) +* ``--no-breakends`` - do not report breakend variants in HTML report +* ``--frequency-threshold`` - threshold for labeling SVs in population variant databases *pv* as common. If query SV *v* overlaps with *pv* that has frequency above the threshold, then *v* is considered to be *common*. +* ``--output-format`` - comma separated list of output formats to use for writing the results. See :ref:`rstoutputformats` section for available output formats (default ``html``) +* ``--prefix`` - prefix for output files (default: based on the input VCF name) +* ``--report-top-variants`` - report top *n* variants in the HTML report (default: ``100``) + + +.. _example.vcf: https://github.com/TheJacksonLaboratory/Squirls/blob/development/squirls-cli/src/examples/example.vcf \ No newline at end of file diff --git a/svanna-cli/src/main/java/org/jax/svanna/cli/cmd/PrioritizeCommand.java b/svanna-cli/src/main/java/org/jax/svanna/cli/cmd/PrioritizeCommand.java index 646fe5ff..ffba3a5b 100644 --- a/svanna-cli/src/main/java/org/jax/svanna/cli/cmd/PrioritizeCommand.java +++ b/svanna-cli/src/main/java/org/jax/svanna/cli/cmd/PrioritizeCommand.java @@ -106,9 +106,9 @@ public class PrioritizeCommand extends SvAnnaCommand { description = "Reassign priority of heterozygous variants if at least one affected gene is not associated with AD disease (default: ${DEFAULT-VALUE})") public boolean modeOfInheritance = false; - @CommandLine.Option(names = {"--similarity-threshold"}, + @CommandLine.Option(names = {"--overlap-threshold"}, description = "Percentage threshold for determining variant's region is similar enough to database entry (default: ${DEFAULT-VALUE})") - public float similarityThreshold = 80.F; + public float overlapThreshold = 80.F; /* * ------------ OUTPUT OPTIONS ------------ @@ -233,10 +233,10 @@ private void runAnalysis(Collection patientTerms, Path vcfFile) throws I LogUtils.logInfo(LOGGER, "Read {} variants", NF.format(variants.size())); // Filter - LogUtils.logInfo(LOGGER, "Filtering out the variants with reciprocal overlap >{}% occurring in more than {}% probands", similarityThreshold, frequencyThreshold); + LogUtils.logInfo(LOGGER, "Filtering out the variants with reciprocal overlap >{}% occurring in more than {}% probands", overlapThreshold, frequencyThreshold); LogUtils.logInfo(LOGGER, "Filtering out the variants where ALT allele is supported by less than {} reads", minAltReadSupport); AnnotationDataService annotationDataService = context.getBean(AnnotationDataService.class); - PopulationFrequencyAndCoverageFilter filter = new PopulationFrequencyAndCoverageFilter(annotationDataService, similarityThreshold, frequencyThreshold, minAltReadSupport, maxLength); + PopulationFrequencyAndCoverageFilter filter = new PopulationFrequencyAndCoverageFilter(annotationDataService, overlapThreshold, frequencyThreshold, minAltReadSupport, maxLength); List filteredVariants = filter.filter(variants); // Prioritize @@ -284,7 +284,7 @@ private AnalysisParameters getAnalysisParameters(SvannaProperties properties) { analysisParameters.setJannovarCachePath(properties.jannovarCachePath()); analysisParameters.setPhenopacketPath(phenopacketPath == null ? null : phenopacketPath.toAbsolutePath().toString()); analysisParameters.setVcfPath(vcfFile.toAbsolutePath().toString()); - analysisParameters.setSimilarityThreshold(similarityThreshold); + analysisParameters.setSimilarityThreshold(overlapThreshold); analysisParameters.setFrequencyThreshold(frequencyThreshold); analysisParameters.addAllPopulationVariantOrigins(PopulationVariantOrigin.benign()); analysisParameters.setMinAltReadSupport(minAltReadSupport); From 7acf5b5f6ad58e54642f7a3e90b291368ee69ea0 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Tue, 6 Jul 2021 21:54:19 -0400 Subject: [PATCH 11/40] Add link to database files into setup --- docs/setup.rst | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/docs/setup.rst b/docs/setup.rst index 16c29d34..55522618 100644 --- a/docs/setup.rst +++ b/docs/setup.rst @@ -36,10 +36,8 @@ SvAnna database files SvAnna database files are available for download from: -.. TODO - complete - **hg38/GRCh38** - Download `2107_hg38 `_ (~TODO GB for download, ~TODO GB unpacked) + Download `svanna.zip `_ (~600 MB for download, 2.6 GB unpacked) After the download, unzip the archive(s) content into a folder and note the folder path. From 12d9429334230a89ce100e7e7a578b37fc7b3e97 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Tue, 6 Jul 2021 21:59:35 -0400 Subject: [PATCH 12/40] Update quickstart --- docs/quickstart.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/quickstart.rst b/docs/quickstart.rst index 6e7a1619..7c5a7c4b 100644 --- a/docs/quickstart.rst +++ b/docs/quickstart.rst @@ -22,7 +22,7 @@ Run the following commands to setup SvAnna. Download and extract SvAnna distribution ZIP archive from `here `_. Then, run:: - java -jar svanna-cli-1.0.0-RC1/svanna-cli-1.0.0-RC1.jar --help + java -jar svanna-cli-1.0.0-RC1.jar --help .. note:: If things went well, the command above should print the following help message:: @@ -38,8 +38,8 @@ Download and extract SvAnna distribution ZIP archive from `here Date: Sun, 11 Jul 2021 12:52:18 -0400 Subject: [PATCH 13/40] Prevent null pointer when GeneService is queried for a genes overlapping with a query on an unknown contig --- .../svanna/core/reference/GeneService.java | 12 +++- .../core/reference/GeneServiceTest.java | 55 +++++++++++++++++++ 2 files changed, 65 insertions(+), 2 deletions(-) create mode 100644 svanna-core/src/test/java/org/jax/svanna/core/reference/GeneServiceTest.java diff --git a/svanna-core/src/main/java/org/jax/svanna/core/reference/GeneService.java b/svanna-core/src/main/java/org/jax/svanna/core/reference/GeneService.java index e4053022..bbd01723 100644 --- a/svanna-core/src/main/java/org/jax/svanna/core/reference/GeneService.java +++ b/svanna-core/src/main/java/org/jax/svanna/core/reference/GeneService.java @@ -1,23 +1,31 @@ package org.jax.svanna.core.reference; import de.charite.compbio.jannovar.impl.intervals.IntervalArray; +import org.jax.svanna.core.exception.LogUtils; import org.monarchinitiative.svart.CoordinateSystem; import org.monarchinitiative.svart.GenomicRegion; import org.monarchinitiative.svart.Strand; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.util.List; import java.util.Map; public interface GeneService { + Logger LOGGER = LoggerFactory.getLogger(GeneService.class); + Map> getChromosomeMap(); Gene bySymbol(String symbol); default List overlappingGenes(GenomicRegion query) { IntervalArray array = getChromosomeMap().get(query.contigId()); - IntervalArray.QueryResult result = - query.length() == 0 + if (array == null) { + LogUtils.logDebug(LOGGER, "Unknown contig ID {} for query {}:{}:{}", query.contigId(), query.contigName(), query.start(), query.end()); + return List.of(); + } + IntervalArray.QueryResult result = query.length() == 0 ? array.findOverlappingWithPoint(query.startOnStrandWithCoordinateSystem(Strand.POSITIVE, CoordinateSystem.zeroBased())) : array.findOverlappingWithInterval(query.startOnStrandWithCoordinateSystem(Strand.POSITIVE, CoordinateSystem.zeroBased()), query.endOnStrandWithCoordinateSystem(Strand.POSITIVE, CoordinateSystem.zeroBased())); return result.getEntries(); diff --git a/svanna-core/src/test/java/org/jax/svanna/core/reference/GeneServiceTest.java b/svanna-core/src/test/java/org/jax/svanna/core/reference/GeneServiceTest.java new file mode 100644 index 00000000..59b08315 --- /dev/null +++ b/svanna-core/src/test/java/org/jax/svanna/core/reference/GeneServiceTest.java @@ -0,0 +1,55 @@ +package org.jax.svanna.core.reference; + +import org.jax.svanna.core.TestContig; +import org.jax.svanna.core.TestDataConfig; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.CsvSource; +import org.monarchinitiative.svart.*; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.test.context.SpringBootTest; + +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; + +import static org.hamcrest.CoreMatchers.hasItems; +import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.*; + +@SpringBootTest(classes = TestDataConfig.class) +public class GeneServiceTest { + + @Autowired + public GeneService geneService; + @Autowired + private GenomicAssembly genomicAssembly; + + @ParameterizedTest + @CsvSource({ + "9, 133356485, 133356544, ''", // right between the SURF1 and SURF2 genes + "9, 133356484, 133356544, SURF1", + "9, 133356485, 133356545, SURF2", + "9, 133356484, 133356545, 'SURF1,SURF2'", + "9, 133356480, 133356480, SURF1", // interval with length 0 + "15, 48550000, 48550100, FBN1", + }) + public void overlappingGenes(String contigName, int start, int end, String names) { + Contig contig = genomicAssembly.contigByName(contigName); + GenomicRegion region = GenomicRegion.of(contig, Strand.POSITIVE, CoordinateSystem.zeroBased(), start, end); + List genes = geneService.overlappingGenes(region); + + Set actual = genes.stream().map(Gene::geneSymbol).collect(Collectors.toUnmodifiableSet()); + String[] expected = names.split(","); + if (expected.length == 1 && expected[0].equals("")) + assertThat(actual, hasSize(0)); + else + assertThat(actual, hasItems(expected)); + } + + @Test + public void overlappingGenes_unknownContig() { + GenomicRegion region = GenomicRegion.of(TestContig.of(200, 100), Strand.POSITIVE, CoordinateSystem.zeroBased(), 50, 60); + assertThat(geneService.overlappingGenes(region), is(empty())); + } +} From a6c3465ba2cce7b5f4c2d9bd3b356f6340a41f8e Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Sun, 11 Jul 2021 13:18:59 -0400 Subject: [PATCH 14/40] Add more GeneService tests --- .../core/reference/GeneServiceTest.java | 17 ++++++++++ .../transcripts/JannovarGeneServiceTest.java | 33 ++++++++++--------- 2 files changed, 35 insertions(+), 15 deletions(-) diff --git a/svanna-core/src/test/java/org/jax/svanna/core/reference/GeneServiceTest.java b/svanna-core/src/test/java/org/jax/svanna/core/reference/GeneServiceTest.java index 59b08315..d7a07ed2 100644 --- a/svanna-core/src/test/java/org/jax/svanna/core/reference/GeneServiceTest.java +++ b/svanna-core/src/test/java/org/jax/svanna/core/reference/GeneServiceTest.java @@ -25,6 +25,23 @@ public class GeneServiceTest { @Autowired private GenomicAssembly genomicAssembly; + @Test + public void getChromosomeMap() { + assertThat(geneService.getChromosomeMap().keySet(), hasItems(1, 7, 9, 13, 15, 20, 23, 24)); + } + + @ParameterizedTest + @CsvSource({ + "SURF1, 4681", + "HNF4A, 77045", + }) + public void bySymbol(String symbol, int length) { + Gene gene = geneService.bySymbol(symbol); + + assertThat(gene.geneSymbol(), equalTo(symbol)); + assertThat(gene.length(), equalTo(length)); + } + @ParameterizedTest @CsvSource({ "9, 133356485, 133356544, ''", // right between the SURF1 and SURF2 genes diff --git a/svanna-core/src/test/java/org/jax/svanna/core/reference/transcripts/JannovarGeneServiceTest.java b/svanna-core/src/test/java/org/jax/svanna/core/reference/transcripts/JannovarGeneServiceTest.java index 0c86e436..a2a3e4a6 100644 --- a/svanna-core/src/test/java/org/jax/svanna/core/reference/transcripts/JannovarGeneServiceTest.java +++ b/svanna-core/src/test/java/org/jax/svanna/core/reference/transcripts/JannovarGeneServiceTest.java @@ -1,14 +1,16 @@ package org.jax.svanna.core.reference.transcripts; import de.charite.compbio.jannovar.data.JannovarData; -import de.charite.compbio.jannovar.data.JannovarDataSerializer; +import org.jax.svanna.core.TestDataConfig; import org.jax.svanna.core.reference.Gene; -import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; -import org.monarchinitiative.svart.*; +import org.monarchinitiative.svart.CoordinateSystem; +import org.monarchinitiative.svart.GenomicAssembly; +import org.monarchinitiative.svart.GenomicRegion; +import org.monarchinitiative.svart.Strand; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.test.context.SpringBootTest; -import java.nio.file.Path; -import java.nio.file.Paths; import java.util.List; import java.util.stream.Collectors; @@ -16,22 +18,23 @@ import static org.hamcrest.Matchers.hasItems; import static org.hamcrest.Matchers.hasSize; +@SpringBootTest(classes = TestDataConfig.class) public class JannovarGeneServiceTest { - private static final GenomicAssembly ASSEMBLY = GenomicAssemblies.GRCh38p13(); - private static final Path JANNOVAR_CACHE = Paths.get("/home/ielis/soft/jannovar/v0.35/hg38_refseq.ser"); + @Autowired + public GenomicAssembly genomicAssembly; + @Autowired + public JannovarData jannovarData; @Test - @Disabled - public void initialize() throws Exception { - JannovarData jannovarData = new JannovarDataSerializer(JANNOVAR_CACHE.toString()).load(); - JannovarGeneService service = JannovarGeneService.of(ASSEMBLY, jannovarData); + public void initialize() { + JannovarGeneService geneService = JannovarGeneService.of(genomicAssembly, jannovarData); + GenomicRegion region = GenomicRegion.of(genomicAssembly.contigByName("9"), Strand.POSITIVE, CoordinateSystem.zeroBased(), 133_356_484, 133_356_545); - GenomicRegion region = GenomicRegion.of(ASSEMBLY.contigByName("9"), Strand.POSITIVE, CoordinateSystem.zeroBased(), 133_352_000, 133_366_000); - List genes = service.overlappingGenes(region); + List genes = geneService.overlappingGenes(region); - assertThat(genes, hasSize(3)); - assertThat(genes.stream().map(Gene::geneSymbol).collect(Collectors.toSet()), hasItems("SURF1", "SURF2", "SURF4")); + assertThat(genes, hasSize(2)); + assertThat(genes.stream().map(Gene::geneSymbol).collect(Collectors.toSet()), hasItems("SURF1", "SURF2")); } } \ No newline at end of file From b9ecb230e7c36b05b078dab61918b3a805caebf1 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Sun, 11 Jul 2021 13:20:56 -0400 Subject: [PATCH 15/40] Add CI badges to README.md, remove the out-of-sync text --- README.md | 78 +++---------------------------------------------------- 1 file changed, 3 insertions(+), 75 deletions(-) diff --git a/README.md b/README.md index 98b9864e..021a7bca 100644 --- a/README.md +++ b/README.md @@ -1,83 +1,11 @@ # SvAnna +![Java CI with Maven](https://github.com/TheJacksonLaboratory/SvAnna/workflows/Java%20CI%20with%20Maven/badge.svg) +[![Documentation Status](https://readthedocs.org/projects/svanna/badge/?version=latest)](https://svanna.readthedocs.io/en/latest/?badge=latest) + Efficient and accurate pathogenicity prediction for coding and regulatory structural variants in long-read genome sequencing Most users should download the latest SvAnna distribution ZIP file from the [Releases page](https://github.com/TheJacksonLaboratory/SvAnna/releases). Please consult the Read the docs site for [detailed documentation](https://svanna.readthedocs.io/en/latest). - -## Attic - -**The text below is out of sync, and the most useful parts of the text will be moved to *Read the docs*.** - -**The documentation needs to be completed.** - -### Creating the Jannovar transcript file -[Jannovar](https://github.com/charite/jannovar) is a Java app/library for annotating -VCF files. Its main use case is for small variants and their intersection with -protein coding sequences. We will use it here to extract the positions of genes and -SVs, but it may be easier just to start with a gencode GFF file in the future. - -Jannovar downloads various files and creates a transcript file that it uses for VCF annotation. -At present, NCBI etc has changed the location of some files so that only the develop branch -of Jannovar works. Enter the following commands to create the transcript file - -``` -git clone -https://github.com/charite/jannovar.git -cd jannovar -git checkout develop -mvn package -java [-Xmx8g] -jar jannovar-cli-0.36-SNAPSHOT.jar download -d hg38/refseq_curated -``` -This command downloads various files and generates `data/hg38_refseq_curated.ser`. either move -this to the data subdirectory in this project or softlink it (from 'data', enter `ln -s `). -Thus, for now, this project expects the path `data/data/refseq_curated.ser`. - -## Running svann - -Enter the following command to see options. The LIRICAL file is the -LIRICAL TSV output file. The enhancers file is created by the -https://github.com/pnrobinson/tspec app. To use the enhancers file -it is required to also use an HPO term with the major phenotypic abnormality, -e.g., [Abnormality of the immune system](https://hpo.jax.org/app/browse/term/HP:0002715). - -``` -$ java -jar target/svann.jar annotate -h - Usage: svann annotate [-hV] [-e=] [-g=] - [-j=] [-t=] -v= - [-x=] - annotate VCF file - -e, --enhancer= - tspec enhancer file - -g, --gencode= - - -h, --help Show this help message and exit. - -j, --jannovar= - prefix for output files (default: - data/data/hg38_refseq_curated.ser ) - -t, --term= HPO term IDs (comma-separated list) - -v, --vcf= - -V, --version Print version information and exit. - -x, --prefix= prefix for output files (default: L2O ) -``` - - - - -# Documentation - -Generate the read the docs documentation locally by going to the ``docs`` subdirectory. -First generate a virtual environment and install the required sphinx packages. :: - - virtualenv p38 - source p38/bin/activate - pip install sphinx sphinx-rtd-theme - -To create the documentation, ensure you are using the ``p38`` environment and enter the following command. :: - - source p38/bin/activate - make html - -This will generate HTML pages under ``_build/html``. \ No newline at end of file From 0467f33986050472e7ddce1024984437d0e09176 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Sun, 11 Jul 2021 13:44:02 -0400 Subject: [PATCH 16/40] Change logging level from warn to debug when an ENTREZ id is missing --- .../reference/transcripts/JannovarGeneService.java | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/svanna-core/src/main/java/org/jax/svanna/core/reference/transcripts/JannovarGeneService.java b/svanna-core/src/main/java/org/jax/svanna/core/reference/transcripts/JannovarGeneService.java index ba5e59b2..70433e50 100644 --- a/svanna-core/src/main/java/org/jax/svanna/core/reference/transcripts/JannovarGeneService.java +++ b/svanna-core/src/main/java/org/jax/svanna/core/reference/transcripts/JannovarGeneService.java @@ -64,7 +64,7 @@ public static JannovarGeneService of(GenomicAssembly assembly, JannovarData... d ImmutableSortedMap altIds = tm.getAltGeneIDs(); if (!altIds.containsKey("ENTREZ_ID")) { - LogUtils.logWarn(LOGGER, "Missing entrez id for gene {}", tm.getGeneSymbol()); + LogUtils.logDebug(LOGGER, "Missing entrez id for gene {}", tm.getGeneSymbol()); return; } TermId geneAccessionId = TermId.of("NCBIGene", altIds.get("ENTREZ_ID")); @@ -122,14 +122,14 @@ public Gene bySymbol(String symbol) { private static Function, Optional> buildGeneIfPossible() { return entry -> { - Gene gene; try { - gene = entry.getValue().build(); + return Optional.of(entry.getValue().build()); } catch (Exception e) { - LogUtils.logWarn(LOGGER, "Unable to remap gene {}: {}", entry.getKey(), e.getMessage()); + // TODO - we may do a more granular catching, the code should mainly throw IllegalArgumentException + LogUtils.logDebug(LOGGER, "Unable to remap gene {}: {}", entry.getKey(), e.getMessage()); return Optional.empty(); } - return Optional.of(gene); + }; } From 63f2d29b5cc9114ac238c65ca7f3bfd497d1e365 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Sun, 11 Jul 2021 21:00:04 -0400 Subject: [PATCH 17/40] Implement VCF output format, remove MergedVariantParser --- .../benchmark/BenchmarkCaseCommand.java | 6 +- .../svanna/benchmark/CaseReportImporter.java | 6 +- .../jax/svanna/cli/cmd/PrioritizeCommand.java | 165 ++++++------ .../svanna/cli/writer/AnalysisResults.java | 8 +- .../cli/writer/ResultWriterFactory.java | 10 +- .../cli/writer/vcf/VcfResultWriter.java | 122 +++++++++ .../org/jax/svanna/io/FullSvannaVariant.java | 10 + .../io/parse}/BreakendedSvannaVariant.java | 63 +++-- .../io/parse}/DefaultSvannaVariant.java | 69 +++-- .../svanna/io/parse/MergedVariantParser.java | 252 ------------------ .../java/org/jax/svanna/io/parse/Utils.java | 15 -- .../io/parse/VariantCallAttributeParser.java | 8 +- .../jax/svanna/io/parse/VcfVariantParser.java | 48 ++-- .../io/parse/MergedVariantParserTest.java | 70 ----- .../parse/VariantCallAttributeParserTest.java | 14 +- .../svanna/io/parse/VcfVariantParserTest.java | 29 +- 16 files changed, 370 insertions(+), 525 deletions(-) create mode 100644 svanna-cli/src/main/java/org/jax/svanna/cli/writer/vcf/VcfResultWriter.java create mode 100644 svanna-io/src/main/java/org/jax/svanna/io/FullSvannaVariant.java rename {svanna-core/src/main/java/org/jax/svanna/core/reference => svanna-io/src/main/java/org/jax/svanna/io/parse}/BreakendedSvannaVariant.java (73%) rename {svanna-core/src/main/java/org/jax/svanna/core/reference => svanna-io/src/main/java/org/jax/svanna/io/parse}/DefaultSvannaVariant.java (74%) delete mode 100644 svanna-io/src/main/java/org/jax/svanna/io/parse/MergedVariantParser.java delete mode 100644 svanna-io/src/main/java/org/jax/svanna/io/parse/Utils.java delete mode 100644 svanna-io/src/test/java/org/jax/svanna/io/parse/MergedVariantParserTest.java diff --git a/svanna-benchmark/src/main/java/org/jax/svanna/benchmark/BenchmarkCaseCommand.java b/svanna-benchmark/src/main/java/org/jax/svanna/benchmark/BenchmarkCaseCommand.java index 5d112d89..a159f304 100644 --- a/svanna-benchmark/src/main/java/org/jax/svanna/benchmark/BenchmarkCaseCommand.java +++ b/svanna-benchmark/src/main/java/org/jax/svanna/benchmark/BenchmarkCaseCommand.java @@ -103,15 +103,15 @@ public Integer call() { GenomicAssembly genomicAssembly = context.getBean(GenomicAssembly.class); LogUtils.logInfo(LOGGER, "Reading variants from `{}`", vcfFile); - VariantParser parser = new VcfVariantParser(genomicAssembly, false); - List variants = parser.createVariantAlleleList(vcfFile); + VariantParser parser = new VcfVariantParser(genomicAssembly); + List variants = parser.createVariantAlleleList(vcfFile); LogUtils.logInfo(LOGGER, "Read {} variants", NF.format(variants.size())); LogUtils.logInfo(LOGGER, "Filtering out the variants with reciprocal overlap >{}% occurring in more than {}% probands", similarityThreshold, frequencyThreshold); LogUtils.logInfo(LOGGER, "Filtering out the variants where ALT allele is supported by less than {} reads", minAltReadSupport); AnnotationDataService annotationDataService = context.getBean(AnnotationDataService.class); PopulationFrequencyAndCoverageFilter filter = new PopulationFrequencyAndCoverageFilter(annotationDataService, similarityThreshold, frequencyThreshold, minAltReadSupport, maxLength); - List allVariants = filter.filter(variants); + List allVariants = filter.filter(variants); List filteredVariants = allVariants.stream() .filter(SvannaVariant::passedFilters) diff --git a/svanna-benchmark/src/main/java/org/jax/svanna/benchmark/CaseReportImporter.java b/svanna-benchmark/src/main/java/org/jax/svanna/benchmark/CaseReportImporter.java index a3184ac3..3b9565ff 100644 --- a/svanna-benchmark/src/main/java/org/jax/svanna/benchmark/CaseReportImporter.java +++ b/svanna-benchmark/src/main/java/org/jax/svanna/benchmark/CaseReportImporter.java @@ -3,7 +3,11 @@ import com.google.protobuf.InvalidProtocolBufferException; import com.google.protobuf.util.JsonFormat; import org.jax.svanna.core.exception.LogUtils; -import org.jax.svanna.core.reference.*; +import org.jax.svanna.core.reference.SvannaVariant; +import org.jax.svanna.core.reference.VariantCallAttributes; +import org.jax.svanna.core.reference.Zygosity; +import org.jax.svanna.io.parse.BreakendedSvannaVariant; +import org.jax.svanna.io.parse.DefaultSvannaVariant; import org.monarchinitiative.phenol.ontology.data.TermId; import org.monarchinitiative.svart.*; import org.monarchinitiative.svart.util.VariantTrimmer; diff --git a/svanna-cli/src/main/java/org/jax/svanna/cli/cmd/PrioritizeCommand.java b/svanna-cli/src/main/java/org/jax/svanna/cli/cmd/PrioritizeCommand.java index 646fe5ff..8be29712 100644 --- a/svanna-cli/src/main/java/org/jax/svanna/cli/cmd/PrioritizeCommand.java +++ b/svanna-cli/src/main/java/org/jax/svanna/cli/cmd/PrioritizeCommand.java @@ -18,6 +18,7 @@ import org.jax.svanna.core.priority.*; import org.jax.svanna.core.reference.SvannaVariant; import org.jax.svanna.core.reference.Zygosity; +import org.jax.svanna.io.FullSvannaVariant; import org.jax.svanna.io.parse.VariantParser; import org.jax.svanna.io.parse.VcfVariantParser; import org.monarchinitiative.phenol.ontology.data.Term; @@ -37,7 +38,7 @@ import java.text.NumberFormat; import java.util.*; import java.util.concurrent.ExecutionException; -import java.util.function.Function; +import java.util.function.UnaryOperator; import java.util.stream.Collectors; @CommandLine.Command(name = "prioritize", @@ -49,20 +50,18 @@ footer = Main.FOOTER) public class PrioritizeCommand extends SvAnnaCommand { - private static final Logger LOGGER = LoggerFactory.getLogger(PrioritizeCommand.class); - protected static final NumberFormat NF = NumberFormat.getNumberInstance(); - - static { - NF.setMaximumFractionDigits(2); - } - + private static final Logger LOGGER = LoggerFactory.getLogger(PrioritizeCommand.class); /** * Gene contribution is multiplied by this factor if variant is heterozygous and the gene is not associated with * a dominant disease. */ private static final double MODE_OF_INHERITANCE_FACTOR = .5; + static { + NF.setMaximumFractionDigits(2); + } + /* * -------------- INPUT OPTIONS ------------- */ @@ -98,8 +97,8 @@ public class PrioritizeCommand extends SvAnnaCommand { description = "Do not prioritize variants longer than this (default: ${DEFAULT-VALUE})") public int maxLength = 250_000_000; - @CommandLine.Option(names={"--min-read-support"}, - description="Minimum number of ALT reads to prioritize (default: ${DEFAULT-VALUE})") + @CommandLine.Option(names = {"--min-read-support"}, + description = "Minimum number of ALT reads to prioritize (default: ${DEFAULT-VALUE})") public int minAltReadSupport = 3; @CommandLine.Option(names = {"--mode-of-inheritance"}, @@ -126,15 +125,78 @@ public class PrioritizeCommand extends SvAnnaCommand { description = "Prefix for output files (default: based on the input VCF name)") public String outPrefix = null; + @CommandLine.Option(names = {"--uncompressed-output"}, + description = "Write tabular and VCF output formats with no compression (default: ${DEFAULT-VALUE})") + public boolean uncompressed = false; + @CommandLine.Option(names = {"--report-top-variants"}, paramLabel = "100", description = "Report top n variants (default: ${DEFAULT-VALUE})") public int reportNVariants = 100; + private static Optional getVcfFilePath(Phenopacket phenopacket) { + // There should be exactly one VCF file + LinkedList vcfFiles = phenopacket.getHtsFilesList().stream() + .filter(htsFile -> htsFile.getHtsFormat().equals(HtsFile.HtsFormat.VCF)) + .distinct() + .collect(Collectors.toCollection(LinkedList::new)); + if (vcfFiles.size() != 1) { + LogUtils.logWarn(LOGGER, "Expected to find 1 VCF file, found {}", vcfFiles.size()); + return Optional.empty(); + } + + // The VCF file should have a proper URI + HtsFile vcf = vcfFiles.getFirst(); + try { + URI uri = new URI(vcf.getUri()); + return Optional.of(Path.of(uri)); + } catch (URISyntaxException e) { + LogUtils.logWarn(LOGGER, "Invalid URI `{}`: {}", vcf.getUri(), e.getMessage()); + return Optional.empty(); + } + } + + private static void performModeOfInheritanceReassignment(List filteredPrioritizedVariants, PhenotypeDataService phenotypeDataService) { + // TODO - find a better place for this code + LogUtils.logInfo(LOGGER, "Reassigning priorities"); + for (SvannaVariant variant : filteredPrioritizedVariants) { + if (variant.svPriority() instanceof GeneAwareSvPriority) { + Zygosity zygosity = variant.zygosity(); + if (zygosity.equals(Zygosity.UNKNOWN)) + continue; + + GeneAwareSvPriority priority = (GeneAwareSvPriority) variant.svPriority(); + Map map = new HashMap<>(); + for (String geneId : priority.geneIds()) { + double geneContribution = priority.geneContribution(geneId); + if (geneContribution < 1E-8) + continue; // no contribution, the gene is not affected by the variant + + Set diseases = phenotypeDataService.getDiseasesForGene(geneId); + if (zygosity.equals(Zygosity.HETEROZYGOUS)) { + boolean noDominantDisease = true; + for (HpoDiseaseSummary disease : diseases) { + if (disease.isCompatibleWithInheritance(ModeOfInheritance.AUTOSOMAL_DOMINANT) + || disease.isCompatibleWithInheritance(ModeOfInheritance.X_DOMINANT)) { + noDominantDisease = false; + break; + } + } + + if (noDominantDisease && !diseases.isEmpty()) + geneContribution *= MODE_OF_INHERITANCE_FACTOR; + } + map.put(geneId, geneContribution); + } + variant.setSvPriority(GeneAwareSvPriority.of(map)); + } + } + } + @Override - public Integer call(){ + public Integer call() { int status = checkArguments(); - if (status!=0) + if (status != 0) return status; Set phenotypeTermIds; @@ -174,7 +236,7 @@ public Integer call(){ protected int checkArguments() { if ((vcfFile == null) == (phenopacketPath == null)) { - LogUtils.logError(LOGGER,"Path to a VCF file or to a phenopacket must be supplied"); + LogUtils.logError(LOGGER, "Path to a VCF file or to a phenopacket must be supplied"); return 1; } @@ -228,8 +290,8 @@ private void runAnalysis(Collection patientTerms, Path vcfFile) throws I Set topLevelHpoTerms = phenotypeDataService.getTopLevelTerms(validatedPatientTerms); LogUtils.logInfo(LOGGER, "Reading variants from `{}`", vcfFile); - VariantParser parser = new VcfVariantParser(genomicAssembly, false); - List variants = parser.createVariantAlleleList(vcfFile); + VariantParser parser = new VcfVariantParser(genomicAssembly); + List variants = parser.createVariantAlleleList(vcfFile); LogUtils.logInfo(LOGGER, "Read {} variants", NF.format(variants.size())); // Filter @@ -237,7 +299,7 @@ private void runAnalysis(Collection patientTerms, Path vcfFile) throws I LogUtils.logInfo(LOGGER, "Filtering out the variants where ALT allele is supported by less than {} reads", minAltReadSupport); AnnotationDataService annotationDataService = context.getBean(AnnotationDataService.class); PopulationFrequencyAndCoverageFilter filter = new PopulationFrequencyAndCoverageFilter(annotationDataService, similarityThreshold, frequencyThreshold, minAltReadSupport, maxLength); - List filteredVariants = filter.filter(variants); + List filteredVariants = filter.filter(variants); // Prioritize SvPrioritizerFactory svPrioritizerFactory = context.getBean(SvPrioritizerFactory.class); @@ -246,14 +308,14 @@ private void runAnalysis(Collection patientTerms, Path vcfFile) throws I LogUtils.logInfo(LOGGER, "Prioritizing variants"); ProgressReporter priorityProgress = new ProgressReporter(5_000); - Function prioritizationFunction = v -> { + UnaryOperator prioritizationFunction = v -> { priorityProgress.logItem(v); SvPriority priority = prioritizer.prioritize(v); v.setSvPriority(priority); return v; }; - List filteredPrioritizedVariants = TaskUtils.executeBlocking(filteredVariants, prioritizationFunction, parallelism); + List filteredPrioritizedVariants = TaskUtils.executeBlocking(filteredVariants, prioritizationFunction, parallelism); if (modeOfInheritance) performModeOfInheritanceReassignment(filteredPrioritizedVariants, phenotypeDataService); @@ -264,13 +326,13 @@ private void runAnalysis(Collection patientTerms, Path vcfFile) throws I ResultWriterFactory resultWriterFactory = context.getBean(ResultWriterFactory.class); String prefix = resolveOutPrefix(vcfFile); for (OutputFormat outputFormat : outputFormats) { - ResultWriter writer = resultWriterFactory.resultWriterForFormat(outputFormat); + ResultWriter writer = resultWriterFactory.resultWriterForFormat(outputFormat, !uncompressed); if (writer instanceof HtmlResultWriter) { SvannaProperties svannaProperties = context.getBean(SvannaProperties.class); // TODO - is there a more elegant way to pass the HTML specific parameters into the writer? - HtmlResultWriter wrt = (HtmlResultWriter) writer; - wrt.setAnalysisParameters(getAnalysisParameters(svannaProperties)); - wrt.setDoNotReportBreakends(doNotReportBreakends); + HtmlResultWriter htmlWriter = (HtmlResultWriter) writer; + htmlWriter.setAnalysisParameters(getAnalysisParameters(svannaProperties)); + htmlWriter.setDoNotReportBreakends(doNotReportBreakends); } writer.write(results, prefix); } @@ -296,63 +358,4 @@ private AnalysisParameters getAnalysisParameters(SvannaProperties properties) { return analysisParameters; } - - private static Optional getVcfFilePath(Phenopacket phenopacket) { - // There should be exactly one VCF file - LinkedList vcfFiles = phenopacket.getHtsFilesList().stream() - .filter(htsFile -> htsFile.getHtsFormat().equals(HtsFile.HtsFormat.VCF)) - .distinct() - .collect(Collectors.toCollection(LinkedList::new)); - if (vcfFiles.size() != 1) { - LogUtils.logWarn(LOGGER, "Expected to find 1 VCF file, found {}", vcfFiles.size()); - return Optional.empty(); - } - - // The VCF file should have a proper URI - HtsFile vcf = vcfFiles.getFirst(); - try { - URI uri = new URI(vcf.getUri()); - return Optional.of(Path.of(uri)); - } catch (URISyntaxException e) { - LogUtils.logWarn(LOGGER, "Invalid URI `{}`: {}", vcf.getUri(), e.getMessage()); - return Optional.empty(); - } - } - - private static void performModeOfInheritanceReassignment(List filteredPrioritizedVariants, PhenotypeDataService phenotypeDataService) { - // TODO - find a better place for this code - LogUtils.logInfo(LOGGER, "Reassigning priorities"); - for (SvannaVariant variant : filteredPrioritizedVariants) { - if (variant.svPriority() instanceof GeneAwareSvPriority) { - Zygosity zygosity = variant.zygosity(); - if (zygosity.equals(Zygosity.UNKNOWN)) - continue; - - GeneAwareSvPriority priority = (GeneAwareSvPriority) variant.svPriority(); - Map map = new HashMap<>(); - for (String geneId : priority.geneIds()) { - double geneContribution = priority.geneContribution(geneId); - if (geneContribution < 1E-8) - continue; // no contribution, the gene is not affected by the variant - - Set diseases = phenotypeDataService.getDiseasesForGene(geneId); - if (zygosity.equals(Zygosity.HETEROZYGOUS)) { - boolean noDominantDisease = true; - for (HpoDiseaseSummary disease : diseases) { - if (disease.isCompatibleWithInheritance(ModeOfInheritance.AUTOSOMAL_DOMINANT) - || disease.isCompatibleWithInheritance(ModeOfInheritance.X_DOMINANT)) { - noDominantDisease = false; - break; - } - } - - if (noDominantDisease && !diseases.isEmpty()) - geneContribution *= MODE_OF_INHERITANCE_FACTOR; - } - map.put(geneId, geneContribution); - } - variant.setSvPriority(GeneAwareSvPriority.of(map)); - } - } - } } diff --git a/svanna-cli/src/main/java/org/jax/svanna/cli/writer/AnalysisResults.java b/svanna-cli/src/main/java/org/jax/svanna/cli/writer/AnalysisResults.java index 3b3069b1..528cd0c2 100644 --- a/svanna-cli/src/main/java/org/jax/svanna/cli/writer/AnalysisResults.java +++ b/svanna-cli/src/main/java/org/jax/svanna/cli/writer/AnalysisResults.java @@ -1,6 +1,6 @@ package org.jax.svanna.cli.writer; -import org.jax.svanna.core.reference.SvannaVariant; +import org.jax.svanna.io.FullSvannaVariant; import org.monarchinitiative.phenol.ontology.data.Term; import java.util.List; @@ -15,12 +15,12 @@ public class AnalysisResults { private final Set topLevelPhenotypeTerms; - private final List variants; + private final List variants; public AnalysisResults(String variantSource, Set probandPhenotypeTerms, Set topLevelPhenotypeTerms, - List variants) { + List variants) { this.variantSource = Objects.requireNonNull(variantSource, "Variant source cannot be null"); this.probandPhenotypeTerms = Objects.requireNonNull(probandPhenotypeTerms, "Phenotype terms cannot be null"); this.topLevelPhenotypeTerms = Objects.requireNonNull(topLevelPhenotypeTerms, "Top level phenotype terms cannot be null"); @@ -35,7 +35,7 @@ public Set topLevelPhenotypeTerms() { return topLevelPhenotypeTerms; } - public List variants() { + public List variants() { return variants; } diff --git a/svanna-cli/src/main/java/org/jax/svanna/cli/writer/ResultWriterFactory.java b/svanna-cli/src/main/java/org/jax/svanna/cli/writer/ResultWriterFactory.java index c2069af0..f4853d11 100644 --- a/svanna-cli/src/main/java/org/jax/svanna/cli/writer/ResultWriterFactory.java +++ b/svanna-cli/src/main/java/org/jax/svanna/cli/writer/ResultWriterFactory.java @@ -2,6 +2,7 @@ import org.jax.svanna.cli.writer.html.HtmlResultWriter; import org.jax.svanna.cli.writer.tabular.TabularResultWriter; +import org.jax.svanna.cli.writer.vcf.VcfResultWriter; import org.jax.svanna.core.hpo.PhenotypeDataService; import org.jax.svanna.core.landscape.AnnotationDataService; import org.jax.svanna.core.overlap.GeneOverlapper; @@ -19,17 +20,18 @@ public ResultWriterFactory(GeneOverlapper overlapper, AnnotationDataService anno } - public ResultWriter resultWriterForFormat(OutputFormat outputFormat) { + public ResultWriter resultWriterForFormat(OutputFormat outputFormat, boolean compress) { switch (outputFormat) { case HTML: return new HtmlResultWriter(overlapper, annotationDataService, phenotypeDataService); case TSV: - return new TabularResultWriter(OutputFormat.TSV.fileSuffix(), '\t', true); + return new TabularResultWriter(OutputFormat.TSV.fileSuffix(), '\t', compress); case CSV: - return new TabularResultWriter(OutputFormat.CSV.fileSuffix(), ',', true); + return new TabularResultWriter(OutputFormat.CSV.fileSuffix(), ',', compress); case VCF: + return new VcfResultWriter(compress); default: - throw new RuntimeException("Unsupported right now"); + throw new RuntimeException("The output format " + outputFormat + " is not supported"); } } diff --git a/svanna-cli/src/main/java/org/jax/svanna/cli/writer/vcf/VcfResultWriter.java b/svanna-cli/src/main/java/org/jax/svanna/cli/writer/vcf/VcfResultWriter.java new file mode 100644 index 00000000..cb4d0b87 --- /dev/null +++ b/svanna-cli/src/main/java/org/jax/svanna/cli/writer/vcf/VcfResultWriter.java @@ -0,0 +1,122 @@ +package org.jax.svanna.cli.writer.vcf; + +import htsjdk.samtools.util.BlockCompressedOutputStream; +import htsjdk.tribble.TribbleException; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.variantcontext.VariantContextBuilder; +import htsjdk.variant.variantcontext.writer.Options; +import htsjdk.variant.variantcontext.writer.VariantContextWriter; +import htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder; +import htsjdk.variant.vcf.*; +import org.jax.svanna.cli.writer.AnalysisResults; +import org.jax.svanna.cli.writer.OutputFormat; +import org.jax.svanna.cli.writer.ResultWriter; +import org.jax.svanna.core.exception.LogUtils; +import org.jax.svanna.core.priority.SvPriority; +import org.jax.svanna.core.reference.Prioritized; +import org.jax.svanna.io.FullSvannaVariant; +import org.monarchinitiative.svart.CoordinateSystem; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.BufferedOutputStream; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.Comparator; +import java.util.Optional; +import java.util.function.Function; + +public class VcfResultWriter implements ResultWriter { + + private static final Logger LOGGER = LoggerFactory.getLogger(VcfResultWriter.class); + + private static final String SVANNA_TAD_SV_FIELD_NAME = "TADSV"; + private static final VCFInfoHeaderLine TADSV_LINE = new VCFInfoHeaderLine( + SVANNA_TAD_SV_FIELD_NAME, + VCFHeaderLineCount.A, + VCFHeaderLineType.Float, + "SvAnna TAD-SV score for the variant"); + + private final boolean compress; + + public VcfResultWriter(boolean compress) { + this.compress = compress; + } + + /** + * Extend the header with INFO fields that are being added in this command. + * + * @return the extended header + */ + private static VCFHeader prepareVcfHeader(Path inputVcfPath) { + VCFHeader header; + try (VCFFileReader reader = new VCFFileReader(inputVcfPath, false)) { + header = reader.getFileHeader(); + } catch (TribbleException.MalformedFeatureFile e) { + // This happens when the input variants were not read from a VCF file but from e.g. a CSV file. In case we + // add another variant source in future + LOGGER.info("Creating a stub VCF header"); + header = new VCFHeader(); + header.setVCFHeaderVersion(VCFHeaderVersion.VCF4_2); + } + + // TADSV - float + header.addMetaDataLine(TADSV_LINE); + + return header; + } + + private static Function> addInfoField() { + return sv -> { + SvPriority svPriority = sv.svPriority(); + + VariantContext vc = sv.variantContext(); + if (vc == null) { + LogUtils.logDebug(LOGGER, "Cannot write VCF line for variant '{}' because variant context is missing. {}:{}{}>{}", + sv.id(), sv.contigName(), sv.startWithCoordinateSystem(CoordinateSystem.oneBased()), sv.ref(), sv.alt()); + return Optional.empty(); + } + + VariantContextBuilder builder = new VariantContextBuilder(vc); + if (svPriority == null || Double.isNaN(svPriority.getPriority())) + return Optional.of(builder.make()); + + return Optional.of(builder.attribute(SVANNA_TAD_SV_FIELD_NAME, svPriority.getPriority()) + .make()); + }; + } + + @Override + public void write(AnalysisResults analysisResults, String prefix) throws IOException { + Path inputVcfPath = Paths.get(analysisResults.variantSource()); + VCFHeader header = prepareVcfHeader(inputVcfPath); + + Path outPath = Paths.get(prefix + OutputFormat.VCF.fileSuffix() + (compress ? ".gz" : "")); + LogUtils.logInfo(LOGGER, "Writing VCF results into {}", outPath.toAbsolutePath()); + + try (VariantContextWriter writer = new VariantContextWriterBuilder() + .setOutputVCFStream(openOutputStream(outPath)) + .setReferenceDictionary(header.getSequenceDictionary()) + .unsetOption(Options.INDEX_ON_THE_FLY) + .build()) { + writer.writeHeader(header); + + analysisResults.variants().stream() + .filter(sv -> !Double.isNaN(sv.svPriority().getPriority())) + .sorted(Comparator.comparing(Prioritized::svPriority).reversed()) + .map(addInfoField()) + .filter(Optional::isPresent) + .map(Optional::get) + .forEachOrdered(writer::add); + } + } + + + private BufferedOutputStream openOutputStream(Path outputPath) throws IOException { + return compress + ? new BufferedOutputStream(new BlockCompressedOutputStream(outputPath.toFile())) + : new BufferedOutputStream(Files.newOutputStream(outputPath)); + } +} diff --git a/svanna-io/src/main/java/org/jax/svanna/io/FullSvannaVariant.java b/svanna-io/src/main/java/org/jax/svanna/io/FullSvannaVariant.java new file mode 100644 index 00000000..50a598a7 --- /dev/null +++ b/svanna-io/src/main/java/org/jax/svanna/io/FullSvannaVariant.java @@ -0,0 +1,10 @@ +package org.jax.svanna.io; + +import htsjdk.variant.variantcontext.VariantContext; +import org.jax.svanna.core.reference.SvannaVariant; + +public interface FullSvannaVariant extends SvannaVariant { + + VariantContext variantContext(); + +} diff --git a/svanna-core/src/main/java/org/jax/svanna/core/reference/BreakendedSvannaVariant.java b/svanna-io/src/main/java/org/jax/svanna/io/parse/BreakendedSvannaVariant.java similarity index 73% rename from svanna-core/src/main/java/org/jax/svanna/core/reference/BreakendedSvannaVariant.java rename to svanna-io/src/main/java/org/jax/svanna/io/parse/BreakendedSvannaVariant.java index 2e893084..7bf45d86 100644 --- a/svanna-core/src/main/java/org/jax/svanna/core/reference/BreakendedSvannaVariant.java +++ b/svanna-io/src/main/java/org/jax/svanna/io/parse/BreakendedSvannaVariant.java @@ -1,8 +1,12 @@ -package org.jax.svanna.core.reference; +package org.jax.svanna.io.parse; +import htsjdk.variant.variantcontext.VariantContext; import org.jax.svanna.core.filter.FilterResult; import org.jax.svanna.core.filter.FilterType; import org.jax.svanna.core.priority.SvPriority; +import org.jax.svanna.core.reference.VariantCallAttributes; +import org.jax.svanna.core.reference.Zygosity; +import org.jax.svanna.io.FullSvannaVariant; import org.monarchinitiative.svart.BaseBreakendVariant; import org.monarchinitiative.svart.Breakend; @@ -11,12 +15,13 @@ import java.util.Set; import java.util.concurrent.atomic.AtomicReference; -public final class BreakendedSvannaVariant extends BaseBreakendVariant implements SvannaVariant { +public final class BreakendedSvannaVariant extends BaseBreakendVariant implements FullSvannaVariant { private final VariantCallAttributes variantCallAttributes; private final Set passedFilterTypes; private final Set failedFilterTypes; - private final AtomicReference priority = new AtomicReference<>(); + private final AtomicReference priority; + private final VariantContext variantContext; private BreakendedSvannaVariant(String eventId, Breakend left, @@ -25,28 +30,37 @@ private BreakendedSvannaVariant(String eventId, String alt, VariantCallAttributes variantCallAttributes, Set passedFilterTypes, - Set failedFilterTypes) { + Set failedFilterTypes, + AtomicReference priority, + VariantContext variantContext) { + super(eventId, left, right, ref, alt); this.variantCallAttributes = Objects.requireNonNull(variantCallAttributes); this.passedFilterTypes = passedFilterTypes; this.failedFilterTypes = failedFilterTypes; + this.priority = priority; + this.variantContext = variantContext; } private BreakendedSvannaVariant(Builder builder) { super(builder); - variantCallAttributes = builder.variantCallAttributes; + variantCallAttributes = Objects.requireNonNull(builder.variantCallAttributes); passedFilterTypes = builder.passedFilterTypes; failedFilterTypes = builder.failedFilterTypes; + priority = builder.priority; + variantContext = builder.variantContext; } public static BreakendedSvannaVariant of(String eventId, - Breakend left, - Breakend right, - String ref, - String alt, - VariantCallAttributes variantCallAttributes) { - return new BreakendedSvannaVariant(eventId, left, right, ref, alt, variantCallAttributes, new HashSet<>(), new HashSet<>()); + Breakend left, + Breakend right, + String ref, + String alt, + VariantCallAttributes variantCallAttributes, + SvPriority svPriority, + VariantContext variantContext) { + return new BreakendedSvannaVariant(eventId, left, right, ref, alt, variantCallAttributes, new HashSet<>(), new HashSet<>(), new AtomicReference<>(svPriority), variantContext); } public static Builder builder() { @@ -55,7 +69,7 @@ public static Builder builder() { @Override protected BreakendedSvannaVariant newBreakendVariantInstance(String eventId, Breakend left, Breakend right, String ref, String alt) { - return new BreakendedSvannaVariant(eventId, left, right, ref, alt, variantCallAttributes, passedFilterTypes, failedFilterTypes); + return new BreakendedSvannaVariant(eventId, left, right, ref, alt, variantCallAttributes, passedFilterTypes, failedFilterTypes, priority, variantContext); } /* @@ -123,18 +137,23 @@ public SvPriority svPriority() { return priority.get(); } + @Override + public VariantContext variantContext() { + return variantContext; + } + @Override public boolean equals(Object o) { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; if (!super.equals(o)) return false; BreakendedSvannaVariant that = (BreakendedSvannaVariant) o; - return Objects.equals(variantCallAttributes, that.variantCallAttributes) && Objects.equals(passedFilterTypes, that.passedFilterTypes) && Objects.equals(failedFilterTypes, that.failedFilterTypes) && Objects.equals(priority.get(), that.priority.get()); + return Objects.equals(variantCallAttributes, that.variantCallAttributes) && Objects.equals(passedFilterTypes, that.passedFilterTypes) && Objects.equals(failedFilterTypes, that.failedFilterTypes) && Objects.equals(priority.get(), that.priority.get()) && Objects.equals(variantContext, that.variantContext); } @Override public int hashCode() { - return Objects.hash(super.hashCode(), variantCallAttributes, passedFilterTypes, failedFilterTypes, priority); + return Objects.hash(super.hashCode(), variantCallAttributes, passedFilterTypes, failedFilterTypes, priority, variantContext); } @Override @@ -144,15 +163,17 @@ public String toString() { ", passedFilterTypes=" + passedFilterTypes + ", failedFilterTypes=" + failedFilterTypes + ", priority=" + priority.get() + + ", variantContext=" + variantContext + "} " + super.toString(); } public static class Builder extends BaseBreakendVariant.Builder { - private VariantCallAttributes variantCallAttributes; private final Set passedFilterTypes = new HashSet<>(); private final Set failedFilterTypes = new HashSet<>(); - + private final AtomicReference priority = new AtomicReference<>(); + private VariantCallAttributes variantCallAttributes; + private VariantContext variantContext; public Builder variantCallAttributes(VariantCallAttributes variantCallAttributes) { this.variantCallAttributes = variantCallAttributes; @@ -170,6 +191,16 @@ public Builder addFilterResult(FilterResult filterResult) { return self(); } + public Builder priority(SvPriority svPriority) { + priority.set(svPriority); + return self(); + } + + public Builder variantContext(VariantContext variantContext) { + this.variantContext = variantContext; + return self(); + } + @Override public BreakendedSvannaVariant build() { return new BreakendedSvannaVariant(self()); diff --git a/svanna-core/src/main/java/org/jax/svanna/core/reference/DefaultSvannaVariant.java b/svanna-io/src/main/java/org/jax/svanna/io/parse/DefaultSvannaVariant.java similarity index 74% rename from svanna-core/src/main/java/org/jax/svanna/core/reference/DefaultSvannaVariant.java rename to svanna-io/src/main/java/org/jax/svanna/io/parse/DefaultSvannaVariant.java index 7516cd88..c1913379 100644 --- a/svanna-core/src/main/java/org/jax/svanna/core/reference/DefaultSvannaVariant.java +++ b/svanna-io/src/main/java/org/jax/svanna/io/parse/DefaultSvannaVariant.java @@ -1,8 +1,12 @@ -package org.jax.svanna.core.reference; +package org.jax.svanna.io.parse; +import htsjdk.variant.variantcontext.VariantContext; import org.jax.svanna.core.filter.FilterResult; import org.jax.svanna.core.filter.FilterType; import org.jax.svanna.core.priority.SvPriority; +import org.jax.svanna.core.reference.VariantCallAttributes; +import org.jax.svanna.core.reference.Zygosity; +import org.jax.svanna.io.FullSvannaVariant; import org.monarchinitiative.svart.*; import java.util.HashSet; @@ -10,12 +14,13 @@ import java.util.Set; import java.util.concurrent.atomic.AtomicReference; -public final class DefaultSvannaVariant extends BaseVariant implements SvannaVariant { +public final class DefaultSvannaVariant extends BaseVariant implements FullSvannaVariant { private final VariantCallAttributes variantCallAttributes; private final Set passedFilterTypes; private final Set failedFilterTypes; - private final AtomicReference priority = new AtomicReference<>(); + private final AtomicReference priority; + private final VariantContext variantContext; private DefaultSvannaVariant(Contig contig, String id, @@ -28,12 +33,16 @@ private DefaultSvannaVariant(Contig contig, int changeLength, VariantCallAttributes variantCallAttributes, Set passedFilterTypes, - Set failedFilterTypes) { + Set failedFilterTypes, + AtomicReference priority, + VariantContext variantContext) { // for creating a novel instance from an existing instance in `newVariantInstance` super(contig, id, strand, coordinateSystem, startPosition, endPosition, ref, alt, changeLength); this.variantCallAttributes = Objects.requireNonNull(variantCallAttributes); this.passedFilterTypes = passedFilterTypes; this.failedFilterTypes = failedFilterTypes; + this.priority = priority; + this.variantContext = variantContext; } private DefaultSvannaVariant(Builder builder) { @@ -41,6 +50,8 @@ private DefaultSvannaVariant(Builder builder) { variantCallAttributes = Objects.requireNonNull(builder.variantCallAttributes, "Variant call attributes cannot be null"); passedFilterTypes = builder.passedFilterTypes; failedFilterTypes = builder.failedFilterTypes; + priority = builder.priority; + variantContext = builder.variantContext; } public static Builder builder() { @@ -48,18 +59,20 @@ public static Builder builder() { } public static DefaultSvannaVariant of(Contig contig, - String id, - Strand strand, - CoordinateSystem coordinateSystem, - Position startPosition, - Position endPosition, - String ref, - String alt, - int changeLength, - VariantCallAttributes variantCallAttributes) { + String id, + Strand strand, + CoordinateSystem coordinateSystem, + Position startPosition, + Position endPosition, + String ref, + String alt, + int changeLength, + VariantCallAttributes variantCallAttributes, + SvPriority svPriority, + VariantContext variantContext) { return new DefaultSvannaVariant( contig, id, strand, coordinateSystem, startPosition, endPosition, ref, alt, changeLength, - variantCallAttributes, new HashSet<>(), new HashSet<>()); + variantCallAttributes, new HashSet<>(), new HashSet<>(), new AtomicReference<>(svPriority), variantContext); } @Override @@ -83,7 +96,9 @@ protected DefaultSvannaVariant newVariantInstance(Contig contig, changeLength, variantCallAttributes, passedFilterTypes, - failedFilterTypes); + failedFilterTypes, + priority, + variantContext); } @Override @@ -145,18 +160,23 @@ public SvPriority svPriority() { return priority.get(); } + @Override + public VariantContext variantContext() { + return variantContext; + } + @Override public boolean equals(Object o) { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; if (!super.equals(o)) return false; DefaultSvannaVariant that = (DefaultSvannaVariant) o; - return Objects.equals(variantCallAttributes, that.variantCallAttributes) && Objects.equals(passedFilterTypes, that.passedFilterTypes) && Objects.equals(failedFilterTypes, that.failedFilterTypes) && Objects.equals(priority, that.priority); + return Objects.equals(variantCallAttributes, that.variantCallAttributes) && Objects.equals(passedFilterTypes, that.passedFilterTypes) && Objects.equals(failedFilterTypes, that.failedFilterTypes) && Objects.equals(priority, that.priority) && Objects.equals(variantContext, that.variantContext); } @Override public int hashCode() { - return Objects.hash(super.hashCode(), variantCallAttributes, passedFilterTypes, failedFilterTypes, priority); + return Objects.hash(super.hashCode(), variantCallAttributes, passedFilterTypes, failedFilterTypes, priority, variantContext); } @Override @@ -166,14 +186,17 @@ public String toString() { ", passedFilterTypes=" + passedFilterTypes + ", failedFilterTypes=" + failedFilterTypes + ", priority=" + priority.get() + + ", variantContext=" + variantContext + "} " + super.toString(); } public static class Builder extends BaseVariant.Builder { - private VariantCallAttributes variantCallAttributes; private final Set passedFilterTypes = new HashSet<>(); private final Set failedFilterTypes = new HashSet<>(); + private final AtomicReference priority = new AtomicReference<>(); + private VariantCallAttributes variantCallAttributes; + private VariantContext variantContext; public Builder variantCallAttributes(VariantCallAttributes variantCallAttributes) { this.variantCallAttributes = variantCallAttributes; @@ -191,6 +214,16 @@ public Builder addFilterResult(FilterResult filterResult) { return self(); } + public Builder priority(SvPriority svPriority) { + priority.set(svPriority); + return self(); + } + + public Builder variantContext(VariantContext variantContext) { + this.variantContext = variantContext; + return self(); + } + @Override public DefaultSvannaVariant build() { return new DefaultSvannaVariant(selfWithEndIfMissing()); diff --git a/svanna-io/src/main/java/org/jax/svanna/io/parse/MergedVariantParser.java b/svanna-io/src/main/java/org/jax/svanna/io/parse/MergedVariantParser.java deleted file mode 100644 index 6a075541..00000000 --- a/svanna-io/src/main/java/org/jax/svanna/io/parse/MergedVariantParser.java +++ /dev/null @@ -1,252 +0,0 @@ -package org.jax.svanna.io.parse; - -import org.monarchinitiative.svart.*; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.BufferedReader; -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.text.NumberFormat; -import java.text.ParseException; -import java.util.*; -import java.util.function.Function; -import java.util.stream.Collectors; -import java.util.stream.Stream; - -/** - * This parser expects to get path to a BED-like file with records that describe symbolic variants only. - */ -@Deprecated(forRemoval = true) -public class MergedVariantParser implements VariantParser { - - private static final Logger LOGGER = LoggerFactory.getLogger(MergedVariantParser.class); - - private static final int MISSING_DEPTH_PLACEHOLDER = -1; - - /** - * The parser supports these symbolic variant types. - */ - private static final Set SUPPORTED_TYPES = Set.of(VariantType.DEL, VariantType.DUP, VariantType.INV); - - - private static final NumberFormat NF = NumberFormat.getInstance(); - private static final String DELIMITER = "\t"; - private final GenomicAssembly assembly; - - public MergedVariantParser(GenomicAssembly assembly) { - this.assembly = assembly; - } - - private static Map parseData(String callerData, String infoColumnString) { - // callers: e.g. `pbsv:sniffles:svim` - // infoColumn: e.g. `pbsv.DEL.30582;15;PASS;1/1:96836;33;PASS;1/1:svim.DEL.24308;35;PASS;1/1` - String[] callers = callerData.split(":"); - String[] splits = infoColumnString.split(":"); - if (callers.length != splits.length) { - return Map.of(); - } - Map map = new HashMap<>(); - for (int i = 0; i < callers.length; i++) { - Caller caller = Caller.valueOf(callers[i]); - String[] fields = splits[i].split(";"); - RecordData recordData; - try { - recordData = new RecordData(fields[0], NF.parse(fields[1]).intValue(), fields[2], fields[3]); - } catch (ParseException e) { - LOGGER.warn("Invalid depth `{}`", fields[1]); - continue; - } - map.put(caller, recordData); - } - - return map; - } - - @Override - public Stream createVariantAlleles(Path filePath) throws IOException { - /* Tab-delimited file with entries like: - CM000663.2 1080765 1080860 DEL pbsv:sniffles:svim SAMPLE1 pbsv.DEL.50;10;PASS;0/1:122;24;PASS;1/1:svim.DEL.166;24;PASS;0/1 - CM000663.2 1099137 1099828 DUP sniffles:svim SAMPLE1 125;9;PASS;0/0:svim.DUP_TANDEM.17;5;PASS;./. - CM000663.2 2031552 2031782 INV sniffles:svim PID1048 245;4;PASS;0/0:svim.INV.3;3;PASS;./. - */ - BufferedReader reader = Files.newBufferedReader(filePath); - return reader.lines() - .map(toVariant()) - .filter(Optional::isPresent) - .map(Optional::get) - .onClose(() -> { - try { - reader.close(); - } catch (IOException e) { - LOGGER.warn("Error closing the file {}", filePath); - } - }); - } - - private Function> toVariant() { - return line -> { - // line like: - // CM000663.2 1080765 1080860 DEL pbsv:sniffles:svim SAMPLE1 pbsv.DEL.50;10;PASS;0/1:122;24;PASS;1/1:svim.DEL.166;24;PASS;0/1 - - String[] tokens = line.split(DELIMITER); - - return makeCoreCoords(tokens).map(cco -> { - int changeLength; - VariantType vt = cco.variantType; - switch (vt.baseType()) { - case DEL: - changeLength = cco.begin.pos() - cco.end.pos(); - break; - case DUP: - changeLength = cco.end.pos() - cco.begin.pos(); - break; - case INV: - changeLength = 0; - break; - default: - // shouldn't happen since we check in `makeCoreCoords()` - LOGGER.warn("Unsupported variant type {}", vt); - throw new RuntimeException(); - } - return Variant.of(cco.contig, cco.id, Strand.POSITIVE, CoordinateSystem.zeroBased(), cco.begin, cco.end, "", '<' + vt.baseType().name() + '>', changeLength); - }); - }; - - } - - private Optional makeCoreCoords(String[] tokens) { - Contig contig = assembly.contigByName(tokens[0]); - if (contig.isUnknown()) { - LOGGER.warn("Unknown contig {} in line {}", tokens[0], String.join(DELIMITER, tokens)); - return Optional.empty(); - } - - VariantType variantType = VariantType.parseType(tokens[3]); - if (!SUPPORTED_TYPES.contains(variantType.baseType())) { - LOGGER.warn("Unsupported variant type: `{}`", variantType); - return Optional.empty(); - } - - // coordinates - Position begin, end; - try { - int beginPos = NF.parse(tokens[1]).intValue(); - begin = Position.of(beginPos); - int endPos = NF.parse(tokens[2]).intValue(); - end = Position.of(endPos); - } catch (ParseException e) { - LOGGER.warn("Invalid begin/end coordinate in line {}", String.join(DELIMITER, tokens)); - return Optional.empty(); - } - Map data = parseData(tokens[4], tokens[6]); - - return Optional.of(new CoreCoords(contig, begin, end, data, variantType)); - } - - private enum Caller { - pbsv, - sniffles, - svim; - - public String shortName() { - // get `sv`, `sn`, `pb` - return name().substring(0, 2); - } - } - - - private static class CoreCoords { - private final Contig contig; - private final Position begin, end; - private final Map dataMap; - private final VariantType variantType; - private final String id; - - private CoreCoords(Contig contig, Position begin, Position end, Map dataMap, VariantType variantType) { - this.contig = contig; - this.begin = begin; - this.end = end; - this.dataMap = dataMap; - this.id = dataMap.entrySet().stream() - .map(e -> String.format("%s(%s)", e.getKey().shortName(), e.getValue().id())) - .sorted() - .collect(Collectors.joining(";")); - this.variantType = variantType; - } - - int medianDepth() { - List depths = dataMap.values().stream() - .map(RecordData::depth) - .sorted() - .collect(Collectors.toList()); - - if (depths.isEmpty()) { - return -1; - } - int middle = depths.size() / 2; - if (depths.size() % 2 == 0) { - // even number of depths - return rounded arithmetic mean of the two middle elements - return Math.round(((float) depths.get(middle - 1) + depths.get(middle)) / 2); - } else { - return depths.get(middle); - } - } - - int minDepth() { - return dataMap.values().stream() - .mapToInt(RecordData::depth) - .min() - .orElse(MISSING_DEPTH_PLACEHOLDER); - } - } - - private static class RecordData { - - private final String id; - private final int depth; - private final String filter; - private final String genotype; - - - private RecordData(String id, int depth, String filter, String genotype) { - this.id = id; - this.depth = depth; - this.filter = filter; - this.genotype = genotype; - } - - public String id() { - return id; - } - - public int depth() { - return depth; - } - - public String filter() { - return filter; - } - - public String genotype() { - return genotype; - } - - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - RecordData that = (RecordData) o; - return depth == that.depth && - Objects.equals(id, that.id) && - Objects.equals(filter, that.filter) && - Objects.equals(genotype, that.genotype); - } - - @Override - public int hashCode() { - return Objects.hash(id, depth, filter, genotype); - } - } -} diff --git a/svanna-io/src/main/java/org/jax/svanna/io/parse/Utils.java b/svanna-io/src/main/java/org/jax/svanna/io/parse/Utils.java deleted file mode 100644 index f5979e39..00000000 --- a/svanna-io/src/main/java/org/jax/svanna/io/parse/Utils.java +++ /dev/null @@ -1,15 +0,0 @@ -package org.jax.svanna.io.parse; - -import htsjdk.variant.variantcontext.VariantContext; - -class Utils { - - private Utils() { - // static utility class - } - - static String makeVariantRepresentation(VariantContext vc) { - return String.format("%s-%d:(%s)", vc.getContig(), vc.getStart(), vc.getID()); - } - -} diff --git a/svanna-io/src/main/java/org/jax/svanna/io/parse/VariantCallAttributeParser.java b/svanna-io/src/main/java/org/jax/svanna/io/parse/VariantCallAttributeParser.java index a8aa9bc0..fbda5a95 100644 --- a/svanna-io/src/main/java/org/jax/svanna/io/parse/VariantCallAttributeParser.java +++ b/svanna-io/src/main/java/org/jax/svanna/io/parse/VariantCallAttributeParser.java @@ -13,17 +13,11 @@ class VariantCallAttributeParser { private static final Logger LOGGER = LoggerFactory.getLogger(VariantCallAttributeParser.class); - private static final VariantCallAttributeParser INSTANCE = new VariantCallAttributeParser(); - - static VariantCallAttributeParser getInstance() { - return INSTANCE; - } - private VariantCallAttributeParser() { // private no-op } - VariantCallAttributes parseAttributes(Map attributes, Genotype genotype) { + static VariantCallAttributes parseAttributes(Map attributes, Genotype genotype) { VariantCallAttributes.Builder builder = VariantCallAttributes.builder(); // first, parse zygosity diff --git a/svanna-io/src/main/java/org/jax/svanna/io/parse/VcfVariantParser.java b/svanna-io/src/main/java/org/jax/svanna/io/parse/VcfVariantParser.java index 1e6043d3..fd5dd226 100644 --- a/svanna-io/src/main/java/org/jax/svanna/io/parse/VcfVariantParser.java +++ b/svanna-io/src/main/java/org/jax/svanna/io/parse/VcfVariantParser.java @@ -11,10 +11,8 @@ import org.jax.svanna.core.exception.LogUtils; import org.jax.svanna.core.filter.FilterResult; import org.jax.svanna.core.filter.FilterType; -import org.jax.svanna.core.reference.BreakendedSvannaVariant; -import org.jax.svanna.core.reference.DefaultSvannaVariant; -import org.jax.svanna.core.reference.SvannaVariant; import org.jax.svanna.core.reference.VariantCallAttributes; +import org.jax.svanna.io.FullSvannaVariant; import org.monarchinitiative.svart.*; import org.monarchinitiative.svart.util.VariantTrimmer; import org.monarchinitiative.svart.util.VcfConverter; @@ -32,31 +30,19 @@ import java.util.function.Function; import java.util.stream.Stream; -import static org.jax.svanna.io.parse.Utils.makeVariantRepresentation; - /** * Parse variants stored in a VCF file. The parser is NOT thread safe! */ -public class VcfVariantParser implements VariantParser { +public class VcfVariantParser implements VariantParser { private static final Logger LOGGER = LoggerFactory.getLogger(VcfVariantParser.class); private static final FilterResult FAILED_VARIANT_FILTER_RESULT = FilterResult.fail(FilterType.FAILED_VARIANT_FILTER); - private final VariantCallAttributeParser attributeParser; - private final VcfConverter vcfConverter; - private final boolean requireVcfIndex; - public VcfVariantParser(GenomicAssembly assembly) { - this(assembly, false); - } - - public VcfVariantParser(GenomicAssembly assembly, boolean requireVcfIndex) { - this.attributeParser = VariantCallAttributeParser.getInstance(); this.vcfConverter = new VcfConverter(assembly, VariantTrimmer.leftShiftingTrimmer(VariantTrimmer.removingCommonBase())); - this.requireVcfIndex = requireVcfIndex; } /** @@ -74,8 +60,12 @@ private static Function> toVariantContext(VCFCo }; } + private static String makeVariantRepresentation(VariantContext vc) { + return String.format("%s-%d:(%s)", vc.getContig(), vc.getStart(), vc.getID()); + } + @Override - public Stream createVariantAlleles(Path filePath) throws IOException { + public Stream createVariantAlleles(Path filePath) throws IOException { /* Sniffles VCF contains corrupted VCF records like @@ -93,12 +83,12 @@ public Stream createVariantAlleles(Path filePath) throws IOExcept So, this is a workaround that drops the corrupted lines: */ VCFHeader header; - try (VCFFileReader reader = new VCFFileReader(filePath, requireVcfIndex)) { + try (VCFFileReader reader = new VCFFileReader(filePath, false)) { header = reader.getHeader(); } VCFCodec codec = new VCFCodec(); - codec.setVCFHeader(header, header.getVCFHeaderVersion() == null ? VCFHeaderVersion.VCF4_1: header.getVCFHeaderVersion()); + codec.setVCFHeader(header, header.getVCFHeaderVersion() == null ? VCFHeaderVersion.VCF4_1 : header.getVCFHeaderVersion()); BufferedReader reader; if (filePath.toFile().getName().endsWith(".gz")) @@ -124,7 +114,7 @@ public Stream createVariantAlleles(Path filePath) throws IOExcept * * @return function that maps variant context to collection of {@link Variant}s */ - Function> toVariants() { + protected Function> toVariants() { return vc -> { Contig contig = vcfConverter.parseContig(vc.getContig()); if (contig.isUnknown()) { @@ -158,8 +148,8 @@ Function> toVariants() { }; } - private Optional parseSequenceVariantAllele(VariantContext vc, Contig contig) { - VariantCallAttributes attrs = attributeParser.parseAttributes(vc.getAttributes(), vc.getGenotype(0)); + private Optional parseSequenceVariantAllele(VariantContext vc, Contig contig) { + VariantCallAttributes attrs = VariantCallAttributeParser.parseAttributes(vc.getAttributes(), vc.getGenotype(0)); DefaultSvannaVariant.Builder builder = vcfConverter.convert(DefaultSvannaVariant.builder(), contig, vc.getID(), vc.getStart(), @@ -170,10 +160,10 @@ private Optional parseSequenceVariantAllele(VariantCont if (!vc.getFilters().isEmpty()) builder.addFilterResult(FAILED_VARIANT_FILTER_RESULT); - return Optional.of(builder.variantCallAttributes(attrs).build()); + return Optional.of(builder.variantCallAttributes(attrs).variantContext(vc).build()); } - private Optional parseSymbolicVariantAllele(VariantContext vc, Contig contig) { + private Optional parseSymbolicVariantAllele(VariantContext vc, Contig contig) { // parse start pos and CIPOS ConfidenceInterval cipos; List cp = vc.getAttributeAsIntList("CIPOS", 0); @@ -234,17 +224,17 @@ private Optional parseSymbolicVariantAllele(VariantCont end = end.shift(-1); } - VariantCallAttributes variantCallAttributes = attributeParser.parseAttributes(vc.getAttributes(), vc.getGenotype(0)); + VariantCallAttributes variantCallAttributes = VariantCallAttributeParser.parseAttributes(vc.getAttributes(), vc.getGenotype(0)); DefaultSvannaVariant.Builder builder = vcfConverter.convertSymbolic(DefaultSvannaVariant.builder(), contig, vc.getID(), start, end, ref, alt, svlen); // we assume that `PASS` is not added in between variant context's filters by HtsJDK, // and all the other values denote low quality variants if (!vc.getFilters().isEmpty()) builder.addFilterResult(FAILED_VARIANT_FILTER_RESULT); - return Optional.of(builder.variantCallAttributes(variantCallAttributes).build()); + return Optional.of(builder.variantCallAttributes(variantCallAttributes).variantContext(vc).build()); } - private Optional parseBreakendAllele(VariantContext vc, Contig contig) { + private Optional parseBreakendAllele(VariantContext vc, Contig contig) { // sanity checks if (vc.getAlternateAlleles().size() > 1) { if (LOGGER.isWarnEnabled()) @@ -280,7 +270,7 @@ private Optional parseBreakendAllele(VariantContext vc, String mateId = vc.getAttributeAsString("MATEID", ""); String eventId = vc.getAttributeAsString("EVENT", ""); - VariantCallAttributes attrs = attributeParser.parseAttributes(vc.getAttributes(), vc.getGenotype(0)); + VariantCallAttributes attrs = VariantCallAttributeParser.parseAttributes(vc.getAttributes(), vc.getGenotype(0)); BreakendedSvannaVariant.Builder builder = vcfConverter.convertBreakend(BreakendedSvannaVariant.builder(), contig, vc.getID(), pos, vc.getReference().getDisplayString(), vc.getAlternateAllele(0).getDisplayString(), @@ -291,7 +281,7 @@ private Optional parseBreakendAllele(VariantContext vc, if (!vc.getFilters().isEmpty()) builder.addFilterResult(FilterResult.fail(FilterType.FAILED_VARIANT_FILTER)); - return Optional.of(builder.variantCallAttributes(attrs).build()); + return Optional.of(builder.variantCallAttributes(attrs).variantContext(vc).build()); } } diff --git a/svanna-io/src/test/java/org/jax/svanna/io/parse/MergedVariantParserTest.java b/svanna-io/src/test/java/org/jax/svanna/io/parse/MergedVariantParserTest.java deleted file mode 100644 index 4dd6f4e7..00000000 --- a/svanna-io/src/test/java/org/jax/svanna/io/parse/MergedVariantParserTest.java +++ /dev/null @@ -1,70 +0,0 @@ -package org.jax.svanna.io.parse; - -import org.jax.svanna.io.TestDataConfig; -import org.junit.jupiter.api.Test; -import org.monarchinitiative.svart.*; -import org.springframework.boot.test.context.SpringBootTest; - -import java.nio.file.Path; -import java.nio.file.Paths; -import java.util.List; - -import static org.hamcrest.MatcherAssert.assertThat; -import static org.hamcrest.Matchers.equalTo; -import static org.hamcrest.Matchers.hasSize; - -@SpringBootTest(classes = TestDataConfig.class) -public class MergedVariantParserTest { - - private static final Path SV_EXAMPLE_PATH = Paths.get("src/test/resources/org/jax/svanna/io/parse/sv_small_merged.bed"); - - private final GenomicAssembly GRCh38p13= GenomicAssemblies.GRCh38p13(); - - @Test - public void createVariantAlleleList() throws Exception { - MergedVariantParser instance = new MergedVariantParser(GRCh38p13); - - List alleles = instance.createVariantAlleleList(SV_EXAMPLE_PATH); - - assertThat(alleles, hasSize(3)); - - Variant deletion = alleles.get(0); - assertThat(deletion.contig(), equalTo(GRCh38p13.contigByName("4"))); - assertThat(deletion.id(), equalTo("pb(pbsv.DEL.30582);sn(96836);sv(svim.DEL.24308)")); - assertThat(deletion.strand(), equalTo(Strand.POSITIVE)); - assertThat(deletion.coordinateSystem(), equalTo(CoordinateSystem.zeroBased())); - assertThat(deletion.startPosition(), equalTo(Position.of(87_615_924))); - assertThat(deletion.endPosition(), equalTo(Position.of(87_616_059))); - - assertThat(deletion.ref(), equalTo("")); - assertThat(deletion.alt(), equalTo("")); - assertThat(deletion.variantType(), equalTo(VariantType.DEL)); - assertThat(deletion.length(), equalTo(87_616_059-87_615_924)); - assertThat(deletion.changeLength(), equalTo(87_615_924 - 87_616_059)); - - Variant duplication = alleles.get(1); - assertThat(duplication.contigName(), equalTo("4")); - assertThat(duplication.startPosition().pos(), equalTo(89_180_724)); - assertThat(duplication.startPosition().isPrecise(), equalTo(true)); - assertThat(duplication.endPosition().pos(), equalTo(89_186_234)); - assertThat(duplication.endPosition().isPrecise(), equalTo(true)); - assertThat(duplication.id(), equalTo("pb(pbsv.SPLIT.DUP.30605);sn(96896);sv(svim.DUP_TANDEM.6669)")); - assertThat(duplication.ref(), equalTo("")); - assertThat(duplication.alt(), equalTo("")); - assertThat(duplication.variantType(), equalTo(VariantType.DUP)); - assertThat(duplication.length(), equalTo(89_186_234 - 89_180_724)); - assertThat(duplication.changeLength(), equalTo(89_186_234 - 89_180_724)); - - Variant inversion = alleles.get(2); - assertThat(inversion.contigName(), equalTo("4")); - assertThat(inversion.startPosition().pos(), equalTo(95_391_950)); - assertThat(inversion.startPosition().isPrecise(), equalTo(true)); - assertThat(inversion.endPosition().pos(), equalTo(95_392_557)); - assertThat(inversion.endPosition().isPrecise(), equalTo(true)); - assertThat(inversion.id(), equalTo("sn(97077);sv(svim.INV.1900)")); - assertThat(inversion.ref(), equalTo("")); - assertThat(inversion.alt(), equalTo("")); - assertThat(inversion.variantType(), equalTo(VariantType.INV)); - assertThat(inversion.changeLength(), equalTo(0)); - } -} \ No newline at end of file diff --git a/svanna-io/src/test/java/org/jax/svanna/io/parse/VariantCallAttributeParserTest.java b/svanna-io/src/test/java/org/jax/svanna/io/parse/VariantCallAttributeParserTest.java index 6801dc55..68f54fe7 100644 --- a/svanna-io/src/test/java/org/jax/svanna/io/parse/VariantCallAttributeParserTest.java +++ b/svanna-io/src/test/java/org/jax/svanna/io/parse/VariantCallAttributeParserTest.java @@ -7,7 +7,6 @@ import org.jax.svanna.core.reference.VariantCallAttributes; import org.jax.svanna.core.reference.Zygosity; import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.Arguments; import org.junit.jupiter.params.provider.MethodSource; @@ -26,8 +25,6 @@ public class VariantCallAttributeParserTest { /* VCF header with all fields that are present in VCF files generated by Sniffles, SVIM, and PBSV. */ private static final Path SV_EXAMPLE_PATH = Paths.get("src/test/resources/org/jax/svanna/io/parse/uber_header.vcf"); - private VariantCallAttributeParser parser; - @BeforeAll public static void beforeAll() { try (VCFFileReader reader = new VCFFileReader(SV_EXAMPLE_PATH, REQUIRE_INDEX)) { @@ -35,16 +32,11 @@ public static void beforeAll() { } } - @BeforeEach - public void setUp() { - parser = VariantCallAttributeParser.getInstance(); - } - @ParameterizedTest @MethodSource("provideSnifflesVariants") public void parseAttributes_Sniffles(VariantContext vc, int sampleIdx, Zygosity zygosity, int dp, int refDepth, int altDepth) { - VariantCallAttributes attributes = parser.parseAttributes(vc.getAttributes(), vc.getGenotype(sampleIdx)); + VariantCallAttributes attributes = VariantCallAttributeParser.parseAttributes(vc.getAttributes(), vc.getGenotype(sampleIdx)); assertThat(attributes.zygosity(), equalTo(zygosity)); assertThat(attributes.dp(), equalTo(dp)); @@ -65,7 +57,7 @@ private static Stream provideSnifflesVariants() { @MethodSource("providePbsvVariants") public void parseAttributes_Pbsv(VariantContext vc, int sampleIdx, Zygosity zygosity, int dp, int refDepth, int altDepth, int copyNumber) { - VariantCallAttributes attributes = parser.parseAttributes(vc.getAttributes(), vc.getGenotype(sampleIdx)); + VariantCallAttributes attributes = VariantCallAttributeParser.parseAttributes(vc.getAttributes(), vc.getGenotype(sampleIdx)); assertThat(attributes.zygosity(), equalTo(zygosity)); assertThat(attributes.dp(), equalTo(dp)); @@ -88,7 +80,7 @@ private static Stream providePbsvVariants() { @MethodSource("provideSvimVariants") public void parseAttributes_Svim(VariantContext vc, int sampleIdx, Zygosity zygosity, int dp, int refDepth, int altDepth, int copyNumber) { - VariantCallAttributes attributes = parser.parseAttributes(vc.getAttributes(), vc.getGenotype(sampleIdx)); + VariantCallAttributes attributes = VariantCallAttributeParser.parseAttributes(vc.getAttributes(), vc.getGenotype(sampleIdx)); assertThat(attributes.zygosity(), equalTo(zygosity)); assertThat(attributes.dp(), equalTo(dp)); diff --git a/svanna-io/src/test/java/org/jax/svanna/io/parse/VcfVariantParserTest.java b/svanna-io/src/test/java/org/jax/svanna/io/parse/VcfVariantParserTest.java index be185a68..dc8721a3 100644 --- a/svanna-io/src/test/java/org/jax/svanna/io/parse/VcfVariantParserTest.java +++ b/svanna-io/src/test/java/org/jax/svanna/io/parse/VcfVariantParserTest.java @@ -6,6 +6,7 @@ import htsjdk.variant.vcf.VCFHeaderVersion; import org.jax.svanna.core.reference.SvannaVariant; import org.jax.svanna.core.reference.Zygosity; +import org.jax.svanna.io.FullSvannaVariant; import org.jax.svanna.io.TestDataConfig; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.DisplayName; @@ -46,7 +47,7 @@ public class RealLife { @Test public void createVariantList() throws Exception { - VcfVariantParser instance = new VcfVariantParser(GRCh38p13, false); + VcfVariantParser instance = new VcfVariantParser(GRCh38p13); List variants = instance.createVariantAlleleList(SV_EXAMPLE_PATH); @@ -80,9 +81,9 @@ public void createVariantList() throws Exception { @Test public void createVariantList_Pbsv() throws Exception{ - VcfVariantParser instance = new VcfVariantParser(GRCh38p13, false); + VcfVariantParser instance = new VcfVariantParser(GRCh38p13); - List variants = instance.createVariantAlleleList(Paths.get("src/test/resources/org/jax/svanna/io/parse/pbsv.vcf")); + List variants = instance.createVariantAlleleList(Paths.get("src/test/resources/org/jax/svanna/io/parse/pbsv.vcf")); assertThat(variants, hasSize(6)); assertThat(variants.stream().map(Variant::variantType).collect(toSet()), @@ -143,9 +144,9 @@ public void createVariantList_Pbsv() throws Exception{ @Test public void createVariantList_Svim() throws Exception { - VcfVariantParser instance = new VcfVariantParser(GRCh38p13, false); + VcfVariantParser instance = new VcfVariantParser(GRCh38p13); - List variants = instance.createVariantAlleleList(Paths.get("src/test/resources/org/jax/svanna/io/parse/svim.vcf")); + List variants = instance.createVariantAlleleList(Paths.get("src/test/resources/org/jax/svanna/io/parse/svim.vcf")); assertThat(variants, hasSize(6)); assertThat(variants.stream().map(Variant::variantType).collect(toSet()), @@ -200,9 +201,9 @@ public void createVariantList_Svim() throws Exception { @Test public void createVariantList_Sniffles() throws Exception { - VcfVariantParser instance = new VcfVariantParser(GRCh38p13, false); + VcfVariantParser instance = new VcfVariantParser(GRCh38p13); - List variants = instance.createVariantAlleleList(Paths.get("src/test/resources/org/jax/svanna/io/parse/sniffles.vcf")); + List variants = instance.createVariantAlleleList(Paths.get("src/test/resources/org/jax/svanna/io/parse/sniffles.vcf")); assertThat(variants, hasSize(6)); assertThat(variants.stream().map(Variant::variantType).collect(toSet()), @@ -238,7 +239,7 @@ public void createVariantList_Sniffles() throws Exception { @Test public void toVariants_multiallelicBreakendVariant() { - VcfVariantParser instance = new VcfVariantParser(GRCh38p13, false); + VcfVariantParser instance = new VcfVariantParser(GRCh38p13); String line = "2\t321681\tbnd_W\tG\tG]17:198982],C\t6\tPASS\tSVTYPE=BND;MATEID=bnd_Y;EVENT=tra1\tGT\t./."; VariantContext vc = VCF_CODEC.decode(line); @@ -249,7 +250,7 @@ public void toVariants_multiallelicBreakendVariant() { @Test public void toVariants_multiallelicSymbolicVariant() { - VcfVariantParser instance = new VcfVariantParser(GRCh38p13, false); + VcfVariantParser instance = new VcfVariantParser(GRCh38p13); String line = "2\t321682\tdel0\tT\t,C\t6\tPASS\tSVTYPE=DEL;END=321887;SVLEN=-205;CIPOS=-56,20;CIEND=-10,62\tGT:GQ:DP\t0/1:12:11"; VariantContext vc = VCF_CODEC.decode(line); @@ -260,7 +261,7 @@ public void toVariants_multiallelicSymbolicVariant() { @Test public void toVariants_symbolic_unknownContig() { - VcfVariantParser instance = new VcfVariantParser(GRCh38p13, false); + VcfVariantParser instance = new VcfVariantParser(GRCh38p13); String line = "bacon\t12665100\tdup0\tA\t\t14\tPASS\tSVTYPE=DUP;END=12686200;SVLEN=21100;CIPOS=-500,500;CIEND=-500,500;DP=5\tGT:GQ:CN:CNQ\t./.:0:3:16.2"; VariantContext vc = VCF_CODEC.decode(line); @@ -271,7 +272,7 @@ public void toVariants_symbolic_unknownContig() { @Test public void toVariants_sequence_unknownContig() { - VcfVariantParser instance = new VcfVariantParser(GRCh38p13, false); + VcfVariantParser instance = new VcfVariantParser(GRCh38p13); String line = "bacon\t14370\trs6054257\tG\tA\t29\tPASS\tDP=14;AF=0.5;DB\tGT:GQ:DP\t1/1:43:5"; VariantContext vc = VCF_CODEC.decode(line); @@ -288,7 +289,7 @@ public class ToyTests { @Test public void toVariants_symbolicDeletion() { GenomicAssembly assembly = testAssembly(List.of(TestContig.of(1, 10), TestContig.of(2, 20))); - VcfVariantParser parser = new VcfVariantParser(assembly, false); + VcfVariantParser parser = new VcfVariantParser(assembly); String line = "1\t2\tdel0\tT\t\t6\tPASS\tSVTYPE=DEL;END=7;SVLEN=-5;CIPOS=-1,2;CIEND=-2,1\tGT:GQ:DP:AD\t0/1:12:11:6,5"; @@ -321,7 +322,7 @@ public void toVariants_symbolicDeletion() { @Test public void toVariants_sequenceVariant() { GenomicAssembly assembly = testAssembly(List.of(TestContig.of(1, 10), TestContig.of(2, 20))); - VcfVariantParser parser = new VcfVariantParser(assembly, false); + VcfVariantParser parser = new VcfVariantParser(assembly); String line = "1\t2\tdel1\tTTC\tTT\t6\tPASS\tAF=0.5\tGT:GQ:DP:AD\t0/1:12:11:6,5"; VariantContext vc = VCF_CODEC.decode(line); @@ -354,7 +355,7 @@ public void toVariants_sequenceVariant() { @Test public void toVariants_breakendVariant() { GenomicAssembly assembly = testAssembly(List.of(TestContig.of(1, 10), TestContig.of(2, 20))); - VcfVariantParser parser = new VcfVariantParser(assembly, false); + VcfVariantParser parser = new VcfVariantParser(assembly); String line = "1\t3\tbnd_V\tT\t]2:16]AAGT\t6\tPASS\tSVTYPE=BND;CIPOS=-1,2;CIEND=-2,1;MATEID=bnd_U;EVENT=tra2\tGT\t./."; VariantContext vc = VCF_CODEC.decode(line); From e9b98d3a1e8fa0a69f9a6c7f2801201949e819df Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Sun, 11 Jul 2021 21:08:00 -0400 Subject: [PATCH 18/40] Fix branch merging error --- .../src/main/java/org/jax/svanna/cli/cmd/PrioritizeCommand.java | 1 + 1 file changed, 1 insertion(+) diff --git a/svanna-cli/src/main/java/org/jax/svanna/cli/cmd/PrioritizeCommand.java b/svanna-cli/src/main/java/org/jax/svanna/cli/cmd/PrioritizeCommand.java index 6f77cc70..db707e26 100644 --- a/svanna-cli/src/main/java/org/jax/svanna/cli/cmd/PrioritizeCommand.java +++ b/svanna-cli/src/main/java/org/jax/svanna/cli/cmd/PrioritizeCommand.java @@ -19,6 +19,7 @@ import org.jax.svanna.core.reference.SvannaVariant; import org.jax.svanna.core.reference.Zygosity; import org.jax.svanna.io.FullSvannaVariant; +import org.jax.svanna.io.parse.VariantParser; import org.jax.svanna.io.parse.VcfVariantParser; import org.monarchinitiative.phenol.ontology.data.Term; import org.monarchinitiative.phenol.ontology.data.TermId; From d781d3dc1d60fea6d0be40598e6680e44bd4a0c1 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Sun, 11 Jul 2021 22:42:21 -0400 Subject: [PATCH 19/40] Describe output formats, change header of the tabular files, deprecate max length filter --- docs/index.rst | 7 +- docs/outputformats.rst | 65 ++++++++++++++++++- docs/quickstart.rst | 60 ++++++++++++----- docs/running.rst | 7 +- docs/setup.rst | 3 +- .../java/org/jax/svanna/benchmark/Main.java | 4 +- .../main/java/org/jax/svanna/cli/Main.java | 2 +- .../jax/svanna/cli/cmd/PrioritizeCommand.java | 3 +- .../writer/tabular/TabularResultWriter.java | 2 +- .../jax/svanna/core/filter/FilterType.java | 1 + 10 files changed, 126 insertions(+), 28 deletions(-) diff --git a/docs/index.rst b/docs/index.rst index 4fe976fc..81dcfd4c 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -16,7 +16,8 @@ of germline variants. setup running outputformats - structuralvariation - enhancers - BND + +.. structuralvariation +.. enhancers +.. BND \ No newline at end of file diff --git a/docs/outputformats.rst b/docs/outputformats.rst index 8fce4e1c..0efa2b5c 100644 --- a/docs/outputformats.rst +++ b/docs/outputformats.rst @@ -4,4 +4,67 @@ Output formats ============== -.. TODO - Write sections +SvAnna supports storing results in 4 output formats: *HTML*, *VCF* *CSV*, and *TSV*. Use the ``--output-format`` option +to select one or more of the desired output formats (e.g. ``--output-format html,vcf``). + +HTML output format +^^^^^^^^^^^^^^^^^^ + +SvAnna creates an *HTML* file with the analysis summary and with variants sorted by the :math:`TAD_{SV}` score +in descending order. +By default, top 100 variants are included into the report. The number of the reported variants can be adjusted by +the ``--report-top-variants`` option. + +The report consists of several parts: + +* *Analysis summary* - Details of HPO terms of the proband, paths of the input files, and the analysis parameters. +* *Variant counts* - Breakdown of the number of the variant types of the different categories. +* *Prioritized SVs* - Visualizations of the prioritized variants. + +.. TODO - write more about the HTML report + +.. note:: + Only the variants that passed all the filters are visualized in the *Prioritized SVs* section + +The ``--no-breakends`` excludes breakend/translocation variants from the report. + +VCF output format +^^^^^^^^^^^^^^^^^ +When including ``vcf`` into the ``--output-format`` option, a VCF file with all input variants is created. +The prioritization adds a novel *INFO* field to each variant: + +* ``TADSV`` - an *INFO* field containing :math:`TAD_{SV}` score for the variant. + +.. note:: + * ``--report-top-variants`` option has no effect for the *VCF* output format. + * add ``--uncompressed-output`` flag if you want to get uncompressed VCF file + + +CSV/TSV output format +^^^^^^^^^^^^^^^^^^^^^ +To write *n* most deleterious variants into a *CSV* (or *TSV*) file, use ``csv`` (``tsv``) in the ``--output-format`` option. + +The results are written into a tabular file with the following columns: + +* *contig* - name of the contig/chromosome (e.g. ``1``, ``2``, ``X``) +* *start* - 0-based start coordinate (excluded) of the variant on positive strand +* *end* - 0-based end coordinate (included) of the variant on positive strand +* *id* - variant ID as it was present in the input VCF file +* *vtype* - variant type, one of {``DEL``, ``DUP``, ``INV``, ``INS``, ``BND``, ``CNV``} +* *failed_filters* - the names of filters that the variant failed to pass. The names are separated by semicolon (``;``) + * ``filter`` - the variant failed previous VCF filters - at least one filter flag is present in the variant VCF line, except for ``PASS``. + * ``coverage`` - the variant is supported by less reads than specified by ``--min-read-support`` option +* *tadsv* - the :math:`TAD_{SV}` score value + +.. table:: Tabular output + + ======== ========= ========== ====== ======= ================= ===================== + contig start end id vtype failed_filters tadsv + ======== ========= ========== ====== ======= ================= ===================== + 11 31130456 31671718 abcd DEL 109.75766900764305 + 18 46962113 46969912 efgh DUP filter;coverage 3.2 + ... ... ... ... ... ... ... + ======== ========= ========== ====== ======= ================= ===================== + +.. note:: + add ``--uncompressed-output`` flag if you want to get uncompressed tabular file \ No newline at end of file diff --git a/docs/quickstart.rst b/docs/quickstart.rst index 7c5a7c4b..00ddf11f 100644 --- a/docs/quickstart.rst +++ b/docs/quickstart.rst @@ -10,7 +10,7 @@ Prerequisites ^^^^^^^^^^^^^ SvAnna is written in Java 11 and needs Java 11+ to be present in the runtime environment. Please verify that you are -running Java 11+ by running:: +using Java 11+ by running:: java -version @@ -18,16 +18,23 @@ running Java 11+ by running:: SvAnna setup ^^^^^^^^^^^^ -Run the following commands to setup SvAnna. +SvAnna is install by running the following three steps. -Download and extract SvAnna distribution ZIP archive from `here `_. Then, run:: +1. Download SvAnna distribution ZIP +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Download and extract SvAnna distribution ZIP archive from `here `_. +Expand the *Assets* menu and download the ``svanna-cli-{version}-distribution.zip``. Choose the latest stable version, +or a release candidate (RC). + +After unzipping the distribution archive, run the following command to display the help message:: java -jar svanna-cli-1.0.0-RC1.jar --help .. note:: - If things went well, the command above should print the following help message:: + If things went OK, the command above will print the following help message:: - Structural variant annotation + Structural variant prioritization Usage: svanna-cli.jar [-hV] [COMMAND] -h, --help Show this help message and exit. -V, --version Print version information and exit. @@ -36,38 +43,59 @@ Download and extract SvAnna distribution ZIP archive from `here `_) +receives the highest :math:`TAD_{SV}` score of 25.61, and the variant is placed on rank 1. -SvAnna will prioritize the variants and store the results in HTML file next to the VCF file. +SvAnna stores prioritization results in *HTML*, *CSV*, and *VCF* output formats next to the input VCF file. -Read the :ref:`rstrunning` section to learn about additional options SvAnna offers. +Read the :ref:`rstsetup` and :ref:`rstrunning` sections to learn all details regarding setting up and running SvAnna. diff --git a/docs/running.rst b/docs/running.rst index 8b1f6a0e..8230f00e 100644 --- a/docs/running.rst +++ b/docs/running.rst @@ -22,9 +22,9 @@ stored in one or more :ref:`rstoutputformats`. To prioritize variants in the `example.vcf`_ file (an example VCF file with 8 variants stored in SvAnna repository), run:: - java -jar svanna-cli.jar prioritize --config svanna-config.yaml --vcf example.vcf --term HP:0011890 --term HP:0000978 --term HP:0012147 + java -jar svanna-cli.jar prioritize --config svanna-config.yaml --vcf example.vcf --term HP:0011890 --term HP:0000978 --term HP:0012147 --prefix /path/to/output -After the annotation, the results are stored at ``output.html``. +After the annotation, the results are stored at ``/path/to/output.html``. Mandatory arguments ~~~~~~~~~~~~~~~~~~~ @@ -57,6 +57,9 @@ SvAnna allows to fine-tune the prioritization using the following *optional* opt * ``--output-format`` - comma separated list of output formats to use for writing the results. See :ref:`rstoutputformats` section for available output formats (default ``html``) * ``--prefix`` - prefix for output files (default: based on the input VCF name) * ``--report-top-variants`` - report top *n* variants in the HTML report (default: ``100``) +* ``--uncompressed-output`` - the tabular and VCF output files are compressed by default. Use this flag if you want to disable compressing the output files (default: ``false``) +See the next section to learn more about the SvAnna :ref:`rstoutputformats`. + .. _example.vcf: https://github.com/TheJacksonLaboratory/Squirls/blob/development/squirls-cli/src/examples/example.vcf \ No newline at end of file diff --git a/docs/setup.rst b/docs/setup.rst index 55522618..78604f7a 100644 --- a/docs/setup.rst +++ b/docs/setup.rst @@ -152,7 +152,8 @@ Open the file in your favorite text editor and provide the following three bits Optional parameters ~~~~~~~~~~~~~~~~~~~ -The optional parameters do not necessarily need to be touched unless you know what you're doing. +The optional parameters do not necessarily need to be touched unless you know what you're doing. We will provide the +details later. .. TODO - describe the optional parameters. diff --git a/svanna-benchmark/src/main/java/org/jax/svanna/benchmark/Main.java b/svanna-benchmark/src/main/java/org/jax/svanna/benchmark/Main.java index 5a484182..2bd0abd4 100644 --- a/svanna-benchmark/src/main/java/org/jax/svanna/benchmark/Main.java +++ b/svanna-benchmark/src/main/java/org/jax/svanna/benchmark/Main.java @@ -8,14 +8,14 @@ import static picocli.CommandLine.Help.Ansi.Style.*; @CommandLine.Command(name = "svanna-benchmark.jar", - header = "Structural variant annotation", + header = "Structural variant prioritization", mixinStandardHelpOptions = true, version = Main.VERSION, usageHelpWidth = Main.WIDTH, footer = Main.FOOTER) public class Main implements Callable { - public static final String VERSION = "svanna v0.3.1-SNAPSHOT"; + public static final String VERSION = "svanna-benchmark v1.0.0-RC1-SNAPSHOT"; public static final int WIDTH = 120; diff --git a/svanna-cli/src/main/java/org/jax/svanna/cli/Main.java b/svanna-cli/src/main/java/org/jax/svanna/cli/Main.java index 76fcd49d..df7ed673 100644 --- a/svanna-cli/src/main/java/org/jax/svanna/cli/Main.java +++ b/svanna-cli/src/main/java/org/jax/svanna/cli/Main.java @@ -11,7 +11,7 @@ import static picocli.CommandLine.Help.Ansi.Style.*; @CommandLine.Command(name = "svanna-cli.jar", - header = "Structural variant annotation", + header = "Structural variant prioritization", mixinStandardHelpOptions = true, version = Main.VERSION, usageHelpWidth = Main.WIDTH, diff --git a/svanna-cli/src/main/java/org/jax/svanna/cli/cmd/PrioritizeCommand.java b/svanna-cli/src/main/java/org/jax/svanna/cli/cmd/PrioritizeCommand.java index db707e26..a033efb0 100644 --- a/svanna-cli/src/main/java/org/jax/svanna/cli/cmd/PrioritizeCommand.java +++ b/svanna-cli/src/main/java/org/jax/svanna/cli/cmd/PrioritizeCommand.java @@ -93,7 +93,8 @@ public class PrioritizeCommand extends SvAnnaCommand { description = "Frequency threshold as a percentage [0-100] (default: ${DEFAULT-VALUE})") public float frequencyThreshold = 1.F; - @CommandLine.Option(names = {"max-length"}, + @Deprecated + @CommandLine.Option(names = {"--max-length"}, description = "Do not prioritize variants longer than this (default: ${DEFAULT-VALUE})") public int maxLength = 250_000_000; diff --git a/svanna-cli/src/main/java/org/jax/svanna/cli/writer/tabular/TabularResultWriter.java b/svanna-cli/src/main/java/org/jax/svanna/cli/writer/tabular/TabularResultWriter.java index 39f30b13..09e466c6 100644 --- a/svanna-cli/src/main/java/org/jax/svanna/cli/writer/tabular/TabularResultWriter.java +++ b/svanna-cli/src/main/java/org/jax/svanna/cli/writer/tabular/TabularResultWriter.java @@ -47,7 +47,7 @@ public TabularResultWriter(String suffix, char columnSeparator, boolean compress public void write(AnalysisResults analysisResults, String prefix) throws IOException { try (BufferedWriter writer = openWriter(prefix)) { CSVPrinter printer = CSVFormat.DEFAULT.withDelimiter(columnSeparator) - .withHeader("CONTIG", "START", "END", "ID", "VTYPE", "FAILED_FILTERS", "PRIORITY") + .withHeader("contig", "start", "end", "id", "vtype", "failed_filters", "tadsv") .print(writer); analysisResults.variants().stream() .filter(sv -> !Double.isNaN(sv.svPriority().getPriority())) diff --git a/svanna-core/src/main/java/org/jax/svanna/core/filter/FilterType.java b/svanna-core/src/main/java/org/jax/svanna/core/filter/FilterType.java index f11e5d4b..d93649de 100644 --- a/svanna-core/src/main/java/org/jax/svanna/core/filter/FilterType.java +++ b/svanna-core/src/main/java/org/jax/svanna/core/filter/FilterType.java @@ -47,6 +47,7 @@ public enum FilterType { @Deprecated REPETITIVE_REGION_FILTER("repeats", "Repetitive region"), COVERAGE_FILTER("coverage", "Failed required coverage depth filter"), + @Deprecated MAX_LENGTH_FILTER("max-length", "Failed required maximum length filter"); private final String vcfValue; From 2e1f6e65503825642bc62107cd3e5aa53dad62b9 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Sun, 11 Jul 2021 23:00:11 -0400 Subject: [PATCH 20/40] Update HTML output template --- .../writer/html/template/HpoHtmlComponent.java | 5 ----- .../html/template/MetaDataHtmlComponent.java | 5 ----- .../cli/writer/html/template/svannHTML.ftl | 18 ++++++++++++------ 3 files changed, 12 insertions(+), 16 deletions(-) diff --git a/svanna-cli/src/main/java/org/jax/svanna/cli/writer/html/template/HpoHtmlComponent.java b/svanna-cli/src/main/java/org/jax/svanna/cli/writer/html/template/HpoHtmlComponent.java index c6f906d4..6f727933 100644 --- a/svanna-cli/src/main/java/org/jax/svanna/cli/writer/html/template/HpoHtmlComponent.java +++ b/svanna-cli/src/main/java/org/jax/svanna/cli/writer/html/template/HpoHtmlComponent.java @@ -16,11 +16,6 @@ public HpoHtmlComponent(Map originalHpoTerms) { "
\n" + originalTermsTable(originalHpoTerms) + "
\n" + - "\n" + - "
\n" + - "

svanna performs simple phenotype-base prioritization by semantic similarity analysis (Resnik)," + - " corresponding to the OSS approach in Köhler et al (2009).

\n" + "
\n"; } diff --git a/svanna-cli/src/main/java/org/jax/svanna/cli/writer/html/template/MetaDataHtmlComponent.java b/svanna-cli/src/main/java/org/jax/svanna/cli/writer/html/template/MetaDataHtmlComponent.java index eb377cba..031119c2 100644 --- a/svanna-cli/src/main/java/org/jax/svanna/cli/writer/html/template/MetaDataHtmlComponent.java +++ b/svanna-cli/src/main/java/org/jax/svanna/cli/writer/html/template/MetaDataHtmlComponent.java @@ -19,11 +19,6 @@ public MetaDataHtmlComponent(Map originalHpoTerms, AnalysisParam "
\n" + analysisHtmlComponent.getHtml() + "
\n" + - "\n" + - "
\n" + - "

svanna performs simple phenotype-base prioritization by semantic similarity analysis (Resnik)," + - " corresponding to the OSS approach in Köhler et al (2009).

\n" + "
\n"; } diff --git a/svanna-cli/src/main/resources/org/jax/svanna/cli/writer/html/template/svannHTML.ftl b/svanna-cli/src/main/resources/org/jax/svanna/cli/writer/html/template/svannHTML.ftl index 62fb6a88..8bdeda15 100644 --- a/svanna-cli/src/main/resources/org/jax/svanna/cli/writer/html/template/svannHTML.ftl +++ b/svanna-cli/src/main/resources/org/jax/svanna/cli/writer/html/template/svannHTML.ftl @@ -376,8 +376,8 @@ a[name="othergenes"] table.vartab a::first-letter { footer { - background-color: #05396b; - color: white; + background-color: #ffffff; + color: black; padding: 1rem 2rem; } @@ -473,7 +473,7 @@ a.svg:hover, a.svg:active {
- +
${analysisMetadata?no_esc}

Phenopackets file: ${phenopacket_file} @@ -511,9 +511,11 @@ a.svg:hover, a.svg:active {

- +

About

+

SvAnna performs phenotype-driven prioritization using semantic similarity (Resnik), corresponding to the + OSS approach in Köhler et al (2009).

SvAnna shows candidate SVs that affect genes associated with the top candidates.

@@ -522,7 +524,11 @@ a.svg:hover, a.svg:active {