diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 55374904..875a8a96 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -2,6 +2,15 @@ Changelog ========= +------ +v1.0.4 +------ + +- ``SvAnna`` + - update dependency versions. +- ``svanna-io`` + - Fix issue with VCF files compressed by newer compressor versions + ------ v1.0.3 ------ diff --git a/README.md b/README.md index 092e39fc..b2cb8884 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,51 @@ Efficient and accurate pathogenicity prediction for coding and regulatory struct Most users should download the latest SvAnna distribution ZIP file from the [Releases page](https://github.com/TheJacksonLaboratory/SvAnna/releases). -Please consult the Read the docs site for detailed documentation: +## Example use + +SvAnna is a standalone command-line Java application and can be run as follows: + +```shell +java -jar svanna-cli.jar -d path/to/svanna/data \ + -t HP:0008330 \ + --vcf example.vcf.gz \ + --output-format html,csv,vcf +``` + +The analysis will filter out common SVs and perform phenotype-driven prioritization of the remaining SVs. +The SVs are assigned with *"Pathogenicity of Structural variation"* (PSV) score and written into +one of several output formats, such as CSV table, a VCF file, or a detailed HTML report. + +### HTML report + +The HTML report includes a header with the analysis summary and the SVs ordered by the PSV score +with the best scores on top. + +### Analysis summary + +The summary presents the clinical features encoded into terms of Human Phenotype Ontology (HPO) as well as +the other analysis parameters. + +![Analysis summary](img/analysis-summary.png) + +### Variant counts + +The report further breaks down SVs into several categories: + +![Variant counts](img/variant-counts.png) + +### Structural variants + +Last, each SV is presented in the context of the overlapping genes and transcripts: +![Variant transcript summary](img/variant-tx-summary.png) + +We also show the variant in context of the neighboring repetitive regions and genes/transcripts: +![Variant context](img/variant-tx-context.png) + + +## Read more + +Please consult the Read the docs site for a detailed documentation: - [stable version](https://svanna.readthedocs.io/en/master) describing the latest release at the *Releases page*, or - [latest version](https://svanna.readthedocs.io/en/latest) summarizing the latest development on `development` branch. diff --git a/docs/conf.py b/docs/conf.py index 2d13836b..4c8fcfd5 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -56,7 +56,7 @@ # The short X.Y version. version = u'1.0' # The full version, including alpha/beta/rc tags. -release = u'1.0.3' +release = u'1.0.4' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/img/analysis-summary.png b/img/analysis-summary.png new file mode 100644 index 00000000..6f41f32b Binary files /dev/null and b/img/analysis-summary.png differ diff --git a/img/variant-counts.png b/img/variant-counts.png new file mode 100644 index 00000000..8afc1dd8 Binary files /dev/null and b/img/variant-counts.png differ diff --git a/img/variant-tx-context.png b/img/variant-tx-context.png new file mode 100644 index 00000000..04b316f4 Binary files /dev/null and b/img/variant-tx-context.png differ diff --git a/img/variant-tx-summary.png b/img/variant-tx-summary.png new file mode 100644 index 00000000..15df9668 Binary files /dev/null and b/img/variant-tx-summary.png differ diff --git a/pom.xml b/pom.xml index 8f156007..dc76016b 100644 --- a/pom.xml +++ b/pom.xml @@ -16,7 +16,7 @@ svanna-benchmark org.monarchinitiative.svanna - 1.0.3 + 1.0.4 SvAnna @@ -34,7 +34,13 @@ 2.0.0 1.0.0-RC2 + 2.0.2 0.2.5 + 3.0.5 + 1.7 + 2.8.0 + 3.8.0 + 1.24.0 1.4.200 @@ -111,7 +117,7 @@ org.phenopackets phenopacket-schema - 2.0.2 + ${phenopacket-schema.version} com.google.protobuf @@ -131,7 +137,7 @@ com.github.samtools htsjdk - 3.0.5 + ${htsjdk.version} org.tukaani @@ -143,22 +149,22 @@ org.apache.commons commons-csv - 1.7 + ${commons-csv.version} org.apache.commons commons-compress - 1.21 + ${commons-compress.version} commons-io commons-io - 2.8.0 + ${commons-io.version} commons-net commons-net - 3.8.0 + ${commons-net.version} info.picocli diff --git a/svanna-benchmark/pom.xml b/svanna-benchmark/pom.xml index b8b58f4b..a0dbff76 100644 --- a/svanna-benchmark/pom.xml +++ b/svanna-benchmark/pom.xml @@ -3,7 +3,7 @@ SvAnna org.monarchinitiative.svanna - 1.0.3 + 1.0.4 4.0.0 diff --git a/svanna-benchmark/src/main/java/org/monarchinitiative/svanna/benchmark/cmd/benchmark_case/BenchmarkCaseCommand.java b/svanna-benchmark/src/main/java/org/monarchinitiative/svanna/benchmark/cmd/benchmark_case/BenchmarkCaseCommand.java index fea1a5b6..b080d5d9 100644 --- a/svanna-benchmark/src/main/java/org/monarchinitiative/svanna/benchmark/cmd/benchmark_case/BenchmarkCaseCommand.java +++ b/svanna-benchmark/src/main/java/org/monarchinitiative/svanna/benchmark/cmd/benchmark_case/BenchmarkCaseCommand.java @@ -1,6 +1,5 @@ package org.monarchinitiative.svanna.benchmark.cmd.benchmark_case; -import org.apache.commons.compress.compressors.gzip.GzipCompressorOutputStream; import org.apache.commons.csv.CSVFormat; import org.apache.commons.csv.CSVPrinter; import org.monarchinitiative.svanna.benchmark.cmd.BaseBenchmarkCommand; @@ -34,6 +33,7 @@ import java.util.*; import java.util.stream.Collectors; import java.util.stream.Stream; +import java.util.zip.GZIPOutputStream; @CommandLine.Command(name = "benchmark-case", aliases = {"BC"}, @@ -191,7 +191,7 @@ private void writeOutResults(File output, BenchmarkResults results, Set // "case_name", "background_vcf", "variant_id", "rank", "vtype", "is_causal", "priority" LOGGER.info("Writing the results for `{}`", results.caseName()); - try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new GzipCompressorOutputStream(new FileOutputStream(output))))) { + try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new GZIPOutputStream(new FileOutputStream(output))))) { CSVPrinter printer = CSVFormat.DEFAULT .withHeader("case_name", "background_vcf", "variant_id", "rank", "vtype", "is_causal", "priority") .print(writer); diff --git a/svanna-cli/pom.xml b/svanna-cli/pom.xml index a48d43ae..ccdc2702 100644 --- a/svanna-cli/pom.xml +++ b/svanna-cli/pom.xml @@ -3,7 +3,7 @@ SvAnna org.monarchinitiative.svanna - 1.0.3 + 1.0.4 4.0.0 svanna-cli diff --git a/svanna-cli/src/main/java/org/monarchinitiative/svanna/cli/Main.java b/svanna-cli/src/main/java/org/monarchinitiative/svanna/cli/Main.java index e16da4d3..4fed0ab5 100644 --- a/svanna-cli/src/main/java/org/monarchinitiative/svanna/cli/Main.java +++ b/svanna-cli/src/main/java/org/monarchinitiative/svanna/cli/Main.java @@ -17,7 +17,7 @@ footer = Main.FOOTER) public class Main implements Callable { - public static final String VERSION = "svanna-cli v1.0.3"; + public static final String VERSION = "svanna-cli v1.0.4"; public static final int WIDTH = 120; diff --git a/svanna-cli/src/main/java/org/monarchinitiative/svanna/cli/writer/tabular/TabularResultWriter.java b/svanna-cli/src/main/java/org/monarchinitiative/svanna/cli/writer/tabular/TabularResultWriter.java index 0a620cb6..c7db9f1c 100644 --- a/svanna-cli/src/main/java/org/monarchinitiative/svanna/cli/writer/tabular/TabularResultWriter.java +++ b/svanna-cli/src/main/java/org/monarchinitiative/svanna/cli/writer/tabular/TabularResultWriter.java @@ -1,6 +1,5 @@ package org.monarchinitiative.svanna.cli.writer.tabular; -import org.apache.commons.compress.compressors.gzip.GzipCompressorOutputStream; import org.apache.commons.csv.CSVFormat; import org.apache.commons.csv.CSVPrinter; import org.monarchinitiative.svanna.cli.writer.AnalysisResults; @@ -27,6 +26,7 @@ import java.util.LinkedList; import java.util.List; import java.util.function.Consumer; +import java.util.zip.GZIPOutputStream; public class TabularResultWriter implements ResultWriter { @@ -63,7 +63,7 @@ private BufferedWriter openWriter(Path output, String prefix) throws IOException Path outPath = output.resolve(prefix + suffix + (compress ? ".gz" : "")); LogUtils.logInfo(LOGGER, "Writing tabular results into {}", outPath.toAbsolutePath()); return compress - ? new BufferedWriter(new OutputStreamWriter(new GzipCompressorOutputStream(new FileOutputStream(outPath.toFile())))) + ? new BufferedWriter(new OutputStreamWriter(new GZIPOutputStream(new FileOutputStream(outPath.toFile())))) : Files.newBufferedWriter(outPath); } diff --git a/svanna-configuration/pom.xml b/svanna-configuration/pom.xml index 1dc947de..9d311188 100644 --- a/svanna-configuration/pom.xml +++ b/svanna-configuration/pom.xml @@ -3,7 +3,7 @@ SvAnna org.monarchinitiative.svanna - 1.0.3 + 1.0.4 4.0.0 diff --git a/svanna-core/pom.xml b/svanna-core/pom.xml index eb949207..3ba7de55 100644 --- a/svanna-core/pom.xml +++ b/svanna-core/pom.xml @@ -3,7 +3,7 @@ SvAnna org.monarchinitiative.svanna - 1.0.3 + 1.0.4 4.0.0 diff --git a/svanna-db/pom.xml b/svanna-db/pom.xml index 52878476..2becf335 100644 --- a/svanna-db/pom.xml +++ b/svanna-db/pom.xml @@ -3,7 +3,7 @@ SvAnna org.monarchinitiative.svanna - 1.0.3 + 1.0.4 4.0.0 diff --git a/svanna-ingest/pom.xml b/svanna-ingest/pom.xml index 8933ac32..15635041 100644 --- a/svanna-ingest/pom.xml +++ b/svanna-ingest/pom.xml @@ -3,7 +3,7 @@ SvAnna org.monarchinitiative.svanna - 1.0.3 + 1.0.4 4.0.0 svanna-ingest diff --git a/svanna-ingest/src/main/java/org/monarchinitiative/svanna/ingest/cmd/BuildDb.java b/svanna-ingest/src/main/java/org/monarchinitiative/svanna/ingest/cmd/BuildDb.java index 7df25179..eeb86f7f 100644 --- a/svanna-ingest/src/main/java/org/monarchinitiative/svanna/ingest/cmd/BuildDb.java +++ b/svanna-ingest/src/main/java/org/monarchinitiative/svanna/ingest/cmd/BuildDb.java @@ -7,8 +7,6 @@ import org.apache.commons.codec.digest.MessageDigestAlgorithms; import org.apache.commons.compress.archivers.zip.ZipArchiveEntry; import org.apache.commons.compress.archivers.zip.ZipFile; -import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream; -import org.apache.commons.compress.compressors.gzip.GzipCompressorOutputStream; import org.apache.commons.csv.CSVFormat; import org.apache.commons.csv.CSVParser; import org.apache.commons.csv.CSVRecord; @@ -87,6 +85,8 @@ import java.util.regex.Pattern; import java.util.stream.Collectors; import java.util.stream.Stream; +import java.util.zip.GZIPInputStream; +import java.util.zip.GZIPOutputStream; @CommandLine.Command(name = "build-db", aliases = "B", @@ -369,7 +369,7 @@ private static List downloadAndPreprocessGenes(GeneProper GeneParser jsonParser = parserFactory.forFormat(SerializationFormat.JSON); Path destination = buildDir.resolve("gencode.v38.genes.json.gz"); LOGGER.info("Serializing the genes to {}", destination.toAbsolutePath()); - try (OutputStream os = new BufferedOutputStream(new GzipCompressorOutputStream(Files.newOutputStream(destination)))) { + try (OutputStream os = new BufferedOutputStream(new GZIPOutputStream(Files.newOutputStream(destination)))) { jsonParser.write(genes, os); } @@ -563,7 +563,7 @@ private static Map parseNcbiToHgncTable(String ncbiGeneToHgnc) private static BufferedReader openForReading(Path tablePath) throws IOException { return (tablePath.toFile().getName().endsWith(".gz")) - ? new BufferedReader(new InputStreamReader(new GzipCompressorInputStream(Files.newInputStream(tablePath)))) + ? new BufferedReader(new InputStreamReader(new GZIPInputStream(Files.newInputStream(tablePath)))) : Files.newBufferedReader(tablePath); } diff --git a/svanna-io/pom.xml b/svanna-io/pom.xml index d103020c..d2698911 100644 --- a/svanna-io/pom.xml +++ b/svanna-io/pom.xml @@ -3,7 +3,7 @@ SvAnna org.monarchinitiative.svanna - 1.0.3 + 1.0.4 4.0.0 diff --git a/svanna-io/src/main/java/org/monarchinitiative/svanna/io/parse/VcfVariantParser.java b/svanna-io/src/main/java/org/monarchinitiative/svanna/io/parse/VcfVariantParser.java index e0637599..deb03cec 100644 --- a/svanna-io/src/main/java/org/monarchinitiative/svanna/io/parse/VcfVariantParser.java +++ b/svanna-io/src/main/java/org/monarchinitiative/svanna/io/parse/VcfVariantParser.java @@ -7,7 +7,6 @@ import htsjdk.variant.vcf.VCFFileReader; import htsjdk.variant.vcf.VCFHeader; import htsjdk.variant.vcf.VCFHeaderVersion; -import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream; import org.monarchinitiative.svanna.core.LogUtils; import org.monarchinitiative.svanna.core.filter.FilterResult; import org.monarchinitiative.svanna.core.filter.FilterType; @@ -23,15 +22,16 @@ import org.slf4j.LoggerFactory; import java.io.BufferedReader; -import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; +import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import java.util.List; import java.util.Optional; import java.util.function.Function; import java.util.stream.Stream; +import java.util.zip.GZIPInputStream; /** * Parse variants stored in a VCF file. The parser is NOT thread safe! @@ -97,20 +97,39 @@ public Stream createVariantAlleles(Path filePath) throws IOEx VCFCodec codec = new VCFCodec(); codec.setVCFHeader(header, header.getVCFHeaderVersion() == null ? VCFHeaderVersion.VCF4_1 : header.getVCFHeaderVersion()); - BufferedReader reader; - if (filePath.toFile().getName().endsWith(".gz")) - reader = new BufferedReader(new InputStreamReader(new GzipCompressorInputStream(new FileInputStream(filePath.toFile())))); - else - reader = Files.newBufferedReader(filePath); + BufferedReader reader = openFileForReading(filePath); return reader.lines() - .onClose(() -> {try {reader.close();} catch (IOException ignored) {}}) + .onClose(closeReader(reader)) .map(toVariantContext(codec)) .flatMap(Optional::stream) .map(toVariants()) .flatMap(Optional::stream); } + private static BufferedReader openFileForReading(Path filePath) throws IOException { + BufferedReader reader; + if (filePath.toFile().getName().endsWith(".gz")) + reader = new BufferedReader( + new InputStreamReader( + new GZIPInputStream(Files.newInputStream(filePath)), + StandardCharsets.UTF_8)); + else + reader = Files.newBufferedReader(filePath, StandardCharsets.UTF_8); + return reader; + } + + private static Runnable closeReader(BufferedReader reader) { + return () -> { + try { + LOGGER.trace("Closing VCF file"); + reader.close(); + } catch (IOException e) { + LOGGER.warn("Error while closing the VCF file", e); + } + }; + } + /** * One variant context might represent multiple sequence variants or a single symbolic variant/breakend. * This function melts the variant context to a collection of variants. diff --git a/svanna-io/src/main/java/org/monarchinitiative/svanna/io/service/SilentGenesGeneService.java b/svanna-io/src/main/java/org/monarchinitiative/svanna/io/service/SilentGenesGeneService.java index 58fd046a..d21a3d2b 100644 --- a/svanna-io/src/main/java/org/monarchinitiative/svanna/io/service/SilentGenesGeneService.java +++ b/svanna-io/src/main/java/org/monarchinitiative/svanna/io/service/SilentGenesGeneService.java @@ -1,6 +1,5 @@ package org.monarchinitiative.svanna.io.service; -import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream; import org.monarchinitiative.svanna.core.service.GeneService; import org.monarchinitiative.svanna.core.service.QueryResult; import org.monarchinitiative.svanna.io.service.jannovar.IntervalArray; @@ -23,6 +22,7 @@ import java.nio.file.Path; import java.util.*; import java.util.stream.Collectors; +import java.util.zip.GZIPInputStream; public class SilentGenesGeneService implements GeneService { @@ -64,7 +64,7 @@ public static SilentGenesGeneService of(GenomicAssembly assembly, Path silentGen private static InputStream openForReading(Path silentGenesJsonPath) throws IOException { if (silentGenesJsonPath.toFile().getName().endsWith(".gz")) { LOGGER.debug("Assuming the file is gzipped"); - return new BufferedInputStream(new GzipCompressorInputStream(Files.newInputStream(silentGenesJsonPath))); + return new BufferedInputStream(new GZIPInputStream(Files.newInputStream(silentGenesJsonPath))); } else { return new BufferedInputStream(Files.newInputStream(silentGenesJsonPath)); } diff --git a/svanna-io/src/test/java/org/monarchinitiative/svanna/io/parse/VcfVariantParserTest.java b/svanna-io/src/test/java/org/monarchinitiative/svanna/io/parse/VcfVariantParserTest.java index e95dda80..fc8297d6 100644 --- a/svanna-io/src/test/java/org/monarchinitiative/svanna/io/parse/VcfVariantParserTest.java +++ b/svanna-io/src/test/java/org/monarchinitiative/svanna/io/parse/VcfVariantParserTest.java @@ -4,15 +4,12 @@ import htsjdk.variant.vcf.VCFCodec; import htsjdk.variant.vcf.VCFFileReader; import htsjdk.variant.vcf.VCFHeaderVersion; +import org.junit.jupiter.api.*; import org.monarchinitiative.svanna.core.reference.SvannaVariant; import org.monarchinitiative.svanna.core.reference.VariantAware; import org.monarchinitiative.svanna.core.reference.Zygosity; import org.monarchinitiative.svanna.io.FullSvannaVariant; import org.monarchinitiative.svanna.io.TestDataConfig; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.DisplayName; -import org.junit.jupiter.api.Nested; -import org.junit.jupiter.api.Test; import org.monarchinitiative.svart.*; import org.monarchinitiative.svart.assembly.GenomicAssembly; import org.monarchinitiative.svart.assembly.GenomicAssemblies; @@ -31,7 +28,8 @@ @SpringBootTest(classes = TestDataConfig.class) public class VcfVariantParserTest { - private static final Path SV_EXAMPLE_PATH = Paths.get("src/test/resources/org/monarchinitiative/svanna/io/parse/sv_example.vcf"); + private static final Path TEST_VCF_DIR = Paths.get("src/test/resources/org/monarchinitiative/svanna/io/parse"); + private static final Path SV_EXAMPLE_PATH = TEST_VCF_DIR.resolve("sv_example.vcf"); private static final VCFCodec VCF_CODEC = new VCFCodec(); @BeforeAll @@ -426,6 +424,41 @@ public void toVariants_breakendVariant() { } } + /** + * Per issue 235, + * HTSlib >1.17 produces a gzipped file that cannot be read by common-compress's `GzipCompressorInputStream`. + * As a fix, the class was replaced by JRE's {@link java.util.zip.GZIPInputStream}. + *

+ * Here we test that both older and newer VCFs can be correctly read by SvAnna's code. + */ + @Nested + public class GzipQuirkTests { + + private final GenomicAssembly GRCh38p13 = GenomicAssemblies.GRCh38p13(); + private VcfVariantParser instance; + + @BeforeEach + public void setUp() { + instance = new VcfVariantParser(GRCh38p13); + } + + @Test + public void loadHtslibLeq16() throws Exception { + Path input = TEST_VCF_DIR.resolve("htslib_16.vcf.gz"); + List alleles = instance.createVariantAlleleList(input); + + assertThat(alleles, hasSize(8)); + } + + @Test + public void loadHtslibGeq17() throws Exception { + Path input = TEST_VCF_DIR.resolve("htslib_17.vcf.gz"); + List alleles = instance.createVariantAlleleList(input); + + assertThat(alleles, hasSize(8)); + } + } + private static GenomicAssembly testAssembly(List contigs) { return GenomicAssembly.of("toy", "Wookie", "9999", "Han Solo", "2100-01-01", "GB1", "RS1", contigs); diff --git a/svanna-io/src/test/resources/org/monarchinitiative/svanna/io/parse/htslib_16.vcf.gz b/svanna-io/src/test/resources/org/monarchinitiative/svanna/io/parse/htslib_16.vcf.gz new file mode 100644 index 00000000..07c50b3f Binary files /dev/null and b/svanna-io/src/test/resources/org/monarchinitiative/svanna/io/parse/htslib_16.vcf.gz differ diff --git a/svanna-io/src/test/resources/org/monarchinitiative/svanna/io/parse/htslib_17.vcf.gz b/svanna-io/src/test/resources/org/monarchinitiative/svanna/io/parse/htslib_17.vcf.gz new file mode 100644 index 00000000..5da69e82 Binary files /dev/null and b/svanna-io/src/test/resources/org/monarchinitiative/svanna/io/parse/htslib_17.vcf.gz differ diff --git a/svanna-model/pom.xml b/svanna-model/pom.xml index 87eca9af..9698f940 100644 --- a/svanna-model/pom.xml +++ b/svanna-model/pom.xml @@ -3,7 +3,7 @@ SvAnna org.monarchinitiative.svanna - 1.0.3 + 1.0.4 4.0.0 diff --git a/svanna-test/pom.xml b/svanna-test/pom.xml index c5d40d9c..519ac549 100644 --- a/svanna-test/pom.xml +++ b/svanna-test/pom.xml @@ -3,7 +3,7 @@ SvAnna org.monarchinitiative.svanna - 1.0.3 + 1.0.4 4.0.0