diff --git a/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/VariantContextConverter.java b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/VariantContextConverter.java index e6ab3ad9a..c8c28e4fa 100644 --- a/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/VariantContextConverter.java +++ b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/VariantContextConverter.java @@ -27,7 +27,10 @@ import org.opencb.biodata.models.variant.StudyEntry; import org.opencb.biodata.models.variant.Variant; import org.opencb.biodata.tools.commons.Converter; +import org.opencb.biodata.tools.variant.converters.avro.VariantAvroToVariantContextConverter; import org.opencb.commons.datastore.core.ObjectMap; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.text.DecimalFormat; import java.util.*; @@ -38,6 +41,7 @@ * Created by jtarraga on 07/02/17. */ public abstract class VariantContextConverter implements Converter { + private final Logger logger = LoggerFactory.getLogger(VariantAvroToVariantContextConverter.class); public static final DecimalFormat DECIMAL_FORMAT_7 = new DecimalFormat("#.#######"); public static final DecimalFormat DECIMAL_FORMAT_3 = new DecimalFormat("#.###"); @@ -246,7 +250,7 @@ protected double getQuality(List> fileAttributes) { } } catch (NumberFormatException e) { // Nothing to do - e.getMessage(); + logger.warn("Invalid QUAL value found: " + attrs.get(StudyEntry.QUAL)); } } } @@ -254,9 +258,10 @@ protected double getQuality(List> fileAttributes) { return (qual == Double.MAX_VALUE ? VariantContext.NO_LOG10_PERROR : (-0.1 * qual)); } - protected List getGenotypes(List alleleList, List sampleDataKeys, BiFunction getSampleData) { + protected List getGenotypes(List alleleList, List sampleDataKeys, BiFunction getSampleData, Set duplicatedAlleles) { String refAllele = alleleList.get(0); Set noCallAlleles = getNoCallAlleleIdx(alleleList); + Map finalAlleleMap = getDedupAlleleMap(alleleList); List genotypes = new ArrayList<>(); if (this.sampleNames != null) { @@ -289,6 +294,14 @@ protected List getGenotypes(List alleleList, List samp case "AD": if (StringUtils.isNotEmpty(value) && !value.equals(".")) { int[] ad = getInts(value); + if (!duplicatedAlleles.isEmpty() && ad.length == alleleList.size()) { + int[] finalAD = new int[finalAlleleMap.size()]; + for (int i = 0; i < ad.length; i++) { + finalAD[finalAlleleMap.get(alleleList.get(i))] += ad[i]; + } + genotypeBuilder.AD(finalAD); + ad = finalAD; + } genotypeBuilder.AD(ad); } else { genotypeBuilder.noAD(); @@ -341,7 +354,7 @@ private int[] getInts(String value) { return ints; } - protected VariantContext makeVariantContext(String chromosome, int start, int end, String idForVcf, List alleleList, boolean isNoVariation, Set filters, double qual, ObjectMap attributes, List genotypes) { + protected VariantContext makeVariantContext(String chromosome, int start, int end, String idForVcf, List alleleList, boolean isNoVariation, Set filters, double qual, ObjectMap attributes, List genotypes, Set duplicatedAlleles) { String refAllele = alleleList.get(0); VariantContextBuilder variantContextBuilder = new VariantContextBuilder() .chr(chromosome) @@ -355,7 +368,12 @@ protected VariantContext makeVariantContext(String chromosome, int start, int en if (isNoVariation && alleleList.get(1).isEmpty()) { variantContextBuilder.alleles(refAllele); } else { - variantContextBuilder.alleles(alleleList.stream().filter(a -> !a.equals(NO_CALL_ALLELE)).collect(Collectors.toList())); + List finalAlleles = alleleList.stream() + .filter(a -> !a.equals(NO_CALL_ALLELE)) + .collect(Collectors.toList()); + // Remove first occurrence of duplicated allele + duplicatedAlleles.forEach(finalAlleles::remove); + variantContextBuilder.alleles(finalAlleles); } if (genotypes.isEmpty()) { @@ -372,6 +390,34 @@ protected VariantContext makeVariantContext(String chromosome, int start, int en return variantContextBuilder.make(); } + protected Map getDedupAlleleMap(List alleleList) { + Map finalAlleleIdxMap = new HashMap<>(); + for (String allele : alleleList) { + // Assign an index to each unique allele + finalAlleleIdxMap.putIfAbsent(allele, finalAlleleIdxMap.size()); + } + return finalAlleleIdxMap; + } + + protected Set getDuplicatedAlleles(String chromosome, int start, List alleleList) { + Set duplicatedAlleles; + if (alleleList.size() > 2 && new HashSet<>(alleleList).size() != alleleList.size()) { + Set allelesSet = new HashSet<>(); + + duplicatedAlleles = new HashSet<>(); + for (String allele : alleleList) { + if (!allelesSet.add(allele)) { + duplicatedAlleles.add(allele); + } + } + logger.warn("Duplicated alleles found in variant " + chromosome + ":" + start + " : Denormalized alleles" + alleleList + + " , duplicated alleles: " + duplicatedAlleles); + } else { + duplicatedAlleles = Collections.emptySet(); + } + return duplicatedAlleles; + } + protected abstract Object getStudy(T variant); protected abstract Iterator getStudiesId(T variant); diff --git a/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/avro/VariantAvroToVariantContextConverter.java b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/avro/VariantAvroToVariantContextConverter.java index 78fde6101..74c24b072 100644 --- a/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/avro/VariantAvroToVariantContextConverter.java +++ b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/avro/VariantAvroToVariantContextConverter.java @@ -63,6 +63,7 @@ public VariantContext convert(Variant variant) { int start = adjustedStartEndPositions.getLeft(); int end = adjustedStartEndPositions.getRight(); List alleleList = buildAlleles(variant, adjustedStartEndPositions, referenceAlleles); + Set duplicatedAlleles = getDuplicatedAlleles(chromosome, start, alleleList); boolean isNoVariation = type.equals(VariantType.NO_VARIATION); // ID @@ -102,9 +103,9 @@ public VariantContext convert(Variant variant) { } // SAMPLES - List genotypes = getGenotypes(alleleList, studyEntry.getSampleDataKeys(), getSampleData); + List genotypes = getGenotypes(alleleList, studyEntry.getSampleDataKeys(), getSampleData, duplicatedAlleles); - return makeVariantContext(chromosome, start, end, idForVcf, alleleList, isNoVariation, filters, qual, attributes, genotypes); + return makeVariantContext(chromosome, start, end, idForVcf, alleleList, isNoVariation, filters, qual, attributes, genotypes, duplicatedAlleles); } /** diff --git a/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/proto/VariantProtoToVariantContextConverter.java b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/proto/VariantProtoToVariantContextConverter.java index 7904f8e23..61d862452 100644 --- a/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/proto/VariantProtoToVariantContextConverter.java +++ b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/converters/proto/VariantProtoToVariantContextConverter.java @@ -60,6 +60,7 @@ public VariantContext convert(VariantProto.Variant variant) { int start = adjustedStartEndPositions.getLeft(); int end = adjustedStartEndPositions.getRight(); List alleleList = buildAlleles(variant, adjustedStartEndPositions, referenceAlleles); + Set duplicatedAlleles = getDuplicatedAlleles(chromosome, start, alleleList); boolean isNoVariation = type.equals(VariantProto.VariantType.NO_VARIATION); // ID @@ -96,9 +97,9 @@ public VariantContext convert(VariantProto.Variant variant) { // SAMPLES BiFunction getSampleData = (sampleName, id) -> getSampleData(studyEntry, formatPositions, sampleName, id); - List genotypes = getGenotypes(alleleList, studyEntry.getSampleDataKeysList(), getSampleData); + List genotypes = getGenotypes(alleleList, studyEntry.getSampleDataKeysList(), getSampleData, duplicatedAlleles); - return makeVariantContext(chromosome, start, end, idForVcf, alleleList, isNoVariation, filters, qual, attributes, genotypes); + return makeVariantContext(chromosome, start, end, idForVcf, alleleList, isNoVariation, filters, qual, attributes, genotypes, duplicatedAlleles); } public String getSampleData(VariantProto.StudyEntry studyEntry, Map formatPositions, String sampleName, String field) { diff --git a/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/converters/VariantContextConverterTest.java b/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/converters/VariantContextConverterTest.java index 942090f10..e54c5141c 100644 --- a/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/converters/VariantContextConverterTest.java +++ b/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/converters/VariantContextConverterTest.java @@ -1,23 +1,27 @@ package org.opencb.biodata.tools.variant.converters; -import org.apache.commons.lang3.StringUtils; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.variantcontext.writer.Options; +import htsjdk.variant.variantcontext.writer.VariantContextWriter; +import htsjdk.variant.vcf.VCFHeader; import org.apache.commons.lang3.tuple.Pair; import org.junit.Test; +import org.opencb.biodata.formats.variant.vcf4.VcfUtils; import org.opencb.biodata.models.variant.Variant; import org.opencb.biodata.models.variant.avro.AlternateCoordinate; -import org.opencb.biodata.models.variant.avro.FileEntry; import org.opencb.biodata.models.variant.avro.VariantType; import org.opencb.biodata.models.variant.exceptions.NonStandardCompliantSampleField; import org.opencb.biodata.tools.variant.VariantNormalizer; import org.opencb.biodata.tools.variant.converters.avro.VariantAvroToVariantContextConverter; +import org.opencb.biodata.tools.variant.merge.VariantMerger; +import java.io.ByteArrayOutputStream; import java.util.Collections; import java.util.List; import java.util.Map; import java.util.stream.Collectors; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.*; /** * Created on 29/11/17. @@ -26,6 +30,51 @@ */ public class VariantContextConverterTest { + @Test + public void testDuplicatedAllele() throws NonStandardCompliantSampleField { + String studyId = "s"; + Variant variant = Variant.newBuilder("1", 1000, null, "AGTATATTGT", "A") + .setStudyId(studyId) + .setSampleDataKeys("GT", "AD") + .addSample("s1", "1/1", "10,10") + .addSample("s2", "0/1", "0,10") + .build(); + Variant variant2 = Variant.newBuilder("1", 1002, null, "TATATTGTGT", "TT,T") + .setStudyId(studyId) + .setSampleDataKeys("GT", "AD") + .addSample("s3", "0/2", "1,1,10") + .addSample("s4", "1/1", "1,10,1") + .build(); + + + Variant normalized = new VariantNormalizer().normalize(Collections.singletonList(variant), false).get(0); + Variant normalized2 = new VariantNormalizer().normalize(Collections.singletonList(variant2), false).get(0); + + Variant merged = new VariantMerger().merge(normalized, normalized2); + +// System.out.println("merged = " + merged.toJson()); + + // Convert to VariantContext + List sampleNames = merged.getSampleNames(studyId); + VariantAvroToVariantContextConverter converter = new VariantAvroToVariantContextConverter(studyId, sampleNames, Collections.emptyList()); + VariantContext context = converter.convert(merged); + +// System.out.println("context = " + context); + + // Print as VCF + ByteArrayOutputStream os = new ByteArrayOutputStream(); + VariantContextWriter writer = VcfUtils.createVariantContextWriter(os, null, Options.ALLOW_MISSING_FIELDS_IN_HEADER); + writer.setHeader(new VCFHeader(Collections.emptySet(), sampleNames)); + writer.add(context); + writer.close(); + System.out.println(os); + assertArrayEquals(new String[]{"1", "1000", ".", "AGTATATTGTG", "AG,AGT", ".", ".", ".", "GT:AD", + "1/1:10,10,0", + "0/1:0,10,0", + "0/1:1,10,1", + "2/2:1,1,10"}, os.toString().trim().split("\t")); + } + @Test public void adjustedVariantStart() throws Exception { testBuildAllele("1:824337:TGC:TC,TG");