Skip to content

Commit

Permalink
Code review comments.
Browse files Browse the repository at this point in the history
  • Loading branch information
cmnbroad committed Jun 3, 2024
1 parent 19be0c7 commit 669fd7e
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@
* {@link #fetchReferenceBases(int)} or {@link #fetchReferenceBasesByRegion(int, int, int)}. It caches the bases
* from the previous request, along with metadata about the (0-based) start offset, and length of the
* cached bases.
*
* NOTE: this class is not thread-safe/safe for concurrent access across threads.
*/
public class CRAMReferenceRegion {
private static final Log log = Log.getInstance(CRAMReferenceRegion.class);
Expand Down Expand Up @@ -113,11 +115,7 @@ public void fetchReferenceBases(final int referenceIndex) {

// Re-resolve the reference bases if we don't have a current region or if the region we have
// doesn't span the *entire* contig requested.
final SAMSequenceRecord newSequenceRecord = sequenceDictionary.getSequence(referenceIndex);
if (newSequenceRecord == null) {
throw new IllegalArgumentException(
String.format("Requested reference sequence index %d not found", referenceIndex));
}
final SAMSequenceRecord newSequenceRecord = getSAMSequenceRecord(referenceIndex);
if ((referenceIndex != this.referenceIndex) ||
regionStart != 0 ||
(regionLength != newSequenceRecord.getSequenceLength())) {
Expand All @@ -128,7 +126,7 @@ public void fetchReferenceBases(final int referenceIndex) {
String.format("A reference must be supplied (reference sequence %s not found).", sequenceRecord));
}
regionStart = 0;
regionLength = sequenceRecord.getSequenceLength();
regionLength = newSequenceRecord.getSequenceLength();
}
}

Expand Down
27 changes: 26 additions & 1 deletion src/test/java/htsjdk/samtools/CRAMAllEncodingStrategiesTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,23 @@
import htsjdk.samtools.util.Tuple;
import htsjdk.utils.SamtoolsTestUtils;
import org.testng.Assert;
import org.testng.SkipException;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;

import java.io.*;
import java.util.*;

/**
* Test roundtripping files through the GATK writer using both the default HTSJDK encoding strategy, plus a variety
* of alternative encoding strategies, in order to stress test the writer implementation. Compares the results with
* the original file, and then roundtrips the newly written CRAM through the samtools writer, validating that samtools
* can consume the HTSJDK-written files with the expected level of roundtrip fidelity (CRAMs don't always roundtrip
* with complete bit-level fidelity, i.e, samtools will resurrect NM/MD tags whether they were present in the original
* file or not unless they are specifically excluded, etc.). So in some case, you can't use full SAMRecord comparisons,
* in which case we fall back to lenient equality and restrict the comparison to read names, bases, alignment start/stop,
* and quality scores.
*/
public class CRAMAllEncodingStrategiesTest extends HtsjdkTest {

private static final File TEST_DATA_DIR = new File("src/test/resources/htsjdk/samtools/cram");
Expand All @@ -24,22 +35,29 @@ public class CRAMAllEncodingStrategiesTest extends HtsjdkTest {
@DataProvider(name="defaultStrategyRoundTripTestFiles")
public Object[][] defaultStrategyRoundTripTestFiles() {
return new Object[][] {
// a test file with artificially small slices and containers to force multiple slices and containers
{ new File(TEST_DATA_DIR, "NA12878.20.21.1-100.100-SeqsPerSlice.500-unMapped.cram"),
new File(TEST_DATA_DIR, "human_g1k_v37.20.21.1-100.fasta"),
false, false },
// the same file without the artificially small container constraints
{ new File(TEST_DATA_DIR, "CEUTrio.HiSeq.WGS.b37.NA12878.20.21.10m-10m100.cram"),
new File("src/test/resources/htsjdk/samtools/reference/human_g1k_v37.20.21.fasta.gz"),
false, false },
// a test file with only unmapped reads
{ new File(TEST_DATA_DIR, "NA12878.unmapped.cram"),
new File(TEST_DATA_DIR, "human_g1k_v37.20.21.1-100.fasta"),
false, false },
// generated with samtools 1.19 from the gatk bam file CEUTrio.HiSeq.WGS.b37.NA12878.20.21.bam
{ new File(TEST_DATA_DIR, "CEUTrio.HiSeq.WGS.b37.NA12878.20.21.v3.0.samtools.cram"),
new File("src/test/resources/htsjdk/samtools/reference/human_g1k_v37.20.21.fasta.gz"),
true, false },
// these tests use lenient equality to only validate read names, bases and qual scores

// these tests use lenient equality to only validate read names, bases, alignment start/stop, and qual scores

// a user-contributed file with reads aligned only to the mito contig that has been rewritten (long ago) with GATK
{ new File(TEST_DATA_DIR, "mitoAlignmentStartTestGATKGen.cram"),
new File(TEST_DATA_DIR, "mitoAlignmentStartTest.fa"), true, false },
// the original user-contributed file with reads aligned only to the mito contig
{ new File(TEST_DATA_DIR, "mitoAlignmentStartTest.cram"),
new File(TEST_DATA_DIR, "mitoAlignmentStartTest.fa"), true, false },
// files created by rewriting the htsjdk test file src/test/resources/htsjdk/samtools/cram/mitoAlignmentStartTest.cram
Expand All @@ -59,11 +77,13 @@ public final void testRoundTripDefaultEncodingStrategy(
final File referenceFile,
final boolean lenientEquality,
final boolean emitDetail) throws IOException {
// test the default encoding strategy
final CRAMEncodingStrategy testStrategy = new CRAMEncodingStrategy();
final File tempOutCRAM = File.createTempFile("testRoundTrip", ".cram");
tempOutCRAM.deleteOnExit();
CRAMTestUtils.writeToCRAMWithEncodingStrategy(testStrategy, sourceFile, tempOutCRAM, referenceFile);
assertRoundTripFidelity(sourceFile, tempOutCRAM, referenceFile, lenientEquality, emitDetail);
// test interop with samtools using this encoding
assertRoundtripFidelityWithSamtools(tempOutCRAM, referenceFile, lenientEquality, emitDetail);
}

Expand All @@ -86,6 +106,7 @@ public final void testAllEncodingStrategyCombinations(
tempOutCRAM.deleteOnExit();
CRAMTestUtils.writeToCRAMWithEncodingStrategy(testStrategy.b, cramSourceFile, tempOutCRAM, referenceFile);
assertRoundTripFidelity(cramSourceFile, tempOutCRAM, referenceFile, lenientEquality, emitDetail);
// test interop with samtools using this encoding
assertRoundtripFidelityWithSamtools(tempOutCRAM, referenceFile, lenientEquality, emitDetail);
}
}
Expand Down Expand Up @@ -136,6 +157,8 @@ public void assertRoundTripFidelity(
final SAMRecord sourceRec = sourceIterator.next();
final SAMRecord targetRec = targetIterator.next();
Assert.assertEquals(targetRec.getReadName(), sourceRec.getReadName());
Assert.assertEquals(targetRec.getAlignmentStart(), sourceRec.getAlignmentStart());
Assert.assertEquals(targetRec.getAlignmentEnd(), sourceRec.getAlignmentEnd());
Assert.assertEquals(targetRec.getReadBases(), sourceRec.getReadBases());
Assert.assertEquals(targetRec.getBaseQualities(), sourceRec.getBaseQualities());
} else if (emitDetail) {
Expand Down Expand Up @@ -166,6 +189,8 @@ private void assertRoundtripFidelityWithSamtools(
referenceFile,
"--input-fmt-option decode_md=0 --output-fmt-option store_md=0 --output-fmt-option store_nm=0");
assertRoundTripFidelity(sourceCRAM, samtoolsOutFile, referenceFile, lenientEquality, emitDetail);
} else {
throw new SkipException("samtools is not installed, skipping test");
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,11 @@ public void testGetReferenceBasesByRegionPositive(
Assert.assertEquals(bases, Arrays.copyOfRange(fullContigBases, requestedOffset, requestedOffset + requestedLength));
}

// simulate the state transitions that occur when writing a CRAM file
// Simulate the state transitions that occur when writing a CRAM file; specifically, use transitions that mirror
// the ones that occur when writing a CRAM with the conditions that affect (and that fail without the fix to)
// https://github.com/broadinstitute/gatk/issues/8768, i.e., a container with one or more containers with reads
// aligned to position 1 of a given contig, followed by one or more containers with reads aligned past position 1
// of the same contig.
@Test
public void testSerialStateTransitions() {
// start with an entire reference sequence
Expand All @@ -183,8 +187,9 @@ public void testSerialStateTransitions() {
cramReferenceRegion.fetchReferenceBasesByRegion(CRAMStructureTestHelper.REFERENCE_SEQUENCE_ZERO, 0, SHORT_FRAGMENT_LENGTH);
Assert.assertEquals(cramReferenceRegion.getRegionLength(), SHORT_FRAGMENT_LENGTH);

// now transition back to the full sequence
// now transition back to the full sequence; this is where the bug previously would have occurred
cramReferenceRegion.fetchReferenceBases(CRAMStructureTestHelper.REFERENCE_SEQUENCE_ZERO);
// this assert would fail without the fix
Assert.assertEquals(cramReferenceRegion.getRegionLength(), fullRegionFragmentLength);

// transition to a shorter region fragment length using fetchReferenceBasesByRegion(AlignmentContext), then back to the full region
Expand Down

0 comments on commit 669fd7e

Please sign in to comment.