Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Wire up CRAM 3.1 codecs for reading. #1736

Draft
wants to merge 36 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
306459c
Implementation of CRAM 3.1 codecs.
Jan 14, 2022
dd9b17e
Many FQZComp fixes, with roundtrip tests working.
cmnbroad Nov 13, 2024
9534f3d
Name tokenization codec fixes.
cmnbroad Nov 13, 2024
36d8622
Interop test data from htslib.
cmnbroad Nov 13, 2024
0f7c991
Remove unnecessary mac files.
cmnbroad Nov 25, 2024
ce7430f
Use shared rans decoder in interop tests.
cmnbroad Nov 25, 2024
ca64055
Update useArith type in name tokenizer, remove unecessary object crea…
cmnbroad Nov 26, 2024
56b8322
Remove sketchy exception suppression, make test encoding params type-…
cmnbroad Nov 26, 2024
d354ab2
Code cleanup.
cmnbroad Dec 5, 2024
f7a4ee6
Store token stream in arrays instead of lists.
cmnbroad Dec 5, 2024
ea0b8b2
More naming, removal of unnecessary code, switch sanitization.
cmnbroad Dec 9, 2024
8559902
Temp update.
cmnbroad Dec 9, 2024
561595d
Precompile regular expression patterns, optimize some string operatio…
cmnbroad Dec 10, 2024
7777e24
Checkpoint 1.
cmnbroad Dec 12, 2024
fdc2153
Checkpoint 2.
cmnbroad Dec 12, 2024
d7f2095
Repair haphazard stream management.
cmnbroad Dec 15, 2024
d39ddc0
Consolidate and optimize, remove unecessary code.
cmnbroad Dec 16, 2024
70665d1
Fix spotbugs issue.
cmnbroad Dec 17, 2024
22e0dfa
Remove unnecessary code.
cmnbroad Jan 6, 2025
2f3ce74
Fix sketchy byte conversion to use UTF-8 charset for names.
cmnbroad Jan 6, 2025
a8a7e55
Remove obsolete comment.
cmnbroad Jan 6, 2025
a71f9a9
Upgrade to samtools 1.21.
cmnbroad Jan 6, 2025
bdf9338
Remove redundant/duplicate tests.
cmnbroad Jan 6, 2025
bad9ece
Eliminate intermediate String representation for decoded names.
cmnbroad Jan 7, 2025
91c1cce
Standardize input/output read name buffer separator.
cmnbroad Jan 7, 2025
2b4ffd8
Update name separator handling.
cmnbroad Jan 7, 2025
b543c57
Comment update.
cmnbroad Jan 7, 2025
7a50f40
Remove old interop data.
cmnbroad Jan 14, 2025
71e7145
Add updated interop test files.
cmnbroad Jan 14, 2025
a37b995
Remove .DS_Store files.
cmnbroad Jan 14, 2025
282322a
Conform to updated interop test structure.
cmnbroad Jan 14, 2025
b9ff1a9
Code review comments.
cmnbroad Jan 28, 2025
b65918a
Wire up CRAM 3.1 codecs for reading.
cmnbroad Jan 22, 2025
d9de160
Fixes for issues exposed by CI tests.
cmnbroad Jan 22, 2025
c6c7ce5
Remove old code.
cmnbroad Jan 28, 2025
f52f2fb
Add 3.1 sanmtools interop tests.
cmnbroad Jan 28, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ htsjdk.iws
atlassian-ide-plugin.xml
/htsjdk.version.properties
/test-output/
.DS_Store

#intellij
.idea/
Expand Down
2 changes: 1 addition & 1 deletion scripts/install-samtools.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ sudo apt-get upgrade
sudo apt-get install -y libncurses-dev libbz2-dev liblzma-dev

#install from the github tar
export SAMTOOLS_VERSION=1.19.1
export SAMTOOLS_VERSION=1.21
wget https://github.com/samtools/samtools/releases/download/${SAMTOOLS_VERSION}/samtools-${SAMTOOLS_VERSION}.tar.bz2
tar -xjvf samtools-${SAMTOOLS_VERSION}.tar.bz2
cd samtools-${SAMTOOLS_VERSION} && ./configure --prefix=/usr && make && sudo make install
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
package htsjdk.beta.codecs.reads.cram.cramV3_1;

import htsjdk.beta.codecs.reads.cram.CRAMCodec;
import htsjdk.beta.codecs.reads.cram.CRAMDecoder;
import htsjdk.beta.codecs.reads.cram.CRAMEncoder;
import htsjdk.beta.exception.HtsjdkIOException;
import htsjdk.beta.io.bundle.Bundle;
import htsjdk.beta.io.bundle.SignatureStream;
import htsjdk.beta.plugin.HtsVersion;
import htsjdk.beta.plugin.reads.ReadsDecoderOptions;
import htsjdk.beta.plugin.reads.ReadsEncoderOptions;
import htsjdk.samtools.cram.structure.CramHeader;

import java.io.IOException;
import java.util.Arrays;

/**
* CRAM v3.1 codec
*/
public class CRAMCodecV3_1 extends CRAMCodec {
public static final HtsVersion VERSION_3_1 = new HtsVersion(3, 0, 1);
private static final String CRAM_MAGIC_3_1 = new String(CramHeader.MAGIC) + "\3\1";

@Override
public HtsVersion getVersion() {
return VERSION_3_1;
}

@Override
public int getSignatureLength() {
return CRAM_MAGIC_3_1.length();
}

@Override
public boolean canDecodeSignature(final SignatureStream signatureStream, final String sourceName) {
try {
final byte[] signatureBytes = new byte[getSignatureLength()];
final int numRead = signatureStream.read(signatureBytes);
if (numRead < getSignatureLength()) {
throw new HtsjdkIOException(String.format("Failure reading content from stream for %s", sourceName));
}
return Arrays.equals(signatureBytes, getSignatureString().getBytes());
} catch (IOException e) {
throw new HtsjdkIOException(String.format("Failure reading content from stream for %s", sourceName));
}
}

@Override
public CRAMDecoder getDecoder(final Bundle inputBundle, final ReadsDecoderOptions readsDecoderOptions) {
return new CRAMDecoderV3_1(inputBundle, readsDecoderOptions);
}

@Override
public CRAMEncoder getEncoder(final Bundle outputBundle, final ReadsEncoderOptions readsEncoderOptions) {
return new CRAMEncoderV3_1(outputBundle, readsEncoderOptions);
}

@Override
protected String getSignatureString() { return CRAM_MAGIC_3_1; }

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
package htsjdk.beta.codecs.reads.cram.cramV3_1;

import htsjdk.beta.codecs.reads.cram.CRAMDecoder;
import htsjdk.beta.io.bundle.Bundle;
import htsjdk.beta.io.bundle.BundleResourceType;
import htsjdk.beta.plugin.HtsVersion;
import htsjdk.beta.plugin.reads.ReadsDecoderOptions;

/**
* CRAM v3.1 decoder.
*/
public class CRAMDecoderV3_1 extends CRAMDecoder {

/**
* Create a new CRAM V3.1 decoder. The primary resource in the input
* bundle must have content type {@link BundleResourceType#CT_ALIGNED_READS} (to find a decoder for a bundle,
* see {@link htsjdk.beta.plugin.registry.ReadsResolver}).
*
* @param bundle input {@link Bundle} to decode
* @param readsDecoderOptions {@link ReadsDecoderOptions} to use
*/
public CRAMDecoderV3_1(final Bundle bundle, final ReadsDecoderOptions readsDecoderOptions) {
super(bundle, readsDecoderOptions);
}

@Override
public HtsVersion getVersion() {
return CRAMCodecV3_1.VERSION_3_1;
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
package htsjdk.beta.codecs.reads.cram.cramV3_1;

import htsjdk.beta.codecs.reads.cram.CRAMEncoder;
import htsjdk.beta.io.bundle.Bundle;
import htsjdk.beta.io.bundle.BundleResourceType;
import htsjdk.beta.plugin.HtsVersion;
import htsjdk.beta.plugin.reads.ReadsEncoderOptions;

/**
* CRAM v3.1 encoder.
*/
public class CRAMEncoderV3_1 extends CRAMEncoder {

/**
* Create a new CRAM v3.1 encoder for the given output bundle. The primary resource in the
* bundle must have content type {@link BundleResourceType#CT_ALIGNED_READS} (to find an encoder for a bundle,
* see {@link htsjdk.beta.plugin.registry.ReadsResolver}).
*
* @param outputBundle output {@link Bundle} to encode
* @param readsEncoderOptions {@link ReadsEncoderOptions} to use
*/
public CRAMEncoderV3_1(final Bundle outputBundle, final ReadsEncoderOptions readsEncoderOptions) {
super(outputBundle, readsEncoderOptions);
}

@Override
public HtsVersion getVersion() {
return CRAMCodecV3_1.VERSION_3_1;
}

}
2 changes: 2 additions & 0 deletions src/main/java/htsjdk/samtools/cram/common/CramVersions.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,12 @@
public final class CramVersions {
public static final CRAMVersion CRAM_v2_1 = new CRAMVersion(2, 1);
public static final CRAMVersion CRAM_v3 = new CRAMVersion(3, 0);
public static final CRAMVersion CRAM_v3_1 = new CRAMVersion(3, 1);

final static Set<CRAMVersion> supportedCRAMVersions = new HashSet<CRAMVersion>() {{
add(CRAM_v2_1);
add(CRAM_v3);
add(CRAM_v3_1);
}};

/**
Expand Down
178 changes: 178 additions & 0 deletions src/main/java/htsjdk/samtools/cram/compression/CompressionUtils.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
package htsjdk.samtools.cram.compression;

import htsjdk.samtools.cram.CRAMException;
import htsjdk.samtools.cram.compression.rans.Constants;

import java.nio.ByteBuffer;
import java.nio.ByteOrder;

public class CompressionUtils {
public static void writeUint7(final int i, final ByteBuffer cp) {
int s = 0;
int X = i;
do {
s += 7;
X >>= 7;
} while (X > 0);
do {
s -= 7;
//writeByte
final int s_ = (s > 0) ? 1 : 0;
cp.put((byte) (((i >> s) & 0x7f) + (s_ << 7)));
} while (s > 0);
}

public static int readUint7(final ByteBuffer cp) {
int i = 0;
int c;
do {
//read byte
c = cp.get();
i = (i << 7) | (c & 0x7f);
} while ((c & 0x80) != 0);
return i;
}

public static ByteBuffer encodePack(
final ByteBuffer inBuffer,
final ByteBuffer outBuffer,
final int[] frequencyTable,
final int[] packMappingTable,
final int numSymbols){
final int inSize = inBuffer.remaining();
final ByteBuffer encodedBuffer;
if (numSymbols <= 1) {
encodedBuffer = CompressionUtils.allocateByteBuffer(0);
} else if (numSymbols <= 2) {

// 1 bit per value
final int encodedBufferSize = (int) Math.ceil((double) inSize/8);
encodedBuffer = CompressionUtils.allocateByteBuffer(encodedBufferSize);
int j = -1;
for (int i = 0; i < inSize; i ++) {
if (i % 8 == 0) {
encodedBuffer.put(++j, (byte) 0);
}
encodedBuffer.put(j, (byte) (encodedBuffer.get(j) + (packMappingTable[inBuffer.get(i) & 0xFF] << (i % 8))));
}
} else if (numSymbols <= 4) {

// 2 bits per value
final int encodedBufferSize = (int) Math.ceil((double) inSize/4);
encodedBuffer = CompressionUtils.allocateByteBuffer(encodedBufferSize);
int j = -1;
for (int i = 0; i < inSize; i ++) {
if (i % 4 == 0) {
encodedBuffer.put(++j, (byte) 0);
}
encodedBuffer.put(j, (byte) (encodedBuffer.get(j) + (packMappingTable[inBuffer.get(i) & 0xFF] << ((i % 4) * 2))));
}
} else {

// 4 bits per value
final int encodedBufferSize = (int) Math.ceil((double)inSize/2);
encodedBuffer = CompressionUtils.allocateByteBuffer(encodedBufferSize);
int j = -1;
for (int i = 0; i < inSize; i ++) {
if (i % 2 == 0) {
encodedBuffer.put(++j, (byte) 0);
}
encodedBuffer.put(j, (byte) (encodedBuffer.get(j) + (packMappingTable[inBuffer.get(i) & 0xFF] << ((i % 2) * 4))));
}
}

// write numSymbols
outBuffer.put((byte) numSymbols);

// write mapping table "packMappingTable" that converts mapped value to original symbol
for(int i = 0; i < Constants.NUMBER_OF_SYMBOLS; i ++) {
if (frequencyTable[i] > 0) {
outBuffer.put((byte) i);
}
}

// write the length of data
CompressionUtils.writeUint7(encodedBuffer.limit(), outBuffer);
return encodedBuffer; // Here position = 0 since we have always accessed the data buffer using index
}

public static ByteBuffer decodePack(
final ByteBuffer inBuffer,
final byte[] packMappingTable,
final int numSymbols,
final int uncompressedPackOutputLength) {
final ByteBuffer outBufferPack = CompressionUtils.allocateByteBuffer(uncompressedPackOutputLength);
int j = 0;
if (numSymbols <= 1) {
for (int i=0; i < uncompressedPackOutputLength; i++){
outBufferPack.put(i, packMappingTable[0]);
}
}

// 1 bit per value
else if (numSymbols <= 2) {
int v = 0;
for (int i=0; i < uncompressedPackOutputLength; i++){
if (i % 8 == 0){
v = inBuffer.get(j++);
}
outBufferPack.put(i, packMappingTable[v & 1]);
v >>=1;
}
}

// 2 bits per value
else if (numSymbols <= 4){
int v = 0;
for(int i=0; i < uncompressedPackOutputLength; i++){
if (i % 4 == 0){
v = inBuffer.get(j++);
}
outBufferPack.put(i, packMappingTable[v & 3]);
v >>=2;
}
}

// 4 bits per value
else if (numSymbols <= 16){
int v = 0;
for(int i=0; i < uncompressedPackOutputLength; i++){
if (i % 2 == 0){
v = inBuffer.get(j++);
}
outBufferPack.put(i, packMappingTable[v & 15]);
v >>=4;
}
}
return outBufferPack;
}

public static ByteBuffer allocateOutputBuffer(final int inSize) {
// This calculation is identical to the one in samtools rANS_static.c
// Presumably the frequency table (always big enough for order 1) = 257*257,
// then * 3 for each entry (byte->symbol, 2 bytes -> scaled frequency),
// + 9 for the header (order byte, and 2 int lengths for compressed/uncompressed lengths).
final int compressedSize = (int) (inSize + 257 * 257 * 3 + 9);
final ByteBuffer outputBuffer = allocateByteBuffer(compressedSize);
if (outputBuffer.remaining() < compressedSize) {
throw new CRAMException("Failed to allocate sufficient buffer size for RANS coder.");
}
return outputBuffer;
}

// returns a new LITTLE_ENDIAN ByteBuffer of size = bufferSize
//TODO: rename this to allocateLittleEndianByteBuffer
public static ByteBuffer allocateByteBuffer(final int bufferSize){
return ByteBuffer.allocate(bufferSize).order(ByteOrder.LITTLE_ENDIAN);
}

// returns a LITTLE_ENDIAN ByteBuffer that is created by wrapping a byte[]
public static ByteBuffer wrap(final byte[] inputBytes){
return ByteBuffer.wrap(inputBytes).order(ByteOrder.LITTLE_ENDIAN);
}

// returns a LITTLE_ENDIAN ByteBuffer that is created by inputBuffer.slice()
public static ByteBuffer slice(final ByteBuffer inputBuffer){
return inputBuffer.slice().order(ByteOrder.LITTLE_ENDIAN);
}
}
Original file line number Diff line number Diff line change
@@ -1,6 +1,18 @@
package htsjdk.samtools.cram.compression;

import htsjdk.samtools.cram.compression.rans.RANS;
import htsjdk.samtools.cram.compression.fqzcomp.FQZCompDecode;
import htsjdk.samtools.cram.compression.fqzcomp.FQZCompEncode;
import htsjdk.samtools.cram.compression.fqzcomp.FQZCompExternalCompressor;
import htsjdk.samtools.cram.compression.nametokenisation.NameTokenisationDecode;
import htsjdk.samtools.cram.compression.nametokenisation.NameTokenisationEncode;
import htsjdk.samtools.cram.compression.nametokenisation.NameTokeniserExternalCompressor;
import htsjdk.samtools.cram.compression.range.RangeDecode;
import htsjdk.samtools.cram.compression.range.RangeEncode;
import htsjdk.samtools.cram.compression.range.RangeExternalCompressor;
import htsjdk.samtools.cram.compression.rans.rans4x8.RANS4x8Decode;
import htsjdk.samtools.cram.compression.rans.rans4x8.RANS4x8Encode;
import htsjdk.samtools.cram.compression.rans.ransnx16.RANSNx16Decode;
import htsjdk.samtools.cram.compression.rans.ransnx16.RANSNx16Encode;
import htsjdk.samtools.cram.structure.block.BlockCompressionMethod;
import htsjdk.utils.ValidationUtils;

Expand Down Expand Up @@ -71,8 +83,24 @@ public static ExternalCompressor getCompressorForMethod(

case RANS:
return compressorSpecificArg == NO_COMPRESSION_ARG ?
new RANSExternalCompressor(new RANS()) :
new RANSExternalCompressor(compressorSpecificArg, new RANS());
new RANS4x8ExternalCompressor(new RANS4x8Encode(), new RANS4x8Decode()) :
new RANS4x8ExternalCompressor(compressorSpecificArg, new RANS4x8Encode(), new RANS4x8Decode());

case RANSNx16:
return compressorSpecificArg == NO_COMPRESSION_ARG ?
new RANSNx16ExternalCompressor(new RANSNx16Encode(), new RANSNx16Decode()) :
new RANSNx16ExternalCompressor(compressorSpecificArg, new RANSNx16Encode(), new RANSNx16Decode());

case ADAPTIVE_ARITHMETIC:
return compressorSpecificArg == NO_COMPRESSION_ARG ?
new RangeExternalCompressor(new RangeEncode(), new RangeDecode()) :
new RangeExternalCompressor(compressorSpecificArg, new RangeEncode(), new RangeDecode());

case NAME_TOKENISER:
return new NameTokeniserExternalCompressor(new NameTokenisationEncode(), new NameTokenisationDecode());

case FQZCOMP:
return new FQZCompExternalCompressor(new FQZCompEncode(), new FQZCompDecode());

case BZIP2:
ValidationUtils.validateArg(
Expand All @@ -85,5 +113,4 @@ public static ExternalCompressor getCompressorForMethod(
}
}

}

}
Loading