Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

revise: removes custom header checks #33

Merged
merged 1 commit into from
Mar 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 2 additions & 4 deletions src/utils/formats/bam.rs
Original file line number Diff line number Diff line change
Expand Up @@ -105,8 +105,7 @@ where
// (3) Parse the header and reference sequences.
debug!("parsing the header and reference sequences");
let raw_header = reader.read_header().with_context(|| "reading BAM header")?;
let parsed_header =
super::sam::parse_header(raw_header.clone()).with_context(|| "parsing BAM header")?;
let parsed_header = raw_header.parse().with_context(|| "parsing BAM header")?;
let reference_sequences = reader
.read_reference_sequences()
.with_context(|| "reading BAM reference sequences")?;
Expand Down Expand Up @@ -209,8 +208,7 @@ where
.read_header()
.await
.with_context(|| "reading BAM header")?;
let parsed_header =
super::sam::parse_header(raw_header.clone()).with_context(|| "parsing BAM header")?;
let parsed_header = raw_header.parse().with_context(|| "parsing BAM header")?;
let reference_sequences = reader
.read_reference_sequences()
.await
Expand Down
6 changes: 2 additions & 4 deletions src/utils/formats/cram.rs
Original file line number Diff line number Diff line change
Expand Up @@ -102,8 +102,7 @@ where
let raw_header = reader
.read_file_header()
.with_context(|| "reading CRAM header")?;
let parsed_header =
super::sam::parse_header(raw_header.clone()).with_context(|| "parsing CRAM header")?;
let parsed_header = raw_header.parse().with_context(|| "parsing CRAM header")?;

// (4) Return the result.
Ok(ParsedCRAMFile {
Expand Down Expand Up @@ -207,8 +206,7 @@ where
.read_file_header()
.await
.with_context(|| "reading CRAM header")?;
let parsed_header =
super::sam::parse_header(raw_header.clone()).with_context(|| "parsing CRAM header")?;
let parsed_header = raw_header.parse().with_context(|| "parsing CRAM header")?;

// (4) Return the result.
Ok(ParsedAsyncCRAMFile {
Expand Down
54 changes: 2 additions & 52 deletions src/utils/formats/sam.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,44 +6,12 @@ use std::path::Path;
use anyhow::bail;
use anyhow::Context;
use noodles::sam;
use regex::Captures;
use regex::Regex;
use tracing::debug;

use crate::utils::formats::utils::RawAndParsedHeaders;

use super::BioinformaticsFileFormat;

//=================//
// Utility Methods //
//=================//

/// Corrects common header mistakes. See the inline comments for the things that
/// are automatically corrected.
pub fn correct_common_header_mistakes(header: String) -> String {
// (1) Corrects any lowercase platform units in the read group to be all
// uppercase. This is especially important for data that contains 'illumina'
// instead of the correct 'ILLUMINA'.
let pattern = Regex::new("(\tPL:)(.+)").unwrap();
let replaced = pattern.replace_all(&header, |c: &Captures<'_>| {
format!("{}{}", &c[1], c[2].to_uppercase())
});

replaced.to_string()
}

/// Parses a SAM/BAM/CRAM header from a string while also correcting common
/// header mistakes.
pub fn parse_header(header: String) -> anyhow::Result<sam::Header> {
let header_raw_corrected = correct_common_header_mistakes(header);

let header = header_raw_corrected
.parse()
.with_context(|| "could not parse SAM/BAM/CRAM header")?;

Ok(header)
}

//====================================//
// Sequence Alignment Map (SAM) files //
//====================================//
Expand Down Expand Up @@ -102,7 +70,7 @@ where
// (2) Parse the header.
debug!("parsing the header");
let raw_header = reader.read_header()?;
let parsed_header = parse_header(raw_header.clone()).with_context(|| "parsing SAM header")?;
let parsed_header = raw_header.parse().with_context(|| "parsing SAM header")?;

// (3) Return the result.
Ok(ParsedSAMFile {
Expand Down Expand Up @@ -176,7 +144,7 @@ where
// (2) Parse the header.
debug!("parsing the header");
let raw_header = reader.read_header().await?;
let parsed_header = parse_header(raw_header.clone()).with_context(|| "parsing SAM header")?;
let parsed_header = raw_header.parse().with_context(|| "parsing SAM header")?;

// (3) Return the result.
Ok(ParsedAsyncSAMFile {
Expand All @@ -187,21 +155,3 @@ where
},
})
}

//=======//
// Tests //
//=======//

#[cfg(test)]
mod tests {

use super::*;

#[test]
pub fn test_illumina_lowercase_fix() {
let data = "@RG\tID:rg0\tPL:illumina\n";
let expected = "@RG\tID:rg0\tPL:ILLUMINA\n";

assert_eq!(correct_common_header_mistakes(data.to_string()), expected);
}
}
3 changes: 1 addition & 2 deletions src/view/cram.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ use tokio::io;
use tokio::io::AsyncWriteExt;
use tracing::debug;

use crate::utils::formats::sam::parse_header;
use crate::utils::pathbuf::AppendExtension;
use crate::view::command::Mode;

Expand Down Expand Up @@ -82,7 +81,7 @@ pub async fn view(
}

// (7) Parses the header text.
let header = parse_header(ht).with_context(|| "parsing CRAM header")?;
let header = ht.parse().with_context(|| "parsing CRAM header")?;

// (8) Writes the records to the output stream.
let mut writer = sam::AsyncWriter::new(handle);
Expand Down