-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: introduces liftover capabilities
This commit is rebased and comprised of the following commits: * feat: introduces liftover (sans correct negative strand lookups) * feat: adds in correct negative strand lookups * docs: updates documentation and slightly reorganizes modules
- Loading branch information
1 parent
5cd0539
commit 4516665
Showing
21 changed files
with
3,516 additions
and
43 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,223 @@ | ||
//! This example is a very simple approach to testing the validity of a chain | ||
//! file. To run the example, you'll need a reference FASTA, a query FASTA, as | ||
//! well as a chain file lifting over from the reference to the query. You can | ||
//! call the program like so: | ||
//! | ||
//! ``` | ||
//! cargo run --release --example chain_check <CHAIN> <REFERENCE_FASTA> <QUERY_FASTA> | ||
//! ``` | ||
//! | ||
//! If all goes well, the program will print out the total number of positions | ||
//! compared, the total number of positions with mismatches, and the overall | ||
//! match rate between the two genomes (via the chain file). In some cases, you | ||
//! will experience errors: this indicates a malformed chain file or incorrect | ||
//! pairings of chain file/reference FASTA/query FASTA. | ||
//! | ||
//! Note that this program somewhat simplifies what a "match" between the two | ||
//! genomes means. For example, all nucleotides within both FASTA files are | ||
//! mapped to 'A', 'C', 'G', 'T', or 'Other'. Lowercase and uppercase letters in | ||
//! the FASTA file have semantic meaning, but for the purposes of this | ||
//! comparison, they are considered the same. | ||
use std::collections::HashMap; | ||
use std::env; | ||
use std::fs::File; | ||
use std::io::BufReader; | ||
use std::iter::zip; | ||
|
||
use chain::core::Coordinate; | ||
use chain::core::Interval; | ||
use chain::core::Position; | ||
use chain::core::Strand; | ||
use chain::liftover; | ||
use chainfile as chain; | ||
use flate2::read::GzDecoder; | ||
use noodles::fasta; | ||
|
||
fn main() -> Result<(), Box<dyn std::error::Error>> { | ||
let mut total_positions = 0usize; | ||
let mut mismatches = 0usize; | ||
|
||
let chain_path = env::args().nth(1).expect("missing chain path"); | ||
let reference_path = env::args().nth(2).expect("missing reference path"); | ||
let query_path = env::args().nth(3).expect("missing query path"); | ||
|
||
let chain = File::open(chain_path) | ||
.map(GzDecoder::new) | ||
.map(BufReader::new) | ||
.map(chain::Reader::new)?; | ||
|
||
let machine = liftover::machine::builder::Builder::default().try_build_from(chain)?; | ||
|
||
let mut reference = HashMap::new(); | ||
for result in fasta::reader::Builder::default() | ||
.build_from_path(reference_path)? | ||
.records() | ||
{ | ||
let record = result?; | ||
reference.insert(record.name().to_string(), record.sequence().clone()); | ||
} | ||
|
||
let mut query = HashMap::new(); | ||
for result in fasta::reader::Builder::default() | ||
.build_from_path(query_path)? | ||
.records() | ||
{ | ||
let record = result?; | ||
query.insert(record.name().to_string(), record.sequence().clone()); | ||
} | ||
|
||
for (name, reference_sequence) in reference { | ||
let interval = Interval::try_new( | ||
Coordinate::try_new(name.clone(), 0, Strand::Positive)?, | ||
Coordinate::try_new(name, reference_sequence.len(), Strand::Positive)?, | ||
)?; | ||
|
||
let results = match machine.liftover(&interval) { | ||
Some(results) => results, | ||
None => { | ||
println!( | ||
"INFO: region {} is not included in the query genome.", | ||
interval | ||
); | ||
continue; | ||
} | ||
}; | ||
|
||
for region in results { | ||
let query_sequence = query.get(region.query().contig()).unwrap_or_else(|| { | ||
panic!( | ||
"query fasta did not contain necessary contig: {}. \ | ||
Are the FASTA files and chain file correct?", | ||
region.query().contig() | ||
) | ||
}); | ||
|
||
let reference = get_sequence_for_interval(&reference_sequence, region.reference()); | ||
let query = get_sequence_for_interval(query_sequence, region.query()); | ||
|
||
assert_eq!(reference.len(), query.len()); | ||
total_positions += reference.len(); | ||
mismatches += count_mismatches(&reference, &query); | ||
} | ||
} | ||
|
||
let result = (1.0 - (mismatches as f64 / total_positions as f64)) * 100.0; | ||
println!("Total mismatches found: {}", mismatches); | ||
println!("Total positions checked: {}", total_positions); | ||
println!("Percentage match for lifted regions: {:.1}%", result); | ||
|
||
Ok(()) | ||
} | ||
|
||
fn get_sequence_for_interval( | ||
sequence: &fasta::record::Sequence, | ||
interval: &chain::core::Interval, | ||
) -> Vec<Nucleotide> { | ||
let result = sequence | ||
.slice(parse_interval(interval)) | ||
.expect("sequence lookup to succeed. Are the FASTA files and chain file correct?") | ||
.as_ref() | ||
.iter() | ||
.map(Nucleotide::from) | ||
.collect::<Vec<_>>(); | ||
|
||
match interval.strand() { | ||
Strand::Positive => result, | ||
Strand::Negative => result | ||
.into_iter() | ||
.rev() | ||
.map(|n| n.complement()) | ||
.collect::<Vec<_>>(), | ||
} | ||
} | ||
|
||
fn parse_interval(interval: &chain::core::Interval) -> noodles::core::region::Interval { | ||
let (start, end) = match interval.strand() { | ||
Strand::Positive => { | ||
let start = match interval.start().position() { | ||
Position::ZeroBased(position) => { | ||
noodles::core::Position::try_from(*position + 1).unwrap() | ||
} | ||
Position::NegativeBound => unreachable!(), | ||
}; | ||
|
||
let end = match interval.end().position() { | ||
Position::ZeroBased(position) => { | ||
noodles::core::Position::try_from(*position).unwrap() | ||
} | ||
Position::NegativeBound => unreachable!(), | ||
}; | ||
|
||
(start, end) | ||
} | ||
Strand::Negative => { | ||
let start = match interval.end().position() { | ||
Position::ZeroBased(position) => { | ||
noodles::core::Position::try_from(*position + 2).unwrap() | ||
} | ||
Position::NegativeBound => noodles::core::Position::try_from(1).unwrap(), | ||
}; | ||
|
||
let end = match interval.start().position() { | ||
Position::ZeroBased(position) => { | ||
noodles::core::Position::try_from(*position + 1).unwrap() | ||
} | ||
Position::NegativeBound => unreachable!(), | ||
}; | ||
|
||
(start, end) | ||
} | ||
}; | ||
|
||
noodles::core::region::Interval::from(start..=end) | ||
} | ||
|
||
#[derive(Debug, Eq, PartialEq)] | ||
pub enum Nucleotide { | ||
A, | ||
C, | ||
G, | ||
T, | ||
Other, | ||
} | ||
|
||
impl Nucleotide { | ||
pub fn complement(self) -> Nucleotide { | ||
match self { | ||
Nucleotide::A => Nucleotide::T, | ||
Nucleotide::C => Nucleotide::G, | ||
Nucleotide::G => Nucleotide::C, | ||
Nucleotide::T => Nucleotide::A, | ||
Nucleotide::Other => Nucleotide::Other, | ||
} | ||
} | ||
} | ||
|
||
impl From<&u8> for Nucleotide { | ||
fn from(value: &u8) -> Self { | ||
match value { | ||
b'A' => Nucleotide::A, | ||
b'a' => Nucleotide::A, | ||
b'C' => Nucleotide::C, | ||
b'c' => Nucleotide::C, | ||
b'G' => Nucleotide::G, | ||
b'g' => Nucleotide::G, | ||
b'T' => Nucleotide::T, | ||
b't' => Nucleotide::T, | ||
_ => Nucleotide::Other, | ||
} | ||
} | ||
} | ||
|
||
pub fn count_mismatches(reference: &[Nucleotide], query: &[Nucleotide]) -> usize { | ||
let mut result = 0; | ||
|
||
for (r, q) in zip(reference.iter(), query.iter()) { | ||
if r != q { | ||
result += 1; | ||
} | ||
} | ||
|
||
result | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
use std::env; | ||
use std::fs::File; | ||
use std::io::BufReader; | ||
|
||
use chain::core::Coordinate; | ||
use chain::core::Interval; | ||
use chainfile as chain; | ||
use flate2::read::GzDecoder; | ||
|
||
fn main() -> Result<(), Box<dyn std::error::Error>> { | ||
let interval = env::args().nth(1).expect("missing interval"); | ||
let src = env::args().nth(2).expect("missing src"); | ||
|
||
let interval = interval.parse::<Interval>()?; | ||
|
||
let mut reader = File::open(src) | ||
.map(GzDecoder::new) | ||
.map(BufReader::new) | ||
.map(chain::Reader::new)?; | ||
|
||
for result in reader.sections() { | ||
let section = result?; | ||
let section_interval = Interval::try_new( | ||
Coordinate::try_new( | ||
section | ||
.header() | ||
.reference_sequence() | ||
.chromosome_name() | ||
.clone(), | ||
section.header().reference_sequence().alignment_start(), | ||
section.header().reference_sequence().strand().clone(), | ||
)?, | ||
Coordinate::try_new( | ||
section | ||
.header() | ||
.reference_sequence() | ||
.chromosome_name() | ||
.clone(), | ||
section.header().reference_sequence().alignment_end(), | ||
section.header().reference_sequence().strand().clone(), | ||
)?, | ||
)?; | ||
|
||
if section_interval.contains(interval.start()) { | ||
println!("{}", section.header()); | ||
} | ||
} | ||
|
||
Ok(()) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
use std::env; | ||
use std::fs::File; | ||
use std::io::BufReader; | ||
|
||
use chainfile as chain; | ||
use flate2::read::GzDecoder; | ||
|
||
fn main() -> Result<(), Box<dyn std::error::Error>> { | ||
let src = env::args().nth(1).expect("missing src"); | ||
|
||
let mut reader = File::open(src) | ||
.map(GzDecoder::new) | ||
.map(BufReader::new) | ||
.map(chain::Reader::new)?; | ||
|
||
for result in reader.sections() { | ||
let section = result?; | ||
|
||
for result in section.stepthrough()? { | ||
let pair = result?; | ||
println!("{:?} -> {:?}", pair.reference(), pair.query()); | ||
} | ||
} | ||
|
||
Ok(()) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
use std::env; | ||
use std::fs::File; | ||
use std::io::BufReader; | ||
|
||
use chain::core::Interval; | ||
use chain::liftover; | ||
use chainfile as chain; | ||
use flate2::read::GzDecoder; | ||
|
||
fn main() -> Result<(), Box<dyn std::error::Error>> { | ||
let interval = env::args().nth(1).expect("missing interval"); | ||
let src = env::args().nth(2).expect("missing src"); | ||
|
||
let interval = interval.parse::<Interval>()?; | ||
|
||
let reader = File::open(src) | ||
.map(GzDecoder::new) | ||
.map(BufReader::new) | ||
.map(chain::Reader::new)?; | ||
|
||
let machine = liftover::machine::builder::Builder::default().try_build_from(reader)?; | ||
let results = machine.liftover(&interval); | ||
|
||
match results { | ||
Some(results) => { | ||
for result in results { | ||
println!("{}", result); | ||
} | ||
} | ||
None => println!("Does not exist in new genome"), | ||
} | ||
|
||
Ok(()) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
//! Core functionality used across the crate. | ||
pub mod coordinate; | ||
pub mod interval; | ||
pub mod strand; | ||
|
||
pub use coordinate::Contig; | ||
pub use coordinate::Coordinate; | ||
pub use coordinate::Position; | ||
pub use interval::Interval; | ||
pub use strand::Strand; |
Oops, something went wrong.