Skip to content

Commit

Permalink
feat: add vcf generate
Browse files Browse the repository at this point in the history
  • Loading branch information
natir committed Feb 2, 2024
1 parent 49c5bef commit 8fc300d
Show file tree
Hide file tree
Showing 4 changed files with 212 additions and 0 deletions.
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ license-file = "LICENSE"
[features]
fasta = []
fastq = []
vcf = []
derive = ["dep:biotest_derive"]

[dependencies]
Expand Down
14 changes: 14 additions & 0 deletions src/constants.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,20 @@ pub const PHRED64: [u8; 40] = gen_array::<40, 64>();
/// Alphabets with [ \ ] ^ _ `
pub const ALPHABETS: [u8; 58] = gen_array::<58, 65>();

/// Some different possible chromosomes name
pub const CHROMOSOMES: [&[u8]; 10] = [
b"chr1",
b"23",
b"93",
b"chrMT",
b"X",
b"NC_000015.10",
b"ENA|LT795502|LT795502.1",
b"NC_016845.1",
b"YAR028W",
b"1",
];

#[cfg(test)]
mod tests {
/* project use */
Expand Down
3 changes: 3 additions & 0 deletions src/format.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,7 @@ pub mod fasta;
#[cfg(feature = "fastq")]
pub mod fastq;

#[cfg(feature = "vcf")]
pub mod vcf;

/* projet use */
194 changes: 194 additions & 0 deletions src/format/vcf.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
//! VCF generation
/* std use */

/* crates use */
use rand::seq::SliceRandom as _;
use rand::Rng as _;

/* projet use */
use crate::constants;
use crate::error;

/// Generate vcf header
fn header<W>(output: &mut W) -> error::Result<()>
where
W: std::io::Write,
{
output.write_all(b"##fileformat=VCFv4.3\n")?;
for chr in constants::CHROMOSOMES {
output.write_all(b"##contig=<ID=")?;
output.write_all(chr)?;
output.write_all(b",length=2147483648,species=\"random\">\n")?;
}

output.write_all(b"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n")?;

Ok(())
}

/// Generate vcf record
fn record<W>(output: &mut W, rng: &mut rand::rngs::StdRng) -> error::Result<()>
where
W: std::io::Write,
{
// chromosomes
output.write_all(constants::CHROMOSOMES.choose(rng).unwrap())?;
output.write_all(b"\t")?;

// position
output.write_all(
&rng.gen_range(0..i32::MAX)
.to_string()
.bytes()
.collect::<Vec<u8>>(),
)?;

// identifiant
output.write_all(b"\t.\t")?;

// reference
output.write_all(&[*constants::NUCLEOTIDES.choose(rng).unwrap()])?;
output.write_all(b"\t")?;

// alternatif
let alt_len = rng.gen_range(1..5);
crate::sequence(output, rng, alt_len)?;
output.write_all(b"\t")?;

// quality
output.write_all(
&rng.gen_range(0..255)
.to_string()
.bytes()
.collect::<Vec<u8>>(),
)?;
output.write_all(b"\t")?;

// filter
output.write_all(b".\t")?;

// info
output.write_all(b"AC=")?;
output.write_all(
&rng.gen_range(0..i32::MAX)
.to_string()
.bytes()
.collect::<Vec<u8>>(),
)?;

Ok(())
}

/// Write multiple record
pub fn records<W>(
output: &mut W,
rng: &mut rand::rngs::StdRng,
num_record: usize,
) -> error::Result<()>
where
W: std::io::Write,
{
for _ in 0..num_record {
record(output, rng)?;
output.write_all(&[b'\n'])?;
}

Ok(())
}

/// Create a vcf file
pub fn create<P>(path: P, rng: &mut rand::rngs::StdRng, num_record: usize) -> error::Result<()>
where
P: std::convert::AsRef<std::path::Path>,
{
let mut output = std::fs::File::create(&path)?;

header(&mut output)?;
records(&mut output, rng, num_record)?;

Ok(())
}

#[cfg(test)]
mod tests {
/* std use */
use std::io::Read as _;

/* project use */
use super::*;

const TRUTH: &[u8] = b"##fileformat=VCFv4.3
##contig=<ID=chr1,length=2147483648,species=\"random\">
##contig=<ID=23,length=2147483648,species=\"random\">
##contig=<ID=93,length=2147483648,species=\"random\">
##contig=<ID=chrMT,length=2147483648,species=\"random\">
##contig=<ID=X,length=2147483648,species=\"random\">
##contig=<ID=NC_000015.10,length=2147483648,species=\"random\">
##contig=<ID=ENA|LT795502|LT795502.1,length=2147483648,species=\"random\">
##contig=<ID=NC_016845.1,length=2147483648,species=\"random\">
##contig=<ID=YAR028W,length=2147483648,species=\"random\">
##contig=<ID=1,length=2147483648,species=\"random\">
#CHROM POS ID REF ALT QUAL FILTER INFO
YAR028W 509242864 . a ATg 6 . AC=730431288
NC_016845.1 127722615 . t GTTA 97 . AC=549947617
chr1 993087666 . a cAcg 122 . AC=1985475776
chr1 223251961 . a cT 204 . AC=1940638485
23 1676586806 . a AG 248 . AC=1909766224
";

#[test]
fn header_() -> error::Result<()> {
let mut output = Vec::new();

header(&mut output)?;

assert_eq!(output, TRUTH[..628]);

Ok(())
}

#[test]
fn record_() -> error::Result<()> {
let mut output = Vec::new();
let mut rng = crate::rand();

record(&mut output, &mut rng)?;

assert_eq!(output, TRUTH[628..670]);

Ok(())
}

#[test]
fn records_() -> error::Result<()> {
let mut output = Vec::new();
let mut rng = crate::rand();

records(&mut output, &mut rng, 5)?;

assert_eq!(output, TRUTH[628..]);

Ok(())
}

#[test]
fn create_() -> error::Result<()> {
let mut rng = crate::rand();

let temp_dir = tempfile::tempdir()?;
let temp_path = temp_dir.path();

let temp_file = temp_path.join("tmp.vcf");

create(&temp_file, &mut rng, 5)?;

let mut data = Vec::new();
let mut input = std::fs::File::open(&temp_file)?;
input.read_to_end(&mut data)?;

assert_eq!(data, TRUTH);

Ok(())
}
}

0 comments on commit 8fc300d

Please sign in to comment.