From 69509a98557f0f7e69f382e6c35311dc9db43f3d Mon Sep 17 00:00:00 2001 From: Pierre Marijon Date: Fri, 2 Feb 2024 13:51:49 +0100 Subject: [PATCH] feat: add generation function for fast[a|q] format --- .copier-answers.yml | 2 +- .github/workflows/test.yml | 5 +- Cargo.toml | 15 +++- Readme.md | 23 +++--- biotest_derive/src/lib.rs | 2 +- examples/biotest.rs | 50 ------------ src/constants.rs | 70 ++++++++++++++++ src/error.rs | 5 +- src/format.rs | 14 ++++ src/format/fasta.rs | 147 ++++++++++++++++++++++++++++++++++ src/format/fastq.rs | 159 +++++++++++++++++++++++++++++++++++++ src/lib.rs | 71 ++++++++++++++--- 12 files changed, 482 insertions(+), 81 deletions(-) delete mode 100644 examples/biotest.rs create mode 100644 src/constants.rs create mode 100644 src/format.rs create mode 100644 src/format/fasta.rs create mode 100644 src/format/fastq.rs diff --git a/.copier-answers.yml b/.copier-answers.yml index 50b463a..7c79182 100644 --- a/.copier-answers.yml +++ b/.copier-answers.yml @@ -13,5 +13,5 @@ forge_namespace: natir forge_repo_name: biotest msrv: '1.75' proc_macro: true -project_description: Many function to generate test data for bioinformatics data +project_description: Generate random test data for bioinformatics project_name: biotest diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 7a19d37..879ea66 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -24,6 +24,7 @@ jobs: uses: actions-rs/cargo@v1 with: command: check + args: --all-features test: runs-on: ${{ matrix.os }} @@ -56,7 +57,7 @@ jobs: uses: actions-rs/cargo@v1 with: command: test - args: --no-fail-fast + args: --no-fail-fast --all-features coverage: runs-on: ubuntu-latest @@ -79,6 +80,6 @@ jobs: run: cargo tarpaulin --all-features --timeout 600 --out Xml -- --test-threads 1 - name: Upload coverage to codecov - uses: codecov/codecov-action@v1.0.3 + uses: codecov/codecov-action@v4 with: token: ${{ secrets.CODECOV_TOKEN }} diff --git a/Cargo.toml b/Cargo.toml index b226d13..56481d3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,7 @@ name = "biotest" version = "0.1.0" authors = ["Pierre Marijon "] edition = "2021" -description = "Many function to generate test data for bioinformatics data" +description = "Generate random test data for bioinformatics" rust-version = "1.75" homepage = "https://github.com/natir/biotest" @@ -14,13 +14,17 @@ readme = "Readme.md" license-file = "LICENSE" -[dependencies] +[features] +fasta = [] +fastq = [] +derive = ["dep:biotest_derive"] +[dependencies] +rand = { version = "0.8" } # Error management thiserror = { version = "1" } - # Logging and error management log = { version = "0.4" } @@ -30,12 +34,15 @@ biotest_derive = { path = "biotest_derive", optional = true } [dev-dependencies] criterion = { version = "0.5" } +tempfile = { version = "3" } # CLI management clap = { version = "4", features = ["derive"] } # Logging management stderrlog = { version = "0.5" } + + [profile.release] lto = 'thin' opt-level = 3 @@ -46,4 +53,4 @@ incremental = false [profile.profiling] inherits = "release" -debug = true \ No newline at end of file +debug = true diff --git a/Readme.md b/Readme.md index aec86e8..95192bb 100644 --- a/Readme.md +++ b/Readme.md @@ -1,22 +1,21 @@

biotest

-[![License](https://img.shields.io/badge/license-MIT-green)](https:///natir/biotest/blob/master/LICENSE) +[![License](https://img.shields.io/badge/license-MIT-green)](https://github.com/natir/biotest/blob/master/LICENSE) +![Test](https://github.com/natir/natir/workflows/Test/badge.svg) +![Lints](https://github.com/natir/natir/workflows/Lints/badge.svg) +![MSRV](https://github.com/natir/natir/workflows/MSRV/badge.svg) +[![CodeCov](https://codecov.io/gh/natir/natir/branch/master/graph/badge.svg)](https://codecov.io/gh/natir/natir) +[![Documentation](https://github.com/natir/natir/workflows/Documentation/badge.svg)](https://natir.github.io/natir/natir) +Generate random test data for bioinformatics -Many function to generate test data for bioinformatics data - -## Installation - -### From source +## Usage -```bash -git clone https:///natir/biotest.git -cd biotest -cargo install --path . +In your Cargo.toml add +```toml +biotest = { url = "https:///natir/biotest.git" } ``` -## Usage - ## Minimum supported Rust version Currently the minimum supported Rust version is 1.75. diff --git a/biotest_derive/src/lib.rs b/biotest_derive/src/lib.rs index 3afa2c1..861f13d 100644 --- a/biotest_derive/src/lib.rs +++ b/biotest_derive/src/lib.rs @@ -1,4 +1,4 @@ -//! Many function to generate test data for bioinformatics data procedural macro crate +//! Generate random test data for bioinformatics" #![warn(missing_docs)] diff --git a/examples/biotest.rs b/examples/biotest.rs deleted file mode 100644 index e201d6d..0000000 --- a/examples/biotest.rs +++ /dev/null @@ -1,50 +0,0 @@ -//! Many function to generate test data for bioinformatics data - -#![warn(missing_docs)] - -/* std use */ - -/* crate use */ - -use clap::Parser as _; - -/* project use */ -use biotest::error; - -/// Example: Many function to generate test data for bioinformatics data -#[derive(clap::Parser, std::fmt::Debug)] -#[clap( - name = "biotest", - version = "0.1", - author = "Pierre Marijon " -)] -pub struct Command { - /// Silence all output - #[clap(short = 'q', long = "quiet")] - pub quiet: bool, - - /// Verbose mode (-v, -vv, -vvv, etc) - #[clap(short = 'v', long = "verbosity", action = clap::ArgAction::Count)] - pub verbosity: usize, - - /// Timestamp (sec, ms, ns, none) - #[clap(short = 'T', long = "timestamp")] - pub ts: Option, -} - -fn main() -> error::Result<()> { - // parse cli - let params = Command::parse(); - - // Setup logger - stderrlog::new() - .quiet(params.quiet) - .verbosity(params.verbosity) - .timestamp(params.ts.unwrap_or(stderrlog::Timestamp::Off)) - .init() - .unwrap(); - - log::trace!("Hello, word!"); - - Ok(()) -} diff --git a/src/constants.rs b/src/constants.rs new file mode 100644 index 0000000..5d8ca55 --- /dev/null +++ b/src/constants.rs @@ -0,0 +1,70 @@ +//! Declarations of some constants value + +/* std use */ + +/* crates use */ + +/* projet use */ + +const fn gen_array() -> [u8; N] { + let mut array = [0; N]; + + let mut i = 0; + while i < N { + array[i] = (B + i) as u8; + i += 1; + } + + array +} + +/// Fixed random seed +pub const SEED: [u8; 32] = [42; 32]; + +/// Nucleotides with any case +pub const NUCLEOTIDES: [u8; 8] = *b"ACTGactg"; + +/// Nucleotides lower +pub const NUCLEOTIDES_LOWER: [u8; 4] = *b"actg"; + +/// Nucleotides upper +pub const NUCLEOTIDES_UPPER: [u8; 4] = *b"ACTG"; + +/// All possible phred 33 value +pub const PHRED33: [u8; 40] = gen_array::<40, 33>(); + +/// All possible phred 64 value +pub const PHRED64: [u8; 40] = gen_array::<40, 64>(); + +/// Alphabets with [ \ ] ^ _ ` +pub const ALPHABETS: [u8; 58] = gen_array::<58, 65>(); + +#[cfg(test)] +mod tests { + /* project use */ + use super::*; + + #[test] + fn phred33() { + assert_eq!( + gen_array::<40, 33>().to_vec(), + b"!\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGH".to_vec() + ); + } + + #[test] + fn phred64() { + assert_eq!( + gen_array::<40, 64>().to_vec(), + b"@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefg".to_vec() + ); + } + + #[test] + fn alphapets() { + assert_eq!( + gen_array::<58, 65>().to_vec(), + b"ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz".to_vec() + ); + } +} diff --git a/src/error.rs b/src/error.rs index 3244199..4b8e9f4 100644 --- a/src/error.rs +++ b/src/error.rs @@ -6,7 +6,10 @@ use thiserror; /// Enum to manage error #[derive(std::fmt::Debug, thiserror::Error)] pub enum Error { - } + /// std::io::Error error + #[error(transparent)] + StdIo(#[from] std::io::Error), +} /// Alias of result pub type Result = core::result::Result; diff --git a/src/format.rs b/src/format.rs new file mode 100644 index 0000000..602578a --- /dev/null +++ b/src/format.rs @@ -0,0 +1,14 @@ +//! Format data generation + +/* std use */ + +/* crates use */ + +/* module declaration */ +#[cfg(feature = "fasta")] +pub mod fasta; + +#[cfg(feature = "fastq")] +pub mod fastq; + +/* projet use */ diff --git a/src/format/fasta.rs b/src/format/fasta.rs new file mode 100644 index 0000000..04621e0 --- /dev/null +++ b/src/format/fasta.rs @@ -0,0 +1,147 @@ +//! Fasta generation + +/* std use */ + +/* crates use */ + +/* projet use */ +use crate::error; + +fn description( + output: &mut W, + rng: &mut rand::rngs::StdRng, + id: usize, + comment: usize, +) -> error::Result<()> +where + W: std::io::Write, +{ + output.write_all(&[b'>'])?; + crate::text(output, rng, id)?; + output.write_all(&[b' '])?; + crate::text(output, rng, comment)?; + + Ok(()) +} + +/// Write record +pub fn record( + output: &mut W, + rng: &mut rand::rngs::StdRng, + id: usize, + comment: usize, + seq_len: usize, +) -> error::Result<()> +where + W: std::io::Write, +{ + description(output, rng, id, comment)?; + output.write_all(&[b'\n'])?; + crate::sequence(output, rng, seq_len)?; + + Ok(()) +} + +/// Write multiple record +pub fn records( + output: &mut W, + rng: &mut rand::rngs::StdRng, + id: usize, + comment: usize, + seq_len: usize, + num_record: usize, +) -> error::Result<()> +where + W: std::io::Write, +{ + for _ in 0..num_record { + record(output, rng, id, comment, seq_len)?; + output.write_all(&[b'\n'])?; + } + + Ok(()) +} + +/// Create a fasta file +pub fn create

( + path: P, + rng: &mut rand::rngs::StdRng, + id: usize, + comment: usize, + seq_len: usize, + num_record: usize, +) -> error::Result<()> +where + P: std::convert::AsRef, +{ + let mut output = std::fs::File::create(&path)?; + + records(&mut output, rng, id, comment, seq_len, num_record)?; + + Ok(()) +} + +#[cfg(test)] +mod tests { + /* std use */ + use std::io::Read; + + /* project use */ + use super::*; + + const TRUTH: &[u8] = b">oNi_P dzwC[tBTlD +tCGCgtGTTAGTTAagccAcggtAatGcTtgtaCgcAGgAtaTcgAAtTa +>rQ_[V S^RtSvzMeT +ttGCtCatGtctgCTGGTACtgTgcaaaagggGAGacAtgCtGCAAtTac +>HYNm[ QBCgL`Scxx +GGtatTCaTCctcTGgAActTgCGAcaAgaAAtaTCCcAgagggaCcttC +>gNXcb hRd]QWyFOg +gAACcTtCttAacGtTtAtGTgACAGCCaCGctGagattTGtgCttaAGg +>ppugI LwOFhYRxBZ +CTGTCCACgTTTGagtGaGCatAGGACAAaacTaTTagagGtatAGCcTa +"; + + #[test] + fn record_() -> error::Result<()> { + let mut output = Vec::new(); + let mut rng = crate::rand(); + + record(&mut output, &mut rng, 5, 10, 50)?; + + assert_eq!(output, TRUTH.to_vec()[..68]); + + Ok(()) + } + + #[test] + fn records_() -> error::Result<()> { + let mut output = Vec::new(); + let mut rng = crate::rand(); + + records(&mut output, &mut rng, 5, 10, 50, 5)?; + + assert_eq!(output, TRUTH); + + Ok(()) + } + + #[test] + fn create_() -> error::Result<()> { + let mut rng = crate::rand(); + + let temp_dir = tempfile::tempdir()?; + let temp_path = temp_dir.path(); + + let temp_file = temp_path.join("tmp.fasta"); + + create(&temp_file, &mut rng, 5, 10, 50, 5)?; + + let mut data = Vec::new(); + let mut input = std::fs::File::open(&temp_file)?; + input.read_to_end(&mut data)?; + + assert_eq!(data, TRUTH); + + Ok(()) + } +} diff --git a/src/format/fastq.rs b/src/format/fastq.rs new file mode 100644 index 0000000..f610ba2 --- /dev/null +++ b/src/format/fastq.rs @@ -0,0 +1,159 @@ +//! Fastq generation + +/* std use */ + +/* crates use */ + +/* projet use */ +use crate::error; + +fn description( + output: &mut W, + rng: &mut rand::rngs::StdRng, + id: usize, + comment: usize, +) -> error::Result<()> +where + W: std::io::Write, +{ + output.write_all(&[b'@'])?; + crate::text(output, rng, id)?; + output.write_all(&[b' '])?; + crate::text(output, rng, comment)?; + + Ok(()) +} + +/// Write record +pub fn record( + output: &mut W, + rng: &mut rand::rngs::StdRng, + id: usize, + comment: usize, + seq_len: usize, +) -> error::Result<()> +where + W: std::io::Write, +{ + description(output, rng, id, comment)?; + output.write_all(&[b'\n'])?; + crate::sequence(output, rng, seq_len)?; + output.write_all(b"\n+\n")?; + crate::quality(output, rng, seq_len)?; + + Ok(()) +} + +/// Write multiple record +pub fn records( + output: &mut W, + rng: &mut rand::rngs::StdRng, + id: usize, + comment: usize, + seq_len: usize, + num_record: usize, +) -> error::Result<()> +where + W: std::io::Write, +{ + for _ in 0..num_record { + record(output, rng, id, comment, seq_len)?; + output.write_all(&[b'\n'])?; + } + + Ok(()) +} + +/// Create a fasta file +pub fn create

( + path: P, + rng: &mut rand::rngs::StdRng, + id: usize, + comment: usize, + seq_len: usize, + num_record: usize, +) -> error::Result<()> +where + P: std::convert::AsRef, +{ + let mut output = std::fs::File::create(&path)?; + + records(&mut output, rng, id, comment, seq_len, num_record)?; + + Ok(()) +} + +#[cfg(test)] +mod tests { + /* std use */ + use std::io::Read; + + /* project use */ + use super::*; + + const TRUTH: &[u8] = b"@oNi_P dzwC[tBTlD +tCGCgtGTTAGTTAagccAcggtAatGcTtgtaCgcAGgAtaTcgAAtTa ++ +,30C5-D.$.=A@2/&=\'6A0A$@D&4,1+=!/\'@ED:C577DF%\"%>.0 +@k_sGD gZcCc]tIGE +tGCAAtTacCGtTAAcaGGtatTCaTCctcTGgAActTgCGAcaAgaAAt ++ +6+9#(7E7-\"=BH3?\"6;%13=A-?!2FH!>\"1\'%)) +@K`HVk goY`vkxarZ +ttTGtgCttaAGggTcCTGcGTAGCTGTCCACgTTTGagtGaGCatAGGA ++ +\'!H\":,=$*$6*-95FH5D2?BA,+@58%75BH0D?G0+@E&?D>\")&,B +@F_fww GspJRS\\aPw +TCAGgCtaGTtcCCTcgcTgAgGgAtCAAatTCTATTGTaggcGCaCcCG ++ +A=(@9!DA+-D/,:*B7C+\'=07$C&&C9%H;B=!6&>1\"AD6+2#?54/ +@Uz^rc VndZg_IpyM +tGcTAGCCAgaTTgcAaTtaTGgACTTagGgtATACCtcTctCAtgCGCa ++ +D,\',GB55&(!**$F=@0?3G183F?>6<.C$$6AB2FH4#E<1?-@$.+ +"; + + #[test] + fn record_() -> error::Result<()> { + let mut output = Vec::new(); + let mut rng = crate::rand(); + + record(&mut output, &mut rng, 5, 10, 50)?; + + assert_eq!(output, TRUTH.to_vec()[..121]); + + Ok(()) + } + + #[test] + fn records_() -> error::Result<()> { + let mut output = Vec::new(); + let mut rng = crate::rand(); + + records(&mut output, &mut rng, 5, 10, 50, 5)?; + + assert_eq!(output, TRUTH.to_vec()); + + Ok(()) + } + + #[test] + fn create_() -> error::Result<()> { + let mut rng = crate::rand(); + + let temp_dir = tempfile::tempdir()?; + let temp_path = temp_dir.path(); + + let temp_file = temp_path.join("tmp.fasta"); + + create(&temp_file, &mut rng, 5, 10, 50, 5)?; + + let mut data = Vec::new(); + let mut input = std::fs::File::open(&temp_file)?; + input.read_to_end(&mut data)?; + + assert_eq!(data, TRUTH.to_vec()); + + Ok(()) + } +} diff --git a/src/lib.rs b/src/lib.rs index fe2f77e..8ea101f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,24 +1,75 @@ -//! Many function to generate test data for bioinformatics data +//! Generate random test data for bioinformatics #![warn(missing_docs)] /* std use */ /* crate use */ +use rand::seq::SliceRandom; +use rand::SeedableRng; /* project use */ #[cfg(feature = "derive")] -pub use biommap_derive as derive; - +pub use biotest_derive as derive; /* mod declaration */ +pub mod constants; pub mod error; +pub mod format; + +/// Generate random generator +pub fn rand() -> rand::rngs::StdRng { + rand::rngs::StdRng::from_seed(constants::SEED) +} + +/// Write random text of length in output +pub fn text(output: &mut W, rng: &mut rand::rngs::StdRng, length: usize) -> error::Result<()> +where + W: std::io::Write, +{ + output.write_all( + &(0..length) + .map(|_| { + *constants::ALPHABETS + .choose(rng) + .unwrap_or_else(|| unreachable!()) + }) + .collect::>(), + )?; + + Ok(()) +} + +fn sequence(output: &mut W, rng: &mut rand::rngs::StdRng, length: usize) -> error::Result<()> +where + W: std::io::Write, +{ + output.write_all( + &(0..length) + .map(|_| { + *constants::NUCLEOTIDES + .choose(rng) + .unwrap_or_else(|| unreachable!()) + }) + .collect::>(), + )?; + + Ok(()) +} + +fn quality(output: &mut W, rng: &mut rand::rngs::StdRng, length: usize) -> error::Result<()> +where + W: std::io::Write, +{ + output.write_all( + &(0..length) + .map(|_| { + *constants::PHRED33 + .choose(rng) + .unwrap_or_else(|| unreachable!()) + }) + .collect::>(), + )?; -#[cfg(test)] -mod tests { - #[test] - fn it_works() { - let result = 2 + 2; - assert_eq!(result, 4); - } + Ok(()) }