From 8f9d90754fa28774ad7e048953e6513995368bb9 Mon Sep 17 00:00:00 2001 From: Mitchell Robert Vollger Date: Wed, 20 Mar 2024 20:22:11 -0700 Subject: [PATCH 01/13] chore: cleaning --- CONTRIBUTING.md | 2 +- docs/ft--help.md | 14 ++----- docs/ft-add-nucleosomes-help.md | 21 ++++------ docs/ft-center-help.md | 60 +++++++++++++++++++--------- docs/ft-extract-help.md | 70 ++++++++++++++++++++++++--------- docs/ft-fire-help.md | 43 +++++++------------- docs/ft-footprint-help.md | 22 +++++++++++ docs/ft-predict-m6a-help.md | 3 +- docs/ft-strip-basemods-help.md | 3 +- docs/make_help_docs.sh | 2 +- src/cli.rs | 63 +++++++++++++++++------------ src/main.rs | 27 ++++++++----- 12 files changed, 198 insertions(+), 132 deletions(-) create mode 100644 docs/ft-footprint-help.md diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index cca8e2ca..e3c62328 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -3,7 +3,7 @@ Please feel free to open PRs! But first make sure you code passes tests, and ple ```bash cargo test --all-features ``` -Also format your code and check it with clingy: +Also format your code and check it with clippy before submitting a PR: ```bash cargo fmt cargo clippy --workspace diff --git a/docs/ft--help.md b/docs/ft--help.md index 887d9886..9c8f1e8e 100644 --- a/docs/ft--help.md +++ b/docs/ft--help.md @@ -4,18 +4,12 @@ Fiber-seq toolkit in rust Usage: ft [OPTIONS] Commands: - predict-m6a Predict m6A positions using HiFi kinetics data and encode the results in the MM and ML bam tags. Also - adds nucleosome (nl, ns) and MTase sensitive patches (al, as) [aliases: m6A, m6a] + predict-m6a Predict m6A positions using HiFi kinetics data and encode the results in the MM and ML bam tags. Also adds nucleosome (nl, ns) and MTase sensitive patches (al, as) [aliases: m6A, m6a] add-nucleosomes Add nucleosomes to a bam file with m6a predictions fire Add FIREs (Fiber-seq Inferred Regulatory Elements) to a bam file with m6a predictions - footprint Add footprints to a bam file with m6a predictions - extract Extract fiberseq data into plain text files. See - https://fiberseq.github.io/fibertools-rs/docs/extract.html for a description of the outputs [aliases: - ex, e] - center This command centers fiberseq data around given reference positions. This is useful for making - aggregate m6A and CpG observations, as well as visualization of SVs. See - https://fiberseq.github.io/fibertools-rs/docs/center.html for a description of the output [aliases: - c, ct] + extract Extract fiberseq data into plain text files [aliases: ex, e] + center This command centers fiberseq data around given reference positions. This is useful for making aggregate m6A and CpG observations, as well as visualization of SVs [aliases: c, ct] + footprint Infer footprints from fiberseq data track-decorators Make decorated bed files for fiberseq data clear-kinetics Remove HiFi kinetics tags from the input bam file strip-basemods Strip out select base modifications diff --git a/docs/ft-add-nucleosomes-help.md b/docs/ft-add-nucleosomes-help.md index 316445ba..693a15c9 100644 --- a/docs/ft-add-nucleosomes-help.md +++ b/docs/ft-add-nucleosomes-help.md @@ -8,20 +8,13 @@ Arguments: [OUT] Output bam file with nucleosome calls [default: -] Options: - -n, --nucleosome-length - Minium nucleosome length [default: 75] - -c, --combined-nucleosome-length - Minium nucleosome length when combining over a single m6A [default: 100] - -m, --min-distance-added - Minium distance needed to add to an already existing nuc by crossing an m6a [default: 25] - -d, --distance-from-end - Minimum distance from the end of a fiber to call a nucleosome or MSP [default: 45] - --min-ml-score - Minium score in the ML tag to use in predicting nucleosomes [default: 125] - -h, --help - Print help - -V, --version - Print version + -n, --nucleosome-length Minium nucleosome length [default: 75] + -c, --combined-nucleosome-length Minium nucleosome length when combining over a single m6A [default: 100] + -m, --min-distance-added Minium distance needed to add to an already existing nuc by crossing an m6a [default: 25] + -d, --distance-from-end Minimum distance from the end of a fiber to call a nucleosome or MSP [default: 45] + --min-ml-score Minium score in the ML tag to use in predicting nucleosomes [default: 125] + -h, --help Print help + -V, --version Print version Global-Options: -t, --threads Threads [default: 8] diff --git a/docs/ft-center-help.md b/docs/ft-center-help.md index 553d4312..b4ba377d 100644 --- a/docs/ft-center-help.md +++ b/docs/ft-center-help.md @@ -1,31 +1,53 @@ ``` -This command centers fiberseq data around given reference positions. This is useful for making aggregate m6A and CpG -observations, as well as visualization of SVs. See https://fiberseq.github.io/fibertools-rs/docs/center.html for a -description of the output +This command centers fiberseq data around given reference positions. This is useful for making aggregate m6A and CpG observations, as well as visualization of SVs. + +See https://fiberseq.github.io/fibertools-rs/docs/center.html for a description of the output. Usage: ft center [OPTIONS] Arguments: - Aligned Fiber-seq bam file - Bed file on which to center fiberseq reads. Data is adjusted to the start position of the bed file and corrected for - strand if the strand is indicated in the 6th column of the bed file. The 4th column will also be checked for the - strand but only after the 6th is. If you include strand information in the 4th (or 6th) column it will orient data - accordingly and use the end position of bed record instead of the start if on the minus strand. This means that - profiles of motifs in both the forward and minus orientation will align to the same central position + + Aligned Fiber-seq bam file + + + Bed file on which to center fiberseq reads. Data is adjusted to the start position of the bed file and corrected for strand if the strand is indicated in the 6th column of the bed file. The 4th column will also be + checked for the strand but only after the 6th is. If you include strand information in the 4th (or 6th) column it will orient data accordingly and use the end position of bed record instead of the start if on the + minus strand. This means that profiles of motifs in both the forward and minus orientation will align to the same central position Options: - -m, --min-ml-score Minium score in the ML tag to include in the output [default: 125] - -d, --dist Set a maximum distance from the start of the motif to keep a feature - -w, --wide Provide data in wide format, one row per read - -r, --reference Return relative reference position instead of relative molecular position - -s, --simplify Replace the sequence output column with just "N" - -h, --help Print help - -V, --version Print version + -m, --min-ml-score + Minium score in the ML tag to include in the output + + [default: 125] + + -d, --dist + Set a maximum distance from the start of the motif to keep a feature + + -w, --wide + Provide data in wide format, one row per read + + -r, --reference + Return relative reference position instead of relative molecular position + + -s, --simplify + Replace the sequence output column with just "N" + + -h, --help + Print help (see a summary with '-h') + + -V, --version + Print version Global-Options: - -t, --threads Threads [default: 8] + -t, --threads + Threads + + [default: 8] Debug-Options: - -v, --verbose... Logging level [-v: Info, -vv: Debug, -vvv: Trace] - --quiet Turn off all logging + -v, --verbose... + Logging level [-v: Info, -vv: Debug, -vvv: Trace] + + --quiet + Turn off all logging ``` diff --git a/docs/ft-extract-help.md b/docs/ft-extract-help.md index 1de08b75..3f62d122 100644 --- a/docs/ft-extract-help.md +++ b/docs/ft-extract-help.md @@ -1,32 +1,66 @@ ``` -Extract fiberseq data into plain text files. See https://fiberseq.github.io/fibertools-rs/docs/extract.html for a description -of the outputs +Extract fiberseq data into plain text files. + +See https://fiberseq.github.io/fibertools-rs/docs/extract.html for a description of the outputs. Usage: ft extract [OPTIONS] [BAM] Arguments: - [BAM] Input fiberseq bam file. If no path is provided extract will read bam data from stdin [default: -] + [BAM] + Input fiberseq bam file. If no path is provided extract will read bam data from stdin + + [default: -] Options: - -r, --reference Report in reference sequence coordinates - --molecular Report positions in the molecular sequence coordinates - -m, --min-ml-score Minium score in the ML tag to include in the output [default: 125] - --m6a Output path for m6a bed12 - -c, --cpg Output path for 5mC (CpG, primrose) bed12 - --msp Output path for methylation sensitive patch (msp) bed12 - -n, --nuc Output path for nucleosome bed12 - -a, --all Output path for a tabular format including "all" fiberseq information in the bam - -h, --help Print help - -V, --version Print version + -r, --reference + Report in reference sequence coordinates + + --molecular + Report positions in the molecular sequence coordinates + + -m, --min-ml-score + Minium score in the ML tag to include in the output + + [default: 125] + + --m6a + Output path for m6a bed12 + + -c, --cpg + Output path for 5mC (CpG, primrose) bed12 + + --msp + Output path for methylation sensitive patch (msp) bed12 + + -n, --nuc + Output path for nucleosome bed12 + + -a, --all + Output path for a tabular format including "all" fiberseq information in the bam + + -h, --help + Print help (see a summary with '-h') + + -V, --version + Print version All-Format-Options: - -q, --quality Include per base quality scores in "fiber_qual" - -s, --simplify Simplify output by remove fiber sequence + -q, --quality + Include per base quality scores in "fiber_qual" + + -s, --simplify + Simplify output by remove fiber sequence Global-Options: - -t, --threads Threads [default: 8] + -t, --threads + Threads + + [default: 8] Debug-Options: - -v, --verbose... Logging level [-v: Info, -vv: Debug, -vvv: Trace] - --quiet Turn off all logging + -v, --verbose... + Logging level [-v: Info, -vv: Debug, -vvv: Trace] + + --quiet + Turn off all logging ``` diff --git a/docs/ft-fire-help.md b/docs/ft-fire-help.md index 3273f46b..c887cc17 100644 --- a/docs/ft-fire-help.md +++ b/docs/ft-fire-help.md @@ -8,35 +8,20 @@ Arguments: [OUT] Output file (bam by default, table if --feats_to_text is used, and bed9 + if --extract is used) [default: -] Options: - -e, --extract - Output just FIRE elements in bed9 format - -s, --skip-no-m6a - Don't write reads with no m6A calls to the output bam - --min-msp - Skip reads without at least `N` MSP calls [default: 0] - --min-ave-msp-size - Skip reads without an average MSP size greater than `N` [default: 0] - -w, --width-bin - Width of bin for feature collection [default: 40] - -b, --bin-num - Number of bins to collect [default: 9] - --best-window-size - Calculate stats for the highest X bp window within each MSP Should be a fair amount higher than the expected linker - length [default: 100] - -u, --use-5mc - Use 5mC data in FIREs - -m, --min-msp-length-for-positive-fire-call - Minium length of msp to call a FIRE [default: 85] - --model - optional path to a model json file - --fdr-table - Optional path to a FDR table - -f, --feats-to-text - Output FIREs features for training in a table format - -h, --help - Print help - -V, --version - Print version + -e, --extract Output just FIRE elements in bed9 format + -s, --skip-no-m6a Don't write reads with no m6A calls to the output bam + --min-msp Skip reads without at least `N` MSP calls [default: 0] + --min-ave-msp-size Skip reads without an average MSP size greater than `N` [default: 0] + -w, --width-bin Width of bin for feature collection [default: 40] + -b, --bin-num Number of bins to collect [default: 9] + --best-window-size Calculate stats for the highest X bp window within each MSP Should be a fair amount higher than the expected linker length [default: 100] + -u, --use-5mc Use 5mC data in FIREs + -m, --min-msp-length-for-positive-fire-call Minium length of msp to call a FIRE [default: 85] + --model optional path to a model json file + --fdr-table Optional path to a FDR table + -f, --feats-to-text Output FIREs features for training in a table format + -h, --help Print help + -V, --version Print version Global-Options: -t, --threads Threads [default: 8] diff --git a/docs/ft-footprint-help.md b/docs/ft-footprint-help.md new file mode 100644 index 00000000..02e609b0 --- /dev/null +++ b/docs/ft-footprint-help.md @@ -0,0 +1,22 @@ +``` +Infer footprints from fiberseq data + +Usage: ft footprint [OPTIONS] + +Arguments: + Indexed and aligned bam file with m6A and MSP calls + BED file with the regions to footprint. Should all contain the same motif with proper strand information, and ideally be ChIP-seq peaks + yaml describing the modules of the footprint + +Options: + -o, --out Output bam [default: -] + -h, --help Print help + -V, --version Print version + +Global-Options: + -t, --threads Threads [default: 8] + +Debug-Options: + -v, --verbose... Logging level [-v: Info, -vv: Debug, -vvv: Trace] + --quiet Turn off all logging +``` diff --git a/docs/ft-predict-m6a-help.md b/docs/ft-predict-m6a-help.md index d3235337..4169ae50 100644 --- a/docs/ft-predict-m6a-help.md +++ b/docs/ft-predict-m6a-help.md @@ -1,6 +1,5 @@ ``` -Predict m6A positions using HiFi kinetics data and encode the results in the MM and ML bam tags. Also adds nucleosome (nl, -ns) and MTase sensitive patches (al, as) +Predict m6A positions using HiFi kinetics data and encode the results in the MM and ML bam tags. Also adds nucleosome (nl, ns) and MTase sensitive patches (al, as) Usage: ft predict-m6a [OPTIONS] [BAM] [OUT] diff --git a/docs/ft-strip-basemods-help.md b/docs/ft-strip-basemods-help.md index 7a8a0432..f3a28eab 100644 --- a/docs/ft-strip-basemods-help.md +++ b/docs/ft-strip-basemods-help.md @@ -8,8 +8,7 @@ Arguments: [OUT] Output bam file [default: -] Options: - -b, --basemod base modification to strip out of the bam file [default: m6A] [possible values: m6A, 6mA, 5mC, - CpG] + -b, --basemod base modification to strip out of the bam file [default: m6A] [possible values: m6A, 6mA, 5mC, CpG] -h, --help Print help -V, --version Print version diff --git a/docs/make_help_docs.sh b/docs/make_help_docs.sh index ca4ad125..d109d6bd 100755 --- a/docs/make_help_docs.sh +++ b/docs/make_help_docs.sh @@ -4,7 +4,7 @@ echo $LIBTORCH export LD_LIBRARY_PATH=${LIBTORCH}/lib:$LD_LIBRARY_PATH export DYLD_LIBRARY_PATH=${LIBTORCH}/lib:$LD_LIBRARY_PATH -for subcommand in "" "extract" "center" "predict-m6a" "fire" "add-nucleosomes" "clear-kinetics" "strip-basemods" "track-decorators"; do +for subcommand in "" "extract" "center" "predict-m6a" "fire" "add-nucleosomes" "footprint" "clear-kinetics" "strip-basemods" "track-decorators"; do echo $subcommand out="docs/ft-${subcommand}-help.md" echo '```' >$out diff --git a/src/cli.rs b/src/cli.rs index e55b8d24..8b6d792a 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -78,46 +78,27 @@ pub enum Commands { AddNucleosomes(AddNucleosomeOptions), /// Add FIREs (Fiber-seq Inferred Regulatory Elements) to a bam file with m6a predictions Fire(FireOptions), - /// Add footprints to a bam file with m6a predictions - Footprint(FootprintOptions), /// Extract fiberseq data into plain text files. + /// /// See https://fiberseq.github.io/fibertools-rs/docs/extract.html for a description of the outputs. #[clap(visible_aliases = &["ex", "e"])] Extract(ExtractOptions), /// This command centers fiberseq data around given reference positions. This is useful for making aggregate m6A and CpG observations, as well as visualization of SVs. + /// /// See https://fiberseq.github.io/fibertools-rs/docs/center.html for a description of the output. #[clap(visible_aliases = &["c", "ct"])] Center(CenterOptions), + /// Infer footprints from fiberseq data + Footprint(FootprintOptions), /// Make decorated bed files for fiberseq data TrackDecorators(DecoratorOptions), /// Remove HiFi kinetics tags from the input bam file - ClearKinetics { - /// Bam HiFi file with kinetics - #[clap(default_value = "-")] - bam: String, - /// Output bam file without hifi kinetics - #[clap(default_value = "-")] - out: String, - }, + ClearKinetics(ClearKineticsOptions), /// Strip out select base modifications - StripBasemods { - /// Bam HiFi file with base mods - #[clap(default_value = "-")] - bam: String, - /// Output bam file - #[clap(default_value = "-")] - out: String, - #[clap(short, long, default_value = "m6A", value_parser(["m6A","6mA", "5mC","CpG"]))] - /// base modification to strip out of the bam file - basemod: String, - }, + StripBasemods(StripBasemodsOptions), /// Make command line completions #[clap(hide = true)] - Completions { - /// If provided, outputs the completion file for given shell - #[arg(value_enum)] - shell: Shell, - }, + Completions(CompletionOptions), /// Make a man page for fibertools-rs /// /// Writes file for `man` to stdout. @@ -377,3 +358,33 @@ pub struct ExtractOptions { #[clap(short, long, help_heading = "All-Format-Options")] pub simplify: bool, } + +#[derive(Args, Debug, PartialEq, Eq)] +pub struct ClearKineticsOptions { + /// Bam HiFi file with kinetics + #[clap(default_value = "-")] + pub bam: String, + /// Output bam file without hifi kinetics + #[clap(default_value = "-")] + pub out: String, +} + +#[derive(Args, Debug, PartialEq, Eq)] +pub struct StripBasemodsOptions { + /// Bam HiFi file with base mods + #[clap(default_value = "-")] + pub bam: String, + /// Output bam file + #[clap(default_value = "-")] + pub out: String, + #[clap(short, long, default_value = "m6A", value_parser(["m6A","6mA", "5mC","CpG"]))] + /// base modification to strip out of the bam file + pub basemod: String, +} + +#[derive(Args, Debug, PartialEq, Eq)] +pub struct CompletionOptions { + /// If provided, outputs the completion file for given shell + #[arg(value_enum)] + pub shell: Shell, +} diff --git a/src/main.rs b/src/main.rs index 51f91c21..51c2377e 100644 --- a/src/main.rs +++ b/src/main.rs @@ -73,15 +73,19 @@ pub fn main() -> Result<(), Error> { let mut out = bam_writer(&predict_m6a_opts.out, &bam, args.threads); predict_m6a::read_bam_into_fiberdata(&mut bam, &mut out, predict_m6a_opts); } - Some(Commands::ClearKinetics { bam, out }) => { - let mut bam = bam_reader(bam, args.threads); - let mut out = bam_writer(out, &bam, args.threads); + Some(Commands::ClearKinetics(clear_kinetics_opts)) => { + let mut bam = bam_reader(&clear_kinetics_opts.bam, args.threads); + let mut out = bam_writer(&clear_kinetics_opts.out, &bam, args.threads); fibertools_rs::clear_kinetics(&mut bam, &mut out); } - Some(Commands::StripBasemods { bam, out, basemod }) => { - let mut bam = bam_reader(bam, args.threads); - let mut out = bam_writer(out, &bam, args.threads); - fibertools_rs::strip_basemods::strip_base_mods(&mut bam, &mut out, basemod); + Some(Commands::StripBasemods(strip_basemods_opts)) => { + let mut bam = bam_reader(&strip_basemods_opts.bam, args.threads); + let mut out = bam_writer(&strip_basemods_opts.out, &bam, args.threads); + fibertools_rs::strip_basemods::strip_base_mods( + &mut bam, + &mut out, + &strip_basemods_opts.basemod, + ); } Some(Commands::AddNucleosomes(nuc_opts)) => { add_nucleosomes_to_bam(nuc_opts, args.threads); @@ -95,9 +99,12 @@ pub fn main() -> Result<(), Error> { Some(Commands::TrackDecorators(decorator_opts)) => { fibertools_rs::decorator::get_decorators_from_bam(decorator_opts)?; } - Some(Commands::Completions { shell }) => { - log::info!("Generating completion file for {:?}...", shell); - cli::print_completions(*shell, &mut cli::make_cli_app()); + Some(Commands::Completions(completion_opts)) => { + log::info!( + "Generating completion file for {:?}...", + completion_opts.shell + ); + cli::print_completions(completion_opts.shell, &mut cli::make_cli_app()); } Some(Commands::Man {}) => { let man = clap_mangen::Man::new(cli::make_cli_app()); From a5ffe0b88e96daa0d5b8764f373df3ab42930ab1 Mon Sep 17 00:00:00 2001 From: Mitchell Robert Vollger Date: Wed, 20 Mar 2024 20:31:13 -0700 Subject: [PATCH 02/13] feat: warn on reads with hardclips --- src/bio_io/mod.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/bio_io/mod.rs b/src/bio_io/mod.rs index 43516a3e..891f602d 100644 --- a/src/bio_io/mod.rs +++ b/src/bio_io/mod.rs @@ -221,6 +221,13 @@ impl<'a> Iterator for BamChunk<'a> { let mut cur_vec = vec![]; for r in self.bam.by_ref().take(self.chunk_size) { let r = r.unwrap(); + if r.cigar().leading_hardclips() > 0 || r.cigar().trailing_hardclips() > 0 { + log::warn!( + "Skipping read ({}) because it has been hard clipped. This read will be excluded from calculations and any output.", + String::from_utf8_lossy(r.qname()) + ); + continue; + } cur_vec.push(r); } From 44b4b340f1055d9631841989ddf3fa1973a775c9 Mon Sep 17 00:00:00 2001 From: Mitchell Robert Vollger Date: Wed, 20 Mar 2024 20:37:50 -0700 Subject: [PATCH 03/13] chore: cleaning --- .github/ISSUE_TEMPLATE/enhancement.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 .github/ISSUE_TEMPLATE/enhancement.md diff --git a/.github/ISSUE_TEMPLATE/enhancement.md b/.github/ISSUE_TEMPLATE/enhancement.md new file mode 100644 index 00000000..d73a294b --- /dev/null +++ b/.github/ISSUE_TEMPLATE/enhancement.md @@ -0,0 +1,12 @@ +--- +name: Enhancement request +about: "Use this template to file enhancement requests for fibertools-rs. " +title: "" +labels: enhancement +assignees: "" +--- + +### Thanks for using `fibertools-rs`! To help with enhancement requests I need all the following items: + +- A detailed description of the requested change to current behavior or development of a new command. +- A short description of your scientific use case for the enhancement. From b60575ff39506d2eaa05bbe82025b7a8b7c108c8 Mon Sep 17 00:00:00 2001 From: Mitchell Robert Vollger Date: Wed, 20 Mar 2024 20:41:34 -0700 Subject: [PATCH 04/13] chore: cleaning --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 5c096656..f5834d70 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,7 @@ mamba install -c conda-forge -c bioconda fibertools-rs ``` -However, due to size constraints in `bioconda` this version does not support m6a prediction or GPU acceleration. If you would like to use m6A prediction and GPU acceleration, you will need to install using the directions in the [INSTALL.md](/INSTALL.md) file. +However, due to size constraints in `bioconda` this version does not support contain the pytorch libraries or GPU acceleration for m6A predictions. m6A predictions will still work in the bioconda version but may be much slower. If you would like to use m6A prediction and GPU acceleration, you will need to install using the directions in the [INSTALL.md](/INSTALL.md) file. # Usage From 7c69eb9c92b56e9ab06cb9b1aee7df9ff85ebf4d Mon Sep 17 00:00:00 2001 From: Mitchell Robert Vollger Date: Wed, 20 Mar 2024 20:42:20 -0700 Subject: [PATCH 05/13] chore: cleaning --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index f5834d70..7d585992 100644 --- a/README.md +++ b/README.md @@ -53,6 +53,9 @@ Extracts Fiber-seq data from a bam file into plain text. [Help page for extract] Center Fiber-seq reads (bam) around reference position(s). [Help page for center](/docs/center.md). ![Center](/assets/img/center.png) +## `ft footprint` +Footprint Fiber-seq reads (bam) around reference motifs(s). [Help page for footprint](/docs/footprint.md). + # Python API (`pyft`) The python API is still in development and not stable; however, you can find the current code progress in the [py-ft](/py-ft) folder. More information available at [readthedocs](https://py-ft.readthedocs.io/en/latest/). From 6e52fbc8392a33e3f4ae0cc0d13461ddac4a9833 Mon Sep 17 00:00:00 2001 From: Mitchell Robert Vollger Date: Wed, 20 Mar 2024 20:42:48 -0700 Subject: [PATCH 06/13] chore: cleaning --- README.md | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/README.md b/README.md index 7d585992..8233e242 100644 --- a/README.md +++ b/README.md @@ -64,19 +64,5 @@ The python API is still in development and not stable; however, you can find the **Jha, A.**, **Bohaczuk, S. C.**, Mao, Y., Ranchalis, J., Mallory, B. J., Min, A. T., Hamm, M. O., Swanson, E., Finkbeiner, C., Li, T., Whittington, D., Stergachis, A. B., & **Vollger, M. R.** (2023). DNA-m6A calling and integrated long-read epigenetic and genetic analysis with **fibertools**. _bioRxiv_. https://doi.org/10.1101/2023.04.20.537673 -# TODO items - -- [ ] Use new iterator for `ft extract` and group writes to try and improve the speed -- [ ] long format extract command -- [ ] Improve progress bar for predict-m6a. - - [ ] Get size of bam, say how far we are through the bam in terms of MB/GB? -- [x] Add a python API (see py-ft for progress) - - [ ] add default data viz - - [ ] add conversion to pandas data frame or maybe anndata -- [x] GPU support - - [ ] see if I can simplify or statically link PyTorch to get it onto bioconda - - [ ] Detect GPU memory to set batch size dynamically. -- [ ] Add unaligned, secondary, supplemental reads to the test bam. - # Contributing If you would like to contribute to `fibertools-rs`, please see the [CONTRIBUTING.md](/CONTRIBUTING.md) file for more information. From fd4eacf8b21c74bad598c8b87c50b99868f728e2 Mon Sep 17 00:00:00 2001 From: Mitchell Robert Vollger Date: Wed, 20 Mar 2024 20:43:31 -0700 Subject: [PATCH 07/13] chore: cleaning --- README.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/README.md b/README.md index 8233e242..3c510955 100644 --- a/README.md +++ b/README.md @@ -46,12 +46,10 @@ Add nucleosomes to a bam that file already contains m6a predictions. Note, this ## `ft extract` Extracts Fiber-seq data from a bam file into plain text. [Help page for extract](/docs/extract.md). -![Extract](/assets/img/ft-extract-all.png) ## `ft center` Center Fiber-seq reads (bam) around reference position(s). [Help page for center](/docs/center.md). -![Center](/assets/img/center.png) ## `ft footprint` Footprint Fiber-seq reads (bam) around reference motifs(s). [Help page for footprint](/docs/footprint.md). From cb28ad47e836413474a2ba5c8bb5e7f1ae7d0c05 Mon Sep 17 00:00:00 2001 From: Mitchell Robert Vollger Date: Wed, 20 Mar 2024 20:45:15 -0700 Subject: [PATCH 08/13] chore: cleaning --- README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 3c510955..02eff98b 100644 --- a/README.md +++ b/README.md @@ -33,25 +33,25 @@ ft --help [Help page for fibertools](/docs/ft--help.md) -# Subcommands for `fibertools-rs` +# Highlighted subcommands for `fibertools-rs` -## `ft predict-m6a` +### `ft predict-m6a` Predict m6A positions using HiFi kinetics data and encode the results in the MM and ML bam tags. [Help page for predict-m6a](/docs/ft-predict-m6a-help.md). -## `ft add-nucleosomes` +### `ft add-nucleosomes` Add nucleosomes to a bam that file already contains m6a predictions. Note, this process is also run in the background during `predict-m6a`, so it is unnecessary to run independently unless you want to try new parameters for nucleosome calling. [Help page for add-nucleosomes](/docs/ft-add-nucleosomes-help.md). -## `ft extract` +### `ft extract` Extracts Fiber-seq data from a bam file into plain text. [Help page for extract](/docs/extract.md). -## `ft center` +### `ft center` Center Fiber-seq reads (bam) around reference position(s). [Help page for center](/docs/center.md). -## `ft footprint` +### `ft footprint` Footprint Fiber-seq reads (bam) around reference motifs(s). [Help page for footprint](/docs/footprint.md). # Python API (`pyft`) From 4d17531c3310a7e71619a288a6633b898187a857 Mon Sep 17 00:00:00 2001 From: Mitchell Robert Vollger Date: Thu, 21 Mar 2024 11:41:15 -0700 Subject: [PATCH 09/13] fix: file not found --- .cargo/config.toml | 2 +- Cargo.toml | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.cargo/config.toml b/.cargo/config.toml index 8acf31ed..1a94b2a9 100644 --- a/.cargo/config.toml +++ b/.cargo/config.toml @@ -8,7 +8,7 @@ rustflags = [ [target.x86_64-unknown-linux-gnu] rustflags = [ "-C", "link-arg=-Wl,--allow-shlib-undefined", - "-C", "link-arg=-fno-lto", + #"-C", "link-arg=-fno-lto", "-C", "target-cpu=native", ] diff --git a/Cargo.toml b/Cargo.toml index a1680833..95bf2e99 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -66,7 +66,7 @@ burn = ["dep:burn"] # generated by 'cargo dist init' [profile.dist] inherits = "release" -split-debuginfo = "packed" +#split-debuginfo = "packed" # generated by 'cargo wizard' [profile.dev] @@ -82,9 +82,9 @@ codegen-units = 256 incremental = true [profile.release] -codegen-units = 1 +#codegen-units = 1 debug = false -lto = true -panic = "abort" +#lto = true +#panic = "abort" From 56c83f7f6f93b6598de9feee8054a0096a27a0c4 Mon Sep 17 00:00:00 2001 From: Mitchell Robert Vollger Date: Thu, 21 Mar 2024 11:52:13 -0700 Subject: [PATCH 10/13] fix: file not found --- .cargo/config.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.cargo/config.toml b/.cargo/config.toml index 1a94b2a9..fdd326f5 100644 --- a/.cargo/config.toml +++ b/.cargo/config.toml @@ -9,7 +9,7 @@ rustflags = [ rustflags = [ "-C", "link-arg=-Wl,--allow-shlib-undefined", #"-C", "link-arg=-fno-lto", - "-C", "target-cpu=native", + #"-C", "target-cpu=native", ] # this breaks my build on my mac. I don't know why From f41f6b827e6724717c7d0f0e414f427fc7003592 Mon Sep 17 00:00:00 2001 From: Mitchell Robert Vollger Date: Thu, 21 Mar 2024 11:55:54 -0700 Subject: [PATCH 11/13] fix: file not found --- Cargo.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/Cargo.toml b/Cargo.toml index 95bf2e99..6ad6f5ea 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -82,6 +82,7 @@ codegen-units = 256 incremental = true [profile.release] +strip="debuginfo" #codegen-units = 1 debug = false #lto = true From ebcfe1e35bc1dc5e304a0d6e2366e15b14ae9b14 Mon Sep 17 00:00:00 2001 From: Mitchell Robert Vollger Date: Thu, 21 Mar 2024 12:08:47 -0700 Subject: [PATCH 12/13] fix: file not found --- .cargo/config.toml | 4 ++-- Cargo.toml | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.cargo/config.toml b/.cargo/config.toml index fdd326f5..8acf31ed 100644 --- a/.cargo/config.toml +++ b/.cargo/config.toml @@ -8,8 +8,8 @@ rustflags = [ [target.x86_64-unknown-linux-gnu] rustflags = [ "-C", "link-arg=-Wl,--allow-shlib-undefined", - #"-C", "link-arg=-fno-lto", - #"-C", "target-cpu=native", + "-C", "link-arg=-fno-lto", + "-C", "target-cpu=native", ] # this breaks my build on my mac. I don't know why diff --git a/Cargo.toml b/Cargo.toml index 6ad6f5ea..9dac8679 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -83,9 +83,9 @@ incremental = true [profile.release] strip="debuginfo" -#codegen-units = 1 +codegen-units = 1 debug = false -#lto = true -#panic = "abort" +lto = true +panic = "abort" From 03196208ce6ac2b6f3ee4b4fbbbacc5e0bb0cb57 Mon Sep 17 00:00:00 2001 From: Mitchell Robert Vollger Date: Thu, 21 Mar 2024 13:24:53 -0700 Subject: [PATCH 13/13] fix: embed stats and make record type bincode to try and avoid filenotfound --- build.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/build.rs b/build.rs index 661c3369..45bd0a6a 100644 --- a/build.rs +++ b/build.rs @@ -15,6 +15,7 @@ fn main() { // Generate the model code and state file from the ONNX file. use burn_import::onnx::ModelGen; + use burn_import::onnx::RecordType; for x in &[ "src/m6a_burn/two_zero.onnx", "src/m6a_burn/two_two.onnx", @@ -24,7 +25,8 @@ fn main() { ModelGen::new() .input(x) // Path to the ONNX model .out_dir("m6a_burn/") // Directory for the generated Rust source file (under target/) - //.embed_states(true) + .record_type(RecordType::Bincode) + .embed_states(true) .run_from_script(); } }