From 20e15b0caba9137c7f388e5f0c0682e3a2eba774 Mon Sep 17 00:00:00 2001 From: Jay Hesselberth Date: Tue, 14 Jan 2025 09:55:51 -0700 Subject: [PATCH] Eliminate support of FAST5 files Closes #40 --- .test/make_test_data.sh | 16 ---------------- config/README.md | 2 +- config/config-base.yml | 3 --- workflow/rules/aatrnaseq.smk | 8 +------- workflow/rules/common.smk | 16 ++-------------- 5 files changed, 4 insertions(+), 41 deletions(-) diff --git a/.test/make_test_data.sh b/.test/make_test_data.sh index 630e16b..c68ae41 100644 --- a/.test/make_test_data.sh +++ b/.test/make_test_data.sh @@ -45,22 +45,6 @@ od=sample2/pod5_fail mkdir -p $od pod5 filter $ex/rbc/JMW_510_37C/JMW_510_37C.pod5 --ids ex2_read_ids_3.txt --force-overwrite -o $od/1.pod5 -od=sample1/fast5_pass -mkdir -p $od -pod5 convert to_fast5 -f -o $od sample1/pod5_pass/*.pod5 - -od=sample1/fast5_fail -mkdir -p $od -pod5 convert to_fast5 -f -o $od sample1/pod5_fail/*.pod5 - -od=sample2/fast5_pass -mkdir -p $od -pod5 convert to_fast5 -f -o $od sample2/pod5_pass/*.pod5 - -od=sample2/fast5_fail -mkdir -p $od -pod5 convert to_fast5 -f -o $od sample2/pod5_fail/*.pod5 - # make another "sample2" dataset to test merging multiple runs # don't duplicate sample2 reads to avoid throwing an error when merging # pod5s diff --git a/config/README.md b/config/README.md index b6426c8..bc233da 100644 --- a/config/README.md +++ b/config/README.md @@ -7,7 +7,7 @@ Provide a path to a TSV file which indicates the samples to process (e.g. `confi TSV file should have two columns containing the following information. - a unique id for the sample - - a path to the sequencing run folder which has `pod5_pass`, `pod5`, `pod5_fail`, `fast5_pass`, or `fast5_fail` subdirectories containing raw data. + - a path to the sequencing run folder with `pod5_pass` and `pod5_fail` subdirectories containing raw data. The pipeline will recursively search for pod5 or fast5 files to process within the specified directory and subdirectories diff --git a/config/config-base.yml b/config/config-base.yml index ccc2014..148c3b4 100644 --- a/config/config-base.yml +++ b/config/config-base.yml @@ -4,9 +4,6 @@ # either a path to a basecalling model to use with dorado or a model selection name to specify model to download and use base_calling_model: "resources/models/rna004_130bps_sup@v5.0.0" -# either FAST5 or POD5, if FAST5 then these files will be converted to pod5 before rebasecalling -input_format: "POD5" - # path to fasta file to use for bwa alignment. # a BWA index will be built if it does not exist for this fasta file fasta: "resources/ref/sacCer3-mature-tRNAs-dual-adapt-v2.fa" diff --git a/workflow/rules/aatrnaseq.smk b/workflow/rules/aatrnaseq.smk index d8b9511..b7bd37d 100644 --- a/workflow/rules/aatrnaseq.smk +++ b/workflow/rules/aatrnaseq.smk @@ -1,7 +1,7 @@ rule merge_pods: """ - merge all fast5/pod5s into a single pod5 + merge pod5s into a single pod5 """ input: get_raw_inputs, @@ -9,15 +9,9 @@ rule merge_pods: os.path.join(rbc_outdir, "{sample}", "{sample}.pod5"), log: os.path.join(outdir, "logs", "merge_pods", "{sample}"), - params: - is_fast5=config["input_format"], shell: """ - if [ "{params.is_fast5}" == "FAST5" ]; then - pod5 convert fast5 -f --output {output} {input} - else pod5 merge -f -o {output} {input} - fi """ diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index d9a2f68..5ce67bd 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -58,27 +58,15 @@ def report_metadata(): def find_raw_inputs(sample_dict): """ - parse through directories listed in samples.tsv and identify fast5 or pod5 files to process + parse through directories listed in samples.tsv and identify pod5 files to process store input files and uuid base file names in dictionary for each sample """ POD5_DIRS = ["pod5_pass", "pod5_fail", "pod5"] - FAST5_DIRS = ["fast5_pass", "fast5_fail"] - fmt = config["input_format"] - - data_subdirs = [] - if fmt == "POD5": - data_subdirs = POD5_DIRS - ext = ".pod5" - elif fmt == "FAST5": - data_subdirs = FAST5_DIRS - ext = ".fast5" - else: - sys.exit("input_format config option must be either FAST5, or POD5") for sample, info in sample_dict.items(): raw_fls = [] for path in info["path"]: - for subdir in data_subdirs: + for subdir in POD5_DIRS: data_path = os.path.join(path, subdir, "*" + ext) fls = glob.glob(data_path) raw_fls += fls