From 20e15b0caba9137c7f388e5f0c0682e3a2eba774 Mon Sep 17 00:00:00 2001
From: Jay Hesselberth <jay.hesselberth@gmail.com>
Date: Tue, 14 Jan 2025 09:55:51 -0700
Subject: [PATCH] Eliminate support of FAST5 files

Closes #40
---
 .test/make_test_data.sh      | 16 ----------------
 config/README.md             |  2 +-
 config/config-base.yml       |  3 ---
 workflow/rules/aatrnaseq.smk |  8 +-------
 workflow/rules/common.smk    | 16 ++--------------
 5 files changed, 4 insertions(+), 41 deletions(-)

diff --git a/.test/make_test_data.sh b/.test/make_test_data.sh
index 630e16b..c68ae41 100644
--- a/.test/make_test_data.sh
+++ b/.test/make_test_data.sh
@@ -45,22 +45,6 @@ od=sample2/pod5_fail
 mkdir -p $od
 pod5 filter $ex/rbc/JMW_510_37C/JMW_510_37C.pod5 --ids ex2_read_ids_3.txt --force-overwrite -o $od/1.pod5
 
-od=sample1/fast5_pass
-mkdir -p $od
-pod5 convert to_fast5 -f -o $od sample1/pod5_pass/*.pod5
-
-od=sample1/fast5_fail
-mkdir -p $od
-pod5 convert to_fast5 -f -o $od sample1/pod5_fail/*.pod5
-
-od=sample2/fast5_pass
-mkdir -p $od
-pod5 convert to_fast5 -f -o $od sample2/pod5_pass/*.pod5
-
-od=sample2/fast5_fail
-mkdir -p $od
-pod5 convert to_fast5 -f -o $od sample2/pod5_fail/*.pod5
-
 # make another "sample2" dataset to test merging multiple runs
 # don't duplicate sample2 reads to avoid throwing an error when merging
 # pod5s
diff --git a/config/README.md b/config/README.md
index b6426c8..bc233da 100644
--- a/config/README.md
+++ b/config/README.md
@@ -7,7 +7,7 @@ Provide a path to a TSV file which indicates the samples to process (e.g. `confi
 TSV file should have two columns containing the following information.
 
   - a unique id for the sample
-  - a path to the sequencing run folder which has `pod5_pass`, `pod5`, `pod5_fail`, `fast5_pass`, or `fast5_fail` subdirectories containing raw data.
+  - a path to the sequencing run folder with `pod5_pass` and `pod5_fail` subdirectories containing raw data.
 
 The pipeline will recursively search for pod5 or fast5 files to process within the specified directory and subdirectories 
 
diff --git a/config/config-base.yml b/config/config-base.yml
index ccc2014..148c3b4 100644
--- a/config/config-base.yml
+++ b/config/config-base.yml
@@ -4,9 +4,6 @@
 # either a path to a basecalling model to use with dorado or a model selection name to specify model to download and use
 base_calling_model: "resources/models/rna004_130bps_sup@v5.0.0"
 
-# either FAST5 or POD5, if FAST5 then these files will be converted to pod5 before rebasecalling
-input_format: "POD5"
-
 # path to fasta file to use for bwa alignment.
 # a BWA index will be built if it does not exist for this fasta file
 fasta: "resources/ref/sacCer3-mature-tRNAs-dual-adapt-v2.fa"
diff --git a/workflow/rules/aatrnaseq.smk b/workflow/rules/aatrnaseq.smk
index d8b9511..b7bd37d 100644
--- a/workflow/rules/aatrnaseq.smk
+++ b/workflow/rules/aatrnaseq.smk
@@ -1,7 +1,7 @@
 
 rule merge_pods:
     """
-  merge all fast5/pod5s into a single pod5
+  merge pod5s into a single pod5
   """
     input:
         get_raw_inputs,
@@ -9,15 +9,9 @@ rule merge_pods:
         os.path.join(rbc_outdir, "{sample}", "{sample}.pod5"),
     log:
         os.path.join(outdir, "logs", "merge_pods", "{sample}"),
-    params:
-        is_fast5=config["input_format"],
     shell:
         """
-    if [ "{params.is_fast5}" == "FAST5" ]; then
-      pod5 convert fast5 -f --output {output} {input}
-    else
       pod5 merge -f -o {output} {input}
-    fi
     """
 
 
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index d9a2f68..5ce67bd 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -58,27 +58,15 @@ def report_metadata():
 
 def find_raw_inputs(sample_dict):
     """
-    parse through directories listed in samples.tsv and identify fast5 or pod5 files to process
+    parse through directories listed in samples.tsv and identify pod5 files to process
     store input files and uuid base file names in dictionary for each sample
     """
     POD5_DIRS = ["pod5_pass", "pod5_fail", "pod5"]
-    FAST5_DIRS = ["fast5_pass", "fast5_fail"]
-    fmt = config["input_format"]
-
-    data_subdirs = []
-    if fmt == "POD5":
-        data_subdirs = POD5_DIRS
-        ext = ".pod5"
-    elif fmt == "FAST5":
-        data_subdirs = FAST5_DIRS
-        ext = ".fast5"
-    else:
-        sys.exit("input_format config option must be either FAST5, or POD5")
 
     for sample, info in sample_dict.items():
         raw_fls = []
         for path in info["path"]:
-            for subdir in data_subdirs:
+            for subdir in POD5_DIRS:
                 data_path = os.path.join(path, subdir, "*" + ext)
                 fls = glob.glob(data_path)
                 raw_fls += fls