Running SemiBin2 with abundance information from strobealign-aemb

BigDataBiology · Feb 26, 2024 · 97f6122 · luispedro · Mar 4, 2024 · 97f6122
1 parent ea5c927
commit 97f6122
Show file tree

Hide file tree

Showing 24 changed files with 4,950 additions and 132 deletions.
diff --git a/README.md b/README.md
@@ -217,6 +217,28 @@ After mapping samples (individually) to the combined FASTA file, you can get the
 SemiBin2 multi_easy_bin -i concatenated.fa -b *.sorted.bam -o output
 ```
 
+## Running with abundance information from strobealign-aemb (only used when samples above or equal to 5)
+
+1. split the fasta files 
+```bash
+python script/generate_split.py -c contig.fa -o output
+```
+2. map reads using [strobealign-aemb](https://github.com/ksahlin/strobealign) to generate the abundance information
+```bash
+strobealign --aemb output/split.fa read1_1.fq read1_2.fq -R 6 > sample1.txt
+strobealign --aemb output/split.fa read2_1.fq read2_2.fq -R 6 > sample2.txt
+strobealign --aemb output/split.fa read3_1.fq read3_2.fq -R 6 > sample3.txt
+strobealign --aemb output/split.fa read4_1.fq read4_2.fq -R 6 > sample4.txt
+strobealign --aemb output/split.fa read5_1.fq read5_2.fq -R 6 > sample5.txt
+```
+3. Running SemiBin2 (like running SemiBin with BAM files)
+```bash
+SemiBin2 generate_sequence_features_single -i contig.fa -a *.txt -o output
+SemiBin2 generate_sequence_features_multi -i contig.fa -a *.txt -s : -o output
+SemiBin2 single_easy_bin -i contig.fa -a *.txt -o output
+SemiBin2 multi_easy_bin i contig.fa -a *.txt -s : -o output
+``` 
+
 ## Output
 
 The output folder will contain:

diff --git a/SemiBin/generate_coverage.py b/SemiBin/generate_coverage.py
@@ -172,3 +172,51 @@ def combine_cov(cov_dir : str, bam_list, is_combined : bool): # bam_list : list[
         data_split_cov = None
     return data_cov, data_split_cov
 
+def generate_cov_from_abundances(abundances, output, contig_path, contig_threshold=1000, sep=None, contig_threshold_dict=None):
+    import pandas as pd
+    import numpy as np
+    from .fasta import fasta_iter
+
+    abun_split_list = []
+    for abun_file in abundances:
+        abun_split_data = pd.read_csv(abun_file, sep='\t', index_col=0, header=None)
+        abun_split_data.index.name = None
+        abun_split_data.columns = [f'{abun_file}']
+
+        binned_contig = []
+        for h, seq in fasta_iter(contig_path):
+            if sep is None:
+                cov_threshold = contig_threshold
+            else:
+                sample_name = h.split(sep)[0]
+                cov_threshold = contig_threshold_dict[sample_name]
+
+            if len(seq) >= cov_threshold:
+                binned_contig.append(h + '_1')
+                binned_contig.append(h + '_2')
+        abun_split_data = abun_split_data.loc[binned_contig]
+        abun_split_data = abun_split_data.apply(lambda x: x + 1e-5)
+        if sep is None:
+            abun_scale = (abun_split_data.mean() / 100).apply(np.ceil) * 100
+            abun_split_data = abun_split_data.div(abun_scale)
+        abun_split_list.append(abun_split_data)
+
+    abun_split = pd.concat(abun_split_list, axis=1)
+    if sep is None:
+        with atomic_write(os.path.join(output, 'cov_split.csv'), overwrite=True) as ofile:
+            abun_split.to_csv(ofile)
+
+        index_name = abun_split.index.tolist()
+        data_index_name = []
+        for i in range(len(index_name) // 2):
+            data_index_name.append(index_name[2 * i][0:-2])
+
+        abun_split_values = abun_split.values
+        abun = (abun_split_values[::2] + abun_split_values[1::2]) / 2
+        columns = [f'{abun_file}' for abun_file in abundances]
+        abun = pd.DataFrame(abun, index=data_index_name, columns=columns)
+        with atomic_write(os.path.join(output, 'cov.csv'), overwrite=True) as ofile:
+            abun.to_csv(ofile)
+        return abun, abun_split
+    else:
+        return abun_split