Creating repository, initial state

kjurdzinski · May 21, 2023 · c2809ba · c2809ba
commit c2809ba
Show file tree

Hide file tree

Showing 30 changed files with 1,230,876 additions and 0 deletions.
diff --git a/OTUs_16S/OTUs.txt b/OTUs_16S/OTUs.txt
diff --git a/OTUs_16S/OTUs_membership.txt b/OTUs_16S/OTUs_membership.txt
diff --git a/OTUs_16S/log.txt b/OTUs_16S/log.txt
diff --git a/SLURM_chimera_dbOTU.sh b/SLURM_chimera_dbOTU.sh
@@ -0,0 +1,13 @@
+#!/bin/bash -l
+
+#SBATCH -A naiss2023-5-110
+#SBATCH -p core
+#SBATCH -n 20
+#SBATCH --mail-type=END
+#SBATCH [email protected]
+#SBATCH -t 06:00:00
+#SBATCH -J chimera_dbOTU_clustering
+
+cd /crex/proj/snic2020-6-126/projects/plankton_monitoring/P20310/ASV_reannotation
+conda activate ASV-clustering
+bash workflow_all.sh 16S
diff --git a/Snakefile b/Snakefile
@@ -0,0 +1,27 @@
+import pandas as pd
+from snakemake.utils import min_version
+min_version("5.3.0")
+# configfile: "config.yml"
+
+rule sort_ASVs:
+    """
+    Sort ASVs by abundance
+    """
+    input:
+        "data/asv_seqs.fasta"
+        "data/asv_counts.tsv"
+    output:
+        "temporary/asv_seqs_sorted.fasta"
+    shell:
+        """
+        if [ ! -d "chimeras" ] 
+        then
+            mkdir chimeras
+        fi
+        if [ ! -d "temporary" ] 
+        then
+            mkdir temporary
+        fi
+        python3 scripts/sort_ASVs_by_abundance.py data/asv_seqs.fasta \
+            -c data/asv_counts.tsv -o temporary/asv_seqs_sorted.fasta
+        """
diff --git a/chimera_ASVs_workflow.sh b/chimera_ASVs_workflow.sh
@@ -0,0 +1,17 @@
+if [ ! -d "chimeras" ] 
+then
+    mkdir chimeras
+fi
+if [ ! -d "temporary" ] 
+then
+    mkdir temporary
+fi
+python3 scripts/sort_ASVs_by_abundance.py data/asv_seqs.fasta \
+     -c data/asv_counts.tsv -o temporary/asv_seqs_sorted.fasta
+vsearch --threads 6 --dn 1.4 --mindiffs 3 \
+                --mindiv 0.8 --minh 0.28 --abskew 2.0 \
+                --chimeras chimeras/chimeras.fasta --borderline chimeras/borderline.fasta \
+                --nonchimeras chimeras/nonchimeras.fasta --uchimealns chimeras/uchimealns.out \
+                --uchimeout chimeras/uchimeout.txt --uchimealns chimeras/uchimealns.txt \
+                --uchime_denovo temporary/asv_seqs_sorted.fasta  \
+                >logs/log_uchime.txt 2>&1
diff --git a/chimeras_16S/borderline.fasta b/chimeras_16S/borderline.fasta
diff --git a/chimeras_16S/chimeras.fasta b/chimeras_16S/chimeras.fasta
diff --git a/chimeras_16S/nonchimeras.fasta b/chimeras_16S/nonchimeras.fasta
diff --git a/chimeras_16S/nonchimeras_clean.fasta b/chimeras_16S/nonchimeras_clean.fasta
diff --git a/chimeras_16S/uchimealns.txt b/chimeras_16S/uchimealns.txt
diff --git a/chimeras_16S/uchimeout.txt b/chimeras_16S/uchimeout.txt
diff --git a/clean.sh b/clean.sh
@@ -0,0 +1,3 @@
+rm -r temporary_$1/
+rm -r chimeras_$1/
+rm -r OTUs_$1/
diff --git a/config.yml b/config.yml
diff --git a/config_old.yml b/config_old.yml
@@ -0,0 +1,87 @@
+# Update with the name of a subdirectory directory under data/ that contains
+# asv_seqs.fasta, asv_counts.tsv and asv_taxa.tsv
+rundir: "16S"
+# Specify a name of the run. Using different run_name you can run clustering
+# with different parameters using the same vsearch output.
+run_name: "default"
+# At what rank should the data be split prior to aligning and clustering
+split_rank: "Family"
+# Method to use for calculating representative sequences for clusters
+# select between "median", "mean" and "sum"
+rep_method: "median"
+
+chimera:
+  run_name: "chimera1"
+  # Should chimera detection and removal be performed on the input data?
+  remove_chimeras: True
+  # Select method for chimera filtering:
+  #'batchwise' = run chimera detection on dataset as a whole.
+  #'samplewise' = split input fasta into one file per sample and run chimera
+  #               detection on each sample individually.
+  method: "batchwise"
+  # Select algorithm to use, you can choose from 'uchime_denovo', 'uchime2_denovo'
+  # and 'uchime3_denovo'.
+  algorithm: "uchime_denovo"
+  # In batchwise method, require that a sequence marked as chimeric is present
+  # with its parents in at least <min_samples_shared> samples
+  min_samples_shared: 1
+  # In batchwise method, require that a sequence marked as chimeric is present
+  # with its parents in at least <min_frac_samples_shared> fraction of samples
+  min_frac_samples_shared: 0.5
+  # In samplewise method, require that a sequence is marked as chimeric in at least
+  # <min_chimeric_samples> in order for it to be removed from analysis. If this
+  # value is set to 0, ASVs have to be marked as chimeric in all samples
+  min_chimeric_samples: 0
+  # Chimera detection settings
+  dn: 1.4
+  mindiffs: 3
+  mindiv: 0.8
+  minh: 0.28
+
+# Specify what clustering software to use. Choose from 'swarm', 'opticlust',
+# 'dbotu3' and 'lulu'.
+software:
+  # - "swarm"
+  # - "opticlust"
+  # - "lulu"
+  - "dbotu3"
+evaluation_rank: "Species"
+
+# Tool-specific settings
+vsearch:
+  threads: 10
+  id: 0.84
+  iddef: "1"
+  query_cov: 0.9
+
+opticlust:
+  # For opticlust, choose whether pairwise alignments should be generated with
+  # 'vsearch' or 'mothur'
+  aligner: "vsearch"
+  delta: 0.0001
+  cutoff: 0.03
+  initialize: "singleton"
+  precision: 1000
+  threads: 10
+
+swarm:
+  differences: 1
+  no-otu-breaking: False
+  fastidious: True
+  boundary: 3
+  threads: 10
+  match-reward: 5
+  mismatch-penalty: 4
+  gap-opening-penalty: 12
+  gap-extension-penalty: 4
+
+dbotu3:
+  dist: 0.1
+  abund: 10.0
+  pval: 0.0005
+
+lulu:
+  minimum_ratio_type: "min"
+  minimum_ratio: 1
+  minimum_match: 84
+  minimum_relative_cooccurence: 0.95
diff --git a/data_16S/asv_counts.tsv b/data_16S/asv_counts.tsv