-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathassess_cleaned_read_quality.sh
68 lines (53 loc) · 2.66 KB
/
assess_cleaned_read_quality.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#!/bin/bash
# Description: Use this script to assess the quality of cleaned (trimmed) reads. Sequence files of the forward and reverse reads must be indicated with "R1" and "R2" in the file names, respectively.
#
# Outputs: This script generates a report containing the lengths of all reads in each sequence file.
#
# Software requirements:
#
# Usage: bash assess_cleaned_read_quality.sh [full path to the directory containing the demultiplexed/raw fastq/fastq.gz files] [name of directory containing cleaned reads]
#######################
# Specify directories #
#######################
rawReadsDir=$1
projDir=$(dirname $rawReadsDir)
resultsDir=${projDir}/processing_results
mkdir -p $resultsDir
cleanDirname=$2
cleanDir=${resultsDir}/${cleanDirname}
#####################################################################################################
# Create an array of the names that are used to identify the orientation of reads in a sequence file#
#####################################################################################################
orient=(R1 R2)
####################################################################################################################
# Create a single string of the patterns that must be removed from the file name to extract the name of the sample #
####################################################################################################################
fnameSubstr=${resultsDir}/filename_substrings_to_remove.txt
substr=($(cat ${fnameSubstr}))
############################################################
# Calculate lengths of the cleaned reads and write to a file
############################################################
for f in ${cleanDir}/*_cutadapt.fastq.gz
do
fname=$(basename $f)
for o in "${orient[@]}"
do
for str in "${substr[@]}"
do
pat=${str/_${o}_*fastq/_${o}_cutadapt.fastq}
fname2=${fname/.gz/}
sample=${fname2/${pat}/}
done
done
gzcat $f | awk '{ if (NR%4==2) print length($1) }' > ${resultsDir}/${sample}_read_lengths.txt
done
########################################################
# Plot a histogram of the lengths of the cleaned reads #
########################################################
source activate /anaconda2
Rscript ~/Google_Drive/2018_ASU_Research/lizards/scripts/plot_hist_read_lengths.R $resultsDir
##############################################
# Assess the quality of the cleaning step(s) #
##############################################
Rscript ~/Google_Drive/2018_ASU_Research/lizards/scripts/calc_stats_plots_cleaned_reads.R $projDir mapped_reads_count.txt $cleanDirname
source deactivate /anaconda2