-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhelperFunctions.r
158 lines (130 loc) · 6.54 KB
/
helperFunctions.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
# helper functions
#if (!require(dada2)) stop("Please install dada2 according to the directions at: https://benjjneb.github.io/dada2/dada-installation.html")
require(ShortRead)
require(Biostrings)
require(RCurl)
require(phyloseq)
# from the dada2 tutorial
primerHits <- function(primer, fn) {
# Counts number of reads in which the primer is found
nhits <- vcountPattern(primer, sread(readFastq(fn)), fixed = FALSE)
return(sum(nhits > 0))
}
allOrients <- function(primer) {
# Create all orientations of the input sequence
require(Biostrings)
dna <- DNAString(primer) # The Biostrings works w/ DNAString objects rather than character vectors
orients <- c(Forward = dna, Complement = complement(dna), Reverse = reverse(dna),
RevComp = reverseComplement(dna))
return(sapply(orients, toString)) # Convert back to character vector
}
# check for primers
checkPrimers <- function(fwd, rev, FWD.orients, REV.orients){
print(
rbind(FWD.ForwardReads = sapply(FWD.orients, primerHits, fn = fwd),
FWD.ReverseReads = sapply(FWD.orients, primerHits, fn = rev),
REV.ForwardReads = sapply(REV.orients, primerHits, fn = fwd),
REV.ReverseReads = sapply(REV.orients, primerHits, fn = rev))
)
}
# tic/toc functions from Colin Averill:
#' Two clock functions.
#' Place tic() at the line in the code where you want to start timing.
#' Place toc() at the position in the code where you want to stop timing and report.
tic <- function() {assign("timer", Sys.time(), envir=.GlobalEnv)}
toc <- function() print(Sys.time()-timer)
#normalize otu table. function from Colin Averill:
pro.function <- function(otu){
for(i in 1:ncol(otu)){
otu[,i] <- otu[,i] / sum(otu[,i])
}
return(otu)
}
#' Set truncation length
#'
#' Decides on truncation length for trimmed reads based on quality score means. Default cutoff is a score of 30.
#' Warns you if the beginning (first 10) bases are low-quality, but returns the first low-quality base after the 10th base.
#' If no bases are below score, returns the last base.
#'
#' fl : input files (prints suggested length for each; if used in script, could take the first input, or the lowest)
#' qscore : default = 30.
#' n : default = 5e+05 don't know exactly why this matters, but it's part of how 'qa' is called within the dada2 scripts...
#'
#'
get_truncation_length <-function (fl, qscore = 30, n = 5e+05, quiet = TRUE){
trunc_lengths <- data.frame(file = character(0), early_lowqual = numeric(0),
trunc_length = numeric(0))
for (f in fl) {
srqa <- ShortRead::qa(f, n = n)
df <- srqa[["perCycle"]]$quality
means <- rowsum(df$Score * df$Count, df$Cycle)/rowsum(df$Count,
df$Cycle)
lowqual <- which(means < qscore)
lowqual_in_first_20 <- length(which(lowqual <= 20))
lowqual_after_20 <- lowqual[lowqual > 20][1]
trunc_lengths <- rbind(trunc_lengths, data.frame(file = f, early_lowqual = lowqual_in_first_20,
trunc_length = ifelse(is.na(lowqual_after_20), length(means), lowqual_after_20)))
if (quiet == FALSE) {
if(lowqual_in_first_20 > 0){
cat(paste0(basename(f),': ', lowqual_in_first_20, ' bases within first 20 bases are below your quality score. Consider trimming left.\n'))
}
if (is.na(lowqual_after_20)){
cat(paste0(basename(f), ': After first 20 bases, no bases with a mean under your quality score found. Truncate at end of read, base: ', length(means),'\n'))
} else if (!is.na(lowqual_after_20)){
cat(paste0(basename(f),': After first 20 bases, first mean below your quality score is base: ',lowqual_after_20,'\n'))
#return(lowqual_after_20)
} else "Something's wrong. Inspect function/reads."
} # end printing to console
} # end loop
return(trunc_lengths$trunc_length)
} # end function
checkPrimers.wide <- function(fwd, rev, FWD.orients, REV.orients){
df <- rbind(FWD.ForwardReads = sapply(FWD.orients, primerHits, fn = fwd),
FWD.ReverseReads = sapply(FWD.orients, primerHits, fn = rev),
REV.ForwardReads = sapply(REV.orients, primerHits, fn = fwd),
REV.ReverseReads = sapply(REV.orients, primerHits, fn = rev))
df.melted <- reshape2::melt(df)
df.melted$cat <- paste(df.melted$Var1, df.melted$Var2, sep="-")
df.qa <- as.data.frame(t(df.melted[,c(3)]))
names(df.qa) <- t(df.melted[,c(4)])
rownames(df.qa) <- gsub("_R1", "", basename(fwd))
return(df.qa)
}
## "tree" command by JennyBC
myfile <- RCurl::getURL("https://gist.githubusercontent.com/jennybc/2bf1dbe6eb1f261dfe60/raw/c53fba8a861f82f90d895458e941a1056c8118f5/twee.R", ssl.verifypeer=F)
eval(parse(text = myfile))
# Create data directory
create_data_directory <- function(path = ".", amplicon = c("ITS", "16S"), ...) {
# twee vignette here: https://gist.github.com/jennybc/2bf1dbe6eb1f261dfe60
if (length(amplicon) == 1) {
cmd <- paste0("mkdir -p data/{filt_seqs/",amplicon,",raw_seqs/",amplicon,",trimmed_seqs/",amplicon,",seq_tables/",amplicon,",output_files/",amplicon,"}")
} else {
cmd <- "mkdir -p data/{filt_seqs/{ITS,16S},raw_seqs/{ITS,16S},trimmed_seqs/{ITS,16S},seq_tables/{ITS,16S},output_files/{ITS,16S}}"
}
system(cmd)
twee("data/", level = 2)
}
rbind.named.dfs <- function(df.list){
# solution from https://stackoverflow.com/questions/15162197/combine-rbind-data-frames-and-create-column-with-name-of-original-data-frames
dfs <- df.list[sapply(df.list, function(x) !is.null(dim(x)))]
all.out <- cbind.data.frame(do.call(rbind,dfs),
seqRun = rep(names(dfs), vapply(dfs, nrow, numeric(1))))
return(all.out)
}
# create sample information data.frame from NEON sample names
parseNEONsampleIDs <- function(sampleID){
df <- data.frame(siteID = substr(sampleID, 1, 4), sampleID = sampleID, stringsAsFactors = F) %>%
mutate(sample = sapply(strsplit(sampleID, "-GEN|-gen"), "[[" , 1)) %>%
mutate(sampleID = sapply(strsplit(sampleID, "-gen.fastq"), "[[" , 1)) %>%
mutate(dates = sapply(strsplit(sample, "-"), function(x) x[grep("[2]\\d\\d\\d\\d\\d\\d\\d", x)])) %>%
mutate(dates = ifelse(dates == "21040514", "20140514", dates)) %>%
mutate(asDate = as.Date(as.character(dates), "%Y%m%d")) %>%
mutate(dateID = substr(as.character(dates), 1, 6)) %>%
mutate(plotID = substr(sample, 1, 8)) %>%
mutate(site_date = substr(sample, 1, 8)) %>%
mutate(horizon = ifelse(grepl("-M-", sample), "M", "O")) %>%
mutate(without_horizon = gsub("-[M|O]-", "-", sample)) %>%
mutate(plot_date = paste0(plotID, "-", dateID)) %>%
as.data.frame()
return(df)
}