-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathAprilPresentation.R
62 lines (41 loc) · 3.27 KB
/
AprilPresentation.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# Aprils Presentation
# First need to download Bioconductor, which is an open source software that uses R programming language to analyze high-throughput genomic data.
source("https://bioconductor.org/biocLite.R")
biocLite("SNPRelate")
library(SNPRelate)
a
library(gdsfmt)
# Read in the VCF (Variant Call Format) file, a common file structure for single nucleotide polymorphism, or "SNP", data
OASV2rmdupvcf <- ("OASV2-rmdup-Bio381-filtered.recode.vcf")
# Reformatting to GDS considering a 2-allele only model (standard)
snpgdsVCF2GDS(OASV2rmdupvcf, "OASV2gds", method="biallelic.only")
# Open the reformatted file and name it
OASV2gds <- snpgdsOpen("OASV2gds")
# Get a summary of the dataset
snpgdsSummary("OASV2gds")
# Note: in this case individuals = a pooled population
# Speed up computation by requesting multiple threads via the num.thread command
# Use autosome.only=FALSE to account for the fact that we are working with non-human organisms (so not 23 chromosomes)
pca <- snpgdsPCA(OASV2gds, autosome.only=FALSE, num.thread=4)
# Create a vector of population IDs for being able to identify via color coding in the plot
# NOTE: Need to make sure these IDs match the order as in the original vcf file
popcode <- c("D1_7.5", "D1_7.5", "D1_7.5", "D1_7.5", "D1_8.0", "D1_8.0", "D1_8.0", "D1_8.0", "D7_7.5", "D7_7.5", "D7_7.5", "D7_7.5", "D7_8.0", "D7_8.0", "D7_8.0", "D7_8.0")
# Getting strucutre of PCA, which provides info on the PCs
str(pca)
plot(pca$eigenvect[,1],pca$eigenvect[,2], main="PCA with quality filtered SNPs", cex.main=2, xlab="Principal Component axis 1", cex.lab=1.5, cex.axis=1.5, ylab="Principal Component axis 2", col=as.factor(popcode), pch=20, cex=3)
legend("bottomleft", cex=1.3, legend=levels(as.factor(popcode)), pch=20, col=1:nlevels(as.factor(popcode)))
# Notice that the legend is in the way of some of the points. Can correct this by moving it:
plot(pca$eigenvect[,1],pca$eigenvect[,2], main="PCA with quality filtered SNPs #2", cex.main=2, xlab="Principal Component axis 1", cex.lab=1.3, cex.axis=1.5, ylab="Principal Component axis 2", col=as.factor(popcode), pch=20, cex=3)
legend("topleft", cex=1.3, legend=levels(as.factor(popcode)), pch=20, col=1:nlevels(as.factor(popcode)))
# This prunes any SNPs with an r^2 > 0.2 between adjacent SNPs
snpset <- snpgdsLDpruning(OASV2gds, autosome.only=FALSE, ld.threshold=0.2)
snpset.id <- unlist(snpset)
# Now we run PCA again on the pruned data with no LD and plot (user-defined LD threshold):
pca_noLD <- snpgdsPCA(OASV2gds, snp.id=snpset.id, autosome.only=FALSE, num.thread=4)
plot(pca_noLD$eigenvect[,1],-1*(pca_noLD$eigenvect[,2]), main="PCA with no SNPs in LD", xlab="Principal Component axis 1", cex.lab=1.5, cex.axis=1.5, ylab="Principal Component axis 2", col=as.factor(popcode), pch=20, cex=3)
legend("topleft", cex=1.3, legend=levels(as.factor(popcode)), pch=20, col=1:nlevels(as.factor(popcode)))
# Saving PDF of plot
pdf(file="OASV2 PCA with quality filtered SNPs.pdf", height=5, width=8)
plot(pca$eigenvect[,1],pca$eigenvect[,2], main="PCA with quality filtered SNPs #2", cex.main=2, xlab="Principal Component axis 1", cex.lab=1.3, cex.axis=1.5, ylab="Principal Component axis 2", col=as.factor(popcode), pch=20, cex=3)
legend("topleft", cex=1.3, legend=levels(as.factor(popcode)), pch=20, col=1:nlevels(as.factor(popcode)))
dev.off()