Extracting read counts of milRNA gene loci

First, we needed to assing read counts to milRNA loci in Blumeria graminis f.sp. hordei and Hordeum vulgare. We used featureCounts for this, as before.

# Creating table of read counts for B. graminis f.sp. hordei
featureCounts -T 2 -p -M --fraction -t nc_RNA -g ID -a Bgh_milRNA_v1.gff -o counts.bgh-milRNA.txt RSB15248-1.bgh_aligned.bam RSB15248-2.bgh_aligned.bam RSB15248-3.bgh_aligned.bam RSB15248-4.bgh_aligned.bam RSB15248-5.bgh_aligned.bam RSB15248-6.bgh_aligned.bam RSB15248-7.bgh_aligned.bam RSB15248-8.bgh_aligned.bam RSB15248-9.bgh_aligned.bam RSB15248-10.bgh_aligned.bam RSB15248-11.bgh_aligned.bam RSB15248-12.bgh_aligned.bam RSB15248-13.bgh_aligned.bam RSB15248-14.bgh_aligned.bam RSB15248-15.bgh_aligned.bam RSB15248-16.bgh_aligned.bam RSB15248-17.bgh_aligned.bam RSB15248-18.bgh_aligned.bam
# Creating table of read counts for H. vulgare
featureCounts -T 8 -p -M --fraction -t nc_RNA -g ID -a Hvu_milRNA_v1.gff -o counts.hvu-milRNA.txt RSB15248-1_merged.hvu_aligned.bam RSB15248-2_merged.hvu_aligned.bam RSB15248-3_merged.hvu_aligned.bam RSB15248-4_merged.hvu_aligned.bam RSB15248-5_merged.hvu_aligned.bam RSB15248-6_merged.hvu_aligned.bam RSB15248-7_merged.hvu_aligned.bam RSB15248-8_merged.hvu_aligned.bam RSB15248-9_merged.hvu_aligned.bam RSB15248-10_merged.hvu_aligned.bam RSB15248-11_merged.hvu_aligned.bam RSB15248-12_merged.hvu_aligned.bam RSB15248-13_merged.hvu_aligned.bam RSB15248-14_merged.hvu_aligned.bam RSB15248-15_merged.hvu_aligned.bam RSB15248-16_merged.hvu_aligned.bam RSB15248-17_merged.hvu_aligned.bam RSB15248-18_merged.hvu_aligned.bam

Parsing read count table and filtering

We first created read count and TPM tables for downstream analysis in R. The codes are shown for Blumeria graminis f.sp. hordei, and were applied identically to Hordeum vulgare.

counts <- read.table("counts.bgh-milRNA.txt", header=TRUE, sep=" ")
names(counts) <- c("geneID","length",
                "RSB15248-1", "RSB15248-2", "RSB15248-3", "RSB15248-4", "RSB15248-5", "RSB15248-6",
                "RSB15248-7", "RSB15248-8", "RSB15248-9", "RSB15248-10", "RSB15248-11", "RSB15248-12",
                "RSB15248-13", "RSB15248-14", "RSB15248-15", "RSB15248-16", "RSB15248-17", "RSB15248-18")
# Function TPM calculation
tpm <- function(counts, lengths) {
  rate <- counts / lengths
  rate / sum(rate) * 1e6
}
# Parse data
genes <- data.frame(counts[1:2])
counts <- counts
countsTable <- as.matrix(counts[,3:20])
rownames(countsTable) <- counts[,1]
# Calculate TPM
tpm <- apply(countsTable, 2, function(x) tpm(x, genes$length))
rownames(tpm) <- counts[,1]
# Checking that codes ran correctly
colSums(tpm) #should be equal
colMeans(tpm) #should be equal

We also calculated average and median TPM values for each condition, i.e., Myc, Epi, Hau, P40, EVplus, and EVminus.

# Calculations of averages
Epi <- rowMeans(data.frame(tpm[,1],tpm[,2],tpm[,3]), na.rm = FALSE, dims = 1)
Hau <- rowMeans(data.frame(tpm[,4],tpm[,5],tpm[,6]), na.rm = FALSE, dims = 1)
P40 <- rowMeans(data.frame(tpm[,7],tpm[,8],tpm[,9]), na.rm = FALSE, dims = 1)
EVplus <- rowMeans(data.frame(tpm[,10],tpm[,11],tpm[,15]), na.rm = FALSE, dims = 1)
EVminus <- rowMeans(data.frame(tpm[,12],tpm[,13],tpm[,14]), na.rm = FALSE, dims = 1)
Myc <- rowMeans(data.frame(tpm[,16],tpm[,17],tpm[,18]), na.rm = FALSE, dims = 1)
# Combine
tpm.avg <- data.frame(Myc,Epi,Hau,P40,EVplus,EVminus)
# Calculations of medians
Epi <- apply(data.frame(tpm[,1],tpm[,2],tpm[,3]), 1, median)
Hau <- apply(data.frame(tpm[,4],tpm[,5],tpm[,6]), 1, median)
P40 <- apply(data.frame(tpm[,7],tpm[,8],tpm[,9]), 1, median)
EVplus <- apply(data.frame(tpm[,10],tpm[,11],tpm[,15]), 1, median)
EVminus <- apply(data.frame(tpm[,12],tpm[,13],tpm[,14]), 1, median)
Myc <- apply(data.frame(tpm[,16],tpm[,17],tpm[,18]), 1, median)
# Combine
tpm.median <- data.frame(Myc,Epi,Hau,P40,EVplus,EVminus)
head(tpm.median)

Filtering can be done using a TPM cut-off value of TPM < 1.

# Removing non-expressors using median values
maxv <- apply(tpm.median[,1:6], 1, max)
minv <- apply(tpm.median[,1:6], 1, min)

med.maxv <- data.frame(tpm.median[1:6], minv, maxv)
cond <- med.maxv$maxv < 1
tpm.filter1 <- med.maxv[!cond,]
tpm.filter1 <- data.frame(rownames(tpm.filter1), tpm.filter1)
names(tpm.filter1) <- c("geneID","Myc","Epi","Hau","P40","EVplus","EVminus", "minv", "maxv")

Clustering by Nonmetric dimensional scaling (NMDS)

Non-metric Multi-dimensional Scaling (NMDS) condenses information from multidimensional data into a 2-dimensional representation or ordination. In this ordination, the closer two points are, the more similar the corresponding samples are with respect to the variables that went into calculating the NMDS plot. NMDS attempts to represent the pairwise dissimilarity between objects in a low-dimensional space. Codes here are shown for Blumeria graminis f.sp. hordei and were run the same way for Hordeum vulgare.

Loading required packages.

require(vegan)
require(ggplot2)

Importing data for analysis.

# sample info
info <- read.table("sample_info.tsv", header = T, row.names = 1, sep = "\t")
groups <- info$Group
# import data
counts <- read.table("counts.bgh-milRNA.txt", header=TRUE, sep=" ")
names(counts) <- c("geneID","length",
                "RSB15248-1", "RSB15248-2", "RSB15248-3", "RSB15248-4", "RSB15248-5", "RSB15248-6",
                "RSB15248-7", "RSB15248-8", "RSB15248-9", "RSB15248-10", "RSB15248-11", "RSB15248-12",
                "RSB15248-13", "RSB15248-14", "RSB15248-15", "RSB15248-16", "RSB15248-17", "RSB15248-18")
# transpose and convert to matrix
tcounts <- t(counts)
tcounts <- as.matrix(tcounts)

Running NMDS test. A stress value < 0.05 indicates that the model is a good representation of the data

# NMDS and plotting with base R.
nmds <- metaMDS(tcounts, distance = "bray", k = 2, try = 20, trymax = 20)
## Stress:     0.03085808

Plotting the data with ggplot.

# before plotting, Group column was added to change the colors of each sample accordingly.
data.scores = as.data.frame(scores(nmds))
data.scores$Group = info$Group
colors <- c("EPI" = "#99d8c9", "HAU" = "#9ecae1", "P40" = "#df65b0", "EVN" = "#969696", "EVY" = "#fc8d59", "MYC" = "#4292c6")
# manually set color according to python code
plot = ggplot(data.scores, aes(x = NMDS1, y = NMDS2,)) +
  geom_point(aes(color = Group)) +
  scale_color_manual(values = colors) +
  theme_classic()

Importantly, the distinctions are only observations. For finding significance of dissimilarities between samples, ANOSIM was calculated. In addition to the p value, the R value represents the ratio of the between-group variations to the within-group variations, and a high R means the between-group variation is very high relative to the within-group variation.

# convert tcounts matrix to data frame.
tcounts_new = as.data.frame(tcounts, col.names = T)
# group info from info file was added.
tcounts_new$Group = info$Group
# ANOSIM test
ano = anosim(tcounts, tcounts_new$Group, distance = "bray", permutations = 9999)
## ANOSIM statistic R: 0.6 Significance: 1e-04

Clustering by principal component analysis (PCA)

We ran principal component analysis (PCA) using python.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Aug  3 08:47:27 2021

@author: mansisingh
"""
import pandas as pd
import matplotlib
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn import preprocessing
import matplotlib.pyplot as plt 

pio.renderers.default = "png"

df = pd.read_csv("counts.bgh-milRNA.txt.csv",)

genes = df['Genes'].values
newdf = df.drop(columns=['Genes'])
indexlist = newdf.columns.tolist()
numbers = newdf.values

data = pd.DataFrame(data=numbers,columns=indexlist, index=genes)

#########################
#
# Perform PCA on the data
#
#########################
# First center and scale the data
#scaled_data = preprocessing.scale(data.T)
scaled_data = scaler = preprocessing.StandardScaler().fit_transform(data.T) #same approach but this is unsupervised ML code.
 
pca = PCA() # create a PCA object
pca.fit(scaled_data) # do the math
pca_data = pca.transform(scaled_data) # get PCA coordinates for scaled_data
 
#########################
#
# Draw a scree plot and a PCA plot
#
#########################
 
#The following code constructs the Scree plot
per_var = np.round(pca.explained_variance_ratio_* 100, decimals=1)
labels = ['PC' + str(x) for x in range(1, len(per_var)+1)]
#plt.figure(figsize=(10,5))
plt.bar(x=range(1,len(per_var)+1), height=per_var, tick_label=labels, width=0.8, align='center')
plt.ylabel('Percentage of Explained Variance')
plt.xlabel('Principal Component')
#plt.xticks(fontsize=10)
plt.title('Scree Plot')
plt.savefig('ScreeBgh_raw_ME.png')
plt.show()

#the following code makes a fancy looking plot using PC1 and PC2
pca_df = pd.DataFrame(pca_data, index=indexlist, columns=labels)

#https://colorbrewer2.org/#type=diverging&scheme=RdYlBu&n=6
#https://matplotlib.org/stable/gallery/color/named_colors.html

colors = ['#a6cee3','#1f78b4','#b2df8a','#33a02c','#fb9a99','#e31a1c','#fdbf6f','#ff7f00','#cab2d6','#6a3d9a','#ffff99','#b15928', '#8dd3c7','#ffffb3','#bebada','#fb8072','#80b1d3','#fdb462'] #change colors here #give same colors by repeating the same number three times.
 
plt.scatter(pca_df.PC1, pca_df.PC2, c=colors)
plt.grid()
plt.title('Principal Component Analysis')
plt.xlabel('PC1 - {0}%'.format(per_var[0]))
plt.ylabel('PC2 - {0}%'.format(per_var[1]))
patches = [] 
i = 0
for sample in pca_df.index:
    patches.append(matplotlib.patches.Patch(color=colors[i],label=sample))
    i += 1
plt.legend(handles=patches, loc=(1.02,0.001))
plt.savefig('PCABgh_raw_ME.png', bbox_inches='tight')
plt.show()
 
#########################
#
# Determine which genes had the biggest influence on PC1
#
#########################
 
## get the name of the top 10 measurements (genes) that contribute
## most to pc1.
## first, get the loading scores
loading_scores = pd.Series(pca.components_[0], index=genes)
## now sort the loading scores based on their magnitude
sorted_loading_scores = loading_scores.abs().sort_values(ascending=False)
 
# get the names of the top 10 genes
top_10_genes = sorted_loading_scores[0:10].index.values
 
## print the gene names and their scores (and +/- sign)
print(loading_scores[top_10_genes])

Hierarchical clustering by PCC

As further supportive evidence of clustering patterns, we used Pearson correlation-based clustering (PCC) and generated distance trees of the samples.

# Calculate distances amoung samples and eliminates samples that far away with other samples
### t(tpm) transposes columns and rows 
### dist will calculate the distance between each row 
tpm <- t(tpm)
# Calculating the distances
sampleTree = hclust(dist(tpm), method = "average")
# Plot the sample tree
plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="", cex.lab = 1.5, 
     cex.axis = 1.5, cex.main = 2)

For determining sample outliers:

# Plot a line to show the cut
### h: the y-values for horizontal lines.
abline(h = 15000, col = "red"); 
# Determine cluster under the line
### cutHeight: height at which branches are to be cut.
### minSize: minimum number of object on a branch to be considered a cluster.
clust = cutreeStatic(sampleTree, cutHeight = 15000, minSize = 10)
table(clust)
View(clust) # to view the clusters

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

05.NMDS, PCA, and PCC clustering.md

05.NMDS, PCA, and PCC clustering.md

Extracting read counts of milRNA gene loci

Parsing read count table and filtering

Clustering by Nonmetric dimensional scaling (NMDS)

Clustering by principal component analysis (PCA)

Hierarchical clustering by PCC

Files

05.NMDS, PCA, and PCC clustering.md

Latest commit

History

05.NMDS, PCA, and PCC clustering.md

File metadata and controls

Extracting read counts of milRNA gene loci

Parsing read count table and filtering

Clustering by Nonmetric dimensional scaling (NMDS)

Clustering by principal component analysis (PCA)

Hierarchical clustering by PCC