-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlncRNAs_Annotation
162 lines (123 loc) · 6.14 KB
/
lncRNAs_Annotation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
library(tidyverse)
library(GenomicRanges)
library(biomaRt)
ensembl <- rtracklayer::import("Homo_sapiens.GRCh38.109.gtf")
seqlevels(ensembl) <- paste0("chr",seqlevels(ensembl))
lncbook <- rtracklayer::import("lncRNA_LncBookv2.0_GRCh38.gtf")
var <- read_csv("variation_LncBook2.0.csv")
cons <- read_csv("conservation_LncBook2.0.csv")
meth <- read_csv("methylation_LncBook2.0.csv")
mirna <- read_csv("lncrna_mirna_miRandaAndTargetScanAndRNAhybrid_LncBook2.0.csv",col_names = F)
lncbook_gencode <- rtracklayer::import("LncBookv2.0_GENCODEv34_GRCh38.gtf")
lncbook_gene <- lncbook[lncbook$type=="gene"]
lncbook_gencode_gene <- lncbook_gencode[lncbook_gencode$type=="gene" & lncbook_gencode$gene_type=="lncRNA"]
ensembl_pc <- ensembl[ensembl$gene_biotype=="protein_coding"]
mart <- useMart(biomart="ensembl", dataset="hsapiens_gene_ensembl")
convertNamesAndAnnotate <- function(x,exper="Doxo")
{
k <- suppressWarnings(lncbook_gencode_gene[subjectHits(GenomicRanges::findOverlaps(ensembl[ensembl$gene_id==x & ensembl$type=="gene"],lncbook_gencode_gene,type="equal"))])
if(length(k)==0)
{
k <- suppressWarnings(lncbook_gencode_gene[subjectHits(GenomicRanges::findOverlaps(ensembl[ensembl$gene_id==x & ensembl$type=="gene"],lncbook_gencode_gene))])
}
if(length(unique(k$gene_id))>1)
{
k <- suppressWarnings(lncbook_gencode_gene[nearest(ensembl[ensembl$gene_id==x & ensembl$type=="gene"],lncbook_gencode_gene)])
}
if(length(k)==0)
{
pc_near <- ensembl_pc[nearest(ensembl[ensembl$gene_id==x & ensembl$type=="gene"],ensembl_pc)]$gene_name
return(c(exper,x,NA, NA, NA,NA,NA,pc_near))
}else{
y <- unique(k$gene_id)
class <- k$gene_classification
pc_near <- ensembl_pc[nearest(ensembl[ensembl$gene_id==x & ensembl$type=="gene"],ensembl_pc)]$gene_name
this_cons <- cons %>% filter(`Gene ID`==y & Assembly=="mm39") %>% pull(Conservation)
this_mirna <- paste(unique(mirna %>% filter(X1==y) %>% pull(X11)),collapse=",")
this_GWAS <- paste(unique(var %>% filter(`Gene ID`==y) %>% pull(`GWAS Trait`)),collapse=",")
return(c(exper,x,y,class,this_cons,this_mirna,this_GWAS,pc_near))
}
}
make_annotations <- function(data,exper_name,as_parallel=F){
sig <- data %>% filter(abs(logFC)>1 & Biotype=="lncRNA" & FDR<0.05)
if(nrow(sig)>0)
{
sig_anno <- sig %>% dplyr::select(Ensembl_ID,Gene_symbol,logFC,FDR)
x_vec <- sig$Ensembl_ID
if(as_parallel){
anno_list <- foreach(i=1:length(x_vec)) %dopar% {
convertNamesAndAnnotate(x_vec[i],exper=exper_name)
}
}else{
anno_list <- list()
for(i in 1:length(x_vec)){
anno_list[[paste(i)]] <- (convertNamesAndAnnotate(x_vec[i],exper=exper_name))
}
}
res <- as_tibble(do.call(rbind,anno_list))
colnames(res) <- c("Comparison","Ensembl_ID", "LncBook ID", "Type","Conservation","miRNA","GWAS","nearest PC")
sig_anno <- sig_anno %>% left_join(res,by="Ensembl_ID")
return(sig_anno)
}else{
return(NA)
}
}
to_anno <- list.files("data/",pattern="-CPM.txt",full.names = T)
fnames <- list.files("data/",pattern="-CPM.txt",full.names = F)
comparisons <- gsub("GSE.+\\-(.+_vs_.+)\\-CPM.txt","\\1",fnames)
for(i in 1:length(to_anno)){
data <- read_delim(to_anno[i])
lncAnno <- make_annotations(data, comparisons[i],as_parallel=F)
if(!is.na(lncAnno))
{
tsave_name <- gsub("(-CPM)","-ANNOTATED_lncRNAs",to_anno[i])
write_tsv(lncAnno,file=paste0(tsave_name))
}
}
#### Add most correlated gene
to_anno <- list("GSE135842-1-Control_P130_siCTRL-vs-DOXO_P130_siCTRL",
"GSE135842-2-Control_P130_siP107-vs-DOXO_P130_siP107",
"GSE135842-3-Control_RB+P130_siCTRL-vs-DOXO_RB-P130_siCTRL",
"GSE135842-4-Control_RB+P130_siP107-vs-DOXO_RB-P130_siP107")
## un-comment them when needed
# to_anno <- list("GSE198396-1-Normal-vs-PDL1_pos",
# "GSE198396-2-Normal-vs-PDL1_neg")
# to_anno <- read_tsv("GSE154101-Normal-vs-Doxo")
# to_anno <- list("GSE136631-1-FCBIC02-Par_ctrl-vs-Par_Doxo",
# "GSE136631-2-SUM149-Par_ctrl-vs-Par_Doxo",
# "GSE136631-3-FCBIC02-Par_ctrl-vs-DoxoR_ctrl",
# "GSE136631-4-SUM149-Par_ctrl-vs-DoxoR_ctrl")
m_anno <- read_tsv(paste0(to_anno[1], "-ANNOTATED_lncRNAs.txt")) # change number based on file that is being run
CPM <- read_delim("....-All_Samples.txt") # read in correct file with CPM values for all samples in the study (e.g., GSE154101-All_Samples.txt)
m <- CPM %>% filter(CPM$Ensembl_ID %in% m_anno$Ensembl_ID)
m <- m[, -c(2,3)] # remove columns with gene symbol and biotype
CPM <- tibble::column_to_rownames(GSE135842_CPM, "Ensembl_ID")
CPM <- CPM[, -c(1:2)] # remove columns with ensembl ID and gene symbol
results <- data.frame(matrix(NA, nrow=nrow(m), ncol=2))
colnames(results) <- c("Ensembl_ID", "Top Corr Gene")
for (i in 1:nrow(m)) {
# Calculate the correlations between m and the big table
correlations <- cor(t(m[i, -1]), t(CPM))
# Get top correlated gene
corr <- as.data.frame(t(correlations))
corr <- tibble::rownames_to_column(corr, "Ensembl_ID")
corr <- corr[order(abs(corr$V1),decreasing=T),]
top_gene <- corr$Ensembl_ID[2]
#o <- head(sort(abs(corr$V1), decreasing=T), 2)
#top_gene <- corr %>% filter(corr$V1 %in% o)
results[i, 1] <- m[i, ]$Ensembl_ID
results[i, 2] <- top_gene
}
m_anno <- merge(m_anno, results, by="Ensembl_ID")
mart <- useMart(biomart="ensembl", dataset="hsapiens_gene_ensembl")
gene_symbols <- getBM(filters= "ensembl_gene_id", attributes= c("ensembl_gene_id", "hgnc_symbol"),values=m_anno$`Top Corr Gene`,mart= mart)
for (i in 1:nrow(gene_symbols)) {
if (gene_symbols$hgnc_symbol[i] == "") {
gene_symbols$hgnc_symbol[i] = gene_symbols$ensembl_gene_id[i]
}
}
colnames(gene_symbols) <- c("Top Corr Gene", "Top correlated gene")
m_anno <- merge(m_anno, gene_symbols, by="Top Corr Gene")
m_anno <- m_anno[, c("Ensembl_ID", "Gene_symbol", "logFC", "FDR", "Comparison", "LncBook ID", "Type", "Conservation", "miRNA", "GWAS", "nearest PC", "Top correlated gene")]
tsave_name <- paste0(to_anno[1],"-CORRELATED_lncRNAs.txt") # change number based on file that is being run
write_tsv(m_anno, file=tsave_name)