-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathbuild_reactome_network.R
107 lines (72 loc) · 3.04 KB
/
build_reactome_network.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
## reactome graph building
## information retrived: 30 Jan 2019
## Ize Buphamalai
## Modified June 2021
library("igraph")
library(tidyverse)
library(parallel)
pacman::p_load(pbapply)
#### read pathway gene set from Reactome --------
reactome_gmt = scan("../data/raw_data/ReactomePathways.gmt", what = "", sep = "\n")
gmt_to_list = function(x){
res = list()
contents = strsplit(x, "\t")[[1]]
res$name = contents[1]
res$url = contents[2]
res$genes = contents[3:length(contents)]
return(res)
}
#### convert the association into list -----
reactome_pathways = lapply(reactome_gmt, gmt_to_list)
reactome = lapply(reactome_pathways, function(x) x[["genes"]])
names(reactome) = lapply(reactome_pathways, function(x) x[["name"]])
#remove pathways with only one gene (too specific)
reactome = reactome[!sapply(reactome, length)==1]
# remove pathways with over 300 genes (uninformative)
reactome = reactome[!sapply(reactome, length)>300]
######### build reactome network
d1 <- stack(reactome)
# take valid gene symbols
source("../functions/fn_source.R")
entrez_names <- IDconvert(unique(d1$values), from = "SYMBOL", to = "ENTREZID")
d1 <- d1 %>% filter(values %in% names(entrez_names)[!is.na(entrez_names)])
# create edge lists from d1
g_gene_pathway <- graph_from_data_frame(d1, directed = F)
adj_gene_pathway <- get.adjacency(g_gene_pathway, type = "both")
adj_gene_gene <- adj_gene_pathway %*% t(adj_gene_pathway)
gene_element <- colnames(adj_gene_gene) %in% unique(d1$values)
adj_gene_gene <- adj_gene_gene[gene_element, gene_element]
diag(adj_gene_gene) <- NA
gene_gene_df <-get.data.frame(graph_from_adjacency_matrix(adj_gene_gene, weighted = T, diag = F))
# store gene and pathway as id
gene_id <- unique(d1$values)
pathway_id <- unique(d1$ind)
# convert data into numeric
#d1$values <- factor(d1$values, levels = gene_id, labels = 1:length(gene_id) )
d1$ind <- factor(d1$ind, levels = pathway_id, labels = 1:length(pathway_id) )
reactome_by_element <- split(as.integer(d1$ind), d1$values)
shared_pathway <- pbapply::pbapply(gene_gene_df, 1, function(x) intersect(reactome_by_element[[x[1]]], reactome_by_element[[x[2]]]))#,
# cl = cl)
gene_gene_df$shared_pathways <- shared_pathway
# convert into df
reactome_member_df <- tibble(pathway = names(reactome), member = reactome) %>%
unnest(member)
reactome_count_df <- reactome_member_df%>%
count(member) %>%
arrange(-n)
#### save reactome pathway counts -------
write_tsv(reactome_count_df, "../cache/reactome_pathway_counts.tsv")
write_tsv(reactome_member_df, "../cache/reactome_pathway_association.tsv")
# test_multiple scenarios of reactome graph build
g_reactome <- list()
for(i in 1:10){
g_reactome[[i]] <- graph_from_data_frame(reactome_copathway_df[reactome_copathway_df$weight > i,], directed = FALSE)
}
{par(mfrow = c(5,2))
for(i in 1:10){
fit_power_law(g_reactome[[i]], mode = "ccdf")
}
}
# select a cutoff of five
write_tsv(reactome_copathway_df[reactome_copathway_df$weight > 5,], "../data/network_edgelists/reactome_copathway.tsv")
#################