-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathimpute.r
67 lines (49 loc) · 2.12 KB
/
impute.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
## IMPUTE THE GENE EXPRESSION DATA
## ENVIRONMENT
Sys.setenv(
"AWS_ACCESS_KEY_ID" = "AKIA******",
"AWS_SECRET_ACCESS_KEY" = "SU****",
"AWS_DEFAULT_REGION" = "us-east-2"
)
## IMPORTS
require(impute)
require(aws.s3)
## READ DATA
# FROM AWS
df_main_fpkm = s3read_using(FUN = read.csv2, bucket = "lachke-lab-data/work/0.geno-ai/data/rna-seq/E-MTAB-6798",
object = "E-MTAB-6798-query-results.tpms.tsv",
sep = "\t")
# FROM LOCAL
# df_main_fpkm = read.delim2("/home/atul/0.work/0.geno_ai/data/rna-seq/e-mtab-6798/E-MTAB-6798-query-results.fpkms.tsv",
# header = TRUE, sep = "\t")
nrow(df_main_fpkm);ncol(df_main_fpkm)
# df_main_fpkm[df_main_fpkm == ""] <- 0
df_main_fpkm[1:10, 1: 8]
colnames(df_main_fpkm)
## Extract expression matrix
mt_main_fpkm = as.matrix(sapply(df_main_fpkm[,3:96], as.numeric))
nrow(mt_main_fpkm); ncol(mt_main_fpkm)
mt_main_fpkm[1:5,1:5, drop = FALSE]
## Impute; output is a list
## https://www.bioconductor.org/packages/release/bioc/manuals/impute/man/impute.pdf
mt_impute_lst = impute.knn(mt_main_fpkm ,k = 10, rowmax = 0.5, colmax = 0.8, maxp = 1500, rng.seed=362436069)
names(mt_impute_lst) ## Check list elements
mt_impute_fpkm = mt_impute_lst$data
nrow(mt_impute_fpkm ); ncol(mt_impute_fpkm )
mt_impute_fpkm[1:5,1:8, drop = FALSE]
## Make DF and write
df_impute_fpkm = as.data.frame(mt_impute_fpkm)
df_impute_fpkm[1:5, 1:5]
gene.id = df_main_fpkm['Gene.ID']
gene.name = df_main_fpkm['Gene.Name']
df_impute_final = cbind(gene.id, gene.name, df_impute_fpkm)
colnames(df_impute_final)[1] = 'Gene.ID'
colnames(df_impute_final)[2] = 'Gene.Name'
## Write Dataframe
# write.table(df_impute_final, "/home/atul/0.work/0.geno_ai/data/rna-seq/e-mtab-6798/E-MTAB-6798-query-results.fpkms.impute.tsv",
# sep = "\t", row.names = FALSE)
s3write_using(df_impute_final, FUN = write.table,
bucket = "lachke-lab-data/work/0.geno-ai/data/rna-seq/E-MTAB-6798",
object = "E-MTAB-6798-query-results.tpms.impute.tsv",
sep = "\t")