-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathmodule.access.r
104 lines (91 loc) · 3.75 KB
/
module.access.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# Take array of SNP6 birdseed geneotype urls, the names you want to name each sample, and TCGA credential, and return matrix of genotypes (with quality cutoff 0.1)
GetGenotype <- function(urls, names, username, password) {
test = read.delim(textConnection(getURL(urls[1], username=username, password=password)), skip=1)
geno = matrix(NA, nrow(test), length(urls))
rownames(geno) = test$Composite.Element.REF
colnames(geno) = names
failedIndex = NULL
for(i in 1: length(names)) {
data <- read.delim(textConnection(getURL(urls[i], username=username, password=password)), skip=1)
call <- data$Call
call[which(data$Confidence > 0.1)] <- NA
if(length(call) == nrow(geno)) {
geno[, i] = call
} else {
failedIndex = c(failedIndex, i)
}
}
if(length(failedIndex) > 0) {
cat("Expression data of some aliquots not retrieved. Please try again! \n")
}
return(geno)
}
# Ask for TCGA username and password, and return a data frame with $username as username and $password as password
GetTCGACredential <- function() {
# load package
require(RCurl)
valid <- F
try <- 0
# Try three times until succeed
while (!valid & try < 3) {
# Ask for username and password
username <- readline("What is your TCGA username?")
password <- readline("What is your TCGA password?")
# Test username and password
secureEntry <- paste0("https://", username, ":", password, "@tcga-data-secure.nci.nih.gov/tcgafiles/tcga4yeo/tumor/")
valid <- grepl("Index of", getURL(secureEntry))
try <- try + 1
}
# Feedback and return
if(valid) {
cat("Valid username and password\n")
} else {
cat("Username/password invalid: you may not access TCGA protected data\n")
username <- ""
password <- ""
}
cred <- cbind.data.frame(username, password, stringsAsFactors=F)
return(cred)
}
# Read TCGA Table from url or file, return data frame
# Need TCGA username and password to access protected dataset
GetTCGATable <- function(url, username="", password="") {
# load package
require(RCurl)
# read table file
if(grepl("^http", url)) {
s <- try(getURL(url, username=username, password=password), silent = TRUE);
if (class(s) == "try-error")
return(NULL);
tbl <- read.delim(textConnection(s), quote="\"", as.is=T)
} else {
tbl = read.delim(url, quote="\"", as.is=T)
}
# remove the first one or two lines starting with "CDE_ID" or "bcr_"
if(grepl("bcr_", tbl[1,1]) | grepl("CDE_ID", tbl[1,1]))
tbl <- tbl[-1,]
if(grepl("bcr_", tbl[1,1]) | grepl("CDE_ID", tbl[1,1]))
tbl <- tbl[-1,]
# return the data frame
return(tbl);
}
# read SNP6 sdrf table, output processed table with urls
ProcessSNP6Sdrf <- function(sdrf, disease) {
# init entry point
public.link <- paste0("https://tcga-data.nci.nih.gov/tcgafiles/ftp_auth/distro_ftpusers/anonymous/tumor/", disease, "/cgcc/broad.mit.edu/genome_wide_snp_6/snp/")
protected.link <- paste0("https://tcga-data-secure.nci.nih.gov/tcgafiles/tcga4yeo/tumor/", disease, "/cgcc/broad.mit.edu/genome_wide_snp_6/snp/")
# extract columns of CEL, Birdseed and Hg19.nocnv
uuid <- sdrf$Extract.Name
aliquot <- sdrf$Comment..TCGA.Barcode.
file.cel <- sdrf$Array.Data.File
url.cel <- sdrf$Comment..TCGA.Archive.Name.
url.cel <- paste0(protected.link, url.cel, "/", file.cel)
file.birdseed <- sdrf$Derived.Array.Data.Matrix.File.1
url.birdseed <- sdrf$Comment..TCGA.Archive.Name..2
url.birdseed <- paste0(protected.link, url.birdseed, "/", file.birdseed)
file.seg <- sdrf$Derived.Array.Data.File.3
url.seg <- sdrf$Comment..TCGA.Archive.Name..9
url.seg <- paste0(public.link, url.seg, "/", file.seg)
# output
output = cbind.data.frame(uuid, aliquot, file.cel, url.cel, file.birdseed, url.birdseed, file.seg, url.seg, stringsAsFactors=F)
}