bug fixed

YinLiLin · Sep 28, 2020 · dc84dc7 · dc84dc7
1 parent 148028e
commit dc84dc7
Show file tree

Hide file tree

Showing 15 changed files with 106 additions and 114 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -15,9 +15,9 @@ export(attach.big.matrix)
 export(deepcopy)
 export(is.big.matrix)
 export(new)
-export(read_plink)
+export(read_binary)
 export(read_vcf)
-export(write_plink)
+export(write_binary)
 import(methods)
 import(stats)
 import(utils)

diff --git a/R/SumTool.r b/R/SumTool.r
@@ -36,29 +36,25 @@ function(x)
 #' @param maxLine number, set the number of lines to handle at a time, bigger lines require more memory.
 #' @param impute logical, whether to impute missing values in genotype.
 #' @param additive logical, whether to code the genotype into 0, 1, 2; otherwise, the genotype will be coded into 11, 12, 22, which can be used for summary data imputation.
-#' @param backingpath the path to the directory containing the file backing cache.
-#' @param descriptorfile the name of the file to hold the backingfile description, for subsequent use with ‘attach.big.matrix’; if ‘NULL’, the ‘backingfile’ is used as the root part of the descriptor file name.  The descriptor file is placed in the same directory as the backing files.
-#' @param backingfile the root name for the file(s) for the cache.
+#' @param out character, path and prefix of output file
 #' @param verbose logical, whether to print the log information
 #' @param threads number, the number of used threads for parallel process
 
 #' @examples
 #' ref_bfile_path <- system.file("extdata", "ref_geno", package = "SumTool")
-#' data <- read_plink(bfile=ref_bfile_path, backingpath=tempdir(), threads=1, verbose=FALSE)
+#' data <- read_binary(bfile=ref_bfile_path, out=tempfile(), threads=1, verbose=FALSE)
 #' geno <- data$geno
 #' map <- data$map
 
 #' @export
 
-read_plink <- 
+read_binary <- 
 function(
 	bfile = "", 
 	maxLine = 1000,
 	impute = TRUE,
 	additive = TRUE, 
-	backingpath = getwd(),
-	descriptorfile = NULL,
-	backingfile = NULL,
+	out = NULL,
 	verbose = TRUE,
 	threads = 1
 ){
@@ -72,27 +68,32 @@ function(
 	if(verbose)	cat("Number of individuals: ", n , "\n", sep="")
 	if(verbose)	cat("Reading...\n")
 
-	if(is.null(backingfile) & is.null(backingfile)){
-		backingfile <- paste0(bfile,".bin")
-		descriptorfile <- paste0(bfile,".desc")
+	if(is.null(out)){
+		backingfile <- paste0(basename(bfile),".bin")
+		descriptorfile <- paste0(basename(bfile),".desc")
+		backingpath <- "."
+	}else{
+		backingfile <- paste0(basename(out),".bin")
+		descriptorfile <- paste0(basename(out),".desc")
+		backingpath <- dirname(out)
 	}
-	if(!is.null(backingfile))	backingfile <- basename(backingfile)
-	if(!is.null(descriptorfile))	descriptorfile <- basename(descriptorfile)
-	if(!is.null(backingfile) & is.null(descriptorfile))
-		stop("descriptorfile shoud be assigned.")
-	if(is.null(backingfile) & !is.null(descriptorfile))
-		stop("backingfile shoud be assigned.")
-
-	if(!is.null(descriptorfile)){
-		map_file <- unlist(strsplit(descriptorfile, "", fixed = TRUE))
-		sep_index <- which(map_file == ".")
-		if(length(sep_index)){
-			map_file <- paste0(map_file[1 : (sep_index[length(sep_index)] - 1)], collapse="")
-		}else{
-			map_file <- paste0(map_file, collapse="")
-		}
-		if(!is.null(backingpath))	map_file <- paste0(backingpath, "/", map_file)
+	# if(!is.null(backingfile))	backingfile <- basename(backingfile)
+	# if(!is.null(descriptorfile))	descriptorfile <- basename(descriptorfile)
+	# if(!is.null(backingfile) & is.null(descriptorfile))
+	# 	stop("descriptorfile shoud be assigned.")
+	# if(is.null(backingfile) & !is.null(descriptorfile))
+	# 	stop("backingfile shoud be assigned.")
+
+	# if(!is.null(descriptorfile)){
+	map_file <- unlist(strsplit(descriptorfile, "", fixed = TRUE))
+	sep_index <- which(map_file == ".")
+	if(length(sep_index)){
+		map_file <- paste0(map_file[1 : (sep_index[length(sep_index)] - 1)], collapse="")
+	}else{
+		map_file <- paste0(map_file, collapse="")
 	}
+	if(!is.null(backingpath))	map_file <- paste0(backingpath, "/", map_file)
+	# }
 	map <- as.data.frame(rMap_c(paste0(bfile, ".bim"), out = map_file), stringsAsFactors=FALSE)
 	map$Pos <- as.numeric(map$Pos)
 	pheno <- read.table(paste0(bfile, ".fam"), header=FALSE)[, -c(1:5), drop=FALSE]
@@ -120,15 +121,13 @@ function(
 #' @param maxLine number, set the number of lines to handle at a time, bigger lines require more memory.
 #' @param impute logical, whether to impute missing values in genotype.
 #' @param additive logical, whether to code the genotype into 0, 1, 2; otherwise, the genotype will be coded into 11, 12, 21, 22, which can be used for summary data imputation.
-#' @param backingpath the path to the directory containing the file backing cache.
-#' @param descriptorfile the name of the file to hold the backingfile description, for subsequent use with ‘attach.big.matrix’; if ‘NULL’, the ‘backingfile’ is used as the root part of the descriptor file name.  The descriptor file is placed in the same directory as the backing files.
-#' @param backingfile the root name for the file(s) for the cache.
+#' @param out character, path and prefix of output file
 #' @param verbose logical, whether to print the log information
 #' @param threads number, the number of used threads for parallel process
 
 #' @examples
 #' ref_vfile_path <- system.file("extdata", "ref_geno.vcf", package = "SumTool")
-#' data <- read_vcf(vfile=ref_vfile_path, backingpath=tempdir(), threads=1, verbose=FALSE)
+#' data <- read_vcf(vfile=ref_vfile_path, out=tempfile(), threads=1, verbose=FALSE)
 #' geno <- data$geno
 #' map <- data$map
 
@@ -140,9 +139,7 @@ function(
 	maxLine = 1000,
 	impute = TRUE,
 	additive = TRUE, 
-	backingpath = getwd(),
-	descriptorfile = NULL,
-	backingfile = NULL,
+	out = NULL, 
 	verbose = TRUE,
 	threads = 1
 ){
@@ -157,27 +154,34 @@ function(
 	if(verbose)	cat("Number of individuals: ", n , "\n", sep="")
 	if(verbose)	cat("Reading...\n")
 
-	if(is.null(backingfile) & is.null(backingfile)){
-		backingfile <- paste0(sub('....$','', vfile),".bin")
-		descriptorfile <- paste0(sub('....$','', vfile),".desc")
+	if(is.null(out)){
+		prefix <- sub('....$','', basename(vfile))
+		backingfile <- paste0(prefix,".bin")
+		descriptorfile <- paste0(prefix,".desc")
+		backingpath <- "."
+	}else{
+		backingfile <- paste0(basename(out),".bin")
+		descriptorfile <- paste0(basename(out),".desc")
+		backingpath <- dirname(out)
 	}
-	if(!is.null(backingfile))	backingfile <- basename(backingfile)
-	if(!is.null(descriptorfile))	descriptorfile <- basename(descriptorfile)
-	if(!is.null(backingfile) & is.null(descriptorfile))
-		stop("descriptorfile shoud be assigned.")
-	if(is.null(backingfile) & !is.null(descriptorfile))
-		stop("backingfile shoud be assigned.")
-
-	if(!is.null(descriptorfile)){
-		map_file <- unlist(strsplit(descriptorfile, "", fixed = TRUE))
-		sep_index <- which(map_file == ".")
-		if(length(sep_index)){
-			map_file <- paste0(map_file[1 : (sep_index[length(sep_index)] - 1)], collapse="")
-		}else{
-			map_file <- paste0(map_file, collapse="")
-		}
-		if(!is.null(backingpath))	map_file <- paste0(backingpath, "/", map_file)
+
+	# if(!is.null(backingfile))	backingfile <- basename(backingfile)
+	# if(!is.null(descriptorfile))	descriptorfile <- basename(descriptorfile)
+	# if(!is.null(backingfile) & is.null(descriptorfile))
+	# 	stop("descriptorfile shoud be assigned.")
+	# if(is.null(backingfile) & !is.null(descriptorfile))
+	# 	stop("backingfile shoud be assigned.")
+
+	# if(!is.null(descriptorfile)){
+	map_file <- unlist(strsplit(descriptorfile, "", fixed = TRUE))
+	sep_index <- which(map_file == ".")
+	if(length(sep_index)){
+		map_file <- paste0(map_file[1 : (sep_index[length(sep_index)] - 1)], collapse="")
+	}else{
+		map_file <- paste0(map_file, collapse="")
 	}
+	if(!is.null(backingpath))	map_file <- paste0(backingpath, "/", map_file)
+	# }
 	geno <- bigmemory::big.matrix(
 		nrow = n,
 		ncol = m,
@@ -209,16 +213,16 @@ function(
 #' @examples
 #' # reading data
 #' ref_bfile_path <- system.file("extdata", "ref_geno", package = "SumTool")
-#' data <- read_plink(bfile=ref_bfile_path, threads=1, backingpath=tempdir(), verbose=FALSE)
+#' data <- read_binary(bfile=ref_bfile_path, threads=1, out=tempfile(), verbose=FALSE)
 #' geno <- data$geno
 #' map <- data$map
 #'
 #' # writing data
-#' # write_plink(geno=geno, map=map, out="./test", threads=1, verbose=FALSE)
+#' # write_binary(geno=geno, map=map, out="./test", threads=1, verbose=FALSE)
 
 #' @export
 
-write_plink <- 
+write_binary <- 
 function(
 	geno = NULL, 
 	map = NULL,
@@ -272,7 +276,7 @@ function(
 #' typed_z_path <- system.file("extdata", "typed.zscore", package = "SumTool")
 #'
 #' # reading data
-#' data <- read_bfile(bfile=ref_file_path, additive=FALSE, backingpath=tempdir(), 
+#' data <- read_binary(bfile=ref_file_path, additive=FALSE, out=tempfile(), 
 #' 	threads=1, verbose=FALSE)
 #' ref.geno <- data$geno
 #' ref.map <- data$map
@@ -284,7 +288,7 @@ function(
 #' #--------------SImpute-LD---------------#
 #'
 #' gwas_file_path <- system.file("extdata", "gwas_geno", package = "SumTool")
-#' gwas <- read_bfile(bfile=gwas_file_path, additive=FALSE, backingpath=tempdir(), 
+#' gwas <- read_binary(bfile=gwas_file_path, additive=FALSE, out=tempfile(), 
 #' 	threads=1, verbose=FALSE)
 #' typed.geno <- gwas$geno
 #' typed.map <- gwas$map
@@ -492,7 +496,7 @@ SImputeZ <- function(ref.geno = NULL, ref.map = NULL, typed.geno = NULL, typed =
 #' typed_b_path <- system.file("extdata", "typed.beta", package = "SumTool")
 #'
 #' # reading data
-#' data <- read_bfile(bfile=ref_file_path, additive=FALSE, backingpath=tempdir(), 
+#' data <- read_binary(bfile=ref_file_path, additive=FALSE, out=tempfile(), 
 #' 	threads=1, verbose=FALSE)
 #' ref.geno <- data$geno
 #' ref.map <- data$map
@@ -504,7 +508,7 @@ SImputeZ <- function(ref.geno = NULL, ref.map = NULL, typed.geno = NULL, typed =
 #' #--------------SImpute-LD---------------#
 #'
 #' gwas_file_path <- system.file("extdata", "gwas_geno", package = "SumTool")
-#' gwas <- read_bfile(bfile=gwas_file_path,  additive=FALSE, backingpath=tempdir(), 
+#' gwas <- read_binary(bfile=gwas_file_path,  additive=FALSE, out=tempfile(), 
 #' 	threads=1, verbose=FALSE)
 #' typed.geno <- gwas$geno
 #' typed.map <- gwas$map
@@ -729,7 +733,7 @@ SImputeB <- function(ref.geno = NULL, ref.map = NULL, typed.geno = NULL, typed =
 
 #' @examples
 #' gwas_bfile_path <- system.file("extdata", "gwas_geno", package = "SumTool")
-#' data <- read_plink(bfile=gwas_bfile_path, threads=1, verbose=FALSE, backingpath=tempdir())
+#' data <- read_binary(bfile=gwas_bfile_path, threads=1, verbose=FALSE, out=tempfile())
 #' geno <- data$geno
 #' ld <- LDcal(geno=geno, threads=1, verbose=FALSE)
 
@@ -793,7 +797,7 @@ LDcal <- function(geno = NULL, index = NULL, threads = 1, lambda = 0, chisq = 0,
 
 #' @examples
 #' ref_bfile_path <- system.file("extdata", "ref_geno", package = "SumTool")
-#' data <- read_plink(bfile=ref_bfile_path, backingpath=tempdir(), threads=1)
+#' data <- read_binary(bfile=ref_bfile_path, out=tempfile(), threads=1)
 #' geno <- data$geno
 #' map <- data$map
 #' y <- data$pheno[,1]
@@ -862,7 +866,7 @@ LMreg <- function(y, geno = NULL, map = NULL, X = NULL, threads = 1, verbose = T
 #' ref_bfile_path <- system.file("extdata", "ref_geno", package = "SumTool")
 #' 
 #' # load data
-#' data <- read_plink(bfile=ref_bfile_path, threads=1, backingpath=tempdir(), verbose=FALSE)
+#' data <- read_binary(bfile=ref_bfile_path, threads=1, out=tempfile(), verbose=FALSE)
 #' geno <- data$geno
 #' map <- data$map
 #' ldscore <- LDscore(geno = geno, map = map, w = 100000, b=12500, threads = 1, verbose=FALSE)
@@ -975,7 +979,7 @@ LDscore <- function(geno = NULL, map = NULL, w = 1000000, b = 500000, r2 = TRUE,
 #' ref_bfile_path <- system.file("extdata", "ref_geno", package = "SumTool")
 #' 
 #' # load data
-#' data <- read_plink(bfile=ref_bfile_path, threads=1, backingpath=tempdir(), verbose=FALSE)
+#' data <- read_binary(bfile=ref_bfile_path, threads=1, out=tempfile(), verbose=FALSE)
 #' geno <- data$geno
 #' map <- data$map
 #' snp <- LDprune(geno = geno, map = map, w = 100000, b=50000, threads = 1, verbose=FALSE)
@@ -1089,7 +1093,7 @@ LDprune <- function(geno = NULL, map = NULL, w = 1000000, b = 500000, r2.cutoff
 #' p_path <- system.file("extdata", "P.txt", package = "SumTool")
 
 #' # load data
-#' data <- read_plink(bfile=ref_bfile_path, threads=1, backingpath=tempdir(), verbose=FALSE)
+#' data <- read_binary(bfile=ref_bfile_path, threads=1, out=tempfile(), verbose=FALSE)
 #' geno <- data$geno
 #' map <- data$map
 #' pdata <- read.table(p_path, header = TRUE)
@@ -1211,7 +1215,7 @@ LDclump <- function(geno = NULL, map = NULL, p = NULL, w = 1000000, r2.cutoff =
 #' 
 #' # load data
 #' sumstat <- read.table(sumstat_path, header=TRUE)
-#' data <- read_plink(bfile=ref_bfile_path, threads=1, backingpath=tempdir(), verbose=FALSE)
+#' data <- read_binary(bfile=ref_bfile_path, threads=1, out=tempfile(), verbose=FALSE)
 #' geno <- data$geno
 #' map <- data$map
 #' h2 <- 0.5
@@ -1436,7 +1440,7 @@ LDreg <- function(sumstat = NULL, ldscore = NULL, wld = NULL, M = NULL, maxz2 =
 	if(is.matrix(sumstat) || is.data.frame(sumstat)){
 
 		#heritability estimation
-		est <- h2_est(sumstat = sumstat, ldscore = ldscore, wld = wld, M = M, axz2 = maxz2, maf = maf, nblock = nblock, verbose = verbose)
+		est <- h2_est(sumstat = sumstat, ldscore = ldscore, wld = wld, M = M, maxz2 = maxz2, maf = maf, nblock = nblock, verbose = verbose)
 	}else if(is.list(sumstat)){
 		if(length(sumstat) == 1){
 

diff --git a/README.md b/README.md
@@ -39,11 +39,11 @@ After installed successfully, type ```library(SumTool)``` to use, then type ```l
 
 Genotype Converting
 -----
-With the developing of advaced sequence technologies, the unprecedentedly increased markers across genome make a big challenge in relevant genetic analysis. Loading the genotype into memory directly is highly limited by the computation resources, which becomes the bottleneck of the most of softwares or pipelines. Here we provided a function ```read_plink``` developed by aid of bigmemory package to construct memory-mapped files ('big.matrix') on disk from plink binary files. Instead of reading all genotype data into RAM, it greatly reduce memory cost without significantly increase of computation time. Moreover, this data transformation process is only required at the first time, and no matter how big the data is, it can be connected with RAM within seconds at the next time.
+With the developing of advaced sequence technologies, the unprecedentedly increased markers across genome make a big challenge in relevant genetic analysis. Loading the genotype into memory directly is highly limited by the computation resources, which becomes the bottleneck of the most of softwares or pipelines. Here we provided two functions ```read_binary, read_vcf``` developed by aid of bigmemory package to construct memory-mapped files ('big.matrix') on disk from plink binary and VCF files. Instead of reading all genotype data into RAM, it greatly reduce memory cost without significantly increase of computation time. Moreover, this data transformation process is only required at the first time, and no matter how big the data is, it can be connected with RAM within seconds at the next time.
 ```r
 # plink binary file
 ref_bfile_path <- system.file("extdata", "ref_geno", package = "SumTool")
-data <- read_plink(bfile=ref_bfile_path, additive=TRUE, threads=1)
+data <- read_binary(bfile=ref_bfile_path, additive=TRUE, threads=1)
 # bfile: the prefix of binary files
 
 # or load VCF file
@@ -54,9 +54,9 @@ data <- read_vcf(vfile=ref_vfile_path, additive=TRUE, threads=1)
 ref.geno <- data$geno
 ref.map <- data$map
 ```
-By default, the memory-mapped files are directed into work directory, users could redirect to new path as following:
+Missing genotype will be replaced by the major genotype of each allele. By default, the memory-mapped files are directed into work directory, users could redirect to new path as following:
 ```r
-data <- read_plink(bfile=ref_bfile_path, backingpath="./", descriptorfile="test.desc", backingfile="test.bin", threads=1)
+data <- read_binary(bfile=ref_bfile_path, out="./test", threads=1)
 
 # directly use for the next time
 ref.geno <- attach.big.matrix("./test.desc")
@@ -108,7 +108,7 @@ Linear Model
 -----
 ```r
 ref_bfile_path <- system.file("extdata", "ref_geno", package = "SumTool")
-data <- read_plink(bfile=ref_bfile_path, threads=1)
+data <- read_binary(bfile=ref_bfile_path, threads=1)
 geno <- data$geno
 map <- data$map
 y <- data$pheno[,1]
@@ -117,7 +117,7 @@ gwas <- LMreg(y=y, geno=geno, map=map, threads=1, verbose=FALSE)
 
 Impute Zscore
 -----
-For Zscore imputation, at least 6 columns should be provided in same order with the example for typed SNPs. No need to separate genome into chromosomes. The reference panel can be loaded from both plink binary file (read_plink) or VCF file (read_vcf), we recommend using phased VCF file. Please note that "(..., [additive=FALSE]())" should be added when reading either plink file or VCF file.
+For Zscore imputation, at least 6 columns should be provided in same order with the example for typed SNPs. No need to separate genome into chromosomes. The reference panel can be loaded from both plink binary file (read_binary) or VCF file (read_vcf), we recommend using phased VCF file. Please note that "(..., [additive=FALSE]())" should be added when reading either plink file or VCF file.
 ```r
 # get the path of attached example data on SumTool
 ref_file_path <- system.file("extdata", "ref_geno.vcf", package = "SumTool")
@@ -143,7 +143,7 @@ xx <- SImputeZ(ref.geno=ref.geno, ref.map=ref.map, typed=typed_z, w=1000000, thr
 If the individual genotype of summary statistics is available, the imputation accuracy can be improved by using the LD matrix derived from individual genotype rather than reference panel for typed SNPs. 
 ```r
 gwas_bfile_path <- system.file("extdata", "gwas_geno", package = "SumTool")
-gwas <- read_plink(bfile=gwas_bfile_path, additive=FALSE, threads=1)
+gwas <- read_binary(bfile=gwas_bfile_path, additive=FALSE, threads=1)
 typed.geno <- gwas$geno
 typed.map <- gwas$map
 # NOTE: the order of SNPs in 'typed.geno' should be consistent with the order in 'typed_z'.
@@ -153,7 +153,7 @@ xx <- SImputeZ(ref.geno=ref.geno, ref.map=ref.map, typed=typed_z, typed.geno=typ
 
 Impute Marginal Effect
 -----
-For BETA and SE imputation, limited 8 columns should be provided in same order with the example for typed SNPs. No need to separate genome into chromosomes. The reference panel can be loaded from both plink binary file (read_plink) or VCF file (read_vcf), we recommend using phased VCF file. Please note that "(..., [additive=FALSE]())" should be added when reading either plink file or VCF file.<br>
+For BETA and SE imputation, limited 8 columns should be provided in same order with the example for typed SNPs. No need to separate genome into chromosomes. The reference panel can be loaded from both plink binary file (read_binary) or VCF file (read_vcf), we recommend using phased VCF file. Please note that "(..., [additive=FALSE]())" should be added when reading either plink file or VCF file.<br>
 ***NOTE***: It is not supported to impute multiple traits at a time.
 ```r
 # get the path of attached example data on SumTool
@@ -321,7 +321,7 @@ head(sumstat)
 4  rs3131969   1 754182  A  G -0.04744 0.5306   100
 5  rs1048488   1 760912  C  T -0.07650 0.5416   100
 6 rs12562034   1 768448  G  A -0.12360 0.5500   100
-data <- read_plink(bfile=ref_bfile_path, threads=1)
+data <- read_binary(bfile=ref_bfile_path, threads=1)
 geno <- data$geno
 map <- data$map
 h2 <- 0.5

diff --git a/inst/extdata/gwas_geno.vcf → inst/extdata/ref_geno.vcf b/inst/extdata/gwas_geno.vcf → inst/extdata/ref_geno.vcf
diff --git a/man/LDcal.Rd b/man/LDcal.Rd
diff --git a/man/LDclump.Rd b/man/LDclump.Rd