Skip to content

Commit

Permalink
bug fixed
Browse files Browse the repository at this point in the history
  • Loading branch information
YinLiLin committed Sep 28, 2020
1 parent 148028e commit dc84dc7
Show file tree
Hide file tree
Showing 15 changed files with 106 additions and 114 deletions.
4 changes: 2 additions & 2 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@ export(attach.big.matrix)
export(deepcopy)
export(is.big.matrix)
export(new)
export(read_plink)
export(read_binary)
export(read_vcf)
export(write_plink)
export(write_binary)
import(methods)
import(stats)
import(utils)
Expand Down
138 changes: 71 additions & 67 deletions R/SumTool.r
Original file line number Diff line number Diff line change
Expand Up @@ -36,29 +36,25 @@ function(x)
#' @param maxLine number, set the number of lines to handle at a time, bigger lines require more memory.
#' @param impute logical, whether to impute missing values in genotype.
#' @param additive logical, whether to code the genotype into 0, 1, 2; otherwise, the genotype will be coded into 11, 12, 22, which can be used for summary data imputation.
#' @param backingpath the path to the directory containing the file backing cache.
#' @param descriptorfile the name of the file to hold the backingfile description, for subsequent use with ‘attach.big.matrix’; if ‘NULL’, the ‘backingfile’ is used as the root part of the descriptor file name. The descriptor file is placed in the same directory as the backing files.
#' @param backingfile the root name for the file(s) for the cache.
#' @param out character, path and prefix of output file
#' @param verbose logical, whether to print the log information
#' @param threads number, the number of used threads for parallel process

#' @examples
#' ref_bfile_path <- system.file("extdata", "ref_geno", package = "SumTool")
#' data <- read_plink(bfile=ref_bfile_path, backingpath=tempdir(), threads=1, verbose=FALSE)
#' data <- read_binary(bfile=ref_bfile_path, out=tempfile(), threads=1, verbose=FALSE)
#' geno <- data$geno
#' map <- data$map

#' @export

read_plink <-
read_binary <-
function(
bfile = "",
maxLine = 1000,
impute = TRUE,
additive = TRUE,
backingpath = getwd(),
descriptorfile = NULL,
backingfile = NULL,
out = NULL,
verbose = TRUE,
threads = 1
){
Expand All @@ -72,27 +68,32 @@ function(
if(verbose) cat("Number of individuals: ", n , "\n", sep="")
if(verbose) cat("Reading...\n")

if(is.null(backingfile) & is.null(backingfile)){
backingfile <- paste0(bfile,".bin")
descriptorfile <- paste0(bfile,".desc")
if(is.null(out)){
backingfile <- paste0(basename(bfile),".bin")
descriptorfile <- paste0(basename(bfile),".desc")
backingpath <- "."
}else{
backingfile <- paste0(basename(out),".bin")
descriptorfile <- paste0(basename(out),".desc")
backingpath <- dirname(out)
}
if(!is.null(backingfile)) backingfile <- basename(backingfile)
if(!is.null(descriptorfile)) descriptorfile <- basename(descriptorfile)
if(!is.null(backingfile) & is.null(descriptorfile))
stop("descriptorfile shoud be assigned.")
if(is.null(backingfile) & !is.null(descriptorfile))
stop("backingfile shoud be assigned.")

if(!is.null(descriptorfile)){
map_file <- unlist(strsplit(descriptorfile, "", fixed = TRUE))
sep_index <- which(map_file == ".")
if(length(sep_index)){
map_file <- paste0(map_file[1 : (sep_index[length(sep_index)] - 1)], collapse="")
}else{
map_file <- paste0(map_file, collapse="")
}
if(!is.null(backingpath)) map_file <- paste0(backingpath, "/", map_file)
# if(!is.null(backingfile)) backingfile <- basename(backingfile)
# if(!is.null(descriptorfile)) descriptorfile <- basename(descriptorfile)
# if(!is.null(backingfile) & is.null(descriptorfile))
# stop("descriptorfile shoud be assigned.")
# if(is.null(backingfile) & !is.null(descriptorfile))
# stop("backingfile shoud be assigned.")

# if(!is.null(descriptorfile)){
map_file <- unlist(strsplit(descriptorfile, "", fixed = TRUE))
sep_index <- which(map_file == ".")
if(length(sep_index)){
map_file <- paste0(map_file[1 : (sep_index[length(sep_index)] - 1)], collapse="")
}else{
map_file <- paste0(map_file, collapse="")
}
if(!is.null(backingpath)) map_file <- paste0(backingpath, "/", map_file)
# }
map <- as.data.frame(rMap_c(paste0(bfile, ".bim"), out = map_file), stringsAsFactors=FALSE)
map$Pos <- as.numeric(map$Pos)
pheno <- read.table(paste0(bfile, ".fam"), header=FALSE)[, -c(1:5), drop=FALSE]
Expand Down Expand Up @@ -120,15 +121,13 @@ function(
#' @param maxLine number, set the number of lines to handle at a time, bigger lines require more memory.
#' @param impute logical, whether to impute missing values in genotype.
#' @param additive logical, whether to code the genotype into 0, 1, 2; otherwise, the genotype will be coded into 11, 12, 21, 22, which can be used for summary data imputation.
#' @param backingpath the path to the directory containing the file backing cache.
#' @param descriptorfile the name of the file to hold the backingfile description, for subsequent use with ‘attach.big.matrix’; if ‘NULL’, the ‘backingfile’ is used as the root part of the descriptor file name. The descriptor file is placed in the same directory as the backing files.
#' @param backingfile the root name for the file(s) for the cache.
#' @param out character, path and prefix of output file
#' @param verbose logical, whether to print the log information
#' @param threads number, the number of used threads for parallel process

#' @examples
#' ref_vfile_path <- system.file("extdata", "ref_geno.vcf", package = "SumTool")
#' data <- read_vcf(vfile=ref_vfile_path, backingpath=tempdir(), threads=1, verbose=FALSE)
#' data <- read_vcf(vfile=ref_vfile_path, out=tempfile(), threads=1, verbose=FALSE)
#' geno <- data$geno
#' map <- data$map

Expand All @@ -140,9 +139,7 @@ function(
maxLine = 1000,
impute = TRUE,
additive = TRUE,
backingpath = getwd(),
descriptorfile = NULL,
backingfile = NULL,
out = NULL,
verbose = TRUE,
threads = 1
){
Expand All @@ -157,27 +154,34 @@ function(
if(verbose) cat("Number of individuals: ", n , "\n", sep="")
if(verbose) cat("Reading...\n")

if(is.null(backingfile) & is.null(backingfile)){
backingfile <- paste0(sub('....$','', vfile),".bin")
descriptorfile <- paste0(sub('....$','', vfile),".desc")
if(is.null(out)){
prefix <- sub('....$','', basename(vfile))
backingfile <- paste0(prefix,".bin")
descriptorfile <- paste0(prefix,".desc")
backingpath <- "."
}else{
backingfile <- paste0(basename(out),".bin")
descriptorfile <- paste0(basename(out),".desc")
backingpath <- dirname(out)
}
if(!is.null(backingfile)) backingfile <- basename(backingfile)
if(!is.null(descriptorfile)) descriptorfile <- basename(descriptorfile)
if(!is.null(backingfile) & is.null(descriptorfile))
stop("descriptorfile shoud be assigned.")
if(is.null(backingfile) & !is.null(descriptorfile))
stop("backingfile shoud be assigned.")

if(!is.null(descriptorfile)){
map_file <- unlist(strsplit(descriptorfile, "", fixed = TRUE))
sep_index <- which(map_file == ".")
if(length(sep_index)){
map_file <- paste0(map_file[1 : (sep_index[length(sep_index)] - 1)], collapse="")
}else{
map_file <- paste0(map_file, collapse="")
}
if(!is.null(backingpath)) map_file <- paste0(backingpath, "/", map_file)

# if(!is.null(backingfile)) backingfile <- basename(backingfile)
# if(!is.null(descriptorfile)) descriptorfile <- basename(descriptorfile)
# if(!is.null(backingfile) & is.null(descriptorfile))
# stop("descriptorfile shoud be assigned.")
# if(is.null(backingfile) & !is.null(descriptorfile))
# stop("backingfile shoud be assigned.")

# if(!is.null(descriptorfile)){
map_file <- unlist(strsplit(descriptorfile, "", fixed = TRUE))
sep_index <- which(map_file == ".")
if(length(sep_index)){
map_file <- paste0(map_file[1 : (sep_index[length(sep_index)] - 1)], collapse="")
}else{
map_file <- paste0(map_file, collapse="")
}
if(!is.null(backingpath)) map_file <- paste0(backingpath, "/", map_file)
# }
geno <- bigmemory::big.matrix(
nrow = n,
ncol = m,
Expand Down Expand Up @@ -209,16 +213,16 @@ function(
#' @examples
#' # reading data
#' ref_bfile_path <- system.file("extdata", "ref_geno", package = "SumTool")
#' data <- read_plink(bfile=ref_bfile_path, threads=1, backingpath=tempdir(), verbose=FALSE)
#' data <- read_binary(bfile=ref_bfile_path, threads=1, out=tempfile(), verbose=FALSE)
#' geno <- data$geno
#' map <- data$map
#'
#' # writing data
#' # write_plink(geno=geno, map=map, out="./test", threads=1, verbose=FALSE)
#' # write_binary(geno=geno, map=map, out="./test", threads=1, verbose=FALSE)

#' @export

write_plink <-
write_binary <-
function(
geno = NULL,
map = NULL,
Expand Down Expand Up @@ -272,7 +276,7 @@ function(
#' typed_z_path <- system.file("extdata", "typed.zscore", package = "SumTool")
#'
#' # reading data
#' data <- read_bfile(bfile=ref_file_path, additive=FALSE, backingpath=tempdir(),
#' data <- read_binary(bfile=ref_file_path, additive=FALSE, out=tempfile(),
#' threads=1, verbose=FALSE)
#' ref.geno <- data$geno
#' ref.map <- data$map
Expand All @@ -284,7 +288,7 @@ function(
#' #--------------SImpute-LD---------------#
#'
#' gwas_file_path <- system.file("extdata", "gwas_geno", package = "SumTool")
#' gwas <- read_bfile(bfile=gwas_file_path, additive=FALSE, backingpath=tempdir(),
#' gwas <- read_binary(bfile=gwas_file_path, additive=FALSE, out=tempfile(),
#' threads=1, verbose=FALSE)
#' typed.geno <- gwas$geno
#' typed.map <- gwas$map
Expand Down Expand Up @@ -492,7 +496,7 @@ SImputeZ <- function(ref.geno = NULL, ref.map = NULL, typed.geno = NULL, typed =
#' typed_b_path <- system.file("extdata", "typed.beta", package = "SumTool")
#'
#' # reading data
#' data <- read_bfile(bfile=ref_file_path, additive=FALSE, backingpath=tempdir(),
#' data <- read_binary(bfile=ref_file_path, additive=FALSE, out=tempfile(),
#' threads=1, verbose=FALSE)
#' ref.geno <- data$geno
#' ref.map <- data$map
Expand All @@ -504,7 +508,7 @@ SImputeZ <- function(ref.geno = NULL, ref.map = NULL, typed.geno = NULL, typed =
#' #--------------SImpute-LD---------------#
#'
#' gwas_file_path <- system.file("extdata", "gwas_geno", package = "SumTool")
#' gwas <- read_bfile(bfile=gwas_file_path, additive=FALSE, backingpath=tempdir(),
#' gwas <- read_binary(bfile=gwas_file_path, additive=FALSE, out=tempfile(),
#' threads=1, verbose=FALSE)
#' typed.geno <- gwas$geno
#' typed.map <- gwas$map
Expand Down Expand Up @@ -729,7 +733,7 @@ SImputeB <- function(ref.geno = NULL, ref.map = NULL, typed.geno = NULL, typed =

#' @examples
#' gwas_bfile_path <- system.file("extdata", "gwas_geno", package = "SumTool")
#' data <- read_plink(bfile=gwas_bfile_path, threads=1, verbose=FALSE, backingpath=tempdir())
#' data <- read_binary(bfile=gwas_bfile_path, threads=1, verbose=FALSE, out=tempfile())
#' geno <- data$geno
#' ld <- LDcal(geno=geno, threads=1, verbose=FALSE)

Expand Down Expand Up @@ -793,7 +797,7 @@ LDcal <- function(geno = NULL, index = NULL, threads = 1, lambda = 0, chisq = 0,

#' @examples
#' ref_bfile_path <- system.file("extdata", "ref_geno", package = "SumTool")
#' data <- read_plink(bfile=ref_bfile_path, backingpath=tempdir(), threads=1)
#' data <- read_binary(bfile=ref_bfile_path, out=tempfile(), threads=1)
#' geno <- data$geno
#' map <- data$map
#' y <- data$pheno[,1]
Expand Down Expand Up @@ -862,7 +866,7 @@ LMreg <- function(y, geno = NULL, map = NULL, X = NULL, threads = 1, verbose = T
#' ref_bfile_path <- system.file("extdata", "ref_geno", package = "SumTool")
#'
#' # load data
#' data <- read_plink(bfile=ref_bfile_path, threads=1, backingpath=tempdir(), verbose=FALSE)
#' data <- read_binary(bfile=ref_bfile_path, threads=1, out=tempfile(), verbose=FALSE)
#' geno <- data$geno
#' map <- data$map
#' ldscore <- LDscore(geno = geno, map = map, w = 100000, b=12500, threads = 1, verbose=FALSE)
Expand Down Expand Up @@ -975,7 +979,7 @@ LDscore <- function(geno = NULL, map = NULL, w = 1000000, b = 500000, r2 = TRUE,
#' ref_bfile_path <- system.file("extdata", "ref_geno", package = "SumTool")
#'
#' # load data
#' data <- read_plink(bfile=ref_bfile_path, threads=1, backingpath=tempdir(), verbose=FALSE)
#' data <- read_binary(bfile=ref_bfile_path, threads=1, out=tempfile(), verbose=FALSE)
#' geno <- data$geno
#' map <- data$map
#' snp <- LDprune(geno = geno, map = map, w = 100000, b=50000, threads = 1, verbose=FALSE)
Expand Down Expand Up @@ -1089,7 +1093,7 @@ LDprune <- function(geno = NULL, map = NULL, w = 1000000, b = 500000, r2.cutoff
#' p_path <- system.file("extdata", "P.txt", package = "SumTool")

#' # load data
#' data <- read_plink(bfile=ref_bfile_path, threads=1, backingpath=tempdir(), verbose=FALSE)
#' data <- read_binary(bfile=ref_bfile_path, threads=1, out=tempfile(), verbose=FALSE)
#' geno <- data$geno
#' map <- data$map
#' pdata <- read.table(p_path, header = TRUE)
Expand Down Expand Up @@ -1211,7 +1215,7 @@ LDclump <- function(geno = NULL, map = NULL, p = NULL, w = 1000000, r2.cutoff =
#'
#' # load data
#' sumstat <- read.table(sumstat_path, header=TRUE)
#' data <- read_plink(bfile=ref_bfile_path, threads=1, backingpath=tempdir(), verbose=FALSE)
#' data <- read_binary(bfile=ref_bfile_path, threads=1, out=tempfile(), verbose=FALSE)
#' geno <- data$geno
#' map <- data$map
#' h2 <- 0.5
Expand Down Expand Up @@ -1436,7 +1440,7 @@ LDreg <- function(sumstat = NULL, ldscore = NULL, wld = NULL, M = NULL, maxz2 =
if(is.matrix(sumstat) || is.data.frame(sumstat)){

#heritability estimation
est <- h2_est(sumstat = sumstat, ldscore = ldscore, wld = wld, M = M, axz2 = maxz2, maf = maf, nblock = nblock, verbose = verbose)
est <- h2_est(sumstat = sumstat, ldscore = ldscore, wld = wld, M = M, maxz2 = maxz2, maf = maf, nblock = nblock, verbose = verbose)
}else if(is.list(sumstat)){
if(length(sumstat) == 1){

Expand Down
18 changes: 9 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,11 @@ After installed successfully, type ```library(SumTool)``` to use, then type ```l

Genotype Converting
-----
With the developing of advaced sequence technologies, the unprecedentedly increased markers across genome make a big challenge in relevant genetic analysis. Loading the genotype into memory directly is highly limited by the computation resources, which becomes the bottleneck of the most of softwares or pipelines. Here we provided a function ```read_plink``` developed by aid of bigmemory package to construct memory-mapped files ('big.matrix') on disk from plink binary files. Instead of reading all genotype data into RAM, it greatly reduce memory cost without significantly increase of computation time. Moreover, this data transformation process is only required at the first time, and no matter how big the data is, it can be connected with RAM within seconds at the next time.
With the developing of advaced sequence technologies, the unprecedentedly increased markers across genome make a big challenge in relevant genetic analysis. Loading the genotype into memory directly is highly limited by the computation resources, which becomes the bottleneck of the most of softwares or pipelines. Here we provided two functions ```read_binary, read_vcf``` developed by aid of bigmemory package to construct memory-mapped files ('big.matrix') on disk from plink binary and VCF files. Instead of reading all genotype data into RAM, it greatly reduce memory cost without significantly increase of computation time. Moreover, this data transformation process is only required at the first time, and no matter how big the data is, it can be connected with RAM within seconds at the next time.
```r
# plink binary file
ref_bfile_path <- system.file("extdata", "ref_geno", package = "SumTool")
data <- read_plink(bfile=ref_bfile_path, additive=TRUE, threads=1)
data <- read_binary(bfile=ref_bfile_path, additive=TRUE, threads=1)
# bfile: the prefix of binary files

# or load VCF file
Expand All @@ -54,9 +54,9 @@ data <- read_vcf(vfile=ref_vfile_path, additive=TRUE, threads=1)
ref.geno <- data$geno
ref.map <- data$map
```
By default, the memory-mapped files are directed into work directory, users could redirect to new path as following:
Missing genotype will be replaced by the major genotype of each allele. By default, the memory-mapped files are directed into work directory, users could redirect to new path as following:
```r
data <- read_plink(bfile=ref_bfile_path, backingpath="./", descriptorfile="test.desc", backingfile="test.bin", threads=1)
data <- read_binary(bfile=ref_bfile_path, out="./test", threads=1)

# directly use for the next time
ref.geno <- attach.big.matrix("./test.desc")
Expand Down Expand Up @@ -108,7 +108,7 @@ Linear Model
-----
```r
ref_bfile_path <- system.file("extdata", "ref_geno", package = "SumTool")
data <- read_plink(bfile=ref_bfile_path, threads=1)
data <- read_binary(bfile=ref_bfile_path, threads=1)
geno <- data$geno
map <- data$map
y <- data$pheno[,1]
Expand All @@ -117,7 +117,7 @@ gwas <- LMreg(y=y, geno=geno, map=map, threads=1, verbose=FALSE)

Impute Zscore
-----
For Zscore imputation, at least 6 columns should be provided in same order with the example for typed SNPs. No need to separate genome into chromosomes. The reference panel can be loaded from both plink binary file (read_plink) or VCF file (read_vcf), we recommend using phased VCF file. Please note that "(..., [additive=FALSE]())" should be added when reading either plink file or VCF file.
For Zscore imputation, at least 6 columns should be provided in same order with the example for typed SNPs. No need to separate genome into chromosomes. The reference panel can be loaded from both plink binary file (read_binary) or VCF file (read_vcf), we recommend using phased VCF file. Please note that "(..., [additive=FALSE]())" should be added when reading either plink file or VCF file.
```r
# get the path of attached example data on SumTool
ref_file_path <- system.file("extdata", "ref_geno.vcf", package = "SumTool")
Expand All @@ -143,7 +143,7 @@ xx <- SImputeZ(ref.geno=ref.geno, ref.map=ref.map, typed=typed_z, w=1000000, thr
If the individual genotype of summary statistics is available, the imputation accuracy can be improved by using the LD matrix derived from individual genotype rather than reference panel for typed SNPs.
```r
gwas_bfile_path <- system.file("extdata", "gwas_geno", package = "SumTool")
gwas <- read_plink(bfile=gwas_bfile_path, additive=FALSE, threads=1)
gwas <- read_binary(bfile=gwas_bfile_path, additive=FALSE, threads=1)
typed.geno <- gwas$geno
typed.map <- gwas$map
# NOTE: the order of SNPs in 'typed.geno' should be consistent with the order in 'typed_z'.
Expand All @@ -153,7 +153,7 @@ xx <- SImputeZ(ref.geno=ref.geno, ref.map=ref.map, typed=typed_z, typed.geno=typ

Impute Marginal Effect
-----
For BETA and SE imputation, limited 8 columns should be provided in same order with the example for typed SNPs. No need to separate genome into chromosomes. The reference panel can be loaded from both plink binary file (read_plink) or VCF file (read_vcf), we recommend using phased VCF file. Please note that "(..., [additive=FALSE]())" should be added when reading either plink file or VCF file.<br>
For BETA and SE imputation, limited 8 columns should be provided in same order with the example for typed SNPs. No need to separate genome into chromosomes. The reference panel can be loaded from both plink binary file (read_binary) or VCF file (read_vcf), we recommend using phased VCF file. Please note that "(..., [additive=FALSE]())" should be added when reading either plink file or VCF file.<br>
***NOTE***: It is not supported to impute multiple traits at a time.
```r
# get the path of attached example data on SumTool
Expand Down Expand Up @@ -321,7 +321,7 @@ head(sumstat)
4 rs3131969 1 754182 A G -0.04744 0.5306 100
5 rs1048488 1 760912 C T -0.07650 0.5416 100
6 rs12562034 1 768448 G A -0.12360 0.5500 100
data <- read_plink(bfile=ref_bfile_path, threads=1)
data <- read_binary(bfile=ref_bfile_path, threads=1)
geno <- data$geno
map <- data$map
h2 <- 0.5
Expand Down
File renamed without changes.
2 changes: 1 addition & 1 deletion man/LDcal.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/LDclump.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit dc84dc7

Please sign in to comment.