Skip to content

Commit

Permalink
add new functions
Browse files Browse the repository at this point in the history
  • Loading branch information
YinLiLin committed Sep 30, 2020
1 parent dc84dc7 commit a0b02f1
Show file tree
Hide file tree
Showing 7 changed files with 306 additions and 0 deletions.
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ export(FileNcol)
export(FileNrow)
export(LDcal)
export(LDclump)
export(LDcor)
export(LDprune)
export(LDreg)
export(LDscore)
Expand Down
4 changes: 4 additions & 0 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,10 @@ SImpute_LD_norm_c <- function(pBigMat, index = NULL, chisq = 0L, lambda = 0, hap
.Call(`_SumTool_SImpute_LD_norm_c`, pBigMat, index, chisq, lambda, haps, threads, verbose)
}

LDcor_c <- function(pBigMat1, pBigMat2, index1, index2, threads = 0L) {
.Call(`_SumTool_LDcor_c`, pBigMat1, pBigMat2, index1, index2, threads)
}

LDprune_c <- function(pBigMat, index, r2_cutoff = 0.2, threads = 0L, verbose = TRUE) {
.Call(`_SumTool_LDprune_c`, pBigMat, index, r2_cutoff, threads, verbose)
}
Expand Down
120 changes: 120 additions & 0 deletions R/SumTool.r
Original file line number Diff line number Diff line change
Expand Up @@ -784,6 +784,126 @@ LDcal <- function(geno = NULL, index = NULL, threads = 1, lambda = 0, chisq = 0,
return(bigmat)
}

#' LD correlation
#'
#' To compare the LD structure between two populations
#'
#' @param geno1 big.matrix (n1 * m1), genotype
#' @param map1 data.frame, the genomic information of SNPs. The columns should be in the order of c("SNP", "Chr", "Pos", "A1", "A2")
#' @param geno2 big.matrix (n2 * m2), genotype
#' @param map2 data.frame, the genomic information of SNPs. The columns should be in the order of c("SNP", "Chr", "Pos", "A1", "A2")
#' @param w int, size of windows in bp. Default is 1e6
#' @param threads number, the number of used threads for parallel process
#' @param verbose logical, whether to print the log information

#' @examples
#' ref_file_path <- system.file("extdata", "ref_geno", package = "SumTool")
#' gwas_file_path <- system.file("extdata", "gwas_geno", package = "SumTool")
#' data1 <- read_binary(bfile=ref_file_path, threads=1, verbose=FALSE, out=tempfile())
#' ref.geno <- data1$geno
#' ref.map <- data1$map
#' data2 <- read_binary(bfile=gwas_file_path, threads=1, verbose=FALSE, out=tempfile())
#' gwas.geno <- data2$geno
#' gwas.map <- data2$map
#' ldcor <- LDcor(geno1=ref.geno, map1=ref.map, geno2=gwas.geno, map2=gwas.map, threads=10, verbose=TRUE)

#' @export

LDcor <- function(geno1 = NULL, map1 = NULL, geno2 = NULL, map2 = NULL, w = 1000000, threads = 1, verbose = TRUE)
{
if(verbose) version.info()
t1 <- as.numeric(Sys.time())

if(verbose) cat("Analysis started:", as.character(Sys.time()), "\n")
if(is.null(geno1)) stop("Please provide geno1!")
if(!is.big.matrix(geno1)) stop("geno1 should be in 'big.matrix' format!")
if(is.null(map1)) stop("Please provide map1!")
if(is.null(geno2)) stop("Please provide geno2!")
if(!is.big.matrix(geno2)) stop("geno2 should be in 'big.matrix' format!")
if(is.null(map2)) stop("Please provide map2!")
if(nrow(map1) != ncol(geno1)) stop("Number of SNPs not equals between geno1 and map1.")
if(nrow(map2) != ncol(geno2)) stop("Number of SNPs not equals between geno2 and map2.")
if(!all((geno1[, 1]) <= 2)) stop("genotype should be coded as 0 1 2 in geno1!")
if(!all((geno2[, 1]) <= 2)) stop("genotype should be coded as 0 1 2 in geno2!")
if(any(duplicated(map1[, 1]))) stop("duplicated SNP names exist in map1.")
if(any(duplicated(map2[, 1]))) stop("duplicated SNP names exist in map2.")
if(ncol(map1) != 5) stop("Only 5 columns limited for map1! (snp, chr, pos, a1, a2)")
if(ncol(map2) != 5) stop("Only 5 columns limited for map2! (snp, chr, pos, a1, a2)")
if(!is.numeric(map1[, 3])) stop("Physical position in map1 should be numeric defined!")
if(!is.numeric(map2[, 3])) stop("Physical position in map2 should be numeric defined!")
for(i in 1 : ncol(map1)){
if(is.factor(map1[, i])) map1[, i] <- as.character.factor(map1[, i])
if(is.factor(map2[, i])) map2[, i] <- as.character.factor(map2[, i])
}

mergedSNP <- intersect(map1[, 1], map2[, 1])
if(length(mergedSNP) == 0) stop("No shared SNPs between geno1 and geno2!")
row_logi <- NULL
geno1_index <- match(mergedSNP, map1[, 1])
geno2_index <- match(mergedSNP, map2[, 1])
for(i in 2:ncol(map1)){
row_logi <- cbind(row_logi, map1[geno1_index, i] == map2[geno2_index, i])
}
index <- apply(row_logi, 1, all); rm(row_logi); gc()
if(!all(index)){
stop(sum(!index), " SNPs sharing same names between geno1 and geno2 don't have equal map information!\nSee: ", paste(head(mergedSNP[!index]), collapse=","), "...\n")
}

if(verbose) cat("Number of individuals in geno1:", nrow(geno1), "\n")
if(verbose) cat("Number of SNPs in geno1:", ncol(geno1), "\n")
if(verbose) cat("Number of individuals in geno2:", nrow(geno2), "\n")
if(verbose) cat("Number of SNPs in geno2:", ncol(geno2), "\n")
if(verbose) cat("Number of shared SNPs between geno1 and geno2:", length(mergedSNP), "\n")

chr <- unique(map1[, 2][geno1_index])

res <- NULL
for(chri in chr){
chri_index <- map1[, 2][geno1_index] == chri
if(verbose) cat("Loop on chromosome", chri, "with", sum(chri_index), "SNPs\n")
# chri_pos_min <- min(sumstat[chri_index, 3])
chri_pos_min <- 1
chri_pos_max <- max(map1[geno1_index, 3])
loop <- TRUE
wind_min <- chri_pos_min
wind_max <- chri_pos_min + w
while(loop){
wind_min <- c(wind_min, wind_max[length(wind_max)] + 1)
wind_max <- c(wind_max, wind_max[length(wind_max)] + w)
if((wind_max[length(wind_max)]) >= chri_pos_max) loop <- FALSE
}
if(w >= chri_pos_max){
wind_min <- wind_min[1]
wind_max <- wind_max[1]
}
wind_max[length(wind_max)] <- chri_pos_max
if(verbose) cat("Total Number of windows: ", length(wind_min), "\n", sep="")
mcf <- function(i){
snp_index <- chri_index & map1[, 3][geno1_index] >= wind_min[i] & map1[, 3][geno1_index] <= wind_max[i]
snp <- map1[geno1_index, 1][snp_index]
if(length(snp) != 0){
index1 <- match(snp, map1[, 1])
index2 <- match(snp, map2[, 1])
if(verbose) cat("The ", i, "th window: Start[", min(map1[index1, 3], na.rm = TRUE),"] ~ End[", max(map1[index1, 3], na.rm = TRUE),"]\r", sep="")
resi <- LDcor_c(geno1@address, geno2@address, index1 = index1 - 1, index2 = index2 - 1, threads = threads)
}else{
resi <- NULL
}
return(resi)
}
chr_res <- lapply(1 : length(wind_min), mcf)
if(verbose) cat("\n");
res <- rbind(res, do.call(rbind, chr_res))
rm(chr_res); gc()
}
res <- data.frame(map1[geno1_index, ], res)
colnames(res) <- c("SNP", "Chr", "Pos", "A1", "A2", "r", "P-value")
t2 <- as.numeric(Sys.time())
if(verbose) cat("Analysis finished:", as.character(Sys.time()), "\n")
if(verbose) cat("Total Running time:", times(t2-t1), "\n")
return(res)
}

#' Generalized Linear Model
#'
#' To do association using Generalized Linear Model
Expand Down
16 changes: 16 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ Features
- [Linkage disequilibrium matrix](#linkage-disequilibrium)
- [Linkage disequilibrium score](#ld-score)
- [Linkage disequilibrium pruning](#ld-pruning)/[clumping](#ld-clumping)
- [Linkage disequilibrium comparison](#ld-comparison)
- Association
- [Linear Model](#linear-model)
- Imputation
Expand Down Expand Up @@ -104,6 +105,21 @@ head(pdata)
snp <- LDclump(geno = ref.geno, map = ref.map, p = pdata, p.cutoff = 1, r2.cutoff = 0.25, w = 100000, threads = 1)
```

LD Comparison
-----
To compare the LD structure difference between two populations, the values of the column named "r" in output dataframe are more closer to 1, representing the SNPs match better for those two populations. We recommend deleting SNPs with r < 0.95 prior to downstream analysis.
```r
ref_file_path <- system.file("extdata", "ref_geno", package = "SumTool")
gwas_file_path <- system.file("extdata", "gwas_geno", package = "SumTool")
data1 <- read_binary(bfile=ref_file_path, threads=1, verbose=FALSE, out=tempfile())
ref.geno <- data1$geno
ref.map <- data1$map
data2 <- read_binary(bfile=gwas_file_path, threads=1, verbose=FALSE, out=tempfile())
gwas.geno <- data2$geno
gwas.map <- data2$map
ldcor <- LDcor(geno1=ref.geno, map1=ref.map, geno2=gwas.geno, map2=gwas.map, threads=10, verbose=TRUE)
```

Linear Model
-----
```r
Expand Down
45 changes: 45 additions & 0 deletions man/LDcor.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

16 changes: 16 additions & 0 deletions src/RcppExports.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,21 @@ BEGIN_RCPP
return rcpp_result_gen;
END_RCPP
}
// LDcor_c
arma::mat LDcor_c(SEXP pBigMat1, SEXP pBigMat2, const IntegerVector index1, const IntegerVector index2, const int threads);
RcppExport SEXP _SumTool_LDcor_c(SEXP pBigMat1SEXP, SEXP pBigMat2SEXP, SEXP index1SEXP, SEXP index2SEXP, SEXP threadsSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< SEXP >::type pBigMat1(pBigMat1SEXP);
Rcpp::traits::input_parameter< SEXP >::type pBigMat2(pBigMat2SEXP);
Rcpp::traits::input_parameter< const IntegerVector >::type index1(index1SEXP);
Rcpp::traits::input_parameter< const IntegerVector >::type index2(index2SEXP);
Rcpp::traits::input_parameter< const int >::type threads(threadsSEXP);
rcpp_result_gen = Rcpp::wrap(LDcor_c(pBigMat1, pBigMat2, index1, index2, threads));
return rcpp_result_gen;
END_RCPP
}
// LDprune_c
arma::uvec LDprune_c(SEXP pBigMat, const IntegerVector index, const double r2_cutoff, const int threads, const bool verbose);
RcppExport SEXP _SumTool_LDprune_c(SEXP pBigMatSEXP, SEXP indexSEXP, SEXP r2_cutoffSEXP, SEXP threadsSEXP, SEXP verboseSEXP) {
Expand Down Expand Up @@ -401,6 +416,7 @@ static const R_CallMethodDef CallEntries[] = {
{"_SumTool_rVCF_c", (DL_FUNC) &_SumTool_rVCF_c, 8},
{"_SumTool_SImpute_LD_bigm_c", (DL_FUNC) &_SumTool_SImpute_LD_bigm_c, 8},
{"_SumTool_SImpute_LD_norm_c", (DL_FUNC) &_SumTool_SImpute_LD_norm_c, 7},
{"_SumTool_LDcor_c", (DL_FUNC) &_SumTool_LDcor_c, 5},
{"_SumTool_LDprune_c", (DL_FUNC) &_SumTool_LDprune_c, 5},
{"_SumTool_LDclump_c", (DL_FUNC) &_SumTool_LDclump_c, 6},
{"_SumTool_ldreg_h2", (DL_FUNC) &_SumTool_ldreg_h2, 8},
Expand Down
104 changes: 104 additions & 0 deletions src/ldcor.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
#if !defined(ARMA_64BIT_WORD)
#define ARMA_64BIT_WORD 1
#endif

#include "stats.h"
#include "probar.h"
#include <string.h>

arma::rowvec cortest(arma::vec a, arma::vec b){
arma::rowvec res(2);
int n = a.n_elem;
double r = as_scalar(cor(a, b));
res[0] = r;
double t = r * sqrt(n - 2) / sqrt(1 - r * r);
double p = 2 * R::pt(abs(t), n - 2, false, false);
res[1] = p;
return res;
}

template <typename T>
arma::mat LDcor_c(XPtr<BigMatrix> pMat1, XPtr<BigMatrix> pMat2, const IntegerVector index1, const IntegerVector index2, const int threads=0){

omp_setup(threads);

MatrixAccessor<T> genomat1 = MatrixAccessor<T>(*pMat1);
MatrixAccessor<T> genomat2 = MatrixAccessor<T>(*pMat2);

int m = index1.size();
int ind1 = pMat1->nrow();
int ind2 = pMat2->nrow();
int i, j, k;
double p1 = 0.0, m1 = 0.0, s1 = 0.0, p2 = 0.0, m2 = 0.0, s2 = 0.0, p12 = 0.0, r = 0.0;
List Stat1 = BigStat(pMat1, index1, threads);
List Stat2 = BigStat(pMat2, index2, threads);
NumericVector mean_all1 = Stat1[0];
NumericVector sd_all1 = Stat1[1];
NumericVector sum_all1 = Stat1[2];
NumericVector mean_all2 = Stat2[0];
NumericVector sd_all2 = Stat2[1];
NumericVector sum_all2 = Stat2[2];

arma::mat ldm1(m, m);
arma::mat ldm2(m, m);
arma::mat res(m, 2);

#pragma omp parallel for private(j, p1, m1, s1, s2, i, k, p12, p2, m2, r)
for (j = 0; j < m; j++){
p1 = sd_all1[j];
m1 = mean_all1[j];
s1 = sum_all1[j];
for(i = j; i < m; i++){
p12 = 0;
p2 = sd_all1[i];
m2 = mean_all1[i];
s2 = sum_all1[i];
for(k = 0; k < ind1; k++){
p12 += (genomat1[index1[i]][k] * genomat1[index1[j]][k]);
}
p12 -= s1 * m2 + s2 * m1 - ind1 * m1 * m2;
r = p12 / (p1 * p2);
ldm1(i, j) = ldm1(j, i) = r;
}
p1 = sd_all2[j];
m1 = mean_all2[j];
s1 = sum_all2[j];
for(i = j; i < m; i++){
p12 = 0;
p2 = sd_all2[i];
m2 = mean_all2[i];
s2 = sum_all2[i];
for(k = 0; k < ind2; k++){
p12 += (genomat2[index2[i]][k] * genomat2[index2[j]][k]);
}
p12 -= s1 * m2 + s2 * m1 - ind2 * m1 * m2;
r = p12 / (p1 * p2);
ldm2(i, j) = ldm2(j, i) = r;
}
res(j, 0) = as_scalar(cor(ldm1.col(j), ldm2.col(j)));
double t = res(j, 0) * sqrt(m - 2) / sqrt(1 - res(j, 0) * res(j, 0));
res(j, 1) = 2 * R::pt(abs(t), m - 2, false, false);
}

return res;
}

// [[Rcpp::export]]
arma::mat LDcor_c(SEXP pBigMat1, SEXP pBigMat2, const IntegerVector index1, const IntegerVector index2, const int threads=0){

XPtr<BigMatrix> xpMat1(pBigMat1);
XPtr<BigMatrix> xpMat2(pBigMat2);

switch(xpMat1->matrix_type()){
case 1:
return LDcor_c<char>(xpMat1, xpMat2, index1, index2, threads);
case 2:
return LDcor_c<short>(xpMat1, xpMat2, index1, index2, threads);
case 4:
return LDcor_c<int>(xpMat1, xpMat2, index1, index2, threads);
case 8:
return LDcor_c<double>(xpMat1, xpMat2, index1, index2, threads);
default:
throw Rcpp::exception("unknown type detected for big.matrix object!");
}
}

0 comments on commit a0b02f1

Please sign in to comment.