The following example makes use of 5 genomes from the MVP.
# Setup for BigQuery access
require(bigrquery)
require(xtable)
require(RCurl)
require(dplyr)
project <- "gbsc-gcp-project-mvp" # put your projectID here
DisplayAndDispatchQuery <- function(queryUri, replacements=list()) {
if(grepl("^https.*", queryUri)) {
querySql <- getURL(queryUri, ssl.verifypeer=FALSE)
} else {
querySql <- readChar(queryUri, nchars=1e6)
}
for(replacement in names(replacements)) {
querySql <- gsub(replacement, replacements[[replacement]], querySql, fixed=TRUE)
}
cat(querySql)
query_exec(querySql, project)
}
table_replacement <- list("_THE_TABLE_"="gbsc-gcp-project-mvp:va_aaa_pilot_data.sample_gvcfs",
"_THE_EXPANDED_TABLE_"="gbsc-gcp-project-mvp:va_aaa_pilot_data.sample_vcfs")
limits = "WHERE
reference_name = 'chr22'
AND call.QUAL >= 30"
result <- DisplayAndDispatchQuery("./sql/private-variants-brca1.sql",
replacements=c(table_replacement, "_LIMITS_" = limits))
# Private variants within BRCA1.
SELECT
reference_name AS CHROM,
start AS POS,
GROUP_CONCAT(CASE WHEN cnt = 1 THEN 'S'
WHEN cnt = 2 THEN 'D'
ELSE STRING(cnt) END) AS SINGLETON_DOUBLETON,
reference_bases AS REF,
alternate_bases AS ALT,
GROUP_CONCAT(call.call_set_name) AS INDV,
GROUP_CONCAT(genotype) AS genotype,
SUM(num_samples_with_variant) AS num_samples_with_variant
FROM (
SELECT
reference_name,
start,
reference_bases,
alternate_bases,
alt_num,
call.call_set_name,
GROUP_CONCAT(STRING(call.genotype)) WITHIN call AS genotype,
SUM(call.genotype == alt_num) WITHIN call AS cnt,
COUNT(call.call_set_name) WITHIN RECORD AS num_samples_with_variant
FROM (
FLATTEN((
SELECT
reference_name,
start,
reference_bases,
alternate_bases,
POSITION(alternate_bases) AS alt_num,
call.call_set_name,
call.genotype,
FROM
FLATTEN([gbsc-gcp-project-mvp:va_aaa_pilot_data.sample_gvcfs], call)
WHERE
reference_name = 'chr22'
AND call.QUAL >= 30
OMIT
call IF EVERY(call.genotype = -1)
),
alternate_bases)
)
OMIT
RECORD IF alternate_bases IS NULL
HAVING
cnt > 0
)
GROUP EACH BY
chrom,
pos,
ref,
alt
HAVING
num_samples_with_variant = 1
ORDER BY
POS,
INDV
Retrieving data: 3.6s
Retrieving data: 5.4s
Retrieving data: 7.1s
Number of rows returned by this query: 40418.
Displaying the first few rows of the dataframe of results:
CHROM | POS | SINGLETON_DOUBLETON | REF | ALT | INDV | genotype | num_samples_with_variant |
---|---|---|---|---|---|---|---|
chr22 | 16050035 | D | A | C | LP6005038-DNA_A03 | "1,1" | 1 |
chr22 | 16050251 | S | A | T | LP6005038-DNA_A03 | "0,1" | 1 |
chr22 | 16050611 | S | C | G | LP6005038-DNA_B02 | "0,1" | 1 |
chr22 | 16050821 | S | G | A | LP6005038-DNA_B02 | "0,1" | 1 |
chr22 | 16050839 | S | C | G | LP6005038-DNA_A02 | "0,1" | 1 |
chr22 | 16051967 | S | C | A | LP6005038-DNA_B02 | "0,1" | 1 |
limits = "WHERE
reference_name = 'chr22'
AND call.QUAL >= 30"
result <- DisplayAndDispatchQuery("./sql/homozygous-variants.sql",
replacements=c(table_replacement, "_LIMITS_"=limits))
# Individual Homozygosity
SELECT
INDV,
O_HOM,
ROUND(E_HOM, 2) as E_HOM,
N_SITES,
ROUND((O_HOM - E_HOM) / (N_SITES - E_HOM), 5) AS F
FROM (
SELECT
call.call_set_name AS INDV,
SUM(first_allele = second_allele) AS O_HOM,
SUM(1.0 - (2.0 * freq * (1.0 - freq) * (called_allele_count / (called_allele_count - 1.0)))) AS E_HOM,
COUNT(call.call_set_name) AS N_SITES,
FROM (
SELECT
reference_name,
start,
reference_bases,
GROUP_CONCAT(alternate_bases) WITHIN RECORD AS alternate_bases,
call.call_set_name,
NTH(1,
call.genotype) WITHIN call AS first_allele,
NTH(2,
call.genotype) WITHIN call AS second_allele,
COUNT(alternate_bases) WITHIN RECORD AS num_alts,
SUM(call.genotype >= 0) WITHIN RECORD AS called_allele_count,
IF((SUM(1 = call.genotype) > 0),
SUM(call.genotype = 1)/SUM(call.genotype >= 0),
-1) WITHIN RECORD AS freq
FROM
FLATTEN([gbsc-gcp-project-mvp:va_aaa_pilot_data.sample_vcfs], call)
WHERE
reference_name = 'chr22'
AND call.QUAL >= 30
OMIT
call IF SOME(call.genotype < 0)
OR (2 > COUNT(call.genotype))
HAVING
# Skip 1/2 genotypes _and non-SNP variants
num_alts = 1
AND reference_bases IN ('A','C','G','T')
AND alternate_bases IN ('A','C','G','T')
)
GROUP BY
INDV
)
ORDER BY
INDV
Number of rows returned by this query: 5.
INDV | O_HOM | E_HOM | N_SITES | F |
---|---|---|---|---|
LP6005038-DNA_A01 | 63849 | 444361.00 | 97599 | 1.10 |
LP6005038-DNA_A02 | 64435 | 444939.00 | 97574 | 1.10 |
LP6005038-DNA_A03 | 63852 | 445228.00 | 97681 | 1.10 |
LP6005038-DNA_B01 | 64104 | 449008.00 | 97710 | 1.10 |
LP6005038-DNA_B02 | 64855 | 442527.00 | 97617 | 1.09 |
limits = "WHERE
reference_name = 'chr22'
AND call.QUAL >= 30"
result <- DisplayAndDispatchQuery("./sql/hardy-weinberg-brca1-expanded.sql",
replacements=c(table_replacement, "_LIMITS_"=limits))
# The following query computes the Hardy-Weinberg equilibrium for BRCA1 variants.
SELECT
CHR,
POS,
ref,
alt,
OBS_HOM1,
OBS_HET,
OBS_HOM2,
E_HOM1,
E_HET,
E_HOM2,
# Chi Squared Calculation
# SUM(((Observed - Expected)^2) / Expected )
ROUND((POW(OBS_HOM1 - E_HOM1, 2) / E_HOM1)
+ (POW(OBS_HET - E_HET, 2) / E_HET)
+ (POW(OBS_HOM2 - E_HOM2, 2) / E_HOM2), 6)
AS ChiSq,
# Determine if Chi Sq value is significant
IF((POW(OBS_HOM1 - E_HOM1, 2) / E_HOM1)
+ (POW(OBS_HET - E_HET, 2) / E_HET)
+ (POW(OBS_HOM2 - E_HOM2, 2) / E_HOM2)
> 5.991, "TRUE", "FALSE") AS PVALUE_SIG
FROM (
SELECT
CHR,
POS,
ref,
alt,
OBS_HOM1,
OBS_HET,
OBS_HOM2,
# Expected AA
# p^2
# ((COUNT(AA) + (COUNT(Aa)/2) /
# SAMPLE_COUNT) ^ 2) * SAMPLE_COUNT
ROUND(POW((OBS_HOM1 + (OBS_HET/2)) /
SAMPLE_COUNT, 2) * SAMPLE_COUNT, 2)
AS E_HOM1,
# Expected Aa
# 2pq
# 2 * (COUNT(AA) + (COUNT(Aa)/2) / SAMPLE_COUNT) *
# (COUNT(aa) + (COUNT(Aa)/2) / SAMPLE_COUNT)
# * SAMPLE_COUNT
ROUND(2 * ((OBS_HOM1 + (OBS_HET/2)) / SAMPLE_COUNT) *
((OBS_HOM2 + (OBS_HET/2)) / SAMPLE_COUNT)
* SAMPLE_COUNT, 2)
AS E_HET,
# Expected aa
# q^2
# (COUNT(aa) + (COUNT(Aa)/2) /
# SAMPLE_COUNT) ^ 2 * SAMPLE_COUNT
ROUND(POW((OBS_HOM2 + (OBS_HET/2)) /
SAMPLE_COUNT, 2) * SAMPLE_COUNT, 2)
AS E_HOM2,
FROM (
SELECT
reference_name AS CHR,
start AS POS,
reference_bases AS ref,
alternate_bases AS alt,
HOM_REF AS OBS_HOM1,
HET AS OBS_HET,
HOM_ALT AS OBS_HOM2,
HOM_REF + HET + HOM_ALT AS SAMPLE_COUNT,
FROM (
SELECT
reference_name,
start,
END,
reference_bases,
GROUP_CONCAT(alternate_bases) WITHIN RECORD AS alternate_bases,
COUNT(alternate_bases) WITHIN RECORD AS num_alts,
SUM(EVERY(0 = call.genotype)) WITHIN call AS HOM_REF,
SUM(EVERY(1 = call.genotype)) WITHIN call AS HOM_ALT,
SUM(SOME(0 = call.genotype)
AND SOME(1 = call.genotype)) WITHIN call AS HET,
FROM
FLATTEN([gbsc-gcp-project-mvp:va_aaa_pilot_data.sample_vcfs], call)
WHERE
reference_name = 'chr22'
AND call.QUAL >= 30
HAVING
# Skip 1/2 genotypes
num_alts = 1
)))
ORDER BY
CHR,
POS,
ref
Running query: RUNNING 2.7s
Running query: RUNNING 3.4s
Running query: RUNNING 4.0s
Running query: RUNNING 4.7s
Running query: RUNNING 5.3s
Running query: RUNNING 5.9s
Running query: RUNNING 6.6s
Running query: RUNNING 7.3s
Running query: RUNNING 7.9s
Retrieving data: 3.2s
Retrieving data: 5.6s
Retrieving data: 7.7s
Retrieving data: 11.0s
Retrieving data: 13.4s
Retrieving data: 15.9s
Retrieving data: 18.1s
Retrieving data: 20.9s
Retrieving data: 23.7s
Number of rows returned by this query: 100000.
Displaying the first few results:
CHR | POS | ref | alt | OBS_HOM1 | OBS_HET | OBS_HOM2 | E_HOM1 | E_HET | E_HOM2 | ChiSq | PVALUE_SIG |
---|---|---|---|---|---|---|---|---|---|---|---|
chr22 | 16050035 | A | C | 1 | 0 | 0 | 1.00 | 0.00 | 0.00 | FALSE | |
chr22 | 16050035 | A | C | 1 | 0 | 0 | 1.00 | 0.00 | 0.00 | FALSE | |
chr22 | 16050035 | A | C | 1 | 0 | 0 | 1.00 | 0.00 | 0.00 | FALSE | |
chr22 | 16050035 | A | C | 1 | 0 | 0 | 1.00 | 0.00 | 0.00 | FALSE | |
chr22 | 16050035 | A | C | 0 | 0 | 1 | 0.00 | 0.00 | 1.00 | FALSE | |
chr22 | 16050158 | C | T | 1 | 0 | 0 | 1.00 | 0.00 | 0.00 | FALSE |
===============
limits = "AND reference_name = 'chr22'
AND call.QUAL >= 30"
result <- DisplayAndDispatchQuery("./sql/ti-tv-ratio.sql",
replacements=c(table_replacement,"_LIMITS_"=limits))
# Compute the Ti/Tv ratio of the 1,000 Genomes dataset.
SELECT
transitions,
transversions,
transitions/transversions AS titv,
COUNT
FROM (
SELECT
SUM(IF(mutation IN ('A->G',
'G->A',
'C->T',
'T->C'),
INTEGER(num_snps),
INTEGER(0))) AS transitions,
SUM(IF(mutation IN ('A->C',
'C->A',
'G->T',
'T->G',
'A->T',
'T->A',
'C->G',
'G->C'),
INTEGER(num_snps),
INTEGER(0))) AS transversions,
COUNT(mutation) AS COUNT
FROM (
SELECT
CONCAT(reference_bases,
CONCAT(STRING('->'),
alternate_bases)) AS mutation,
COUNT(alternate_bases) AS num_snps,
FROM
FLATTEN([gbsc-gcp-project-mvp:va_aaa_pilot_data.sample_gvcfs], call)
WHERE
LENGTH(alternate_bases) == 1
AND LENGTH(reference_bases) == 1
AND reference_name = 'chr22'
AND call.QUAL >= 30
GROUP BY
mutation,
ORDER BY
mutation))
The result:
transitions | transversions | titv | COUNT |
---|---|---|---|
174857 | 75357 | 2.32 | 12 |