forked from TyckoLab/CloudASM
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcpg_genotype.sh
150 lines (146 loc) · 4.24 KB
/
cpg_genotype.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#!/bin/bash
bq query \
--use_legacy_sql=false \
--destination_table ${DATASET_ID}.${SAMPLE}_context_filtered \
--replace=true \
"
WITH
CONTEXT AS (
SELECT
chr,
pos,
meth,
cov,
read_id
FROM ${DATASET_ID}.${SAMPLE}_context
),
SNPS_FOR_CPG AS (
SELECT
chr_snp,
pos_snp
FROM ${DATASET_ID}.${SAMPLE}_snps_for_cpg
),
C_FROM_CONTEXT AS (
SELECT DISTINCT
chr,
pos AS pos_c
FROM CONTEXT
),
REMOVE_C AS (
SELECT chr, pos_c
FROM C_FROM_CONTEXT
EXCEPT DISTINCT
SELECT * FROM SNPS_FOR_CPG
),
CHANGE_TO_G AS (
SELECT
chr AS chr_clean,
pos_c + 1 AS pos_g
FROM REMOVE_C
),
REMOVE_C_AND_G AS (
SELECT * FROM CHANGE_TO_G
EXCEPT DISTINCT
SELECT * FROM SNPS_FOR_CPG
)
SELECT chr, pos, meth, cov, read_id
FROM CONTEXT INNER JOIN REMOVE_C_AND_G
ON chr = chr_clean AND pos = pos_g - 1
"
# Create a long table of all CpG x SNP x read_id combinations
# 90% of CpGs are dropped because their respective reads could not be linked to a REF or ALT of any SNP
# This table will be used in the asm_region calculation.
bq query \
--use_legacy_sql=false \
--destination_table ${DATASET_ID}.${SAMPLE}_cpg_read_genotype \
--replace=true \
"
WITH
CONTEXT AS (
SELECT *
FROM ${DATASET_ID}.${SAMPLE}_context_filtered
),
GENOTYPE AS (
SELECT
snp_id,
pos AS snp_pos,
read_id AS geno_read_id,
allele
FROM ${DATASET_ID}.${SAMPLE}_vcf_reads_genotype
)
-- create a table with each combination of snp, CpG, read_id, REF, ALT
SELECT DISTINCT
chr,
pos,
meth,
cov,
snp_id,
snp_pos,
allele,
read_id
FROM CONTEXT
INNER JOIN GENOTYPE
ON read_id = geno_read_id
"
# This file will be used to compute single CPG ASM
bq query \
--use_legacy_sql=false \
--destination_table ${DATASET_ID}.${SAMPLE}_cpg_genotype \
--replace=true \
"
WITH
----------------------------------------------------------
-- Find CpGs that are at least covered 5x on both REF and ALT for a SNP
----------------------------------------------------------
CLEAN AS (
SELECT * FROM ${DATASET_ID}.${SAMPLE}_cpg_read_genotype
),
REF_ALLELES AS (
SELECT
chr,
pos,
snp_id,
snp_pos,
SUM(meth) AS ref_meth,
SUM(cov) AS ref_cov
FROM CLEAN
WHERE allele = 'REF'
GROUP BY chr, pos, snp_id, snp_pos
),
ALT_ALLELES AS (
SELECT
chr AS chr_alt,
pos AS pos_alt,
snp_id AS snp_id_alt,
SUM(meth) AS alt_meth,
SUM(cov) AS alt_cov
FROM CLEAN
WHERE allele = 'ALT'
GROUP BY chr_alt, pos_alt, snp_id_alt
),
REF_AND_ALT AS (
SELECT * FROM REF_ALLELES
INNER JOIN ALT_ALLELES
ON chr = chr_alt AND
pos = pos_alt AND
snp_id = snp_id_alt
)
SELECT
chr,
pos,
snp_id,
snp_pos,
ref_cov,
ref_meth,
alt_cov,
alt_meth
FROM REF_AND_ALT
-- we require that each allele is covered 5x
WHERE ref_cov >=${CPG_COV} AND alt_cov >= ${CPG_COV}
"
# Save file in bucket to use it in a Python script to compute a Fisher exact test
bq extract \
--field_delimiter "," \
--print_header=true \
${DATASET_ID}.${SAMPLE}_cpg_genotype \
gs://$OUTPUT_B/$SAMPLE/asm/${SAMPLE}_cpg_genotype.csv