forked from RobertKSuter/CloudASM
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathappend_context.sh
155 lines (141 loc) · 4.2 KB
/
append_context.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
#!/bin/bash
# This process will filter out about 30% of all CpG
#because they do not have at least 20% methylation or 10x coverage
# Re-order the OB file and transform methylation status in 1 or 0
echo "Merge the OB and OT"
bq query \
--use_legacy_sql=false \
--destination_table ${DATASET_ID}.${SAMPLE}_both_context_tmp \
--replace=true \
"WITH
OB AS (
SELECT
read_id,
chr,
-- Bismark gives the position of the C in the negative strand. need to
-- subtract 1 to have the position of the CG in positive strand.
pos-1 as pos,
IF(meth_call = 'Z',1,0) as meth,
1 as cov
FROM
${DATASET_ID}.${SAMPLE}_CpGOB
),
OT AS (
SELECT
read_id,
chr,
pos,
IF(meth_call = 'Z',1,0) as meth,
1 as cov
FROM
${DATASET_ID}.${SAMPLE}_CpGOT
)
SELECT * FROM OB
UNION ALL
SELECT * FROM OT"
echo "Sum methylation and coverage per CpG. Keep at least 10x cov."
# We impose a coverage of at least 10x per CpG.
bq query \
--use_legacy_sql=false \
--destination_table ${DATASET_ID}.${SAMPLE}_merged_context_bed \
--replace=true \
"
WITH MERGED_STRANDS AS (
SELECT
chr,
pos AS pos_start,
pos AS pos_end,
SUM(cov) AS cov,
SUM(meth)/SUM(cov) AS meth_perc
FROM
${DATASET_ID}.${SAMPLE}_both_context_tmp
GROUP BY
chr, pos
)
SELECT
chr,
pos_start,
pos_end,
cov,
meth_perc
FROM
MERGED_STRANDS
WHERE
-- 2 x the minimum coverage required by allele (no allele distinction at this stage)
cov >= 2*${CPG_COV}
"
echo "Filter out from both_context_tmp the CpG sites that do not have at least 10x cov"
# This removes about 20% of all CpG flagged by Bismark.
bq query \
--use_legacy_sql=false \
--destination_table ${DATASET_ID}.${SAMPLE}_context \
--replace=true \
"
WITH
MERGED_CONTEXT_FILT AS (
SELECT
chr AS chr_cpg,
pos_start AS pos_cpg
FROM
${DATASET_ID}.${SAMPLE}_merged_context_bed
)
SELECT
read_id,
chr,
pos,
meth,
cov
FROM
${DATASET_ID}.${SAMPLE}_both_context_tmp
INNER JOIN
MERGED_CONTEXT_FILT ON
chr_cpg = chr
AND pos_cpg = pos
"
echo "Methylation percentage bedgraph"
bq query \
--use_legacy_sql=false \
--destination_table ${DATASET_ID}.${SAMPLE}_methyperc_bedgraph \
--replace=true \
" SELECT
chr,
pos_start,
pos_end,
meth_perc
FROM
${DATASET_ID}.${SAMPLE}_merged_context_bed
"
echo "CpG coverage bedgraph"
bq query \
--use_legacy_sql=false \
--destination_table ${DATASET_ID}.${SAMPLE}_CpGcov_bedgraph \
--replace=true \
" SELECT
chr,
pos_start,
pos_end,
cov
FROM
${DATASET_ID}.${SAMPLE}_merged_context_bed
"
################### Export bedgraph files to bucket
echo "Export bedgraph files of net methylation and coverage"
bq extract \
--field_delimiter "\t" \
--print_header=false \
${DATASET_ID}.${SAMPLE}_methyperc_bedgraph \
gs://$OUTPUT_B/$SAMPLE/bedgraph/${SAMPLE}_methyperc.bedgraph
bq extract \
--field_delimiter "\t" \
--print_header=false \
${DATASET_ID}.${SAMPLE}_CpGcov_bedgraph \
gs://$OUTPUT_B/$SAMPLE/bedgraph/${SAMPLE}_CpGcov.bedgraph
########################## Delete most BQ files
echo "Delete bedgraph files from BQ"
bq rm -f -t ${DATASET_ID}.${SAMPLE}_both_context_tmp
bq rm -f -t ${DATASET_ID}.${SAMPLE}_methyperc_bedgraph
bq rm -f -t ${DATASET_ID}.${SAMPLE}_CpGcov_bedgraph
bq rm -f -t ${DATASET_ID}.${SAMPLE}_merged_context_bed
echo "Delete raw files"
bq rm -f -t ${DATASET_ID}.${SAMPLE}_CpGOB
bq rm -f -t ${DATASET_ID}.${SAMPLE}_CpGOT