forked from TyckoLab/CloudASM
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathread_genotype.sh
214 lines (208 loc) · 8.45 KB
/
read_genotype.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
#!/bin/bash
# CIGARs strings always start with a "M" (match). They are like 234M3D32M
# Then they may have an insertion or deletion and then a match
# First step: find nucleotide of the variant in the read and remove the combinations where the score of the snp is not high enough
bq query \
--use_legacy_sql=false \
--destination_table ${DATASET_ID}.${SAMPLE}_vcf_reads_for_genotyping \
--replace=true \
"
WITH
-- transform the vcf_reads table by extracting the CIGAR string
-- select only rows where the SNP is actually in the read. The other reads will be tagged later with REF or ALT based on their respective R1/R2
VCF_IN_READS AS (
SELECT
snp_id,
ref,
alt,
chr,
pos,
CT_strand,
GA_strand,
read_start,
read_end,
read_id,
cigar,
-- we split the CIGAR in 2 arrays, one with letters, one with their corresponding numbers.
REGEXP_EXTRACT_ALL(cigar, r'[M|D|I]') AS a_cig_letters,
REGEXP_EXTRACT_ALL(cigar, r'[0-9]+') AS a_cig_num,
pos - read_start + 1 AS snp_pos_in_read,
seq_CT_strand,
seq_GA_strand,
seq,
score_before_recal
FROM ${DATASET_ID}.${SAMPLE}_vcf_reads
),
CIG_NUM AS (
SELECT
-- create a column for the number of letters in the CIGAR string (1, 3, 5, or 7)
ARRAY_LENGTH(a_cig_num) AS length_cig,
*
FROM VCF_IN_READS
),
CIG_NUM_REFINED AS (
SELECT
SAFE_CAST(a_cig_num[offset(0)] AS INT64) AS first_m,
IF(length_cig >= 3,
SAFE_CAST(a_cig_num[offset(0)] AS INT64)
+ SAFE_CAST(a_cig_num[offset(2)] AS INT64)
+ IF( a_cig_letters[offset(1)] = 'I', -1, 1)*SAFE_CAST(a_cig_num[offset(1)] AS INT64),
0)
AS second_m,
IF(length_cig >= 5,
SAFE_CAST(a_cig_num[offset(0)] AS INT64)
+ SAFE_CAST(a_cig_num[offset(2)] AS INT64)
+ SAFE_CAST(a_cig_num[offset(4)] AS INT64)
+ IF(a_cig_letters[offset(1)] = 'I', 1, -1) * SAFE_CAST(a_cig_num[offset(1)] AS INT64)
+ IF(a_cig_letters[offset(3)] = 'I', 1, -1) * SAFE_CAST(a_cig_num[offset(3)] AS INT64),
0)
AS third_m,
*
FROM CIG_NUM
),
-- first table: the SNP is located before the first deletion or insertion
FIRST_MATCH AS (
SELECT
SUBSTR(seq, snp_pos_in_read, 1) as snp_in_read,
-- We add 5 to take into account the tag OQ:Z:
SUBSTR(score_before_recal, 5 + snp_pos_in_read, 1) AS snp_score,
snp_id,
ref,
alt,
chr,
pos,
CT_strand,
GA_strand,
seq_CT_strand,
seq_GA_strand,
read_id
FROM
CIG_NUM_REFINED
WHERE
first_m >= snp_pos_in_read
),
-- second table: the SNP is located after the first deletion or insertion
SECOND_MATCH AS (
SELECT
SUBSTR(seq, snp_pos_in_read + IF( a_cig_letters[offset(1)] = 'I', 1, -1)*SAFE_CAST(a_cig_num[offset(1)] AS INT64), 1) as snp_in_read,
SUBSTR(score_before_recal, 5 + snp_pos_in_read + IF( a_cig_letters[offset(1)] = 'I', 1, -1)*SAFE_CAST(a_cig_num[offset(1)] AS INT64), 1) as snp_score,
snp_id,
ref,
alt,
chr,
pos,
CT_strand,
GA_strand,
seq_CT_strand,
seq_GA_strand,
read_id
FROM
CIG_NUM_REFINED
WHERE
first_m < snp_pos_in_read AND second_m >= snp_pos_in_read
),
-- third table: the SNP is located after the second deletion or insertion
THIRD_MATCH AS (
SELECT
SUBSTR(seq, snp_pos_in_read + IF( a_cig_letters[offset(1)] = 'I', 1, -1)*SAFE_CAST(a_cig_num[offset(1)] AS INT64) + IF( a_cig_letters[offset(3)] = 'I', 1, -1)*SAFE_CAST(a_cig_num[offset(3)] AS INT64), 1) as snp_in_read,
SUBSTR(score_before_recal, 5+ snp_pos_in_read + IF( a_cig_letters[offset(1)] = 'I', 1, -1)*SAFE_CAST(a_cig_num[offset(1)] AS INT64) + IF( a_cig_letters[offset(3)] = 'I', 1, -1)*SAFE_CAST(a_cig_num[offset(3)] AS INT64), 1) as snp_score,
snp_id,
ref,
alt,
chr,
pos,
CT_strand,
GA_strand,
seq_CT_strand,
seq_GA_strand,
read_id
FROM
CIG_NUM_REFINED
WHERE
first_m < snp_pos_in_read AND second_m < snp_pos_in_read AND third_m >= snp_pos_in_read
),
-- Because of insertions, the snp may actually not be in the read even though its position in within the boundaries of the snp
-- so we demand that a SNP was found
-- We also demand a minimum snp score
FOR_GENOTYPING AS (
SELECT * FROM THIRD_MATCH
WHERE BYTE_LENGTH(snp_in_read) = 1 AND SAFE_CAST(TO_CODE_POINTS(snp_score)[offset(0)] AS INT64) >= ${SNP_SCORE}
UNION ALL
SELECT * FROM SECOND_MATCH
WHERE BYTE_LENGTH(snp_in_read) = 1 AND SAFE_CAST(TO_CODE_POINTS(snp_score)[offset(0)] AS INT64) >= ${SNP_SCORE}
UNION ALL
SELECT * FROM FIRST_MATCH
WHERE BYTE_LENGTH(snp_in_read) = 1 AND SAFE_CAST(TO_CODE_POINTS(snp_score)[offset(0)] AS INT64) >= ${SNP_SCORE}
)
SELECT * FROM FOR_GENOTYPING
"
bq query \
--use_legacy_sql=false \
--destination_table ${DATASET_ID}.${SAMPLE}_vcf_reads_genotype \
--replace=true \
"
WITH FOR_GENOTYPING AS (
SELECT * FROM ${DATASET_ID}.${SAMPLE}_vcf_reads_for_genotyping
),
-- table when the SNP can be found on only the CT or GA strands. We require that the SNP found in the read is
-- equal to REF or ALT. In some rare cases, it is not.
ONLY_CT_OR_GA AS (
SELECT
snp_id,
chr,
pos,
read_id,
if (snp_in_read = ref, 'REF', if (snp_in_read = alt, 'ALT', 'bad_snp')) AS allele
FROM
FOR_GENOTYPING
WHERE
CT_strand = FALSE OR GA_STRAND = FALSE
),
BOTH_STRANDS AS (
-- table when the SNP can be found on both strands
SELECT
snp_id,
chr,
pos,
read_id,
if (snp_in_read = ref
OR (ref = 'C' AND seq_CT_strand = TRUE AND snp_in_read = 'T')
OR (ref = 'G' AND seq_GA_strand = TRUE AND snp_in_read = 'A'),
'REF',
if (snp_in_read = alt
OR (alt = 'C' AND seq_CT_strand = TRUE AND snp_in_read = 'T')
OR (alt = 'G' AND seq_GA_strand = TRUE AND snp_in_read = 'A')
, 'ALT','bad_snp')) AS allele
FROM FOR_GENOTYPING
WHERE
CT_strand = TRUE AND GA_STRAND = TRUE
),
-- remove the SNPs that could be not be identified for sure.
GOOD_SNPS AS (
SELECT * FROM BOTH_STRANDS WHERE allele != 'bad_snp'
UNION ALL
SELECT * FROM ONLY_CT_OR_GA WHERE allele != 'bad_snp'
),
-- Need to remove the (snp_id, read_id) combination where the snp_id has been found to be both ref and alt (that can happen with paired reads both overlapping the snp position)
GROUP_BY_READID AS (
SELECT snp_id, read_id, COUNT(DISTINCT allele) AS nb_alleles
FROM GOOD_SNPS
GROUP BY read_id, snp_id
),
SNP_READ_TO_KEEP AS (
SELECT snp_id AS snp_id_keep, read_id AS read_id_keep
FROM GROUP_BY_READID
WHERE nb_alleles = 1
)
-- Need to rejoin to fully-blown list because it was aggregated by read_id
SELECT
snp_id,
pos,
allele,
read_id
FROM GOOD_SNPS
INNER JOIN SNP_READ_TO_KEEP
ON snp_id = snp_id_keep AND read_id = read_id_keep
"
# Delete intermediary files
bq rm -f -t ${DATASET_ID}.${SAMPLE}_vcf_reads_ready_for_genotyping