Skip to content

Commit

Permalink
Removed project_id variable from most scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
elpdumont committed Nov 13, 2019
1 parent ade55bb commit e470db2
Show file tree
Hide file tree
Showing 5 changed files with 27 additions and 30 deletions.
18 changes: 9 additions & 9 deletions append_context.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
echo "Merge the OB and OT"
bq query \
--use_legacy_sql=false \
--destination_table ${PROJECT_ID}:${DATASET_ID}.${SAMPLE}_both_context_tmp \
--destination_table ${DATASET_ID}.${SAMPLE}_both_context_tmp \
--replace=true \
"WITH
OB AS (
Expand Down Expand Up @@ -40,7 +40,7 @@ echo "Sum methylation and coverage per CpG. Keep at least 10x cov."
# We impose a coverage of at least 10x per CpG.
bq query \
--use_legacy_sql=false \
--destination_table ${PROJECT_ID}:${DATASET_ID}.${SAMPLE}_merged_context_bed \
--destination_table ${DATASET_ID}.${SAMPLE}_merged_context_bed \
--replace=true \
"
WITH MERGED_STRANDS AS (
Expand Down Expand Up @@ -72,7 +72,7 @@ echo "Filter out from both_context_tmp the CpG sites that do not have at least 1
# This removes about 20% of all CpG flagged by Bismark.
bq query \
--use_legacy_sql=false \
--destination_table ${PROJECT_ID}:${DATASET_ID}.${SAMPLE}_context \
--destination_table ${DATASET_ID}.${SAMPLE}_context \
--replace=true \
"
WITH
Expand Down Expand Up @@ -100,7 +100,7 @@ bq query \
echo "Methylation percentage bedgraph"
bq query \
--use_legacy_sql=false \
--destination_table ${PROJECT_ID}:${DATASET_ID}.${SAMPLE}_methyperc_bedgraph \
--destination_table ${DATASET_ID}.${SAMPLE}_methyperc_bedgraph \
--replace=true \
" SELECT
chr,
Expand All @@ -114,7 +114,7 @@ bq query \
echo "CpG coverage bedgraph"
bq query \
--use_legacy_sql=false \
--destination_table ${PROJECT_ID}:${DATASET_ID}.${SAMPLE}_CpGcov_bedgraph \
--destination_table ${DATASET_ID}.${SAMPLE}_CpGcov_bedgraph \
--replace=true \
" SELECT
chr,
Expand Down Expand Up @@ -145,10 +145,10 @@ bq extract \
########################## Delete most BQ files

echo "Delete bedgraph files from BQ"
bq rm -f -t ${PROJECT_ID}:${DATASET_ID}.${SAMPLE}_both_context_tmp
bq rm -f -t ${PROJECT_ID}:${DATASET_ID}.${SAMPLE}_methyperc_bedgraph
bq rm -f -t ${PROJECT_ID}:${DATASET_ID}.${SAMPLE}_CpGcov_bedgraph
bq rm -f -t ${PROJECT_ID}:${DATASET_ID}.${SAMPLE}_merged_context_bed
bq rm -f -t ${DATASET_ID}.${SAMPLE}_both_context_tmp
bq rm -f -t ${DATASET_ID}.${SAMPLE}_methyperc_bedgraph
bq rm -f -t ${DATASET_ID}.${SAMPLE}_CpGcov_bedgraph
bq rm -f -t ${DATASET_ID}.${SAMPLE}_merged_context_bed

echo "Delete raw files"
bq rm -f -t ${DATASET_ID}.${SAMPLE}_CpGOB
Expand Down
2 changes: 1 addition & 1 deletion clean_sam.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# Clean the SAM
bq query \
--use_legacy_sql=false \
--destination_table ${PROJECT_ID}:${DATASET_ID}.${SAMPLE}_recal_sam \
--destination_table ${DATASET_ID}.${SAMPLE}_recal_sam \
--replace=true \
"SELECT
read_id,
Expand Down
5 changes: 4 additions & 1 deletion clean_vcf.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
# Remove the rows where the snp_id is not in the form of "rs"
bq query \
--use_legacy_sql=false \
--destination_table ${PROJECT_ID}:${DATASET_ID}.${SAMPLE}_vcf \
--destination_table ${DATASET_ID}.${SAMPLE}_vcf \
--replace=true \
"WITH
-- We create a file with a 500bp window around the SNP and calculate the cov of the SNP
Expand Down Expand Up @@ -46,3 +46,6 @@ bq query \
FROM
variants
"

# Delete VCF file that was uploaded
bq rm -f -t ${DATASET_ID}.${SAMPLE}_vcf
12 changes: 5 additions & 7 deletions master.sh
Original file line number Diff line number Diff line change
Expand Up @@ -521,7 +521,6 @@ dsub \
--zones $ZONE_ID \
--image $DOCKER_GCP \
--logging $LOG \
--env PROJECT_ID="${PROJECT_ID}" \
--env DATASET_ID="${DATASET_ID}" \
--env CPG_COV="${CPG_COV}" \
--env OUTPUT_B="${OUTPUT_B}" \
Expand Down Expand Up @@ -559,8 +558,7 @@ dsub \
--name 'bam-to-sam' \
--wait


## Second export SAM to BigQuery and delete it from the bucket (takes too much space)
######## Export all SAM to BigQuery

# Prepare TSV file
echo -e "--env SAMPLE\t--env SAM" > sam_to_bq.tsv
Expand All @@ -569,13 +567,12 @@ while read SAMPLE ; do
for CHR in `seq 1 22` X Y ; do
echo -e "$SAMPLE\tgs://$OUTPUT_B/${SAMPLE}/sam/${SAMPLE}_chr${CHR}_recal.sam" >> sam_to_bq.tsv
done

# Delete existing SAM on big query
bq rm -f -t ${PROJECT_ID}:${DATASET_ID}.${SAMPLE}_recal_sam_uploaded

done < sample_id.txt

# We append all chromosomes in the same file.
# Takes 2 minutes
dsub \
--provider google-v2 \
--project $PROJECT_ID \
Expand All @@ -596,20 +593,21 @@ dsub \
--wait

# Clean the SAM on BigQuery
# 1 minute
dsub \
--provider google-v2 \
--project $PROJECT_ID \
--zones $ZONE_ID \
--image ${DOCKER_GCP} \
--logging $LOG \
--env DATASET_ID="${DATASET_ID}" \
--env PROJECT_ID="${PROJECT_ID}" \
--script ${SCRIPTS}/clean_sam.sh \
--tasks all_samples.tsv \
--wait

# Delete the SAM files from the bucket (they take a lot of space)
# and the raw SAM files from Big Query
# Takes 2 minutes
dsub \
--provider google-v2 \
--project $PROJECT_ID \
Expand Down Expand Up @@ -651,6 +649,7 @@ dsub \


# We append all chromosomes files in the same file.
# Takes about 4 minutes
dsub \
--provider google-v2 \
--project $PROJECT_ID \
Expand Down Expand Up @@ -684,7 +683,6 @@ dsub \
--image ${DOCKER_GCP} \
--logging $LOG \
--env DATASET_ID="${DATASET_ID}" \
--env PROJECT_ID="${PROJECT_ID}" \
--script ${SCRIPTS}/clean_vcf.sh \
--tasks all_samples.tsv \
--wait
Expand Down
20 changes: 8 additions & 12 deletions reads_overlap_snp.sh
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ bq query \
AND pos >= ${INF}
AND pos <= ${SUP}
),
-- Find the lower and upper bounds of a pair of read id (R1 and R2)
unique_read_id AS (
SELECT
read_id AS read_id_unique,
Expand All @@ -55,21 +56,15 @@ bq query \
-- Find variants on DNA fragments where R1 or R2 has an overlap
variants_and_read_id AS (
SELECT snp_id, pos, ref, alt, read_id_unique, CT_strand, GA_strand
FROM variants
INNER JOIN unique_read_id
FROM unique_read_id
INNER JOIN variants
ON
seq_start <= pos
AND seq_end >= pos
AND (seq_CT_strand = CT_strand OR seq_GA_strand = GA_strand)
),
variants_and_all_reads AS (
SELECT * FROM variants_and_read_id
INNER JOIN sequences
ON read_id_unique = read_id
)
-- Table of all variants x reads where the variant is in the read or its paired read.
SELECT
snp_id,
SELECT snp_id,
ref,
alt,
pos,
Expand All @@ -84,6 +79,7 @@ bq query \
cigar,
read_id,
seq,
score_before_recal
FROM variants_and_all_reads
"
score_before_recal
FROM sequences
INNER JOIN variants_and_read_id
ON read_id_unique = read_id'

0 comments on commit e470db2

Please sign in to comment.