Removed project_id variable from most scripts

TyckoLab · Nov 13, 2019 · e470db2 · e470db2
1 parent ade55bb
commit e470db2
Show file tree

Hide file tree

Showing 5 changed files with 27 additions and 30 deletions.
diff --git a/append_context.sh b/append_context.sh
@@ -7,7 +7,7 @@
 echo "Merge the OB and OT"
 bq query \
     --use_legacy_sql=false \
-    --destination_table ${PROJECT_ID}:${DATASET_ID}.${SAMPLE}_both_context_tmp \
+    --destination_table ${DATASET_ID}.${SAMPLE}_both_context_tmp \
     --replace=true \
     "WITH 
         OB AS (
@@ -40,7 +40,7 @@ echo "Sum methylation and coverage per CpG. Keep at least 10x cov."
 # We impose a coverage of at least 10x per CpG.
 bq query \
     --use_legacy_sql=false \
-    --destination_table ${PROJECT_ID}:${DATASET_ID}.${SAMPLE}_merged_context_bed \
+    --destination_table ${DATASET_ID}.${SAMPLE}_merged_context_bed \
     --replace=true \
     "  
     WITH MERGED_STRANDS AS (
@@ -72,7 +72,7 @@ echo "Filter out from both_context_tmp the CpG sites that do not have at least 1
 # This removes about 20% of all CpG flagged by Bismark.
 bq query \
     --use_legacy_sql=false \
-    --destination_table ${PROJECT_ID}:${DATASET_ID}.${SAMPLE}_context \
+    --destination_table ${DATASET_ID}.${SAMPLE}_context \
     --replace=true \
     " 
     WITH 
@@ -100,7 +100,7 @@ bq query \
 echo "Methylation percentage bedgraph"
 bq query \
     --use_legacy_sql=false \
-    --destination_table ${PROJECT_ID}:${DATASET_ID}.${SAMPLE}_methyperc_bedgraph \
+    --destination_table ${DATASET_ID}.${SAMPLE}_methyperc_bedgraph \
     --replace=true \
     " SELECT 
         chr, 
@@ -114,7 +114,7 @@ bq query \
 echo "CpG coverage bedgraph"
 bq query \
     --use_legacy_sql=false \
-    --destination_table ${PROJECT_ID}:${DATASET_ID}.${SAMPLE}_CpGcov_bedgraph \
+    --destination_table ${DATASET_ID}.${SAMPLE}_CpGcov_bedgraph \
     --replace=true \
     " SELECT 
         chr, 
@@ -145,10 +145,10 @@ bq extract \
 ########################## Delete most BQ files
 
 echo "Delete bedgraph files from BQ"
-bq rm -f -t ${PROJECT_ID}:${DATASET_ID}.${SAMPLE}_both_context_tmp
-bq rm -f -t ${PROJECT_ID}:${DATASET_ID}.${SAMPLE}_methyperc_bedgraph
-bq rm -f -t ${PROJECT_ID}:${DATASET_ID}.${SAMPLE}_CpGcov_bedgraph
-bq rm -f -t ${PROJECT_ID}:${DATASET_ID}.${SAMPLE}_merged_context_bed
+bq rm -f -t ${DATASET_ID}.${SAMPLE}_both_context_tmp
+bq rm -f -t ${DATASET_ID}.${SAMPLE}_methyperc_bedgraph
+bq rm -f -t ${DATASET_ID}.${SAMPLE}_CpGcov_bedgraph
+bq rm -f -t ${DATASET_ID}.${SAMPLE}_merged_context_bed
 
 echo "Delete raw files"
 bq rm -f -t ${DATASET_ID}.${SAMPLE}_CpGOB

diff --git a/clean_sam.sh b/clean_sam.sh
@@ -3,7 +3,7 @@
 # Clean the SAM
 bq query \
     --use_legacy_sql=false \
-    --destination_table ${PROJECT_ID}:${DATASET_ID}.${SAMPLE}_recal_sam \
+    --destination_table ${DATASET_ID}.${SAMPLE}_recal_sam \
     --replace=true \
         "SELECT
             read_id,

diff --git a/clean_vcf.sh b/clean_vcf.sh
@@ -5,7 +5,7 @@
 # Remove the rows where the snp_id is not in the form of "rs"
 bq query \
     --use_legacy_sql=false \
-    --destination_table ${PROJECT_ID}:${DATASET_ID}.${SAMPLE}_vcf \
+    --destination_table ${DATASET_ID}.${SAMPLE}_vcf \
     --replace=true \
     "WITH
       -- We create a file with a 500bp window around the SNP and calculate the cov of the SNP
@@ -46,3 +46,6 @@ bq query \
   FROM
      variants
   "
+
+# Delete VCF file that was uploaded
+bq rm -f -t ${DATASET_ID}.${SAMPLE}_vcf
diff --git a/master.sh b/master.sh
@@ -521,7 +521,6 @@ dsub \
   --zones $ZONE_ID \
   --image $DOCKER_GCP \
   --logging $LOG \
-  --env PROJECT_ID="${PROJECT_ID}" \
   --env DATASET_ID="${DATASET_ID}" \
   --env CPG_COV="${CPG_COV}" \
   --env OUTPUT_B="${OUTPUT_B}" \
@@ -559,8 +558,7 @@ dsub \
   --name 'bam-to-sam' \
   --wait
 
-
-## Second export SAM to BigQuery and delete it from the bucket (takes too much space)
+######## Export all SAM to BigQuery
 
 # Prepare TSV file
 echo -e "--env SAMPLE\t--env SAM" > sam_to_bq.tsv
@@ -569,13 +567,12 @@ while read SAMPLE ; do
   for CHR in `seq 1 22` X Y ; do 
     echo -e "$SAMPLE\tgs://$OUTPUT_B/${SAMPLE}/sam/${SAMPLE}_chr${CHR}_recal.sam" >> sam_to_bq.tsv
   done
-
   # Delete existing SAM on big query
   bq rm -f -t ${PROJECT_ID}:${DATASET_ID}.${SAMPLE}_recal_sam_uploaded
-
 done < sample_id.txt
 
 # We append all chromosomes in the same file.
+# Takes 2 minutes
 dsub \
   --provider google-v2 \
   --project $PROJECT_ID \
@@ -596,20 +593,21 @@ dsub \
   --wait
 
 # Clean the SAM on BigQuery
+# 1 minute
 dsub \
   --provider google-v2 \
   --project $PROJECT_ID \
   --zones $ZONE_ID \
   --image ${DOCKER_GCP} \
   --logging $LOG \
   --env DATASET_ID="${DATASET_ID}" \
-  --env PROJECT_ID="${PROJECT_ID}" \
   --script ${SCRIPTS}/clean_sam.sh \
   --tasks all_samples.tsv \
   --wait
 
 # Delete the SAM files from the bucket (they take a lot of space) 
 # and the raw SAM files from Big Query
+# Takes 2 minutes
 dsub \
   --provider google-v2 \
   --project $PROJECT_ID \
@@ -651,6 +649,7 @@ dsub \
 
 
 # We append all chromosomes files in the same file.
+# Takes about 4 minutes
 dsub \
   --provider google-v2 \
   --project $PROJECT_ID \
@@ -684,7 +683,6 @@ dsub \
   --image ${DOCKER_GCP} \
   --logging $LOG \
   --env DATASET_ID="${DATASET_ID}" \
-  --env PROJECT_ID="${PROJECT_ID}" \
   --script ${SCRIPTS}/clean_vcf.sh \
   --tasks all_samples.tsv \
   --wait

diff --git a/reads_overlap_snp.sh b/reads_overlap_snp.sh
@@ -41,6 +41,7 @@ bq query \
             AND pos >= ${INF}
             AND pos <= ${SUP}
       ),
+      -- Find the lower and upper bounds of a pair of read id (R1 and R2)
       unique_read_id AS (
         SELECT 
           read_id AS read_id_unique,
@@ -55,21 +56,15 @@ bq query \
       -- Find variants on DNA fragments where R1 or R2 has an overlap
       variants_and_read_id AS (
         SELECT snp_id, pos, ref, alt, read_id_unique, CT_strand, GA_strand 
-        FROM variants
-        INNER JOIN unique_read_id
+        FROM unique_read_id
+        INNER JOIN variants
         ON 
             seq_start <= pos
             AND seq_end >= pos
             AND (seq_CT_strand = CT_strand OR seq_GA_strand = GA_strand)
-      ),
-      variants_and_all_reads AS (
-      SELECT * FROM variants_and_read_id
-      INNER JOIN sequences 
-      ON read_id_unique = read_id
       )
       -- Table of all variants x reads where the variant is in the read or its paired read.
-      SELECT
-        snp_id,
+      SELECT snp_id,
         ref, 
         alt,
         pos,
@@ -84,6 +79,7 @@ bq query \
         cigar,
         read_id,
         seq,
-        score_before_recal
-      FROM variants_and_all_reads
-      "
+        score_before_recal 
+      FROM sequences
+      INNER JOIN variants_and_read_id
+      ON read_id_unique = read_id'