yinshiyi · yinshiyi · Jun 21, 2024 · Sep 13, 2024 · Sep 17, 2024 · Sep 26, 2024
diff --git a/.gitignore b/.gitignore
@@ -10,3 +10,4 @@
 bazel-*
 
 **/.ipynb_checkpoints
+shuffle*
diff --git a/baseline.sh b/baseline.sh
@@ -0,0 +1,23 @@
+sudo docker run --gpus 1 \
+  -v /home/${USER}:/home/${USER} \
+  google/deepvariant:"${BIN_VERSION}-gpu" \
+  /opt/deepvariant/bin/run_deepvariant \
+  --model_type WGS \
+  --ref "${REF}" \
+  --reads "${BAM_CHR20}" \
+  --regions "chr20" \
+  --output_vcf "${OUTPUT_DIR}/baseline.vcf.gz" \
+  --num_shards=4
+
+time sudo docker run -it \
+-v "${DATA_DIR}:${DATA_DIR}" \
+-v "${OUTPUT_DIR}:${OUTPUT_DIR}" \
+jmcdani20/hap.py:v0.3.12 /opt/hap.py/bin/hap.py \
+  "${TRUTH_VCF}" \
+  "${OUTPUT_DIR}/baseline.vcf.gz" \
+  -f "${TRUTH_BED}" \
+  -r "${REF}" \
+  -o "${OUTPUT_DIR}/chr20-calling_general.happy.output" \
+  -l chr20 \
+  --engine=vcfeval \
+  --pass-only
diff --git a/docs/add_alarm.sh b/docs/add_alarm.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+# Variables
+host="my-ec2-instance"
+region="us-east-1"
+
+# Step 1: Launch the EC2 instance
+instance_id=$(aws ec2 run-instances \
+    --image-id ami-096ea6a12ea24a797 \
+    --count 1 \
+    --instance-type t4g.small \
+    --security-group-id sg-0b734813083db4ba2 \
+    --key-name gpu \
+    --block-device-mappings DeviceName=/dev/sda1,Ebs={VolumeSize=20} \
+    --tag-specifications 'ResourceType=instance,Tags=[{Key=Name,Value='"${host}"'}]' \
+    --query "Instances[0].InstanceId" \
+    --output text \
+    --region $region \
+    --profile gpu)
+
+echo "Launched EC2 instance with ID: $instance_id"
+
+# Step 2: Create the CloudWatch alarm
+aws cloudwatch put-metric-alarm \
+    --alarm-name "CPUUtilization-Low-${instance_id}" \
+    --metric-name CPUUtilization \
+    --namespace AWS/EC2 \
+    --statistic Average \
+    --period 3600 \
+    --threshold 1 \
+    --comparison-operator LessThanOrEqualToThreshold \
+    --dimensions "Name=InstanceId,Value=${instance_id}" \
+    --evaluation-periods 2 \
+    --alarm-actions arn:aws:sns:us-east-1:940583394710:idle-instance-alarm \
+    --region $region \
+    --profile gpu
+
+echo "Alarm created for instance: $instance_id"
diff --git a/docs/deepvariant-training-case-study.md b/docs/deepvariant-training-case-study.md
@@ -27,40 +27,99 @@ accuracy comparing to the WGS model as a baseline:
 This tutorial is meant as an example for training; all the other processing in
 this tutorial were done serially with no pipeline optimization.
 
+## bam processing
+Since PicoV3 is very low coverage, we need to only take the Bam files regions that have enough coverage >2,
+it make sense at least 3 reads to vote to have a majority
+Using Bed file or something like that
+Learn from the Pacbio examples
+
+First set up AWS deepvariant
+
+Collect the bams files with ground truth data available.
+Do variant calling using the generalized model first to see the performance on the regions that have coverage >2
+Then see if we could improve on that
+
+```bash
+BAM_CHR1="${DATA_DIR}/BGISEQ_PE100_NA12878.sorted.chr1.bam"
+BAM_CHR20="${DATA_DIR}/BGISEQ_PE100_NA12878.sorted.chr20.bam"
+BAM_CHR21="${DATA_DIR}/BGISEQ_PE100_NA12878.sorted.chr21.bam"
+merged_bam="merged.bam"
+mininum_coverage=2
+coverage_bed="pass_threshold.bed"
+# why do we need this bed file?
+TRUTH_BED="${DATA_DIR}/HG001_GRCh37_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_nosomaticdel_chr.bed"
+# https://bedtools.readthedocs.io/en/latest/content/tools/genomecov.html
+samtools merge $merged_bam $BAM_CHR1 $BAM_CHR20 $BAM_CHR21
+bedtools genomecov -ibam $merged_bam -bg | \
+    awk -v min_cov="$minimum_coverage" '$4 > min_cov {print $1, $2, $3}' | \
+    bedtools intersect -a $TRUTH_BED -b - > $coverage_bed
+```
+
 ## Request a machine
 
 For this case study, we use a [GPU machine] with 16 vCPUs. You can request this
-machine on Google Cloud using the following command:
+machine on AWS using the following command:
 
 ```bash
+# public.ecr.aws/aws-genomics/google/deepvariant:1.4.0
+# https://cloud-images.ubuntu.com/locator/ec2/
+aws ec2 run-instances \
+    --image-id ami-0c272455b0778ebeb \ # Replace with the correct AMI ID for Ubuntu 20.04 LTS in your region
+    --count 1 \
+    --instance-type p3.2xlarge \ # p3 instances use Nvidia Tesla V100 GPUs, which is close to the Tesla P100
+    --key-name MyKeyPair \ # Replace with your key pair name
+    --block-device-mappings DeviceName=/dev/sda1,Ebs={VolumeSize=300} \
+    --tag-specifications 'ResourceType=instance,Tags=[{Key=Name,Value='"${USER}-deepvariant-vm"'}]' \
+    --region us-west-2 \
+    --iam-instance-profile Name=gpu
+    --placement AvailabilityZone=us-west-2b 
+```
+```bash
+aws ec2 run-instances \
+    --image-id ami-0c272455b0778ebeb \ # Replace with the correct AMI ID for Ubuntu 20.04 LTS in your region
+    --count 1 \
+    --instance-type t4g.small \ # just get a small instance to try it out first
+    --key-name gpu \ # Replace with your key pair name
+    --block-device-mappings DeviceName=/dev/sda1,Ebs={VolumeSize=20} \
+    --tag-specifications 'ResourceType=instance,Tags=[{Key=Name,Value='"${USER}-deepvariant-vm"'}]' \
+    --region us-west-2 \
+    --iam-instance-profile Name=gpu
+    --placement AvailabilityZone=us-west-2b 
+```
+```bash
+# this actually works
 host="${USER}-deepvariant-vm"
-zone="us-west1-b"
-
-gcloud compute instances create ${host} \
-    --scopes "compute-rw,storage-full,cloud-platform" \
-    --maintenance-policy "TERMINATE" \
-    --accelerator=type=nvidia-tesla-p100,count=1 \
-    --image-family "ubuntu-2004-lts" \
-    --image-project "ubuntu-os-cloud" \
-    --machine-type "n1-standard-16" \
-    --boot-disk-size "300" \
-    --zone "${zone}" \
-    --min-cpu-platform "Intel Skylake"
+region="us-east-1"
+chmod 400 ~/gpu.pem
+# this image id is not right
+aws ec2 run-instances \
+    --image-id ami-096ea6a12ea24a797 \
+    --count 1 \
+    --instance-type t4g.small \
+    --security-group-id sg-0b734813083db4ba2 \
+    --key-name gpu \
+    --block-device-mappings DeviceName=/dev/sda1,Ebs={VolumeSize=20} \
+    --tag-specifications 'ResourceType=instance,Tags=[{Key=Name,Value='"${host}"'}]' \
+    --region $region \
+    --profile gpu
 ```
-
 After a minute or two, your VM should be ready and you can ssh into it using the
 following command:
 
 ```bash
-gcloud compute ssh ${host} --zone ${zone}
+aws ec2 stop-instances --instance-ids i-0e4f059f74edbb771 -profile gpu
+# elastic IP
+ssh -i "~/gpu.pem" [email protected]
+ssh gpu
+# ssh -i ~/gpu.pem ubuntu@${host}
 ```
 
 Once you have logged in, set the variables:
 
 ```bash
 YOUR_PROJECT=REPLACE_WITH_YOUR_PROJECT
 OUTPUT_GCS_BUCKET=REPLACE_WITH_YOUR_GCS_BUCKET
-
+# might have to install gsutil to make sure the instance connect to deepvariant's standard files
 BUCKET="gs://deepvariant"
 VERSION="1.6.1"
 DOCKER_IMAGE="google/deepvariant:${VERSION}"
@@ -113,6 +172,8 @@ gsutil -m cp -r "${DATA_BUCKET}/HG001_GRCh37_GIAB_highconf_CG-IllFB-IllGATKHC-Io
 ### Download extra packages
 
 ```bash
+snap install gh
+gh auth login
 sudo apt -y update
 sudo apt -y install parallel
 curl -O https://raw.githubusercontent.com/google/deepvariant/r1.6.1/scripts/install_nvidia_docker.sh
@@ -538,7 +599,7 @@ time sudo docker run -it \
 jmcdani20/hap.py:v0.3.12 /opt/hap.py/bin/hap.py \
   "${TRUTH_VCF}" \
   "${OUTPUT_DIR}/test_set.vcf.gz" \
-  -f "${TRUTH_BED}" \
+  -f "${TRUTH_BED}" \ # this is important in my study to make sure coverage >2
   -r "${REF}" \
   -o "${OUTPUT_DIR}/chr20-calling.happy.output" \
   -l chr20 \
@@ -588,7 +649,7 @@ sudo docker run --gpus all \
   --output_vcf "${OUTPUT_DIR}/baseline.vcf.gz" \
   --num_shards=${N_SHARDS}
 ```
-
+baseline vcf run happy
 Baseline:
 
 | Type  | TRUTH.TP | TRUTH.FN | QUERY.FP | METRIC.Recall | METRIC.Precision | METRIC.F1_Score |

diff --git a/eval.sh b/eval.sh
@@ -0,0 +1,23 @@
+BASE="/home/${USER}/data/training-case-study"
+OUTPUT_DIR="${BASE}/output"
+model="/home/${USER}/data/model/model.ckpt"
+TRAINING_DIR="${OUTPUT_DIR}/training_dir"
+BIN_VERSION="1.4.0"
+INPUT_DIR="${BASE}/input"
+LOG_DIR="${OUTPUT_DIR}/logs"
+DATA_DIR="${INPUT_DIR}/data"
+REF="${DATA_DIR}/ucsc_hg19.fa"
+BAM_CHR1="${DATA_DIR}/BGISEQ_PE100_NA12878.sorted.chr1.bam"
+BAM_CHR20="${DATA_DIR}/BGISEQ_PE100_NA12878.sorted.chr20.bam"
+
+sudo docker run --gpus 1 \
+  -v /home/${USER}:/home/${USER} \
+  google/deepvariant:"${BIN_VERSION}-gpu" \
+  /opt/deepvariant/bin/run_deepvariant \
+  --model_type WGS \
+  --customized_model "${TRAINING_DIR}/model.ckpt-50000" \
+  --ref "${REF}" \
+  --reads "${BAM_CHR20}" \
+  --regions "chr20" \
+  --output_vcf "${OUTPUT_DIR}/test_set.vcf.gz" \
+  --num_shards=4
diff --git a/index_too_old.sh b/index_too_old.sh
@@ -0,0 +1,2 @@
+#index is older than file
+solve by touch the index files
diff --git a/make_example.sh b/make_example.sh
@@ -0,0 +1,54 @@
+# amd64 architecture, but your machine is running an arm64 architecture
+
+YOUR_PROJECT=takara
+OUTPUT_GCS_BUCKET=REPLACE_WITH_YOUR_GCS_BUCKET
+# might have to install gsutil to make sure the instance connect to deepvariant's standard files
+BUCKET="gs://deepvariant"
+VERSION="1.6.1"
+DOCKER_IMAGE="google/deepvariant:${VERSION}"
+
+MODEL_BUCKET="${BUCKET}/models/DeepVariant/${VERSION}/DeepVariant-inception_v3-${VERSION}+data-wgs_standard"
+GCS_PRETRAINED_WGS_MODEL="${MODEL_BUCKET}/model.ckpt"
+
+OUTPUT_BUCKET="${OUTPUT_GCS_BUCKET}/customized_training"
+TRAINING_DIR="${OUTPUT_BUCKET}/training_dir"
+
+BASE="/home/ubuntu/data/training-case-study"
+DATA_BUCKET=gs://deepvariant/training-case-study/BGISEQ-HG001
+
+INPUT_DIR="${BASE}/input"
+BIN_DIR="${INPUT_DIR}/bin"
+DATA_DIR="${INPUT_DIR}/data"
+OUTPUT_DIR="${BASE}/output2"
+LOG_DIR="${OUTPUT_DIR}/logs"
+SHUFFLE_SCRIPT_DIR="${HOME}/deepvariant/tools"
+
+REF="${DATA_DIR}/ucsc_hg19.fa"
+BAM_CHR1="${DATA_DIR}/BGISEQ_PE100_NA12878.sorted.chr1.bam"
+BAM_CHR20="${DATA_DIR}/BGISEQ_PE100_NA12878.sorted.chr20.bam"
+BAM_CHR21="${DATA_DIR}/BGISEQ_PE100_NA12878.sorted.chr21.bam"
+TRUTH_VCF="${DATA_DIR}/HG001_GRCh37_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_PGandRTGphasetransfer_chrs_FIXED.vcf.gz"
+TRUTH_BED="${DATA_DIR}/HG001_GRCh37_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_nosomaticdel_chr.bed"
+
+N_SHARDS=7
+mkdir -p "${OUTPUT_DIR}"
+mkdir -p "${BIN_DIR}"
+mkdir -p "${DATA_DIR}"
+mkdir -p "${LOG_DIR}"
+
+( time seq 0 $((N_SHARDS-1)) | \
+  parallel --halt 2 --line-buffer \
+    sudo docker run \
+      -v ${HOME}:${HOME} \
+      ${DOCKER_IMAGE} \
+      make_examples \
+      --mode training \
+      --ref "${REF}" \
+      --reads "${BAM_CHR1}" \
+      --examples "${OUTPUT_DIR}/training_set.with_label.tfrecord@${N_SHARDS}.gz" \
+      --truth_variants "${TRUTH_VCF}" \
+      --confident_regions "${TRUTH_BED}" \
+      --task {} \
+      --regions "'chr1'" \
+      --channels "insert_size" \
+) 2>&1 | tee "${LOG_DIR}/training_set.with_label.make_examples.log"
diff --git a/run_hap.sh b/run_hap.sh
@@ -0,0 +1,22 @@
+BAM_CHR1="${DATA_DIR}/BGISEQ_PE100_NA12878.sorted.chr1.bam"
+BAM_CHR20="${DATA_DIR}/BGISEQ_PE100_NA12878.sorted.chr20.bam"
+BAM_CHR21="${DATA_DIR}/BGISEQ_PE100_NA12878.sorted.chr21.bam"
+TRUTH_VCF="${DATA_DIR}/HG001_GRCh37_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_PGandRTGphasetransfer_chrs_FIXED.vcf.gz"
+TRUTH_BED="${DATA_DIR}/HG001_GRCh37_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_nosomaticdel_chr.bed"
+REF="${DATA_DIR}/ucsc_hg19.fa"
+
+
+sudo docker pull jmcdani20/hap.py:v0.3.12
+
+time sudo docker run -it \
+-v "${DATA_DIR}:${DATA_DIR}" \
+-v "${OUTPUT_DIR}:${OUTPUT_DIR}" \
+jmcdani20/hap.py:v0.3.12 /opt/hap.py/bin/hap.py \
+  "${TRUTH_VCF}" \
+  "${OUTPUT_DIR}/test_set.vcf.gz" \
+  -f "${TRUTH_BED}" \
+  -r "${REF}" \
+  -o "${OUTPUT_DIR}/chr20-calling.happy.output" \
+  -l chr20 \
+  --engine=vcfeval \
+  --pass-only
diff --git a/shuffle.sh b/shuffle.sh
@@ -0,0 +1,26 @@
+#
+# git clone https://github.com/apache/beam-starter-python.git
+# cd beam-starter-python
+# python3 -m venv env
+# source env/bin/activate
+
+# pip3 install setuptools --upgrade
+# pip3 install apache_beam # installed 2.59.0
+# pip3 install tensorflow  # For parsing tf.Example in shuffle_tfrecords_beam.py.
+
+# play around with snappy will make it crash in local server
+# python-snappy
+# python3 -m pip install snappy
+# source ../beam-starter-python/shiyi/bin/activate
+YOUR_PROJECT=takara
+BASE="/home/syin/lol/data/training-case-study"
+OUTPUT_DIR="${BASE}/output2"
+time python3 tools/shuffle_tfrecords_beam.py \
+  --project="${YOUR_PROJECT}" \
+  --input_pattern_list="${OUTPUT_DIR}"/training_set.with_label.tfrecord-?????-of-00007.gz \
+  --output_pattern_prefix="${OUTPUT_DIR}/training_set.with_label.shuffled" \
+  --output_dataset_name="HG001" \
+  --output_dataset_config_pbtxt="${OUTPUT_DIR}/training_set.dataset_config.pbtxt" \
+  --job_name=shuffle-tfrecords \
+  --runner=DirectRunner \
+  --direct_num_workers=32
diff --git a/shuffle_validation.sh b/shuffle_validation.sh
@@ -0,0 +1,13 @@
+YOUR_PROJECT=takara
+BASE="/home/syin/lol/data/training-case-study"
+OUTPUT_DIR="${BASE}/output"
+time python3 tools/shuffle_tfrecords_beam.py \
+  --project="${YOUR_PROJECT}" \
+  --input_pattern_list="${OUTPUT_DIR}"/validation_set.with_label.tfrecord-?????-of-?????.gz \
+  --output_pattern_prefix="${OUTPUT_DIR}/2/validation_set.with_label.shuffled" \
+  --output_dataset_name="HG001" \
+  --output_dataset_config_pbtxt="${OUTPUT_DIR}/2/validation_set.dataset_config.pbtxt" \
+  --job_name=shuffle-tfrecords \
+  --runner=DirectRunner \
+  --direct_num_workers=0
+  # --direct_running_mode=multi_threading \
Original file line number	Diff line number	Diff line change
Expand Up		@@ -10,3 +10,4 @@
		bazel-*

		**/.ipynb_checkpoints
		shuffle*
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		#index is older than file
		solve by touch the index files