-
Notifications
You must be signed in to change notification settings - Fork 359
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Fix preemptibles and maxRetries on GCP Batch [AN-274] [AN-377] (#7684)
- Loading branch information
Showing
35 changed files
with
724 additions
and
210 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
2 changes: 1 addition & 1 deletion
2
centaur/src/main/resources/standardTestCases/checkpointing.test
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
70 changes: 70 additions & 0 deletions
70
centaur/src/main/resources/standardTestCases/checkpointing/gcpbatch_checkpointing.wdl
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
version 1.0 | ||
|
||
workflow checkpointing { | ||
call count { input: count_to = 100 } | ||
output { | ||
String preempted = count.preempted | ||
} | ||
} | ||
|
||
task count { | ||
input { | ||
Int count_to | ||
} | ||
|
||
meta { | ||
volatile: true | ||
} | ||
|
||
command <<< | ||
# Read from the my_checkpoint file if there's content there: | ||
FROM_CKPT=$(cat my_checkpoint | tail -n1 | awk '{ print $1 }') | ||
FROM_CKPT=${FROM_CKPT:-1} | ||
|
||
# We don't want any single VM run the entire count, so work out the max counter value for this attempt: | ||
MAX="$(($FROM_CKPT + 66))" | ||
|
||
INSTANCE_NAME=$(curl -s "http://metadata.google.internal/computeMetadata/v1/instance/name" -H "Metadata-Flavor: Google") | ||
echo "Discovered instance: $INSTANCE_NAME" | ||
|
||
# Run the counter: | ||
echo '--' >> my_checkpoint | ||
for i in $(seq $FROM_CKPT ~{count_to}) | ||
do | ||
echo $i | ||
echo $i ${INSTANCE_NAME} $(date) >> my_checkpoint | ||
|
||
# If we're over our max, "preempt" the VM by simulating a maintenance event: | ||
if [ "${i}" -gt "${MAX}" ] | ||
then | ||
fully_qualified_zone=$(curl -s -H "Metadata-Flavor: Google" http://metadata.google.internal/computeMetadata/v1/instance/zone) | ||
zone=$(basename "$fully_qualified_zone") | ||
gcloud beta compute instances simulate-maintenance-event $(curl -s "http://metadata.google.internal/computeMetadata/v1/instance/name" -H "Metadata-Flavor: Google") --zone=$zone -q | ||
sleep 60 | ||
fi | ||
|
||
sleep 1 | ||
done | ||
|
||
# Prove that we got preempted at least once: | ||
FIRST_INSTANCE=$(cat my_checkpoint | head -n1 | awk '{ print $2 }') | ||
LAST_INSTANCE=$(cat my_checkpoint | tail -n1 | awk '{ print $2 }') | ||
if [ "${FIRST_INSTANCE}" != "LAST_INSTANCE" ] | ||
then | ||
echo "GOTPREEMPTED" > preempted.txt | ||
else | ||
echo "NEVERPREEMPTED" > preempted.txt | ||
fi | ||
>>> | ||
|
||
runtime { | ||
docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:slim" | ||
preemptible: 3 | ||
checkpointFile: "my_checkpoint" | ||
} | ||
|
||
output { | ||
File checkpoint_log = "my_checkpoint" | ||
String preempted = read_string("preempted.txt") | ||
} | ||
} |
12 changes: 0 additions & 12 deletions
12
centaur/src/main/resources/standardTestCases/error_10_preemptible.test
This file was deleted.
Oops, something went wrong.
13 changes: 13 additions & 0 deletions
13
centaur/src/main/resources/standardTestCases/gcpbatch_checkpointing.test
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
name: gcpbatch_checkpointing | ||
testFormat: workflowsuccess | ||
backends: [GCPBATCH] | ||
|
||
files { | ||
workflow: checkpointing/gcpbatch_checkpointing.wdl | ||
} | ||
|
||
metadata { | ||
workflowName: checkpointing | ||
status: Succeeded | ||
"outputs.checkpointing.preempted": "GOTPREEMPTED" | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
13 changes: 13 additions & 0 deletions
13
centaur/src/main/resources/standardTestCases/gcpbatch_papi_preemptible_and_max_retries.test
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
name: gcpbatch_papi_preemptible_and_max_retries | ||
testFormat: workflowfailure | ||
backends: [GCPBATCH] | ||
|
||
files { | ||
workflow: papi_preemptible_and_max_retries/gcpbatch_papi_preemptible_and_max_retries.wdl | ||
} | ||
|
||
metadata { | ||
workflowName: papi_preemptible_and_max_retries | ||
status: Failed | ||
"papi_preemptible_and_max_retries.delete_self.-1.attempt": 3 | ||
} |
28 changes: 28 additions & 0 deletions
28
centaur/src/main/resources/standardTestCases/gcpbatch_preemptible_and_memory_retry.test
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
name: gcpbatch_preemptible_and_memory_retry | ||
testFormat: workflowfailure | ||
# The original version of this test was tailored to the quirks of Papi v2 in depending on the misdiagnosis of its own | ||
# VM deletion as a preemption event. However GCP Batch perhaps more correctly diagnoses VM deletion as a weird | ||
# non-preemption event. The GCPBATCH version of this test uses `gcloud beta compute instances simulate-maintenance-event` | ||
# to simulate a preemption in a way that GCP Batch actually perceives as a preemption. | ||
backends: [GCPBATCH] | ||
|
||
files { | ||
workflow: retry_with_more_memory/gcpbatch/preemptible_and_memory_retry.wdl | ||
options: retry_with_more_memory/retry_with_more_memory.options | ||
} | ||
|
||
metadata { | ||
workflowName: preemptible_and_memory_retry | ||
status: Failed | ||
"failures.0.message": "Workflow failed" | ||
"failures.0.causedBy.0.message": "stderr for job `preemptible_and_memory_retry.imitate_oom_error_on_preemptible:NA:3` contained one of the `memory-retry-error-keys: [OutOfMemory,Killed]` specified in the Cromwell config. Job might have run out of memory." | ||
"preemptible_and_memory_retry.imitate_oom_error_on_preemptible.-1.1.preemptible": "true" | ||
"preemptible_and_memory_retry.imitate_oom_error_on_preemptible.-1.1.executionStatus": "RetryableFailure" | ||
"preemptible_and_memory_retry.imitate_oom_error_on_preemptible.-1.1.runtimeAttributes.memory": "1 GB" | ||
"preemptible_and_memory_retry.imitate_oom_error_on_preemptible.-1.2.preemptible": "false" | ||
"preemptible_and_memory_retry.imitate_oom_error_on_preemptible.-1.2.executionStatus": "RetryableFailure" | ||
"preemptible_and_memory_retry.imitate_oom_error_on_preemptible.-1.2.runtimeAttributes.memory": "1 GB" | ||
"preemptible_and_memory_retry.imitate_oom_error_on_preemptible.-1.3.preemptible": "false" | ||
"preemptible_and_memory_retry.imitate_oom_error_on_preemptible.-1.3.executionStatus": "Failed" | ||
"preemptible_and_memory_retry.imitate_oom_error_on_preemptible.-1.3.runtimeAttributes.memory": "1.1 GB" | ||
} |
11 changes: 11 additions & 0 deletions
11
centaur/src/main/resources/standardTestCases/gcpbatch_preemptible_basic.test
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
name: gcpbatch_preemptible_basic | ||
testFormat: workflowsuccess | ||
backends: [GCPBATCH] | ||
|
||
files { | ||
workflow: preemptible_basic/gcpbatch_preemptible_basic.wdl | ||
} | ||
|
||
metadata { | ||
status: Succeeded | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
4 changes: 2 additions & 2 deletions
4
centaur/src/main/resources/standardTestCases/papi_preemptible_and_max_retries.test
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
31 changes: 31 additions & 0 deletions
31
...dTestCases/papi_preemptible_and_max_retries/gcpbatch_papi_preemptible_and_max_retries.wdl
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
version 1.0 | ||
|
||
task delete_self { | ||
|
||
command { | ||
preemptible=$(curl -H "Metadata-Flavor: Google" "http://metadata.google.internal/computeMetadata/v1/instance/scheduling/preemptible") | ||
|
||
# Simulate a maintenance event on ourselves if running on a preemptible VM, otherwise delete ourselves. | ||
fully_qualified_zone=$(curl -s -H "Metadata-Flavor: Google" http://metadata.google.internal/computeMetadata/v1/instance/zone) | ||
zone=$(basename "$fully_qualified_zone") | ||
|
||
if [ "$preemptible" = "TRUE" ]; then | ||
gcloud beta compute instances simulate-maintenance-event $(curl -s "http://metadata.google.internal/computeMetadata/v1/instance/name" -H "Metadata-Flavor: Google") --zone=$zone -q | ||
sleep 60 | ||
else | ||
# We need to actually delete ourselves if the VM is not preemptible; simulated maintenance events don't seem to | ||
# precipitate the demise of on-demand VMs. | ||
gcloud compute instances delete $(curl -s "http://metadata.google.internal/computeMetadata/v1/instance/name" -H "Metadata-Flavor: Google") --zone=$zone -q | ||
fi | ||
} | ||
|
||
runtime { | ||
preemptible: 1 | ||
docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:slim" | ||
maxRetries: 1 | ||
} | ||
} | ||
|
||
workflow papi_preemptible_and_max_retries { | ||
call delete_self | ||
} |
8 changes: 5 additions & 3 deletions
8
centaur/src/main/resources/standardTestCases/preemptible_and_memory_retry.test
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
11 changes: 11 additions & 0 deletions
11
centaur/src/main/resources/standardTestCases/preemptible_basic.test
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
name: preemptible_basic | ||
testFormat: workflowsuccess | ||
backends: [Papiv2, GCPBATCH_ALT] | ||
|
||
files { | ||
workflow: preemptible_basic/preemptible_basic.wdl | ||
} | ||
|
||
metadata { | ||
status: Succeeded | ||
} |
33 changes: 33 additions & 0 deletions
33
...aur/src/main/resources/standardTestCases/preemptible_basic/gcpbatch_preemptible_basic.wdl
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
version 1.0 | ||
|
||
task delete_self_if_preemptible { | ||
|
||
command <<< | ||
# Prepend date, time and pwd to xtrace log entries. | ||
PS4='\D{+%F %T} \w $ ' | ||
set -o errexit -o nounset -o pipefail -o xtrace | ||
|
||
preemptible=$(curl -H "Metadata-Flavor: Google" "http://metadata.google.internal/computeMetadata/v1/instance/scheduling/preemptible") | ||
|
||
# Perform a maintenance event on this VM if it is preemptible, which should cause it to be preempted. | ||
# Since `preemptible: 1` the job should be restarted on a non-preemptible VM. | ||
if [ "$preemptible" = "TRUE" ]; then | ||
fully_qualified_zone=$(curl -s -H "Metadata-Flavor: Google" http://metadata.google.internal/computeMetadata/v1/instance/zone) | ||
zone=$(basename "$fully_qualified_zone") | ||
|
||
gcloud beta compute instances simulate-maintenance-event $(curl -s "http://metadata.google.internal/computeMetadata/v1/instance/name" -H "Metadata-Flavor: Google") --zone=$zone -q | ||
sleep 60 | ||
fi | ||
|
||
>>> | ||
|
||
runtime { | ||
preemptible: 1 | ||
docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:slim" | ||
} | ||
} | ||
|
||
|
||
workflow preemptible_basic { | ||
call delete_self_if_preemptible | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.