Skip to content

Commit

Permalink
Improve stability
Browse files Browse the repository at this point in the history
  • Loading branch information
crtag committed Nov 8, 2024
1 parent 2f25675 commit c339bb8
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 9 deletions.
9 changes: 7 additions & 2 deletions QUICK/job_manager.sh
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,9 @@ report_status() {
"run_log": "$(base64 < "$run_log")"
}
EOF
)
) || {
log_message "Network error occurred"
}

# Handle successful report
if [[ "$response_code" == "204" ]]; then
Expand Down Expand Up @@ -231,7 +233,10 @@ EOF
fetch_new_job() {

# Request a new job from the API and capture response code and body
response=$(curl -s -w "\n%{http_code}" -X GET "$JOB_ASSIGNMENT_ENDPOINT")
response=$(curl -s -w "\n%{http_code}" -X GET "$JOB_ASSIGNMENT_ENDPOINT") || {
log_message "Network error occurred"
}

http_code=$(echo "$response" | tail -n1)
response=$(echo "$response" | head -n -1)

Expand Down
24 changes: 17 additions & 7 deletions k8s/manifests/app-deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,23 +25,33 @@ spec:
- /bin/bash
- -c
- |
if pgrep -x quick.cuda.MPI >/dev/null; then
pgrep -x quick.cuda.MPI | xargs -I {} ps -o state= -p {} | grep -qv '^Z'
else
# Pass if no MPI process (waiting for jobs)
if ! pgrep -x quick.cuda.MPI > /dev/null; then
exit 0
fi
# If process exists, check if it's zombie
if pgrep -x quick.cuda.MPI | xargs -I {} ps -o state= -p {} | grep -q '^Z'; then
# Found zombie process, fail the probe
exit 1
fi
# Process exists and is not zombie - pass
exit 0
initialDelaySeconds: 60
periodSeconds: 30
timeoutSeconds: 10
failureThreshold: 4
failureThreshold: 3
readinessProbe:
exec:
command:
- /bin/bash
- -c
- >
! pgrep -x quick.cuda.MPI ||
(pgrep -x quick.cuda.MPI | xargs -I {} ps -o state= -p {} | grep -vq '^Z')
- |
# Container is ready if either:
# 1. No MPI process (waiting for jobs)
# 2. MPI process exists and is not zombie
! pgrep -x quick.cuda.MPI > /dev/null || \
! pgrep -x quick.cuda.MPI | xargs -I {} ps -o state= -p {} | grep -q '^Z'
initialDelaySeconds: 10
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 3
Expand Down

0 comments on commit c339bb8

Please sign in to comment.