diff --git a/QUICK/job_manager.sh b/QUICK/job_manager.sh index 17a84d2..f1a1af3 100644 --- a/QUICK/job_manager.sh +++ b/QUICK/job_manager.sh @@ -184,7 +184,9 @@ report_status() { "run_log": "$(base64 < "$run_log")" } EOF - ) + ) || { + log_message "Network error occurred" + } # Handle successful report if [[ "$response_code" == "204" ]]; then @@ -231,7 +233,10 @@ EOF fetch_new_job() { # Request a new job from the API and capture response code and body - response=$(curl -s -w "\n%{http_code}" -X GET "$JOB_ASSIGNMENT_ENDPOINT") + response=$(curl -s -w "\n%{http_code}" -X GET "$JOB_ASSIGNMENT_ENDPOINT") || { + log_message "Network error occurred" + } + http_code=$(echo "$response" | tail -n1) response=$(echo "$response" | head -n -1) diff --git a/k8s/manifests/app-deployment.yaml b/k8s/manifests/app-deployment.yaml index fb12a82..94eb696 100644 --- a/k8s/manifests/app-deployment.yaml +++ b/k8s/manifests/app-deployment.yaml @@ -25,23 +25,33 @@ spec: - /bin/bash - -c - | - if pgrep -x quick.cuda.MPI >/dev/null; then - pgrep -x quick.cuda.MPI | xargs -I {} ps -o state= -p {} | grep -qv '^Z' - else + # Pass if no MPI process (waiting for jobs) + if ! pgrep -x quick.cuda.MPI > /dev/null; then + exit 0 + fi + # If process exists, check if it's zombie + if pgrep -x quick.cuda.MPI | xargs -I {} ps -o state= -p {} | grep -q '^Z'; then + # Found zombie process, fail the probe exit 1 fi + # Process exists and is not zombie - pass + exit 0 initialDelaySeconds: 60 periodSeconds: 30 timeoutSeconds: 10 - failureThreshold: 4 + failureThreshold: 3 readinessProbe: exec: command: - /bin/bash - -c - - > - ! pgrep -x quick.cuda.MPI || - (pgrep -x quick.cuda.MPI | xargs -I {} ps -o state= -p {} | grep -vq '^Z') + - | + # Container is ready if either: + # 1. No MPI process (waiting for jobs) + # 2. MPI process exists and is not zombie + ! pgrep -x quick.cuda.MPI > /dev/null || \ + ! pgrep -x quick.cuda.MPI | xargs -I {} ps -o state= -p {} | grep -q '^Z' + initialDelaySeconds: 10 periodSeconds: 10 timeoutSeconds: 5 failureThreshold: 3