From 5efe4a585c61e1557852d705a49d38a520fa55c3 Mon Sep 17 00:00:00 2001 From: ilya-da Date: Sat, 14 Sep 2024 14:55:41 +0300 Subject: [PATCH] Update GPU process cleanup logic in SLURM epilog script Remove redundant 'tail' command in GPU process cleanup checks to ensure more accurate detection and termination of residual GPU processes. This change optimizes the script by directly filtering out comments and unnecessary lines from nvidia-smi output and not depend on how many comment lines nvidia-smi output may have --- roles/slurm/templates/etc/slurm/epilog.d/50-exclusive-gpu | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/roles/slurm/templates/etc/slurm/epilog.d/50-exclusive-gpu b/roles/slurm/templates/etc/slurm/epilog.d/50-exclusive-gpu index 9284a4ead..10e8218e2 100755 --- a/roles/slurm/templates/etc/slurm/epilog.d/50-exclusive-gpu +++ b/roles/slurm/templates/etc/slurm/epilog.d/50-exclusive-gpu @@ -4,16 +4,16 @@ set -ex command -v nvidia-smi || exit 0 # Clean up processes still running. If processes don't exit node is drained. -if nvidia-smi pmon -c 1 | tail -n+3 | awk '{print $2}' | grep -v - > /dev/null +if nvidia-smi pmon -c 1 | grep -v \# | awk '{print $2}' | grep -v - > /dev/null then - for i in $(nvidia-smi pmon -c 1 | tail -n+3 | awk '{print $2}' | grep -v -) + for i in $(nvidia-smi pmon -c 1 | grep -v \# | awk '{print $2}' | grep -v -) do logger -s -t slurm-epilog "Killing residual GPU process $i ..." kill -9 "$i" done fi sleep 5 -if nvidia-smi pmon -c 1 | tail -n+3 | awk '{print $2}' | grep -v - > /dev/null +if nvidia-smi pmon -c 1 | grep -v \# | awk '{print $2}' | grep -v - > /dev/null then logger -s -t slurm-epilog 'Failed to kill residual GPU processes. Draining node ...' scontrol update nodename="$HOSTNAME" state=drain reason='Residual GPU processes found'