-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathschedulerTools.sh
188 lines (164 loc) · 5.91 KB
/
schedulerTools.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
#!/bin/bash
# SCHEDULER tools (e.g. slurm)
##################################################
# functions
##################################################
exitError()
{
\rm -f /tmp/tmp.${user}.$$ 1>/dev/null 2>/dev/null
echo "ERROR $1: $3" 1>&2
echo "ERROR LOCATION=$0" 1>&2
echo "ERROR LINE=$2" 1>&2
exit $1
}
showWarning()
{
echo "WARNING $1: $3" 1>&2
echo "WARNING LOCATION=$0" 1>&2
echo "WARNING LINE=$2" 1>&2
}
# function to launch and wait for job (until job finishes or a
# specified timeout in seconds is reached)
#
# usage: launch_job script timeout
SCHEDULER_SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
function launch_job {
local script=$1
local timeout=$2
local sacct_maxwait=90
# check sanity of arguments
test -f "${script}" || exitError 7201 ${LINENO} "cannot find script ${script}"
if [ -n "${timeout}" ] ; then
echo "${timeout}" | grep '^[0-9][0-9]*$' 2>&1 > /dev/null
if [ $? -ne 0 ] ; then
exitError 7203 ${LINENO} "timeout is not a number"
fi
fi
# get out/err of SLURM job
local out=`grep '^\#SBATCH --output=' ${script} | sed 's/.*output=//g'`
local err=`grep '^\#SBATCH --error=' ${script} | sed 's/.*error=//g'`
# submit SLURM job
local res=$(sbatch ${script} 2>&1)
status=$?
if [[ $res == *"QOSMaxSubmitJobPerUserLimit"* ]]; then
echo "Partition chosen has to many jobs from us already, trying again with the normal partition."
sed -i 's|--partition=.*|--partition=normal|g' ${script}
res=`sbatch ${script}`
if [ $? -ne 0 ] ; then
exitError 7205 ${LINENO} "problem re-submitting SLURM batch job in normal queue"
fi
else
if [ $status -ne 0 ] ; then
exitError 7204 ${LINENO} "problem submitting SLURM batch job"
fi
fi
echo "Submit message: ${res}"
echo "${res}" | grep "^Submitted batch job [0-9][0-9]*$" || exitError 7206 ${LINENO} "problem determining job ID of SLURM job"
local jobid=`echo "${res}" | sed 's/^Submitted batch job //g'`
test -n "${jobid}" || exitError 7207 ${LINENO} "problem determining job ID of SLURM job"
# wait until job has finished (or maximum sleep time has been reached)
if [ -n "${timeout}" ] ; then
local secs=0
local inc=2
local job_status="UNKNOWN"
while [ $secs -lt $timeout ] ; do
echo "...waiting ${inc}s for SLURM job ${jobid} to finish (status=${job_status})"
sleep ${inc}
secs=$[$secs+${inc}]
inc=60
squeue_out=`squeue -o "%.20i %.20u %T" -h -j "${jobid}" 2>/dev/null`
echo "${squeue_out}" | grep "^ *${jobid} " &> /dev/null
if [ $? -eq 1 ] ; then
break
fi
job_status=`echo ${squeue_out} | sed 's/.* //g'`
done
fi
# make sure that job has finished
squeue_out=`squeue -o "%.20i %.20u %T" -h -j "${jobid}" 2>/dev/null`
echo "${squeue_out}" | grep "^ *${jobid} " &> /dev/null
if [ $? -eq 0 ] ; then
exitError 7207 ${LINENO} "batch job ${script} with ID ${jobid} on host ${slave} did not finish"
fi
# check for normal completion of batch job
# Since the slurm data base may take time to update, wait until sacct_maxwait
local sacct_wait=0
local sacct_inc=30
local sacct_log=sacct.${jobid}.log
local sacct_status=1
# XL_HACK: On tsa RH7.6 the job exit with non 0 also the model completed successfully
# For the time being we ignore the slurm status (a check is done on the output)
if [ -n "${COSMO_IGNORE_SLURM_STATUS}" ]; then
echo "!! Warning: slurm status is not checked if COSMO_IGNORE_SLURM_STATUS is set"
return
fi
while [ $sacct_wait -lt $sacct_maxwait ] ; do
sacct --jobs ${jobid} -p -n -b -D 2>/dev/null > ${sacct_log}
# Check that sacct returns COMPLETED
grep -v '|COMPLETED|0:0|' ${sacct_log} >/dev/null
if [ $? -eq 0 ]; then
echo "Status not COMPLETED, waiting 30s for data base update"
sleep 30
else
sacct_status=0
break
fi
sacct_wait=$[$sacct_wait+${sacct_inc}]
done
if [ $sacct_status -ne 0 ] ; then
if [ -n "${out}" ] ; then
echo "=== ${out} BEGIN ==="
cat ${out} | /bin/sed -r "s/\x1B\[([0-9]{1,2}(;[0-9]{1,2})?)?[m|K]//g"
echo "=== ${out} END ==="
fi
if [ -n "${err}" ] ; then
echo "=== ${err} BEGIN ==="
cat ${err} | /bin/sed -r "s/\x1B\[([0-9]{1,2}(;[0-9]{1,2})?)?[m|K]//g"
echo "=== ${err} END ==="
fi
echo "=== ${sacct_log} BEGIN ==="
cat ${sacct_log}
echo "=== ${sacct_log} END ==="
exitError 7209 ${LINENO} "batch job ${script} with ID ${jobid} on host ${slave} did not complete successfully"
fi
rm ${sacct_log}
}
# Function to launch a job with the scheduler, or just run it if the scheduler is nonw
function run_command {
local CMD=$1
local NAME=$2
local SCRIPT=$3
local MINUTES=$4
if [ "${scheduler}" != "none" ] ; then
# test if the slurm script exists, if not, scheduler should not be slurm
test -f "${SCRIPT}" || SCRIPT="${SCHEDULER_SCRIPT_DIR}/submit.${host}.${scheduler}"
test -f "${SCRIPT}" || exitError 1252 ${LINENO} "cannot find script ${SCRIPT}"
# setup job
# set a generic output filename if it's not provided as an input
if [ -z ${NAME} ] ; then
NAME="JenkinsJob${BUILD_ID}"
fi
OUT="${NAME}.out"
# These should get set here
sed -i "s|<OUTFILE>|$OUT|g" $SCRIPT
sed -i "s|<CMD>|$CMD|g" $SCRIPT
sed -i "s|<NAME>|$NAME|g" $SCRIPT
sed -i "s|<NTASKS>|1|g" $SCRIPT
sed -i "s|<NTASKSPERNODE>|$nthreads|g" $SCRIPT
sed -i "s|<CPUSPERTASK>|1|g" $SCRIPT
sed -i "s|<TIMEOUT>|$MINUTES|g" $SCRIPT
# The contents of the resulting script to be submitted
echo "Submitting slurm script:"
cat $SCRIPT
# submit SLURM job
launch_job $SCRIPT $((MINUTES * 60))
if [ $? -ne 0 ] ; then
exitError 1251 ${LINENO} "problem launching SLURM job ${SCRIPT}"
fi
# echo output of SLURM job
cat ${OUT}
rm ${OUT}
else
eval ${CMD}
fi
}