-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathrun.sh
148 lines (124 loc) · 5.65 KB
/
run.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#!/usr/bin/env bash
# https://docs.gitlab.com/runner/executors/custom.html#run
# shellcheck source=./include.sh
source "${BASH_SOURCE[0]%/*}/include.sh"
ensure_executable_available enroot
# External args
# Last argument is always the step name. The one before last is the step script
before_last=$(($#-1))
STEP_NAME_ARG="${!#}"
STEP_SCRIPT_ARG="${!before_last}"
if [[ "${STEP_NAME_ARG}" == "step_script" || "${STEP_NAME_ARG}" == "build_script" ]]; then
echo -e "VOLUMES configuration:"
printf "\t%s\n" "${ENROOT_MOUNT_OPTIONS[@]}"
echo -e "\n"
fi
# No slurm requested or required, directly use the login node
if [[ -z "${USE_SLURM}" || ${USE_SLURM} -eq 0 ||
# All scripts from after_script onward are executed on the login node
# see https://docs.gitlab.com/runner/executors/custom.html#run
"${STEP_NAME_ARG}" == "after_script" ||
"${STEP_NAME_ARG}" == "cleanup_file_variables" ||
"${STEP_NAME_ARG}" == *"archive"* ||
"${STEP_NAME_ARG}" == *"upload_artifacts"* ]]; then
# Enroot fails when quoting anything or splitting this command. Leave it in
# this format.
#
# shellcheck disable=SC2206
COMMAND=(enroot start ${ENROOT_REMAP_ROOT} ${ENROOT_MOUNT_OPTIONS[*]} --rw ${ENROOT_ENV_CONFIG[*]} -e "NVIDIA_VISIBLE_DEVICES=void" ${CONTAINER_NAME} /bin/bash)
"${COMMAND[@]}" < "${STEP_SCRIPT_ARG}" || die "Command: ${COMMAND[*]} failed with exit code ${?}"
else # SLURM usage requested
ensure_executable_available sacct
ensure_executable_available scancel
ensure_executable_available sbatch
ensure_executable_available srun
ensure_executable_available squeue
ensure_executable_available wc
ensure_executable_available awk
# We need to create the temporary files in a directory with filesystem
# access on all nodes. Because we consider ${CONTAINER_NAME} to be unique,
# we use it as storage for this job.
WORK_DIR="${CI_WS}/${CONTAINER_NAME}"
STEP_SCRIPT_DIR="${WORK_DIR}/step_scripts"
if [[ ! -d "${WORK_DIR}" ]]; then
mkdir -p "${WORK_DIR}"
fi
if [[ ! -d "${STEP_SCRIPT_DIR}" ]]; then
mkdir -p "${STEP_SCRIPT_DIR}"
fi
NUM_SCRIPTS="$(find "${STEP_SCRIPT_DIR}" -maxdepth 1 -type f | wc -l)"
STEP_SCRIPT="${STEP_SCRIPT_DIR}/${NUM_SCRIPTS}"
touch "${STEP_SCRIPT}"
JOB_SCRIPT=$(mktemp -p "${WORK_DIR}")
# Save the step script
cp "${STEP_SCRIPT_ARG}" "${STEP_SCRIPT}"
# Only store the gitlab scripts until we reach the main {build,step}_script
if [[ ! "${STEP_NAME_ARG}" =~ ^[bs].*"_script" ]]; then
echo -e "Storing the script for step ${STEP_NAME_ARG} for bulk submission."
exit
fi
# We finally reached the main script, prepare the SLURM job
JOB_LOG=$(mktemp -p "${WORK_DIR}")
JOB_ERR=$(mktemp -p "${WORK_DIR}")
SLURM_CONFIG=("--job-name=${CONTAINER_NAME}")
SLURM_CONFIG+=("--output=${JOB_LOG}")
SLURM_CONFIG+=("--error=${JOB_ERR}")
SLURM_CONFIG+=("--chdir=${WORK_DIR}")
if [[ -n "${CUSTOM_ENV_SLURM_PARTITION}" ]]; then
SLURM_CONFIG+=("--partition=${CUSTOM_ENV_SLURM_PARTITION}")
fi
if [[ -n "${CUSTOM_ENV_SLURM_EXCLUSIVE}" ]]; then
SLURM_CONFIG+=("--exclusive")
fi
if [[ -n "${CUSTOM_ENV_SLURM_TIME}" ]]; then
SLURM_CONFIG+=("--time=${CUSTOM_ENV_SLURM_TIME}")
fi
if [[ -n "${CUSTOM_ENV_SLURM_GRES}" ]]; then
SLURM_CONFIG+=("--gres=${CUSTOM_ENV_SLURM_GRES}")
fi
if [[ -n "${CUSTOM_ENV_SLURM_ACCOUNT}" ]]; then
SLURM_CONFIG+=("--account=${CUSTOM_ENV_SLURM_ACCOUNT}")
fi
# Log the configuration
echo -e "SLURM configuration:"
printf "\t%s\n" "${SLURM_CONFIG[@]}"
echo -e "\n"
echo -e "ENROOT environment configuration:"
printf "\t%s\n" "${ENROOT_ENV_CONFIG[@]}"
echo -e "\n"
# Launch the container through slurm
# Somehow, this script doesn't like if the variables are surrounded by "
echo -e "#!/usr/bin/env bash
for scriptnum in \$(ls -1v ${STEP_SCRIPT_DIR}); do
srun enroot start ${ENROOT_REMAP_ROOT} ${ENROOT_MOUNT_OPTIONS[*]} --rw ${ENROOT_ENV_CONFIG[*]} \
${CONTAINER_NAME} /bin/bash < ${STEP_SCRIPT_DIR}/\${scriptnum}
done
" > "${JOB_SCRIPT}"
chmod +x "${JOB_SCRIPT}"
# Submission
# shellcheck disable=SC2206
COMMAND=(sbatch --parsable ${SLURM_CONFIG[*]} ${JOB_SCRIPT})
JOB_ID=$("${COMMAND[@]}") || \
die "Command: ${COMMAND[*]} failed with exit code ${?}" "${WORK_DIR}"
echo -e "Job submitted and pending with ID: ${JOB_ID}."
squeue -u "${USER}"
# Store the JOB_ID so `cleanup.sh` can read it and cancel the job if running
# (e.g., when pressing the cancel button on gitlab). We consider that the
# CONTAINER_NAME is unique at a given time, so we don't use locking or a list
# of ids.
echo "${JOB_ID}" > "${SLURM_IDS_PATH}/${CONTAINER_NAME}.txt"
slurm_wait_for_status "${JOB_ID}" "${SLURM_PENDING_LIMIT}" \
"${SLURM_GOOD_PENDING_STATUS}" || die "encountered an error while waiting" \
"${WORK_DIR}" "${JOB_ID}" "${JOB_LOG}" "${JOB_ERR}"
echo -e "Job ${JOB_ID} started execution."
slurm_wait_for_status "${JOB_ID}" "${SLURM_RUNNING_LIMIT}" \
"${SLURM_GOOD_COMPLETED_STATUS}" || die "encountered an error while waiting" \
"${WORK_DIR}" "${JOB_ID}" "${JOB_LOG}" "${JOB_ERR}"
test -f "${JOB_ERR}" && test "$(cat "${JOB_ERR}")" != "" && \
die "encountered an error during execution" "${WORK_DIR}" "${JOB_ID}" "${JOB_LOG}" "${JOB_ERR}"
echo -e "Job ${JOB_ID} completed."
slurm_print_output "${JOB_ID}" "Log" "${JOB_LOG}" /dev/stdout
slurm_print_output "${JOB_ID}" "Errors" "${JOB_ERR}" /dev/stdout
# Cleanup the workdir
rm -rf "${WORK_DIR}"
fi