-
Notifications
You must be signed in to change notification settings - Fork 11
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #40 from SeisSol/zihua/testing
remote Jupyter Lab on Frontera
- Loading branch information
Showing
12 changed files
with
335 additions
and
79 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,238 @@ | ||
#!/bin/bash | ||
# | ||
#----------------------------------------------------------------------------- | ||
# This script was generated automatically by the TACC Analytic Portal (TAP) | ||
# | ||
# This TAP job script is designed to create a jupyter notebook session on | ||
# remote nodes through the SLURM batch system. Once the job | ||
# is scheduled, check the output of your job (which by default is | ||
# stored in your home directory in a file named jupyter.out) | ||
# and it will tell you the port number that has been setup for you so | ||
# that you can attach via a separate web browser to any remote login node | ||
# | ||
# Note: you can fine tune the SLURM submission variables below as | ||
# needed. Typical items to change are the runtime limit, location of | ||
# the job output, and the allocation project to submit against (it is | ||
# commented out for now, but is required if you have multiple | ||
# allocations). | ||
# | ||
#----------------------------------------------------------------------------- | ||
# | ||
#SBATCH -J tap_jupyter # Job name | ||
#SBATCH -o jupyter.out # Name of stdout output file (%j expands to jobId) | ||
#SBATCH -p development # Queue name | ||
#SBATCH -N 1 # Total number of nodes requested | ||
#SBATCH -n 2 # Total number of mpi tasks requested | ||
#SBATCH -t 02:00:00 # Run time (hh:mm:ss) | ||
# | ||
# | ||
#-------------------------------------------------------------------------- | ||
|
||
#-------------------------------------------------------------------------- | ||
# ---- You normally should not need to edit anything below this point ----- | ||
#-------------------------------------------------------------------------- | ||
# | ||
# last update: pnav 20221013 | ||
|
||
echo "TACC: job ${SLURM_JOB_ID} execution at: $(date)" | ||
|
||
TAP_FUNCTIONS="/share/doc/slurm/tap_functions" | ||
if [ -f ${TAP_FUNCTIONS} ]; then | ||
. ${TAP_FUNCTIONS} | ||
else | ||
echo "TACC:" | ||
echo "TACC: ERROR - could not find TAP functions file: ${TAP_FUNCTIONS}" | ||
echo "TACC: ERROR - Please submit a consulting ticket at the TACC user portal" | ||
echo "TACC: ERROR - https://portal.tacc.utexas.edu/tacc-consulting/-/consult/tickets/create" | ||
echo "TACC:" | ||
echo "TACC: job $SLURM_JOB_ID execution finished at: `date`" | ||
exit 1 | ||
fi | ||
|
||
# our node name | ||
NODE_HOSTNAME=$(hostname -s) | ||
echo "TACC: running on node ${NODE_HOSTNAME}" | ||
|
||
echo "TACC: unloading xalt" | ||
module unload xalt | ||
|
||
echo "MNMN: install python libraries" | ||
module load python3/3.9.2 | ||
export PATH="$PATH:$HOME/.local" | ||
# pip install --user obspy cartopy | ||
|
||
# urllib compatibility | ||
pip uninstall -y urllib3 | ||
pip install --user 'urllib3<2.0' | ||
pip install vtk pyvista | ||
|
||
echo "MNMN: load appatainer module" | ||
module load tacc-apptainer | ||
|
||
echo "MNMN: prepare the custom image" | ||
|
||
# | ||
SHARED_PATH="/your/path/to/container/" | ||
SIF_NAME="training_latest.sif" | ||
|
||
if [ ! -f $SIF_NAME ]; then | ||
if [ ! -f $SHARED_PATH/$SIF_NAME ]; then | ||
# load the image if no image exists in the shared directory | ||
echo "MNMN: pull the appatainer image" | ||
apptainer pull -F docker://seissol/training:latest | ||
else | ||
# create symlink to the shared directory | ||
echo "MNMN: create symlink to the shared directory" | ||
ln -s $SHARED_PATH/$SIF_NAME $SIF_NAME | ||
fi | ||
fi | ||
|
||
# use jupyter-lab if it exists, otherwise jupyter-notebook | ||
JUPYTER_BIN=$(which jupyter-lab 2> /dev/null) | ||
|
||
if [ -z "${JUPYTER_BIN}" ]; then | ||
JUPYTER_BIN=$(which jupyter-notebook 2> /dev/null) | ||
if [ -z "${JUPYTER_BIN}" ]; then | ||
echo "TACC: ERROR - could not find jupyter install" | ||
echo "TACC: loaded modules below" | ||
module list | ||
echo "TACC: job ${SLURM_JOB_ID} execution finished at: $(date)" | ||
exit 1 | ||
else | ||
JUPYTER_SERVER_APP="NotebookApp" | ||
fi | ||
else | ||
JUPYTER_SERVER_VERSION=$(${JUPYTER_BIN} --version) | ||
if [ ${JUPYTER_SERVER_VERSION%%.*} -lt 3 ]; then | ||
JUPYTER_SERVER_APP="NotebookApp" | ||
else | ||
JUPYTER_SERVER_APP="ServerApp" | ||
fi | ||
fi | ||
echo "TACC: using jupyter binary ${JUPYTER_BIN}" | ||
|
||
|
||
if $(echo ${JUPYTER_BIN} | grep -qve '^/opt') ; then | ||
echo "TACC: WARNING - non-system python detected. Script may not behave as expected" | ||
fi | ||
|
||
NB_SERVERDIR=${HOME}/.jupyter | ||
IP_CONFIG=${NB_SERVERDIR}/jupyter_notebook_config.py | ||
|
||
# make .jupyter dir for logs | ||
mkdir -p ${NB_SERVERDIR} | ||
|
||
mkdir -p ${HOME}/.tap # this should exist at this point, but just in case... | ||
TAP_LOCKFILE=${HOME}/.tap/.${SLURM_JOB_ID}.lock | ||
TAP_CERTFILE=${HOME}/.tap/.${SLURM_JOB_ID} | ||
|
||
# bail if we cannot create a secure session | ||
if [ ! -f ${TAP_CERTFILE} ]; then | ||
echo "TACC: ERROR - could not find TLS cert for secure session" | ||
echo "TACC: job ${SLURM_JOB_ID} execution finished at: $(date)" | ||
exit 1 | ||
fi | ||
|
||
# bail if we cannot create a token for the session | ||
TAP_TOKEN=$(tap_get_token) | ||
if [ -z "${TAP_TOKEN}" ]; then | ||
echo "TACC: ERROR - could not generate token for notebook" | ||
echo "TACC: job ${SLURM_JOB_ID} execution finished at: $(date)" | ||
exit 1 | ||
fi | ||
echo "TACC: using token ${TAP_TOKEN}" | ||
|
||
# create the tap jupyter config if needed | ||
TAP_JUPYTER_CONFIG="${HOME}/.tap/jupyter_config.py" | ||
if [ ${JUPYTER_SERVER_APP} == "NotebookApp" ]; then | ||
cat <<- EOF > ${TAP_JUPYTER_CONFIG} | ||
# Configuration file for TAP jupyter-notebook | ||
import ssl | ||
c = get_config() | ||
c.IPKernelApp.pylab = "inline" # if you want plotting support always | ||
c.NotebookApp.ip = "0.0.0.0" | ||
c.NotebookApp.port = 5902 | ||
c.NotebookApp.open_browser = False | ||
c.NotebookApp.allow_origin = u"*" | ||
c.NotebookApp.ssl_options={"ssl_version": ssl.PROTOCOL_TLSv1_2} | ||
c.NotebookApp.mathjax_url = u"https://cdn.mathjax.org/mathjax/latest/MathJax.js" | ||
EOF | ||
else | ||
cat <<- EOF > ${TAP_JUPYTER_CONFIG} | ||
# Configuration file for TAP jupyter-notebook | ||
import ssl | ||
c = get_config() | ||
c.IPKernelApp.pylab = "inline" # if you want plotting support always | ||
c.ServerApp.ip = "0.0.0.0" | ||
c.ServerApp.port = 5902 | ||
c.ServerApp.open_browser = False | ||
c.ServerApp.allow_origin = u"*" | ||
c.ServerApp.ssl_options={"ssl_version": ssl.PROTOCOL_TLSv1_2} | ||
c.NotebookApp.mathjax_url = u"https://cdn.mathjax.org/mathjax/latest/MathJax.js" | ||
EOF | ||
fi | ||
|
||
# launch jupyter | ||
JUPYTER_LOGFILE=${NB_SERVERDIR}/${NODE_HOSTNAME}.log | ||
JUPYTER_ARGS="--certfile=$(cat ${TAP_CERTFILE}) --config=${TAP_JUPYTER_CONFIG} --${JUPYTER_SERVER_APP}.token=${TAP_TOKEN}" | ||
echo "TACC: using jupyter command: ${JUPYTER_BIN} ${JUPYTER_ARGS}" | ||
nohup ${JUPYTER_BIN} ${JUPYTER_ARGS} &> ${JUPYTER_LOGFILE} && rm ${TAP_LOCKFILE} & | ||
#sleep 120 && rm -f $(cat ${TAP_CERTFILE}) && rm -f ${TAP_CERTFILE} & | ||
JUPYTER_PID=$! | ||
LOCAL_PORT=5902 | ||
|
||
LOGIN_PORT=$(tap_get_port) | ||
echo "TACC: got login node jupyter port ${LOGIN_PORT}" | ||
|
||
JUPYTER_URL="https://frontera.tacc.utexas.edu:${LOGIN_PORT}/?token=${TAP_TOKEN}" | ||
|
||
# verify jupyter is up. if not, give one more try, then bail | ||
if ! $(ps -fu ${USER} | grep ${JUPYTER_BIN} | grep -qv grep) ; then | ||
# sometimes jupyter has a bad day. give it another chance to be awesome. | ||
echo "TACC: first jupyter launch failed. Retrying..." | ||
nohup ${JUPYTER_BIN} ${JUPYTER_ARGS} &> ${JUPYTER_LOGFILE} && rm ${TAP_LOCKFILE} & | ||
fi | ||
if ! $(ps -fu ${USER} | grep ${JUPYTER_BIN} | grep -qv grep) ; then | ||
# jupyter will not be working today. sadness. | ||
echo "TACC: ERROR - jupyter failed to launch" | ||
echo "TACC: ERROR - this is often due to an issue in your python or conda environment" | ||
echo "TACC: ERROR - jupyter logfile contents:" | ||
cat ${JUPYTER_LOGFILE} | ||
echo "TACC: job ${SLURM_JOB_ID} execution finished at: $(date)" | ||
exit 1 | ||
fi | ||
|
||
# create reverse tunnel port to login nodes. Make one tunnel for each login so the user can just | ||
# connect to frontera.tacc.utexas.edu | ||
NUM_LOGINS=4 | ||
for i in $(seq ${NUM_LOGINS}); do | ||
ssh -q -f -g -N -R ${LOGIN_PORT}:${NODE_HOSTNAME}:${LOCAL_PORT} login${i} | ||
done | ||
if [ $(ps -fu ${USER} | grep ssh | grep login | grep -vc grep) != ${NUM_LOGINS} ]; then | ||
# jupyter will not be working today. sadness. | ||
echo "TACC: ERROR - ssh tunnels failed to launch" | ||
echo "TACC: ERROR - this is often due to an issue with your ssh keys" | ||
echo "TACC: ERROR - undo any recent mods in ${HOME}/.ssh" | ||
echo "TACC: ERROR - or submit a TACC consulting ticket with this error" | ||
echo "TACC: job ${SLURM_JOB_ID} execution finished at: $(date)" | ||
exit 1 | ||
fi | ||
echo "TACC: created reverse ports on Frontera logins" | ||
|
||
echo "TACC: Your jupyter notebook server is now running at ${JUPYTER_URL}" | ||
|
||
# spin on lock until file is removed | ||
TAP_CONNECTION=${HOME}/.tap/.${SLURM_JOB_ID}.url | ||
echo ${JUPYTER_URL} > ${TAP_CONNECTION} | ||
echo $(date) > ${TAP_LOCKFILE} | ||
while [ -f ${TAP_LOCKFILE} ]; do | ||
sleep 1 | ||
done | ||
|
||
# job is done! | ||
echo "TACC: release port returned $(tap_release_port ${LOGIN_PORT})" | ||
|
||
# wait a brief moment so jupyter can clean up after itself | ||
sleep 1 | ||
|
||
echo "TACC: job ${SLURM_JOB_ID} execution finished at: $(date)" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.