From 2968f3c17c40bee67b45370a7ce167dc21a84f28 Mon Sep 17 00:00:00 2001 From: Thomas-Ulrich Date: Wed, 22 May 2024 10:48:43 +0200 Subject: [PATCH 1/3] use latest, change default SIF_NAME, change ln command --- README.md | 2 +- frontera.md | 42 +--- job.jupyter | 238 ------------------ ...odules.sh => setup_modules_Frontera_vnc.sh | 2 +- 4 files changed, 14 insertions(+), 270 deletions(-) delete mode 100644 job.jupyter rename source_modules.sh => setup_modules_Frontera_vnc.sh (97%) diff --git a/README.md b/README.md index 3108beb..301d607 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ The Docker container contains an interactive learning environment (Jupyter) whic Please install [Docker](https://docs.docker.com/engine/install/), launch the Docker Desktop and then run (i) For Frontera and all Intel/AMD machines: - - use label: hps-2024-frontera + - use label: latest (ii) Macs with M1/M2/M3 ARM CPUs: - use label: hps-2024-remote-arm diff --git a/frontera.md b/frontera.md index b9e357a..5305b7d 100644 --- a/frontera.md +++ b/frontera.md @@ -12,18 +12,19 @@ Then execute: ``` module load tacc-apptainer -apptainer pull -F docker://seissol/training:hps-2024-frontera +apptainer pull -F docker://seissol/training:latest apptainer build -f my-training.sif singularity.def -apptainer run my-training.sif +ln -sf $(realpath my-training.sif) ~/my-training.sif +apptainer run ~/my-training.sif ``` You can also use the automatically generated container after pulling the docker container ``` module load tacc-apptainer -apptainer pull -F docker://seissol/training:hps-2024-frontera -apptainer run training_latest.sif -ln -s /absolute/path/to/training_latest.sif ~/my-training.sif +apptainer pull -F docker://seissol/training:latest +ln -sf $(realpath latest.sif) ~/my-training.sif +apptainer run ~/my-training.sif ``` You can abort the jupyter lab with Ctrl-C, confirm with `y`. @@ -68,40 +69,21 @@ Sulawesi RS | 6 min TPV13 | 12 s ## Interacting with Frontera from local machine + We present a workflow for running a Jupyter Lab remotely on Frontera, while interacting with it on your local machine. You can take the following steps: -Step 1: change `SHARED_PATH="/your/path/to/container/"` in line 75 of `job.jupyter` to the path where your sigularity container is built. +Step 1: pull the docker and create the symbolic link to ~/my-training.sif as described above -Step 2: Run -``` -sbatch -A job.jupyter -``` +Step 2: submit a VNC job on https://tap.tacc.utexas.edu/jobs/ (e.g. 1 node, 1 task) -Step 3: Check the job status with -``` -squeue -u $USER -``` +Step 3: Wait till job status: running and click on Connect. -Step 4: Once the status changes from `PD` to `R`, you will find the job output in a generated file `jupyter.out`. +Step 4: open a terminal on the remote desktop, and `source setup_modules_Frontera_vnc.sh` -Step 5: Check the last few lines with -``` -tail -f jupyter.out -``` -wait a few seconds until you get in `jupyter.out` something like: -``` -TACC: got login node jupyter port 60320 -TACC: created reverse ports on Frontera logins -TACC: Your jupyter notebook server is now running at https://frontera.tacc.utexas.edu:60320/?token=2e0fade1f8b1ce00b303a7e97dd962c5cd10c17f03a245e8c761ca7e1d5e1597 -``` -(and then Ctrl+C to stop monitoring the contents of `jupyter.out`) +Step 5: Run `swp -p 1 jupyter notebook` as suggested by the script. The jupyterlab should open. -Step 6: Paste the link to your local browser, you will have access to the Frontera environment on your local machine. -``` -https://frontera.tacc.utexas.edu:60320/?token=2e0fade1f8b1ce00b303a7e97dd962c5cd10c17f03a245e8c761ca7e1d5e1597 -``` ## Visualization diff --git a/job.jupyter b/job.jupyter deleted file mode 100644 index 3555d72..0000000 --- a/job.jupyter +++ /dev/null @@ -1,238 +0,0 @@ -#!/bin/bash -# -#----------------------------------------------------------------------------- -# This script was generated automatically by the TACC Analytic Portal (TAP) -# -# This TAP job script is designed to create a jupyter notebook session on -# remote nodes through the SLURM batch system. Once the job -# is scheduled, check the output of your job (which by default is -# stored in your home directory in a file named jupyter.out) -# and it will tell you the port number that has been setup for you so -# that you can attach via a separate web browser to any remote login node -# -# Note: you can fine tune the SLURM submission variables below as -# needed. Typical items to change are the runtime limit, location of -# the job output, and the allocation project to submit against (it is -# commented out for now, but is required if you have multiple -# allocations). -# -#----------------------------------------------------------------------------- -# -#SBATCH -J tap_jupyter # Job name -#SBATCH -o jupyter.out # Name of stdout output file (%j expands to jobId) -#SBATCH -p development # Queue name -#SBATCH -N 1 # Total number of nodes requested -#SBATCH -n 2 # Total number of mpi tasks requested -#SBATCH -t 02:00:00 # Run time (hh:mm:ss) -# -# -#-------------------------------------------------------------------------- - -#-------------------------------------------------------------------------- -# ---- You normally should not need to edit anything below this point ----- -#-------------------------------------------------------------------------- -# -# last update: pnav 20221013 - -echo "TACC: job ${SLURM_JOB_ID} execution at: $(date)" - -TAP_FUNCTIONS="/share/doc/slurm/tap_functions" -if [ -f ${TAP_FUNCTIONS} ]; then - . ${TAP_FUNCTIONS} -else - echo "TACC:" - echo "TACC: ERROR - could not find TAP functions file: ${TAP_FUNCTIONS}" - echo "TACC: ERROR - Please submit a consulting ticket at the TACC user portal" - echo "TACC: ERROR - https://portal.tacc.utexas.edu/tacc-consulting/-/consult/tickets/create" - echo "TACC:" - echo "TACC: job $SLURM_JOB_ID execution finished at: `date`" - exit 1 -fi - -# our node name -NODE_HOSTNAME=$(hostname -s) -echo "TACC: running on node ${NODE_HOSTNAME}" - -echo "TACC: unloading xalt" -module unload xalt - -echo "MNMN: install python libraries" -module load python3/3.9.2 -export PATH="$PATH:$HOME/.local" -# pip install --user obspy cartopy - -# urllib compatibility -pip uninstall -y urllib3 -pip install --user 'urllib3<2.0' -pip install vtk pyvista - -echo "MNMN: load appatainer module" -module load tacc-apptainer - -echo "MNMN: prepare the custom image" - -# -SHARED_PATH="/your/path/to/container/" -SIF_NAME="training_latest.sif" - -if [ ! -f $SIF_NAME ]; then - if [ ! -f $SHARED_PATH/$SIF_NAME ]; then - # load the image if no image exists in the shared directory - echo "MNMN: pull the appatainer image" - apptainer pull -F docker://seissol/training:latest - else - # create symlink to the shared directory - echo "MNMN: create symlink to the shared directory" - ln -s $SHARED_PATH/$SIF_NAME $SIF_NAME - fi -fi - -# use jupyter-lab if it exists, otherwise jupyter-notebook -JUPYTER_BIN=$(which jupyter-lab 2> /dev/null) - -if [ -z "${JUPYTER_BIN}" ]; then - JUPYTER_BIN=$(which jupyter-notebook 2> /dev/null) - if [ -z "${JUPYTER_BIN}" ]; then - echo "TACC: ERROR - could not find jupyter install" - echo "TACC: loaded modules below" - module list - echo "TACC: job ${SLURM_JOB_ID} execution finished at: $(date)" - exit 1 - else - JUPYTER_SERVER_APP="NotebookApp" - fi -else - JUPYTER_SERVER_VERSION=$(${JUPYTER_BIN} --version) - if [ ${JUPYTER_SERVER_VERSION%%.*} -lt 3 ]; then - JUPYTER_SERVER_APP="NotebookApp" - else - JUPYTER_SERVER_APP="ServerApp" - fi -fi -echo "TACC: using jupyter binary ${JUPYTER_BIN}" - - -if $(echo ${JUPYTER_BIN} | grep -qve '^/opt') ; then - echo "TACC: WARNING - non-system python detected. Script may not behave as expected" -fi - -NB_SERVERDIR=${HOME}/.jupyter -IP_CONFIG=${NB_SERVERDIR}/jupyter_notebook_config.py - -# make .jupyter dir for logs -mkdir -p ${NB_SERVERDIR} - -mkdir -p ${HOME}/.tap # this should exist at this point, but just in case... -TAP_LOCKFILE=${HOME}/.tap/.${SLURM_JOB_ID}.lock -TAP_CERTFILE=${HOME}/.tap/.${SLURM_JOB_ID} - -# bail if we cannot create a secure session -if [ ! -f ${TAP_CERTFILE} ]; then - echo "TACC: ERROR - could not find TLS cert for secure session" - echo "TACC: job ${SLURM_JOB_ID} execution finished at: $(date)" - exit 1 -fi - -# bail if we cannot create a token for the session -TAP_TOKEN=$(tap_get_token) -if [ -z "${TAP_TOKEN}" ]; then - echo "TACC: ERROR - could not generate token for notebook" - echo "TACC: job ${SLURM_JOB_ID} execution finished at: $(date)" - exit 1 -fi -echo "TACC: using token ${TAP_TOKEN}" - -# create the tap jupyter config if needed -TAP_JUPYTER_CONFIG="${HOME}/.tap/jupyter_config.py" -if [ ${JUPYTER_SERVER_APP} == "NotebookApp" ]; then -cat <<- EOF > ${TAP_JUPYTER_CONFIG} -# Configuration file for TAP jupyter-notebook -import ssl -c = get_config() -c.IPKernelApp.pylab = "inline" # if you want plotting support always -c.NotebookApp.ip = "0.0.0.0" -c.NotebookApp.port = 5902 -c.NotebookApp.open_browser = False -c.NotebookApp.allow_origin = u"*" -c.NotebookApp.ssl_options={"ssl_version": ssl.PROTOCOL_TLSv1_2} -c.NotebookApp.mathjax_url = u"https://cdn.mathjax.org/mathjax/latest/MathJax.js" -EOF -else -cat <<- EOF > ${TAP_JUPYTER_CONFIG} -# Configuration file for TAP jupyter-notebook -import ssl -c = get_config() -c.IPKernelApp.pylab = "inline" # if you want plotting support always -c.ServerApp.ip = "0.0.0.0" -c.ServerApp.port = 5902 -c.ServerApp.open_browser = False -c.ServerApp.allow_origin = u"*" -c.ServerApp.ssl_options={"ssl_version": ssl.PROTOCOL_TLSv1_2} -c.NotebookApp.mathjax_url = u"https://cdn.mathjax.org/mathjax/latest/MathJax.js" -EOF -fi - -# launch jupyter -JUPYTER_LOGFILE=${NB_SERVERDIR}/${NODE_HOSTNAME}.log -JUPYTER_ARGS="--certfile=$(cat ${TAP_CERTFILE}) --config=${TAP_JUPYTER_CONFIG} --${JUPYTER_SERVER_APP}.token=${TAP_TOKEN}" -echo "TACC: using jupyter command: ${JUPYTER_BIN} ${JUPYTER_ARGS}" -nohup ${JUPYTER_BIN} ${JUPYTER_ARGS} &> ${JUPYTER_LOGFILE} && rm ${TAP_LOCKFILE} & -#sleep 120 && rm -f $(cat ${TAP_CERTFILE}) && rm -f ${TAP_CERTFILE} & -JUPYTER_PID=$! -LOCAL_PORT=5902 - -LOGIN_PORT=$(tap_get_port) -echo "TACC: got login node jupyter port ${LOGIN_PORT}" - -JUPYTER_URL="https://frontera.tacc.utexas.edu:${LOGIN_PORT}/?token=${TAP_TOKEN}" - -# verify jupyter is up. if not, give one more try, then bail -if ! $(ps -fu ${USER} | grep ${JUPYTER_BIN} | grep -qv grep) ; then - # sometimes jupyter has a bad day. give it another chance to be awesome. - echo "TACC: first jupyter launch failed. Retrying..." - nohup ${JUPYTER_BIN} ${JUPYTER_ARGS} &> ${JUPYTER_LOGFILE} && rm ${TAP_LOCKFILE} & -fi -if ! $(ps -fu ${USER} | grep ${JUPYTER_BIN} | grep -qv grep) ; then - # jupyter will not be working today. sadness. - echo "TACC: ERROR - jupyter failed to launch" - echo "TACC: ERROR - this is often due to an issue in your python or conda environment" - echo "TACC: ERROR - jupyter logfile contents:" - cat ${JUPYTER_LOGFILE} - echo "TACC: job ${SLURM_JOB_ID} execution finished at: $(date)" - exit 1 -fi - -# create reverse tunnel port to login nodes. Make one tunnel for each login so the user can just -# connect to frontera.tacc.utexas.edu -NUM_LOGINS=4 -for i in $(seq ${NUM_LOGINS}); do - ssh -q -f -g -N -R ${LOGIN_PORT}:${NODE_HOSTNAME}:${LOCAL_PORT} login${i} -done -if [ $(ps -fu ${USER} | grep ssh | grep login | grep -vc grep) != ${NUM_LOGINS} ]; then - # jupyter will not be working today. sadness. - echo "TACC: ERROR - ssh tunnels failed to launch" - echo "TACC: ERROR - this is often due to an issue with your ssh keys" - echo "TACC: ERROR - undo any recent mods in ${HOME}/.ssh" - echo "TACC: ERROR - or submit a TACC consulting ticket with this error" - echo "TACC: job ${SLURM_JOB_ID} execution finished at: $(date)" - exit 1 -fi -echo "TACC: created reverse ports on Frontera logins" - -echo "TACC: Your jupyter notebook server is now running at ${JUPYTER_URL}" - -# spin on lock until file is removed -TAP_CONNECTION=${HOME}/.tap/.${SLURM_JOB_ID}.url -echo ${JUPYTER_URL} > ${TAP_CONNECTION} -echo $(date) > ${TAP_LOCKFILE} -while [ -f ${TAP_LOCKFILE} ]; do - sleep 1 -done - -# job is done! -echo "TACC: release port returned $(tap_release_port ${LOGIN_PORT})" - -# wait a brief moment so jupyter can clean up after itself -sleep 1 - -echo "TACC: job ${SLURM_JOB_ID} execution finished at: $(date)" diff --git a/source_modules.sh b/setup_modules_Frontera_vnc.sh similarity index 97% rename from source_modules.sh rename to setup_modules_Frontera_vnc.sh index 7a73f70..2251a51 100644 --- a/source_modules.sh +++ b/setup_modules_Frontera_vnc.sh @@ -29,7 +29,7 @@ module load tacc-apptainer # SHARED_PATH="/your/path/to/container/" -SIF_NAME="training_latest.sif" +SIF_NAME="$HOME/my-training.sif" if [ ! -f $SIF_NAME ]; then if [ ! -f $SHARED_PATH/$SIF_NAME ]; then From fadfe816d8da103e80ac241a1c0027d1b4f32af9 Mon Sep 17 00:00:00 2001 From: Thomas-Ulrich Date: Wed, 22 May 2024 14:51:46 +0200 Subject: [PATCH 2/3] address problem identified by Zihua --- setup_modules_Frontera_vnc.sh | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/setup_modules_Frontera_vnc.sh b/setup_modules_Frontera_vnc.sh index 2251a51..65fd2d9 100644 --- a/setup_modules_Frontera_vnc.sh +++ b/setup_modules_Frontera_vnc.sh @@ -32,14 +32,16 @@ SHARED_PATH="/your/path/to/container/" SIF_NAME="$HOME/my-training.sif" if [ ! -f $SIF_NAME ]; then - if [ ! -f $SHARED_PATH/$SIF_NAME ]; then + if [ ! -f $SHARED_PATH/training_latest.sif ]; then # load the image if no image exists in the shared directory echo "MNMN: pull the appatainer image" - apptainer pull -F docker://seissol/training:hps-2024-frontera + apptainer pull -F docker://seissol/training:latest + echo "MNMN: create symlink to the shared directory" + ln -sf $SHARED_PATH/training_latest.sif $SIF_NAME else # create symlink to the shared directory echo "MNMN: create symlink to the shared directory" - ln -s $SHARED_PATH/$SIF_NAME $SIF_NAME + ln -sf $SHARED_PATH/training_latest.sif $SIF_NAME fi fi From 9382432dc741fb744b91b5a42f2cf1c7be070daf Mon Sep 17 00:00:00 2001 From: Thomas-Ulrich Date: Wed, 22 May 2024 16:53:43 +0200 Subject: [PATCH 3/3] address Sebastian's review --- frontera.md | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/frontera.md b/frontera.md index 5305b7d..a60175c 100644 --- a/frontera.md +++ b/frontera.md @@ -8,22 +8,21 @@ Then get an interactive session on a compute node. For example for a 30 min sess idev -m 30 -N 1 --tasks-per-node 2 -p development ``` -Then execute: +You can then pull and use the automatically generated container from the ci workflow: ``` module load tacc-apptainer apptainer pull -F docker://seissol/training:latest -apptainer build -f my-training.sif singularity.def -ln -sf $(realpath my-training.sif) ~/my-training.sif +ln -sf $(realpath latest.sif) ~/my-training.sif apptainer run ~/my-training.sif ``` -You can also use the automatically generated container after pulling the docker container +Alternatively, you can build and use the container with: ``` module load tacc-apptainer -apptainer pull -F docker://seissol/training:latest -ln -sf $(realpath latest.sif) ~/my-training.sif +apptainer build -f my-training.sif singularity.def +ln -sf $(realpath my-training.sif) ~/my-training.sif apptainer run ~/my-training.sif ```