Skip to content

Commit

Permalink
add new grid scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
mieskolainen committed Oct 23, 2024
1 parent 4448a21 commit 98b23c2
Show file tree
Hide file tree
Showing 10 changed files with 180 additions and 17 deletions.
10 changes: 5 additions & 5 deletions .github/workflows/icenet-install-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -113,12 +113,12 @@ jobs:
run: |
source setenv-github-actions.sh && maxevents=10000; source tests/runme_brem_reweight.sh
echo "yes" | source superclean.sh
#
- name: Deep Learning system integration test (zee 1)
run: |
source setenv-github-actions.sh && maxevents=100; source tests/runme_zee_gridtune.sh
echo "yes" | source superclean.sh
#- name: Deep Learning system integration test (zee 1)
# run: |
# source setenv-github-actions.sh && maxevents=100; source tests/runme_zee_gridtune.sh
# echo "yes" | source superclean.sh

#
- name: Deep Learning system integration test (zee 2)
Expand Down
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
*.zip
*.text
*.log
*.out
*.output
#*.json
#*.txt

Expand Down
2 changes: 1 addition & 1 deletion icenet/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import os
import psutil

__version__ = '0.1.3.2'
__version__ = '0.1.3.3'
__release__ = 'alpha'
__date__ = '21/10/2024'
__author__ = '[email protected]'
Expand Down
16 changes: 8 additions & 8 deletions icenet/tools/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,18 +224,18 @@ def read_config(config_path='configs/xyz/', runmode='all'):
hash_args = {}

# Critical Python file content
files = {'inputvars': os.path.join(cwd, config_path, f'{args["inputvars"]}.py'),
'cuts': os.path.join(cwd, config_path, f'cuts.py'),
'filter': os.path.join(cwd, config_path, f'filter.py'),
'common': os.path.join(cwd, args['rootname'], f'common.py')}
files = {'inputvars': os.path.join(cwd, config_path, f'{args["inputvars"]}.py'),
'cuts': os.path.join(cwd, config_path, f'cuts.py'),
'filter': os.path.join(cwd, config_path, f'filter.py'),
'common': os.path.join(cwd, 'ice' + args['rootname'], f'common.py')}

for key in files.keys():
if os.path.exists(files[key]):
print(f"Cache introspection for the file: '{files[key]}'")
hash_args[f'__hash__{key}'] = io.make_hash_sha256_file(files[key])
print(f"Cache introspection for the file: '{files[key]}' [done]", 'green')
else:
print(f"Did not find: {files[key]} [may cause crash if your application depends on it]", 'red')

print(f"Cache introspection did not find: {files[key]} [may cause crash if your application depends on it]", 'red')
# Genesis parameters as the first one
hash_args.update(old_args['genesis_runmode'])

Expand Down
31 changes: 28 additions & 3 deletions tests/runme_zee_gridtune.sh
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,25 @@ fi
echo "DATAPATH is set to $DATAPATH"
echo "CONFIG is set to $CONFIG"

# -----------------------------------------------------------------------
# Initialization

# Ensure that GRID_ID and GRID_NODES are set to special values

if [[ $GRID_ID == -1 && $GRID_NODES == 1 ]]; then

python analysis/zee.py --runmode genesis $MAX --config ${CONFIG}.yml --datapath $DATAPATH

python analysis/zee.py --runmode train $MAX --config ${CONFIG}.yml --datapath $DATAPATH \
--modeltag GRIDTUNE --run_id "INIT" --compute 0

python analysis/zee.py --runmode eval $MAX --config ${CONFIG}.yml --datapath $DATAPATH \
--modeltag GRIDTUNE --run_id "INIT" --compute 0

return 0 # do not use exit
fi


# -----------------------------------------------------------------------
# Generic functions

Expand Down Expand Up @@ -182,8 +201,14 @@ echo ""

# 4. Run
python analysis/zee.py --runmode genesis $MAX --config ${CONFIG}.yml --datapath $DATAPATH
python analysis/zee.py --runmode train $MAX --config ${CONFIG}.yml --datapath $DATAPATH --modeltag GRIDTUNE --run_id $RUN_ID --supertune "${SUPERTUNE}" # Note " "
python analysis/zee.py --runmode eval $MAX --config ${CONFIG}.yml --datapath $DATAPATH --modeltag GRIDTUNE --run_id $RUN_ID --evaltag "minloss" --supertune "models.iceboost_swd.readmode=-1"
python analysis/zee.py --runmode eval $MAX --config ${CONFIG}.yml --datapath $DATAPATH --modeltag GRIDTUNE --run_id $RUN_ID --evaltag "last" --supertune "models.iceboost_swd.readmode=-2"

python analysis/zee.py --runmode train $MAX --config ${CONFIG}.yml --datapath $DATAPATH \
--modeltag GRIDTUNE --run_id $RUN_ID --supertune "${SUPERTUNE}" # Note " "

python analysis/zee.py --runmode eval $MAX --config ${CONFIG}.yml --datapath $DATAPATH \
--modeltag GRIDTUNE --run_id $RUN_ID --evaltag "minloss" --supertune "models.iceboost_swd.readmode=-1"

python analysis/zee.py --runmode eval $MAX --config ${CONFIG}.yml --datapath $DATAPATH \
--modeltag GRIDTUNE --run_id $RUN_ID --evaltag "last" --supertune "models.iceboost_swd.readmode=-2"

done
13 changes: 13 additions & 0 deletions tests/zee/gridtune.dag
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/bin/bash
#
# Grid tuning job with init structure using Condor DAGMan
#
# Submit with:
# condor_submit_dag gridtune.dag

# Filename: job_dependency.dag
JOB A gridtune_init.job # First init job
JOB B gridtune_array.job # Array job

# Make B depend on A finishing successfully
PARENT A CHILD B
14 changes: 14 additions & 0 deletions tests/zee/gridtune_array.job
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Array job

executable = gridtune_task.sh
arguments = "$(PROCESS) 4 $(ClusterId)"
error = gridtune_array.$(CLUSTER).$(PROCESS).out
output = gridtune_array.$(CLUSTER).$(PROCESS).output
log = gridtune_array.$(CLUSTER).$(PROCESS).log
request_gpus = 1
request_memory = 80G
#requirements = TARGET.GPUs_DeviceName =?= "Tesla V100-PCIE-32GB"
requirements = TARGET.GPUs_DeviceName =?= "Tesla P100-PCIE-12GB"
+MaxRuntime = 86000

queue 4
13 changes: 13 additions & 0 deletions tests/zee/gridtune_init.job
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Initialization job

executable = gridtune_task.sh
arguments = "-1 1 $(ClusterId)"
error = gridtune_init.$(CLUSTER).out
output = gridtune_init.$(CLUSTER).output
log = gridtune_init.$(CLUSTER).log
request_gpus = 1
request_memory = 80G
#requirements = TARGET.GPUs_DeviceName =?= "Tesla V100-PCIE-32GB"
requirements = TARGET.GPUs_DeviceName =?= "Tesla P100-PCIE-12GB"
+MaxRuntime = 86000
queue
34 changes: 34 additions & 0 deletions tests/zee/gridtune_task.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#!/bin/sh
#
# GPU grid tuning task

echo "Grid tuning job started"
pwd

ICEPATH="/vols/cms/mmieskol/icenet"

# ** icenet/setenv.sh uses these **
export HTC_PROCESS_ID=$1
export HTC_QUEUE_SIZE=$2
export HTC_CLUSTER_ID=$3

# Init conda
source /home/hep/mmieskol/setconda.sh
conda activate icenet

# Init icenet
mkdir $ICEPATH/tmp -p
cd $ICEPATH
source $ICEPATH/setenv.sh

# Execute
DATAPATH="/vols/cms/pfk18/phd/hgg/Jul23/NN21July/N/validations/outputs/Csplit_Jsamp/files"
CONFIG="tune0_EB"
maxevents=150000
source /vols/cms/mmieskol/icenet/tests/runme_zee_gridtune.sh

# Create the done file when the job completes
donefile="${ICEPATH}/tmp/icenet_${HTC_CLUSTER_ID}_${HTC_PROCESS_ID}.done"
touch $donefile

echo "Task done, created file: ${donefile}"
62 changes: 62 additions & 0 deletions tests/zee/submit.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# Condor submission with first init job and then
# an array job once that is finished
#
# Emulating DAGMan without using it.
#
# Run with: source submit.sh
#
# [email protected], 2024

ICEPATH="/vols/cms/mmieskol/icenet"

TASK_SCRIPT="gridtune_task.sh"
INIT_JOB="gridtune_init.job"
ARRAY_JOB="gridtune_array.job"
PERIOD=15

# Submit the first job
echo "Submitting init job"
FIRST_JOB_ID=$(condor_submit $INIT_JOB | awk '/submitted to cluster/ {print int($6)}')
echo " "
cat $INIT_JOB

# Check if job submission was successful
if [[ -z "$FIRST_JOB_ID" ]]; then
echo "Error: Failed to submit the first job"
exit 1
fi

sleep 5

echo "First job with ID = ${FIRST_JOB_ID}"
echo "Waiting first job to finish"

while true; do

# Check if the job is still in the queue
job_status=$(condor_q $FIRST_JOB_ID -format "%d" JobStatus 2>/dev/null)

# If condor_q returns nothing, check condor_history
if [ -z "$job_status" ]; then
# Job is no longer in the queue, check the history
job_status=$(condor_history $FIRST_JOB_ID -limit 1 -format "%d" JobStatus 2>/dev/null)
# Exit the loop if the job has completed
if [ "$job_status" -eq "4" ]; then
echo "Job completed successfully."
break
else
echo "Job is no longer running but didn't finish as expected -- exit"
exit 0
fi
fi

# Otherwise, job is still in the queue, and we can wait
echo "Job is still running (status: $job_status). Checking again in ${PERIOD} seconds..."
sleep $PERIOD
done

# Submit the array job
echo "Submitting array job"
condor_submit $ARRAY_JOB
echo " "
cat $ARRAY_JOB

0 comments on commit 98b23c2

Please sign in to comment.