Skip to content

Commit

Permalink
Merge branch 'master' of github.com:mvsjober/ml-benchmarks
Browse files Browse the repository at this point in the history
  • Loading branch information
mvsjober committed Sep 15, 2023
2 parents cbdeb27 + 3ac94f5 commit 4a536e8
Show file tree
Hide file tree
Showing 4 changed files with 47 additions and 1 deletion.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@
/logs
__pycache__
/lightning_logs
/results-*.out
35 changes: 35 additions & 0 deletions benchmarks/pytorch_visionmodel_ddp.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,22 @@ def train(args):

total_step = args.steps if args.steps is not None else len(train_loader)

if args.mlflow and verbose:
import mlflow
mlflow.set_tracking_uri(args.mlflow)

experiment_name = os.path.basename(__file__)
exp = mlflow.get_experiment_by_name(experiment_name)
if exp is None:
exp_id = mlflow.create_experiment(experiment_name)
else:
exp_id = exp.experiment_id

mlflow.start_run(run_name=os.getenv("SLURM_JOB_ID"), experiment_id=exp_id)

print(f"MLflow tracking to {mlflow.get_tracking_uri()}")
mlflow.log_params(vars(args))

# For each block of printed steps
last_start = datetime.now()
last_images = 0
Expand Down Expand Up @@ -153,6 +169,14 @@ def train(args):
now = datetime.now()
last_secs = (now-last_start).total_seconds()

if args.mlflow:
mlflow.log_metrics({
"epoch": epoch+1,
"step": i+1,
"loss": loss.item(),
"images/sec": last_images*world_size/last_secs
})

print(f'Epoch [{epoch+1}/{args.epochs}], Step [{i+1}/{total_step}], '
f'Loss: {loss.item():.4f}, '
f'Images/sec: {last_images*world_size/last_secs:.2f} '
Expand All @@ -161,6 +185,16 @@ def train(args):
last_start = now
last_images = 0

# if args.mlflow:
# cp_fname = 'model_checkpoint.pt'
# torch.save({
# 'epoch': epoch+1,
# 'steps': i+1,
# 'model_state_dict': model.state_dict(),
# 'optimizer_state_dict': optimizer.state_dict()
# }, cp_fname)
# mlflow.log_artifact(cp_fname, artifact_path='checkpoints')

if args.steps is not None and tot_steps >= args.steps:
break

Expand Down Expand Up @@ -203,6 +237,7 @@ def main():
parser.add_argument('--print-steps', type=int, default=100)
parser.add_argument('--warmup-steps', type=int, default=10,
help='Number of initial steps to ignore in average')
parser.add_argument('--mlflow', nargs='?', type=str, const='./mlruns')
parser.add_argument('--fp16', action='store_true', default=False,
help='enable mixed precision')
args = parser.parse_args()
Expand Down
2 changes: 1 addition & 1 deletion run-all-benchmarks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ JID_DDP_FP16_TWONODES=$JID
#### PyTorch DDP Lightning - syntethic data

# PyTorch DDP Lightning, single GPU
do_sbatch slurm/${CLUSTER}-gpu1.sh pytorch-ddp-lightning.sh --steps=1000
do_sbatch --partition=$GPUSMALL -t 30 slurm/${CLUSTER}-gpu1.sh pytorch-ddp-lightning.sh --steps=1000
JID_DDPL_GPU1=$JID

# PyTorch DDP, full node
Expand Down
10 changes: 10 additions & 0 deletions slurm/common.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,18 @@ else
fi
echo "NUM_GPUS=$NUM_GPUS"

PUHTI_GPUENERGY=/appl/soft/ai/bin/gpu-energy
if [ -x $PUHTI_GPUENERGY ]; then
$PUHTI_GPUENERGY &
monitor_pid=$!
fi

source $SCRIPT $*

if [ ! -z $monitor_pid ]; then
kill -SIGUSR1 $monitor_pid
fi

(set -x
date
)

0 comments on commit 4a536e8

Please sign in to comment.