From a65f074fd692b745ad8ab7a35b21e5f235947045 Mon Sep 17 00:00:00 2001 From: David Jurado Date: Tue, 16 Jan 2024 10:13:02 -0500 Subject: [PATCH 1/7] Add Mlcube logic --- stable_diffusion/.dockerignore | 1 + stable_diffusion/.gitignore | 1 + stable_diffusion/Dockerfile | 10 +++-- stable_diffusion/mlcube/mlcube.yaml | 37 +++++++++++++++++++ .../scripts/checkpoints/download_all.sh | 26 +++++++++++++ .../scripts/datasets/mlcube_data.sh | 31 ++++++++++++++++ 6 files changed, 103 insertions(+), 3 deletions(-) create mode 100644 stable_diffusion/mlcube/mlcube.yaml create mode 100644 stable_diffusion/scripts/checkpoints/download_all.sh create mode 100755 stable_diffusion/scripts/datasets/mlcube_data.sh diff --git a/stable_diffusion/.dockerignore b/stable_diffusion/.dockerignore index 24df5b67d..ce461f82d 100644 --- a/stable_diffusion/.dockerignore +++ b/stable_diffusion/.dockerignore @@ -1,2 +1,3 @@ nogit/ mlperf_compliance.log +mlcube/workspace/* diff --git a/stable_diffusion/.gitignore b/stable_diffusion/.gitignore index 24df5b67d..ce461f82d 100644 --- a/stable_diffusion/.gitignore +++ b/stable_diffusion/.gitignore @@ -1,2 +1,3 @@ nogit/ mlperf_compliance.log +mlcube/workspace/* diff --git a/stable_diffusion/Dockerfile b/stable_diffusion/Dockerfile index 8e538bb61..cd17f714c 100644 --- a/stable_diffusion/Dockerfile +++ b/stable_diffusion/Dockerfile @@ -7,7 +7,11 @@ ENV DEBIAN_FRONTEND=noninteractive RUN apt-get update RUN apt-get install -y ffmpeg libsm6 libxext6 +# pip dependencies +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + # install LDM -COPY . /diffusion -RUN cd /diffusion && \ - pip install --no-cache-dir -r requirements.txt +ADD . /diffusion +RUN chmod +x /diffusion/*.sh +WORKDIR /diffusion diff --git a/stable_diffusion/mlcube/mlcube.yaml b/stable_diffusion/mlcube/mlcube.yaml new file mode 100644 index 000000000..7ac39d09a --- /dev/null +++ b/stable_diffusion/mlcube/mlcube.yaml @@ -0,0 +1,37 @@ +name: stable_diffusion +description: Stable Diffusion benchmark +authors: + - { name: "MLCommons Best Practices Working Group" } + +platform: + accelerator_count: 1 + +docker: + # Image name. + image: mlcommons/stable_diffusion:0.0.1 + # Docker build context relative to $MLCUBE_ROOT. Default is `build`. + build_context: "../" + # Docker file name within docker build context, default is `Dockerfile`. + build_file: "Dockerfile" + +tasks: + download_data: + entrypoint: ./scripts/datasets/mlcube_data.sh -a + parameters: + outputs: + laion_output_dir: data_laion/ + coco_output_dir: data_coco/ + download_models: + entrypoint: ./scripts/checkpoints/download_all.sh -a + parameters: + outputs: + output_sd: checkpoints/sd + output_inception: checkpoints/inception + output_clip: checkpoints/clip + train: + entrypoint: ./run_and_time.sh --gpus-per-node=1 + parameters: + inputs: + checkpoint: checkpoints/sd/512-base-ema.ckpt + outputs: + results_dir: results/ diff --git a/stable_diffusion/scripts/checkpoints/download_all.sh b/stable_diffusion/scripts/checkpoints/download_all.sh new file mode 100644 index 000000000..fa503396d --- /dev/null +++ b/stable_diffusion/scripts/checkpoints/download_all.sh @@ -0,0 +1,26 @@ +#!/usr/bin/env bash + +: "${OUTPUT_SD:=/checkpoints/sd}" +: "${OUTPUT_INCEPTION:=/checkpoints/inception}" +: "${OUTPUT_CLIP:=/checkpoints/clip}" + +while [ $# -gt 0 ]; do + case "$1" in + --output_sd=*) + OUTPUT_SD="${1#*=}" + ;; + --output_inception=*) + OUTPUT_INCEPTION="${1#*=}" + ;; + --output_clip=*) + OUTPUT_CLIP="${1#*=}" + ;; + *) ;; + esac + shift +done + +cd "$(dirname "$0")" +download_sd.sh --output-dir $OUTPUT_SD +download_inception.sh --output-dir $OUTPUT_INCEPTION +download_clip.sh --output-dir $OUTPUT_CLIP diff --git a/stable_diffusion/scripts/datasets/mlcube_data.sh b/stable_diffusion/scripts/datasets/mlcube_data.sh new file mode 100755 index 000000000..7ef550a82 --- /dev/null +++ b/stable_diffusion/scripts/datasets/mlcube_data.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash + +: "${LAION_OUTPUT_DIR:=/datasets/laion-400m/webdataset-moments-filtered}" +: "${COCO_OUTPUT_DIR:=/datasets/coco2014}" + +while [ $# -gt 0 ]; do + case "$1" in + --laion_output_dir=*) + LAION_OUTPUT_DIR="${1#*=}" + ;; + --coco_output_dir=*) + COCO_OUTPUT_DIR="${1#*=}" + ;; + *) ;; + esac + shift +done + +mkdir -p ${LAION_OUTPUT_DIR} +cd ${LAION_OUTPUT_DIR} + + +for i in {00000..00831}; do wget -O ${LAION_OUTPUT_DIR}/${i}.tar -c "https://cloud.mlcommons.org/index.php/s/training_stable_diffusion/download?path=/datasets/laion-400m/moments-webdataset-filtered&files=${i}.tar"; done + +wget -O ${LAION_OUTPUT_DIR}/sha512sums.txt -c "https://cloud.mlcommons.org/index.php/s/training_stable_diffusion/download?path=/datasets/laion-400m/moments-webdataset-filtered&files=sha512sums.txt" + +sha512sum --quiet -c sha512sums.txt + +mkdir -p ${COCO_OUTPUT_DIR} +wget -O ${COCO_OUTPUT_DIR}/val2014_30k.tsv -c "https://cloud.mlcommons.org/index.php/s/training_stable_diffusion/download?path=/datasets/coco2014&files=val2014_30k.tsv" +wget -O ${COCO_OUTPUT_DIR}/val2014_30k_stats.npz -c "https://cloud.mlcommons.org/index.php/s/training_stable_diffusion/download?path=/datasets/coco2014&files=val2014_30k_stats.npz" From d471135b22c95682c3d79e426b3f1218a0132949 Mon Sep 17 00:00:00 2001 From: David Jurado Date: Fri, 26 Jan 2024 12:07:30 -0500 Subject: [PATCH 2/7] Add stable diffusion demo --- stable_diffusion/Dockerfile | 5 + stable_diffusion/configs/train_demo.yaml | 153 ++++++++++++++++++ stable_diffusion/main.py | 2 - stable_diffusion/mlcube/mlcube.yaml | 23 ++- stable_diffusion/run_demo.sh | 96 +++++++++++ .../scripts/checkpoints/download_all.sh | 6 +- .../scripts/datasets/mlcube_data.sh | 2 +- .../scripts/datasets/mlcube_demo_data.sh | 20 +++ 8 files changed, 295 insertions(+), 12 deletions(-) create mode 100644 stable_diffusion/configs/train_demo.yaml create mode 100644 stable_diffusion/run_demo.sh mode change 100644 => 100755 stable_diffusion/scripts/checkpoints/download_all.sh create mode 100755 stable_diffusion/scripts/datasets/mlcube_demo_data.sh diff --git a/stable_diffusion/Dockerfile b/stable_diffusion/Dockerfile index cd17f714c..55f2d5e94 100644 --- a/stable_diffusion/Dockerfile +++ b/stable_diffusion/Dockerfile @@ -10,6 +10,11 @@ RUN apt-get install -y ffmpeg libsm6 libxext6 # pip dependencies COPY requirements.txt . RUN pip install --no-cache-dir -r requirements.txt +RUN pip install pytorch_lightning==1.9.0 +RUN pip uninstall opencv-python==4.7.0.72 -y +RUN rm -rf /usr/local/lib/python3.8/dist-packages/cv2/ +RUN pip install opencv-python==4.8.0.74 +RUN pip install httpx==0.24.1 # install LDM ADD . /diffusion diff --git a/stable_diffusion/configs/train_demo.yaml b/stable_diffusion/configs/train_demo.yaml new file mode 100644 index 000000000..73de87d28 --- /dev/null +++ b/stable_diffusion/configs/train_demo.yaml @@ -0,0 +1,153 @@ +model: + base_learning_rate: 1.25e-7 + target: ldm.models.diffusion.ddpm.LatentDiffusion + params: + parameterization: "v" + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: npy + first_stage_type: moments + cond_stage_key: txt + image_size: 64 + channels: 4 + cond_stage_trainable: false + conditioning_key: crossattn + monitor: steps + scale_factor: 0.18215 + use_ema: False + + load_vae: True + load_unet: False + load_encoder: True + + validation_config: + sampler: "ddim" # plms, ddim, dpm + steps: 50 + scale: 8.0 + ddim_eta: 0.0 + prompt_key: "caption" + image_fname_key: "image_id" + + save_images: + enabled: True + base_output_dir: "/results/inference" + fid: + enabled: True + inception_weights_url: https://github.com/mseitzer/pytorch-fid/releases/download/fid_weights/pt_inception-2015-12-05-6726825d.pth + cache_dir: /checkpoints/inception + gt_path: /datasets/coco2014/val2014_512x512_30k_stats.npz + clip: + enabled: True + clip_version: "ViT-H-14" + cache_dir: /checkpoints/clip + + scheduler_config: + target: ldm.lr_scheduler.LambdaLinearScheduler + params: + warm_up_steps: [ 100 ] + cycle_lengths: [ 100 ] # incredibly large number to prevent corner cases + f_start: [ 1.e-6 ] + f_max: [ 1. ] + f_min: [ 1. ] + + unet_config: + target: ldm.modules.diffusionmodules.openaimodel.UNetModel + params: + use_checkpoint: False # gradient checkpointing + use_fp16: True + image_size: 32 + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_head_channels: 64 # need to fix for flash-attn + use_spatial_transformer: True + use_linear_in_transformer: True + transformer_depth: 1 + context_dim: 1024 + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder + params: + arch: "ViT-H-14" + version: "laion2b_s32b_b79k" + freeze: True + layer: "penultimate" + cache_dir: /checkpoints/clip + +data: + target: ldm.data.composable_data_module.ComposableDataModule + params: + train: + target: ldm.data.webdatasets.build_dataloader + params: + urls: /datasets/laion-400m/webdataset-moments-filtered/{00000..00831}.tar + batch_size: 8 + shuffle: 1000 + partial: False + keep_only_keys: ["npy", "txt"] + num_workers: 4 + persistent_workers: True + + validation: + target: ldm.data.tsv.build_dataloader + params: + annotations_file: "/datasets/coco2014/val2014_30k.tsv" + keys: ["image_id", "id", "caption"] + batch_size: 8 + shuffle: False + num_workers: 1 + +lightning: + trainer: + accelerator: 'gpu' + num_nodes: 1 + devices: 8 + precision: 16 + logger: False + log_every_n_steps: 1 + enable_progress_bar: False + max_epochs: 2 + max_steps: 5 + val_check_interval: 1 + enable_checkpointing: True + num_sanity_val_steps: 0 + strategy: + target: strategies.DDPStrategy + params: + find_unused_parameters: False + + modelcheckpoint: + target: lightning.pytorch.callbacks.ModelCheckpoint + params: + save_top_k: 1 + every_n_train_steps: 1 diff --git a/stable_diffusion/main.py b/stable_diffusion/main.py index d7e984f2e..b1f245d93 100644 --- a/stable_diffusion/main.py +++ b/stable_diffusion/main.py @@ -703,7 +703,5 @@ def divein(*args, **kwargs): dst = os.path.join(dst, "debug_runs", name) os.makedirs(os.path.split(dst)[0], exist_ok=True) os.rename(logdir, dst) - if trainer.global_rank == 0: - print(trainer.profiler.summary()) mllogger.event(mllog_constants.STATUS, value=status) diff --git a/stable_diffusion/mlcube/mlcube.yaml b/stable_diffusion/mlcube/mlcube.yaml index 7ac39d09a..4c11a2ea9 100644 --- a/stable_diffusion/mlcube/mlcube.yaml +++ b/stable_diffusion/mlcube/mlcube.yaml @@ -13,6 +13,8 @@ docker: build_context: "../" # Docker file name within docker build context, default is `Dockerfile`. build_file: "Dockerfile" + # GPU arguments + gpu_args: "--gpus=all" tasks: download_data: @@ -21,17 +23,26 @@ tasks: outputs: laion_output_dir: data_laion/ coco_output_dir: data_coco/ + download_demo: + entrypoint: ./scripts/datasets/mlcube_demo_data.sh -a + parameters: + outputs: + demo_output_dir: demo_data/ download_models: entrypoint: ./scripts/checkpoints/download_all.sh -a parameters: outputs: - output_sd: checkpoints/sd - output_inception: checkpoints/inception - output_clip: checkpoints/clip - train: - entrypoint: ./run_and_time.sh --gpus-per-node=1 + output_sd: checkpoints/sd/ + output_inception: checkpoints/inception/ + output_clip: checkpoints/clip/ + demo: + entrypoint: ./run_demo.sh --gpus-per-node=1 parameters: inputs: - checkpoint: checkpoints/sd/512-base-ema.ckpt + checkpoint_sd: checkpoints/sd/512-base-ema.ckpt + checkpoint_clip: checkpoints/clip/ + checkpoint_inception: checkpoints/inception/ + coco_dir: demo_data/data_coco/ + laion_dir: demo_data/data_laion/ outputs: results_dir: results/ diff --git a/stable_diffusion/run_demo.sh b/stable_diffusion/run_demo.sh new file mode 100644 index 000000000..3180a6c9a --- /dev/null +++ b/stable_diffusion/run_demo.sh @@ -0,0 +1,96 @@ +#!/bin/bash + +: "${NUM_NODES:=1}" +: "${GPUS_PER_NODE:=8}" +: "${CHECKPOINT_SD:=/checkpoints/sd/512-base-ema.ckpt}" +: "${CHECKPOINT_CLIP:=/checkpoints/clip/}" +: "${CHECKPOINT_INCEPTION:=/checkpoints/inception/}" +: "${COCO_DIR:=data_coco/}" +: "${LAION_DIR:=data_laion/}" +: "${RESULTS_DIR:=/results}" +: "${CONFIG:=./configs/train_demo.yaml}" + + +while [ $# -gt 0 ]; do + case "$1" in + --num-nodes=*) + NUM_NODES="${1#*=}" + ;; + --gpus-per-node=*) + GPUS_PER_NODE="${1#*=}" + ;; + --checkpoint_sd=*) + CHECKPOINT_SD="${1#*=}" + ;; + --checkpoint_clip=*) + CHECKPOINT_CLIP="${1#*=}" + ;; + --checkpoint_inception=*) + CHECKPOINT_INCEPTION="${1#*=}" + ;; + --coco_dir=*) + COCO_DIR="${1#*=}" + ;; + --laion_dir=*) + LAION_DIR="${1#*=}" + ;; + --results_dir=*) + RESULTS_DIR="${1#*=}" + ;; + --config=*) + CONFIG="${1#*=}" + ;; + *) ;; + esac + shift +done + +set -e + +mkdir -p /checkpoints/clip +ln -s $CHECKPOINT_CLIP/* /checkpoints/clip + +mkdir -p /datasets/coco2014 +ln -s $COCO_DIR/* /datasets/coco2014 + +sed -i "s=/datasets/laion-400m/webdataset-moments-filtered/{00000..00831}.tar=$LAION_DIR/{00000..00003}.tar=g" $CONFIG +sed -i "s=/datasets/coco2014/val2014_512x512_30k_stats.npz=$COCO_DIR/val2014_30k_stats.npz=g" $CONFIG +sed -i "s=/results/inference=$RESULTS_DIR/=g" $CONFIG +sed -i "s=/checkpoints/clip=$CHECKPOINT_CLIP/=g" $CONFIG +sed -i "s=/checkpoints/clip=$CHECKPOINT_CLIP/=g" $CONFIG +sed -i "s=/checkpoints/inception=$CHECKPOINT_INCEPTION/=g" $CONFIG + +export HF_DATASETS_OFFLINE=0 +export TRANSFORMERS_OFFLINE=0 +export DIFFUSERS_OFFLINE=0 +export HF_HOME=/hf_home + +start=$(date +%s) +start_fmt=$(date +%Y-%m-%d\ %r) +echo "STARTING TIMING RUN AT $start_fmt" + +# CLEAR YOUR CACHE HERE +python -c " +from mlperf_logging.mllog import constants +from mlperf_logging_utils import mllogger +mllogger.event(key=constants.CACHE_CLEAR, value=True)" + +python main.py \ + lightning.trainer.num_nodes=${NUM_NODES} \ + lightning.trainer.devices=${GPUS_PER_NODE} \ + -m train \ + --validation False \ + --ckpt ${CHECKPOINT_SD} \ + --logdir ${RESULTS_DIR} \ + -b ${CONFIG} + +# end timing +end=$(date +%s) +end_fmt=$(date +%Y-%m-%d\ %r) +echo "ENDING TIMING RUN AT $end_fmt" + +# runtime +runtime=$(( $end - $start )) +result_name="stable_diffusion" + +echo "RESULT,$result_name,$runtime,$USER,$start_fmt" diff --git a/stable_diffusion/scripts/checkpoints/download_all.sh b/stable_diffusion/scripts/checkpoints/download_all.sh old mode 100644 new mode 100755 index fa503396d..7f9019e1e --- a/stable_diffusion/scripts/checkpoints/download_all.sh +++ b/stable_diffusion/scripts/checkpoints/download_all.sh @@ -21,6 +21,6 @@ while [ $# -gt 0 ]; do done cd "$(dirname "$0")" -download_sd.sh --output-dir $OUTPUT_SD -download_inception.sh --output-dir $OUTPUT_INCEPTION -download_clip.sh --output-dir $OUTPUT_CLIP +bash download_sd.sh --output-dir $OUTPUT_SD +bash download_inception.sh --output-dir $OUTPUT_INCEPTION +bash download_clip.sh --output-dir $OUTPUT_CLIP diff --git a/stable_diffusion/scripts/datasets/mlcube_data.sh b/stable_diffusion/scripts/datasets/mlcube_data.sh index 7ef550a82..0dae461d3 100755 --- a/stable_diffusion/scripts/datasets/mlcube_data.sh +++ b/stable_diffusion/scripts/datasets/mlcube_data.sh @@ -20,7 +20,7 @@ mkdir -p ${LAION_OUTPUT_DIR} cd ${LAION_OUTPUT_DIR} -for i in {00000..00831}; do wget -O ${LAION_OUTPUT_DIR}/${i}.tar -c "https://cloud.mlcommons.org/index.php/s/training_stable_diffusion/download?path=/datasets/laion-400m/moments-webdataset-filtered&files=${i}.tar"; done +for i in {00000..00003}; do wget -O ${LAION_OUTPUT_DIR}/${i}.tar -c "https://cloud.mlcommons.org/index.php/s/training_stable_diffusion/download?path=/datasets/laion-400m/moments-webdataset-filtered&files=${i}.tar"; done wget -O ${LAION_OUTPUT_DIR}/sha512sums.txt -c "https://cloud.mlcommons.org/index.php/s/training_stable_diffusion/download?path=/datasets/laion-400m/moments-webdataset-filtered&files=sha512sums.txt" diff --git a/stable_diffusion/scripts/datasets/mlcube_demo_data.sh b/stable_diffusion/scripts/datasets/mlcube_demo_data.sh new file mode 100755 index 000000000..65b6b703f --- /dev/null +++ b/stable_diffusion/scripts/datasets/mlcube_demo_data.sh @@ -0,0 +1,20 @@ +#!/usr/bin/env bash + +: "${DEMO_OUTPUT_DIR:=/demo_data}" + +while [ $# -gt 0 ]; do + case "$1" in + --demo_output_dir=*) + DEMO_OUTPUT_DIR="${1#*=}" + ;; + *) ;; + esac + shift +done + +mkdir -p ${DEMO_OUTPUT_DIR} +cd ${DEMO_OUTPUT_DIR} + +wget -O demo_data.zip -c https://storage.googleapis.com/mlperf_training_demo/stable_diffusion/demo_data.zip +unzip -o demo_data.zip +rm demo_data.zip From 838e5ada42a4a2f12e21338c257d44f96905727f Mon Sep 17 00:00:00 2001 From: David Jurado Date: Fri, 26 Jan 2024 14:19:18 -0500 Subject: [PATCH 3/7] Fix demo hyperparameters --- stable_diffusion/configs/train_demo.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/stable_diffusion/configs/train_demo.yaml b/stable_diffusion/configs/train_demo.yaml index 73de87d28..4f131d0b0 100644 --- a/stable_diffusion/configs/train_demo.yaml +++ b/stable_diffusion/configs/train_demo.yaml @@ -25,7 +25,7 @@ model: validation_config: sampler: "ddim" # plms, ddim, dpm - steps: 50 + steps: 10 scale: 8.0 ddim_eta: 0.0 prompt_key: "caption" @@ -137,7 +137,7 @@ lightning: log_every_n_steps: 1 enable_progress_bar: False max_epochs: 2 - max_steps: 5 + max_steps: 2 val_check_interval: 1 enable_checkpointing: True num_sanity_val_steps: 0 From a6e37f945354c7d262bd71d577cf2216019c84a5 Mon Sep 17 00:00:00 2001 From: David Jurado Date: Fri, 16 Feb 2024 17:13:04 -0500 Subject: [PATCH 4/7] Fix demo and add single GPU config --- stable_diffusion/configs/train_01x01.yaml | 153 ++++++++++++++++++++++ stable_diffusion/configs/train_demo.yaml | 2 +- stable_diffusion/mlcube/README.md | 50 +++++++ stable_diffusion/mlcube/mlcube.yaml | 14 +- stable_diffusion/run_demo.sh | 1 - stable_diffusion/run_train_mlcube.sh | 95 ++++++++++++++ 6 files changed, 312 insertions(+), 3 deletions(-) create mode 100644 stable_diffusion/configs/train_01x01.yaml create mode 100644 stable_diffusion/mlcube/README.md create mode 100755 stable_diffusion/run_train_mlcube.sh diff --git a/stable_diffusion/configs/train_01x01.yaml b/stable_diffusion/configs/train_01x01.yaml new file mode 100644 index 000000000..67a528b00 --- /dev/null +++ b/stable_diffusion/configs/train_01x01.yaml @@ -0,0 +1,153 @@ +model: + base_learning_rate: 1.25e-7 + target: ldm.models.diffusion.ddpm.LatentDiffusion + params: + parameterization: "v" + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: npy + first_stage_type: moments + cond_stage_key: txt + image_size: 64 + channels: 4 + cond_stage_trainable: false + conditioning_key: crossattn + monitor: steps + scale_factor: 0.18215 + use_ema: False + + load_vae: True + load_unet: False + load_encoder: True + + validation_config: + sampler: "ddim" # plms, ddim, dpm + steps: 50 + scale: 8.0 + ddim_eta: 0.0 + prompt_key: "caption" + image_fname_key: "image_id" + + save_images: + enabled: False + base_output_dir: "/results/inference" + fid: + enabled: True + inception_weights_url: https://github.com/mseitzer/pytorch-fid/releases/download/fid_weights/pt_inception-2015-12-05-6726825d.pth + cache_dir: /checkpoints/inception + gt_path: /datasets/coco2014/val2014_512x512_30k_stats.npz + clip: + enabled: True + clip_version: "ViT-H-14" + cache_dir: /checkpoints/clip + + scheduler_config: + target: ldm.lr_scheduler.LambdaLinearScheduler + params: + warm_up_steps: [ 1000 ] + cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases + f_start: [ 1.e-6 ] + f_max: [ 1. ] + f_min: [ 1. ] + + unet_config: + target: ldm.modules.diffusionmodules.openaimodel.UNetModel + params: + use_checkpoint: False # gradient checkpointing + use_fp16: True + image_size: 32 + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_head_channels: 64 # need to fix for flash-attn + use_spatial_transformer: True + use_linear_in_transformer: True + transformer_depth: 1 + context_dim: 1024 + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder + params: + arch: "ViT-H-14" + version: "laion2b_s32b_b79k" + freeze: True + layer: "penultimate" + cache_dir: /checkpoints/clip + +data: + target: ldm.data.composable_data_module.ComposableDataModule + params: + train: + target: ldm.data.webdatasets.build_dataloader + params: + urls: /datasets/laion-400m/webdataset-moments-filtered/{00000..00831}.tar + batch_size: 8 + shuffle: 1000 + partial: False + keep_only_keys: ["npy", "txt"] + num_workers: 4 + persistent_workers: True + + validation: + target: ldm.data.tsv.build_dataloader + params: + annotations_file: "/datasets/coco2014/val2014_30k.tsv" + keys: ["image_id", "id", "caption"] + batch_size: 8 + shuffle: False + num_workers: 1 + +lightning: + trainer: + accelerator: 'gpu' + num_nodes: 1 + devices: 8 + precision: 16 + logger: False + log_every_n_steps: 10 + enable_progress_bar: False + max_epochs: 1 + max_steps: 1 + val_check_interval: 1 + enable_checkpointing: True + num_sanity_val_steps: 0 + strategy: + target: strategies.DDPStrategy + params: + find_unused_parameters: False + + modelcheckpoint: + target: lightning.pytorch.callbacks.ModelCheckpoint + params: + save_top_k: -1 + every_n_train_steps: 1000000000000 diff --git a/stable_diffusion/configs/train_demo.yaml b/stable_diffusion/configs/train_demo.yaml index 4f131d0b0..9fa18226c 100644 --- a/stable_diffusion/configs/train_demo.yaml +++ b/stable_diffusion/configs/train_demo.yaml @@ -134,7 +134,7 @@ lightning: devices: 8 precision: 16 logger: False - log_every_n_steps: 1 + log_every_n_steps: 2 enable_progress_bar: False max_epochs: 2 max_steps: 2 diff --git a/stable_diffusion/mlcube/README.md b/stable_diffusion/mlcube/README.md new file mode 100644 index 000000000..91e014b51 --- /dev/null +++ b/stable_diffusion/mlcube/README.md @@ -0,0 +1,50 @@ +# MLCube for Stable Diffusion + +MLCube™ GitHub [repository](https://github.com/mlcommons/mlcube). MLCube™ [wiki](https://mlcommons.github.io/mlcube/). + +## Project setup + +An important requirement is that you must have Docker installed. + +```bash +# Create Python environment and install MLCube Docker runner +virtualenv -p python3 ./env && source ./env/bin/activate && pip install mlcube-docker +# Fetch the implementation from GitHub +git clone https://github.com/mlcommons/training && cd ./training +git fetch origin pull/696/head:feature/mlcube_sd && git checkout feature/mlcube_sd +cd ./stable_diffusion/mlcube +``` + +Inside the mlcube directory run the following command to check implemented tasks. + +```shell +mlcube describe +``` + +### MLCube tasks + +Download dataset. + +```shell +mlcube run --task=download_demo +``` + +Process dataset. + +```shell +mlcube run --task=download_models +``` + +Train SSD. + +```shell +mlcube run --task=demo +``` + +### Execute the complete pipeline + +You can execute the complete pipeline with one single command. + +```shell +mlcube run --task=download_demo,download_models,demo +``` \ No newline at end of file diff --git a/stable_diffusion/mlcube/mlcube.yaml b/stable_diffusion/mlcube/mlcube.yaml index 4c11a2ea9..3dced0893 100644 --- a/stable_diffusion/mlcube/mlcube.yaml +++ b/stable_diffusion/mlcube/mlcube.yaml @@ -8,7 +8,7 @@ platform: docker: # Image name. - image: mlcommons/stable_diffusion:0.0.1 + image: dfjbtest/stable_diffusion:0.0.1 # Docker build context relative to $MLCUBE_ROOT. Default is `build`. build_context: "../" # Docker file name within docker build context, default is `Dockerfile`. @@ -46,3 +46,15 @@ tasks: laion_dir: demo_data/data_laion/ outputs: results_dir: results/ + train: + entrypoint: ./run_train_mlcube.sh --gpus-per-node=1 + parameters: + inputs: + checkpoint_sd: checkpoints/sd/512-base-ema.ckpt + checkpoint_clip: checkpoints/clip/ + checkpoint_inception: checkpoints/inception/ + coco_dir: coco2014/ + laion_dir: laion-400m/moments-webdataset-filtered/ + outputs: + results_dir: results/ + diff --git a/stable_diffusion/run_demo.sh b/stable_diffusion/run_demo.sh index 3180a6c9a..4b8838196 100644 --- a/stable_diffusion/run_demo.sh +++ b/stable_diffusion/run_demo.sh @@ -57,7 +57,6 @@ sed -i "s=/datasets/laion-400m/webdataset-moments-filtered/{00000..00831}.tar=$L sed -i "s=/datasets/coco2014/val2014_512x512_30k_stats.npz=$COCO_DIR/val2014_30k_stats.npz=g" $CONFIG sed -i "s=/results/inference=$RESULTS_DIR/=g" $CONFIG sed -i "s=/checkpoints/clip=$CHECKPOINT_CLIP/=g" $CONFIG -sed -i "s=/checkpoints/clip=$CHECKPOINT_CLIP/=g" $CONFIG sed -i "s=/checkpoints/inception=$CHECKPOINT_INCEPTION/=g" $CONFIG export HF_DATASETS_OFFLINE=0 diff --git a/stable_diffusion/run_train_mlcube.sh b/stable_diffusion/run_train_mlcube.sh new file mode 100755 index 000000000..0195e2417 --- /dev/null +++ b/stable_diffusion/run_train_mlcube.sh @@ -0,0 +1,95 @@ +#!/bin/bash + +: "${NUM_NODES:=1}" +: "${GPUS_PER_NODE:=8}" +: "${CHECKPOINT_SD:=/checkpoints/sd/512-base-ema.ckpt}" +: "${CHECKPOINT_CLIP:=/checkpoints/clip/}" +: "${CHECKPOINT_INCEPTION:=/checkpoints/inception/}" +: "${COCO_DIR:=data_coco/}" +: "${LAION_DIR:=data_laion/}" +: "${RESULTS_DIR:=/results}" +: "${CONFIG:=./configs/train_01x01.yaml}" + + +while [ $# -gt 0 ]; do + case "$1" in + --num-nodes=*) + NUM_NODES="${1#*=}" + ;; + --gpus-per-node=*) + GPUS_PER_NODE="${1#*=}" + ;; + --checkpoint_sd=*) + CHECKPOINT_SD="${1#*=}" + ;; + --checkpoint_clip=*) + CHECKPOINT_CLIP="${1#*=}" + ;; + --checkpoint_inception=*) + CHECKPOINT_INCEPTION="${1#*=}" + ;; + --coco_dir=*) + COCO_DIR="${1#*=}" + ;; + --laion_dir=*) + LAION_DIR="${1#*=}" + ;; + --results_dir=*) + RESULTS_DIR="${1#*=}" + ;; + --config=*) + CONFIG="${1#*=}" + ;; + *) ;; + esac + shift +done + +set -e + +mkdir -p /checkpoints/clip +ln -s $CHECKPOINT_CLIP/* /checkpoints/clip + +mkdir -p /datasets/coco2014 +ln -s $COCO_DIR/* /datasets/coco2014 + +sed -i "s=/datasets/laion-400m/webdataset-moments-filtered/{00000..00831}.tar=$LAION_DIR/{00000..00003}.tar=g" $CONFIG +sed -i "s=/datasets/coco2014/val2014_512x512_30k_stats.npz=$COCO_DIR/val2014_30k_stats.npz=g" $CONFIG +sed -i "s=/results/inference=$RESULTS_DIR/=g" $CONFIG +sed -i "s=/checkpoints/clip=$CHECKPOINT_CLIP/=g" $CONFIG +sed -i "s=/checkpoints/inception=$CHECKPOINT_INCEPTION/=g" $CONFIG + +export HF_DATASETS_OFFLINE=0 +export TRANSFORMERS_OFFLINE=0 +export DIFFUSERS_OFFLINE=0 +export HF_HOME=/hf_home + +start=$(date +%s) +start_fmt=$(date +%Y-%m-%d\ %r) +echo "STARTING TIMING RUN AT $start_fmt" + +# CLEAR YOUR CACHE HERE +python -c " +from mlperf_logging.mllog import constants +from mlperf_logging_utils import mllogger +mllogger.event(key=constants.CACHE_CLEAR, value=True)" + +python main.py \ + lightning.trainer.num_nodes=${NUM_NODES} \ + lightning.trainer.devices=${GPUS_PER_NODE} \ + -m train \ + --validation False \ + --ckpt ${CHECKPOINT_SD} \ + --logdir ${RESULTS_DIR} \ + -b ${CONFIG} + +# end timing +end=$(date +%s) +end_fmt=$(date +%Y-%m-%d\ %r) +echo "ENDING TIMING RUN AT $end_fmt" + +# runtime +runtime=$(( $end - $start )) +result_name="stable_diffusion" + +echo "RESULT,$result_name,$runtime,$USER,$start_fmt" From 51eb693d55b530c873c54a18dcd4a790017ea1cb Mon Sep 17 00:00:00 2001 From: David Jurado Date: Thu, 22 Feb 2024 10:36:45 -0500 Subject: [PATCH 5/7] Fix readme --- stable_diffusion/mlcube/README.md | 38 ++++++++++++++++++++++++++++--- 1 file changed, 35 insertions(+), 3 deletions(-) diff --git a/stable_diffusion/mlcube/README.md b/stable_diffusion/mlcube/README.md index 91e014b51..33c6669e1 100644 --- a/stable_diffusion/mlcube/README.md +++ b/stable_diffusion/mlcube/README.md @@ -23,19 +23,41 @@ mlcube describe ### MLCube tasks +* Core tasks: + Download dataset. +```shell +mlcube run --task=download_data +``` + +Download models. + +```shell +mlcube run --task=download_models +``` + +Train. + +```shell +mlcube run --task=train +``` + +* Demo tasks: + +Download demo dataset. + ```shell mlcube run --task=download_demo ``` -Process dataset. +Download models. ```shell mlcube run --task=download_models ``` -Train SSD. +Train demo. ```shell mlcube run --task=demo @@ -45,6 +67,16 @@ mlcube run --task=demo You can execute the complete pipeline with one single command. +* Core pipeline: + +```shell +mlcube run --task=download_data,download_models,train +``` + +* Demo pipeline: + ```shell mlcube run --task=download_demo,download_models,demo -``` \ No newline at end of file +``` + +**Note**: To rebuild the image use the flag: `-Pdocker.build_strategy=always` during the `mlcube run` command. From 5744352bb107166a89e4c18ad76009aa42df7e6e Mon Sep 17 00:00:00 2001 From: David Jurado Date: Fri, 26 Jul 2024 09:28:06 -0500 Subject: [PATCH 6/7] update demo download link --- stable_diffusion/scripts/datasets/mlcube_demo_data.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/stable_diffusion/scripts/datasets/mlcube_demo_data.sh b/stable_diffusion/scripts/datasets/mlcube_demo_data.sh index 65b6b703f..574a6963f 100755 --- a/stable_diffusion/scripts/datasets/mlcube_demo_data.sh +++ b/stable_diffusion/scripts/datasets/mlcube_demo_data.sh @@ -15,6 +15,6 @@ done mkdir -p ${DEMO_OUTPUT_DIR} cd ${DEMO_OUTPUT_DIR} -wget -O demo_data.zip -c https://storage.googleapis.com/mlperf_training_demo/stable_diffusion/demo_data.zip -unzip -o demo_data.zip -rm demo_data.zip +wget -O demo_data.zip -c https://mlcube.mlcommons-storage.org/minibenchmarks/stable_diffusion.zip +unzip -o stable_diffusion.zip +rm stable_diffusion.zip From f1e62b4ed99d00bcfc3d00021075e45f5c9495a1 Mon Sep 17 00:00:00 2001 From: David Jurado Date: Fri, 10 Jan 2025 11:00:08 -0500 Subject: [PATCH 7/7] test dependencies --- stable_diffusion/Dockerfile | 18 ++++++++++++++---- stable_diffusion/requirements.txt | 5 +++-- .../scripts/datasets/mlcube_demo_data.sh | 4 ++-- 3 files changed, 19 insertions(+), 8 deletions(-) diff --git a/stable_diffusion/Dockerfile b/stable_diffusion/Dockerfile index 55f2d5e94..864a59927 100644 --- a/stable_diffusion/Dockerfile +++ b/stable_diffusion/Dockerfile @@ -1,11 +1,15 @@ -ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:22.12-py3 +ARG FROM_IMAGE_NAME=pytorch/pytorch:1.13.1-cuda11.6-cudnn8-devel FROM ${FROM_IMAGE_NAME} ENV DEBIAN_FRONTEND=noninteractive # apt dependencies -RUN apt-get update -RUN apt-get install -y ffmpeg libsm6 libxext6 +RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections +RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub +RUN apt-get update && apt-get install -y +RUN apt-get install -y ffmpeg libsm6 libxext6 git wget unzip \ + build-essential \ + libomp-dev # pip dependencies COPY requirements.txt . @@ -14,9 +18,15 @@ RUN pip install pytorch_lightning==1.9.0 RUN pip uninstall opencv-python==4.7.0.72 -y RUN rm -rf /usr/local/lib/python3.8/dist-packages/cv2/ RUN pip install opencv-python==4.8.0.74 +RUN pip install numpy==1.26.4 RUN pip install httpx==0.24.1 +# RUN pip install --upgrade pip setuptools wheel +RUN pip install fastapi==0.115.6 +RUN pip install starlette==0.45.2 +RUN pip install jinja2==2.11.3 +RUN pip install triton==3.1.0 # install LDM ADD . /diffusion RUN chmod +x /diffusion/*.sh -WORKDIR /diffusion +WORKDIR /diffusion \ No newline at end of file diff --git a/stable_diffusion/requirements.txt b/stable_diffusion/requirements.txt index 26802b5ff..03a5338f8 100644 --- a/stable_diffusion/requirements.txt +++ b/stable_diffusion/requirements.txt @@ -20,6 +20,7 @@ colossalai==0.2.7 invisible-watermark==0.1.5 diffusers==0.14.0 cloudpathlib==0.13.0 -git+https://github.com/facebookresearch/xformers.git@5eb0dbf315d14b5f7b38ac2ff3d8379beca7df9b#egg=xformers +#git+https://github.com/facebookresearch/xformers.git@5eb0dbf315d14b5f7b38ac2ff3d8379beca7df9b#egg=xformers +xformers==0.0.16 bitsandbytes==0.37.2 -git+https://github.com/mlcommons/logging.git@8405a08bbfc724f8888c419461c02d55a6ac960c +git+https://github.com/mlcommons/logging.git@8405a08bbfc724f8888c419461c02d55a6ac960c \ No newline at end of file diff --git a/stable_diffusion/scripts/datasets/mlcube_demo_data.sh b/stable_diffusion/scripts/datasets/mlcube_demo_data.sh index 574a6963f..65283325c 100755 --- a/stable_diffusion/scripts/datasets/mlcube_demo_data.sh +++ b/stable_diffusion/scripts/datasets/mlcube_demo_data.sh @@ -16,5 +16,5 @@ mkdir -p ${DEMO_OUTPUT_DIR} cd ${DEMO_OUTPUT_DIR} wget -O demo_data.zip -c https://mlcube.mlcommons-storage.org/minibenchmarks/stable_diffusion.zip -unzip -o stable_diffusion.zip -rm stable_diffusion.zip +unzip -o demo_data.zip +rm demo_data.zip \ No newline at end of file