From e238dd724ac62554d5bc487fc29820227705de80 Mon Sep 17 00:00:00 2001 From: David Jurado Date: Fri, 23 Jul 2021 07:57:30 -0500 Subject: [PATCH 01/14] Add download data task to MLCube --- object_detection/.dockerignore | 2 + object_detection/.gitignore | 1 + object_detection/Dockerfile.mlcube | 31 ++++ object_detection/README.md | 136 ++++++------------ object_detection/README_OLD.md | 95 ++++++++++++ object_detection/download_dataset.sh | 25 ++-- object_detection/mlcube.py | 62 ++++++++ object_detection/mlcube/mlcube.yaml | 32 +++++ object_detection/mlcube/mlcube_cli.py | 70 +++++++++ .../mlcube/workspace/parameters.yaml | 0 10 files changed, 351 insertions(+), 103 deletions(-) create mode 100644 object_detection/.dockerignore create mode 100644 object_detection/.gitignore create mode 100644 object_detection/Dockerfile.mlcube create mode 100644 object_detection/README_OLD.md create mode 100644 object_detection/mlcube.py create mode 100644 object_detection/mlcube/mlcube.yaml create mode 100644 object_detection/mlcube/mlcube_cli.py create mode 100644 object_detection/mlcube/workspace/parameters.yaml diff --git a/object_detection/.dockerignore b/object_detection/.dockerignore new file mode 100644 index 000000000..3d19cfc1d --- /dev/null +++ b/object_detection/.dockerignore @@ -0,0 +1,2 @@ +# Do not add MLCube's workspace directory. +mlcube/ \ No newline at end of file diff --git a/object_detection/.gitignore b/object_detection/.gitignore new file mode 100644 index 000000000..eda0ebdf5 --- /dev/null +++ b/object_detection/.gitignore @@ -0,0 +1 @@ +mlcube/workspace/data \ No newline at end of file diff --git a/object_detection/Dockerfile.mlcube b/object_detection/Dockerfile.mlcube new file mode 100644 index 000000000..b277b60ab --- /dev/null +++ b/object_detection/Dockerfile.mlcube @@ -0,0 +1,31 @@ +FROM pytorch/pytorch:1.6.0-cuda10.1-cudnn7-devel + +RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections + +# install basics +RUN apt-get update -y \ + && apt-get install -y apt-utils curl unzip pv \ + libglib2.0-0=2.56.1-2ubuntu1 \ + libsm6=2:1.2.2-1 \ + libxext6=2:1.3.3-1 \ + libxrender-dev=1:0.9.10-1 + +RUN pip install ninja==1.8.2.post2 \ + yacs==0.1.5 \ + cython==0.29.5 \ + matplotlib==3.0.2 \ + opencv-python==4.0.0.21 \ + mlperf_compliance==0.0.10 \ + torchvision==0.2.2 \ + pycocotools==2.0.2 \ + typer==0.3.2 + +RUN pip install --no-cache-dir https://github.com/mlperf/logging/archive/9ea0afa.zip + +WORKDIR /workspace + +COPY . . + +RUN chmod +x ./run_and_time.sh ./download_dataset.sh + +ENTRYPOINT ["python", "/workspace/mlcube.py"] \ No newline at end of file diff --git a/object_detection/README.md b/object_detection/README.md index 6895735de..8d40ef0c6 100644 --- a/object_detection/README.md +++ b/object_detection/README.md @@ -1,95 +1,41 @@ -# 1. Problem -Object detection and segmentation. Metrics are mask and box mAP. - -# 2. Directions - -### Steps to configure machine - -1. Checkout the MLPerf repository -``` -mkdir -p mlperf -cd mlperf -git clone https://github.com/mlperf/training.git -``` -2. Install CUDA and Docker -``` -source training/install_cuda_docker.sh -``` -3. Build the docker image for the object detection task -``` -cd training/object_detection/ -nvidia-docker build . -t mlperf/object_detection -``` - -4. Run docker container and install code -``` -nvidia-docker run -v .:/workspace -t -i --rm --ipc=host mlperf/object_detection \ - "cd mlperf/training/object_detection && ./install.sh" -``` -Now exit the docker container (Ctrl-D) to get back to your host. - -### Steps to download data -``` -# From training/object_detection/ -source download_dataset.sh -``` - -### Steps to run benchmark. -``` -nvidia-docker run -v .:/workspace -t -i --rm --ipc=host mlperf/object_detection \ - "cd mlperf/training/object_detection && ./run_and_time.sh" -``` - -# 3. Dataset/Environment -### Publication/Attribution -Microsoft COCO: Common Objects in Context - -### Data preprocessing -Only horizontal flips are allowed. - -### Training and test data separation -As provided by MS-COCO (2017 version). - -### Training data order -Randomly. - -### Test data order -Any order. - -# 4. Model -### Publication/Attribution -He, Kaiming, et al. "Mask r-cnn." Computer Vision (ICCV), 2017 IEEE International Conference on. -IEEE, 2017. - -We use a version of Mask R-CNN with a ResNet50 backbone. - -### List of layers -Running the timing script will display a list of layers. - -### Weight and bias initialization -The ResNet50 base must be loaded from the provided weights. They may be quantized. - -### Loss function -Multi-task loss (classification, box, mask). Described in the Mask R-CNN paper. - -Classification: Smooth L1 loss - -Box: Log loss for true class. - -Mask: per-pixel sigmoid, average binary cross-entropy loss. - -### Optimizer -Momentum SGD. Weight decay of 0.0001, momentum of 0.9. - -# 5. Quality -### Quality metric -As Mask R-CNN can provide both boxes and masks, we evaluate on both box and mask mAP. - -### Quality target -Box mAP of 0.377, mask mAP of 0.339 - -### Evaluation frequency -Once per epoch, 118k. - -### Evaluation thoroughness -Evaluate over the entire validation set. Use the official COCO API to compute mAP. +## Current implementation + +We'll be updating this section as we merge MLCube PRs and make new MLCube releases. + +### Project setup +```Python +# Create Python environment +virtualenv -p python3 ./env && source ./env/bin/activate + +# Install MLCube and MLCube docker runner from GitHub repository (normally, users will just run `pip install mlcube mlcube_docker`) +git clone https://github.com/sergey-serebryakov/mlbox.git && cd mlbox && git checkout feature/configV2 +cd ./runners/mlcube_docker && export PYTHONPATH=$(pwd) +cd ../../ && pip install -r mlcube/requirements.txt && pip install omegaconf && cd ../ + +# Fetch the RNN speech recognition workload +git clone https://github.com/mlcommons/training && cd ./training +git fetch origin pull/491/head:feature/object_detection && git checkout feature/object_detection +cd ./object_detection/mlcube +``` + +### Dataset + + +The COCO dataset will be downloaded and extracted. Sizes of the dataset in each step: + +| Dataset Step | MLCube Task | Format | Size | +|--------------------------------|-------------------|----------------|----------| +| Download (Compressed dataset) | download_data | Tar/Zip files | ~20.5 GB | +| Extract (Uncompressed dataset) | download_data | Jpg/Json files | ~21.2 GB | +| Total | (After all tasks) | All | ~41.7 GB | + +### Tasks execution +``` +# Download COCO dataset. Default path = /workspace/data +# To override it, use --data_dir=DATA_DIR +python mlcube_cli.py run --task download_data --platform docker + +# Run benchmark. Default paths = ./workspace/data +# Parameters to override: --data_dir=DATA_DIR, --output_dir=OUTPUT_DIR, --parameters_file=PATH_TO_TRAINING_PARAMS +python mlcube_cli.py run --task train --platform docker +``` \ No newline at end of file diff --git a/object_detection/README_OLD.md b/object_detection/README_OLD.md new file mode 100644 index 000000000..6895735de --- /dev/null +++ b/object_detection/README_OLD.md @@ -0,0 +1,95 @@ +# 1. Problem +Object detection and segmentation. Metrics are mask and box mAP. + +# 2. Directions + +### Steps to configure machine + +1. Checkout the MLPerf repository +``` +mkdir -p mlperf +cd mlperf +git clone https://github.com/mlperf/training.git +``` +2. Install CUDA and Docker +``` +source training/install_cuda_docker.sh +``` +3. Build the docker image for the object detection task +``` +cd training/object_detection/ +nvidia-docker build . -t mlperf/object_detection +``` + +4. Run docker container and install code +``` +nvidia-docker run -v .:/workspace -t -i --rm --ipc=host mlperf/object_detection \ + "cd mlperf/training/object_detection && ./install.sh" +``` +Now exit the docker container (Ctrl-D) to get back to your host. + +### Steps to download data +``` +# From training/object_detection/ +source download_dataset.sh +``` + +### Steps to run benchmark. +``` +nvidia-docker run -v .:/workspace -t -i --rm --ipc=host mlperf/object_detection \ + "cd mlperf/training/object_detection && ./run_and_time.sh" +``` + +# 3. Dataset/Environment +### Publication/Attribution +Microsoft COCO: Common Objects in Context + +### Data preprocessing +Only horizontal flips are allowed. + +### Training and test data separation +As provided by MS-COCO (2017 version). + +### Training data order +Randomly. + +### Test data order +Any order. + +# 4. Model +### Publication/Attribution +He, Kaiming, et al. "Mask r-cnn." Computer Vision (ICCV), 2017 IEEE International Conference on. +IEEE, 2017. + +We use a version of Mask R-CNN with a ResNet50 backbone. + +### List of layers +Running the timing script will display a list of layers. + +### Weight and bias initialization +The ResNet50 base must be loaded from the provided weights. They may be quantized. + +### Loss function +Multi-task loss (classification, box, mask). Described in the Mask R-CNN paper. + +Classification: Smooth L1 loss + +Box: Log loss for true class. + +Mask: per-pixel sigmoid, average binary cross-entropy loss. + +### Optimizer +Momentum SGD. Weight decay of 0.0001, momentum of 0.9. + +# 5. Quality +### Quality metric +As Mask R-CNN can provide both boxes and masks, we evaluate on both box and mask mAP. + +### Quality target +Box mAP of 0.377, mask mAP of 0.339 + +### Evaluation frequency +Once per epoch, 118k. + +### Evaluation thoroughness +Evaluate over the entire validation set. Use the official COCO API to compute mAP. diff --git a/object_detection/download_dataset.sh b/object_detection/download_dataset.sh index e9fe2656d..b05d180fc 100755 --- a/object_detection/download_dataset.sh +++ b/object_detection/download_dataset.sh @@ -1,20 +1,29 @@ #!/bin/bash # Get COCO 2017 data sets -mkdir -p pytorch/datasets/coco -pushd pytorch/datasets/coco +DATA_ROOT_DIR="${DATA_ROOT_DIR:-./pytorch/datasets/coco}" +echo "Downloaading to folder: $DATA_ROOT_DIR" +mkdir -p $DATA_ROOT_DIR +pushd $DATA_ROOT_DIR curl -O https://dl.fbaipublicfiles.com/detectron/coco/coco_annotations_minival.tgz -tar xzf coco_annotations_minival.tgz +echo "Extracting coco_annotations_minival.tgz ..." +tar -xzf coco_annotations_minival.tgz &>/dev/null -curl -O http://images.cocodataset.org/zips/train2017.zip -unzip train2017.zip +curl -O http://images.cocodataset.org/annotations/annotations_trainval2017.zip +echo "Extracting annotations_trainval2017.zip ..." +n_files=`unzip -l annotations_trainval2017.zip| grep .json | wc -l` +unzip annotations_trainval2017.zip | pv -l -s $n_files > /dev/null curl -O http://images.cocodataset.org/zips/val2017.zip -unzip val2017.zip +echo "Extracting val2017.zip ..." +n_files=`unzip -l val2017.zip| grep .jpg | wc -l` +unzip val2017.zip | pv -l -s $n_files > /dev/null -curl -O http://images.cocodataset.org/annotations/annotations_trainval2017.zip -unzip annotations_trainval2017.zip +curl -O http://images.cocodataset.org/zips/train2017.zip +echo "Extracting train2017.zip ..." +n_files=`unzip -l train2017.zip| grep .jpg | wc -l` +unzip train2017.zip | pv -l -s $n_files > /dev/null # TBD: MD5 verification # $md5sum *.zip *.tgz diff --git a/object_detection/mlcube.py b/object_detection/mlcube.py new file mode 100644 index 000000000..379818b72 --- /dev/null +++ b/object_detection/mlcube.py @@ -0,0 +1,62 @@ +"""MLCube handler file""" +import os +import yaml +import typer +import shutil +import subprocess +from pathlib import Path + + +app = typer.Typer() + +class DownloadDataTask(object): + """Download task Class + It defines the environment variables: + DATA_ROOT_DIR: Directory path to download the dataset + Then executes the download script""" + @staticmethod + def run(data_dir: str) -> None: + + env = os.environ.copy() + env.update({ + 'DATA_ROOT_DIR': data_dir, + }) + + process = subprocess.Popen("./download_dataset.sh", cwd=".", env=env) + process.wait() + +class TrainTask(object): + """Preprocess dataset task Class + It defines the environment variables: + DATA_DIR: Dataset directory path + OUTPUT_DIR: Directory path where model will be saved + All other parameters defined in parameters_file + Then executes the benchmark script""" + @staticmethod + def run(data_dir: str, output_dir: str, parameters_file: str) -> None: + with open(parameters_file, 'r') as stream: + parameters = yaml.safe_load(stream) + + env = os.environ.copy() + env.update({ + 'DATA_DIR': data_dir, + 'OUTPUT_DIR': output_dir, + }) + + env.update(parameters) + + process = subprocess.Popen("./run_and_time.sh", cwd=".", env=env) + process.wait() + +@app.command("download_data") +def download_data(data_dir: str = typer.Option(..., '--data_dir')): + DownloadDataTask.run(data_dir) + +@app.command("train") +def train(data_dir: str = typer.Option(..., '--data_dir'), + output_dir: str = typer.Option(..., '--output_dir'), + parameters_file: str = typer.Option(..., '--parameters_file')): + TrainTask.run(data_dir, output_dir, parameters_file) + +if __name__ == '__main__': + app() \ No newline at end of file diff --git a/object_detection/mlcube/mlcube.yaml b/object_detection/mlcube/mlcube.yaml new file mode 100644 index 000000000..eed17a04f --- /dev/null +++ b/object_detection/mlcube/mlcube.yaml @@ -0,0 +1,32 @@ +name: Object Detection +description: MLCommons Object Detection Training Reference Benchmark +authors: + - {name: "MLCommons Best Practices Working Group"} + +platform: + # Edit this according to your system specs + accelerator_count: 1 + +container: + # Image name. + image: mlcommons/object_detection:0.0.1 + # Docker build context relative to $MLCUBE_ROOT. Default is `build`. + build_context: "../" + # Docker file name within docker build context, default is `Dockerfile`. + build_file: "Dockerfile.mlcube" + +tasks: + # Download LibriSpeech dataset + download_data: + io: + # Directory for uncompressed datasets. Total size is ~ 65G. + - {name: data_dir, type: directory, io: output, default: $WORKSPACE/data} + train: + # Train Object Detection model + io: + # see Download::data_dir, it takes the processes subfolder + - {name: data_dir, type: directory, io: input, default: $WORKSPACE/data/LibriSpeech} + # Model output folder + - {name: output_dir, type: directory, io: output, default: $WORKSPACE/output} + # Yaml file with training parameters. + - {name: parameters_file, type: file, io: input, default: $WORKSPACE/parameters.yaml} \ No newline at end of file diff --git a/object_detection/mlcube/mlcube_cli.py b/object_detection/mlcube/mlcube_cli.py new file mode 100644 index 000000000..5df6a4bff --- /dev/null +++ b/object_detection/mlcube/mlcube_cli.py @@ -0,0 +1,70 @@ +""" +This requires the MLCube 2.0 configuration +""" +import os +import yaml +import click +import typing +from mlcube_docker.docker_run import DockerRun + + +def load_config(mlcube_config_path: str, user_config_path: str) -> typing.Dict: + """Returns dictionary containing MLCube configuration""" + # Load mlcube config data + try: + with open(mlcube_config_path) as stream: + mlcube_config_data = yaml.load(stream.read(), Loader=yaml.SafeLoader) + except IOError as exc: + # If file doesn't exist throw the exception: + # OSError: {PATH_TO}/mnist/mlcube.yaml: No such file or directory + raise IOError("%s: %s" % (mlcube_config_path, exc.strerror)) + + # Load user config data if file exists + if os.path.isfile(user_config_path): + with open(user_config_path) as stream: + user_config_data = yaml.load(stream.read(), Loader=yaml.SafeLoader) + else: + return mlcube_config_data + + # Merge config data + tmp = mlcube_config_data['container'] + mlcube_config_data['container'] = user_config_data['container'] + mlcube_config_data['container'].update(tmp) + return mlcube_config_data + + +@click.group(name='mlcube') +def cli(): + pass + + +@cli.command(name='run', help='Run MLCube ML task.', + context_settings=dict(ignore_unknown_options=True, allow_extra_args=True)) +@click.option('--mlcube', required=False, type=str, help='Path to MLCube directory, default is current.') +@click.option('--platform', required=False, type=str, help='Platform to run MLCube, default is docker/podman.') +@click.option('--task', required=False, type=str, help='MLCube task name to run, default is `main`.') +@click.option('--workspace', required=False, type=str, help='Workspace path, default is `workspace` within ' + 'MLCube folder') +def run(mlcube: str, platform: str, task: str, workspace: str): + mlcube_root = os.path.abspath(mlcube or os.getcwd()) + if os.path.isfile(mlcube_root): + mlcube_root = os.path.dirname(mlcube_root) + + platform = platform or 'docker' + if platform != 'docker': + raise ValueError(f"Only `docker` platform is supported") + + task = task or 'main' + workspace = workspace or os.path.join(mlcube_root, 'workspace') + + mlcube_config_data = load_config( + os.path.join(str(mlcube_root), 'mlcube.yaml'), + os.path.join(os.path.expanduser("~"), '.mlcube.yaml') + ) + + docker_runner = DockerRun(mlcube_config_data, root=mlcube_root, workspace=workspace, task=task) + docker_runner.run() + + +if __name__ == "__main__": + cli() \ No newline at end of file diff --git a/object_detection/mlcube/workspace/parameters.yaml b/object_detection/mlcube/workspace/parameters.yaml new file mode 100644 index 000000000..e69de29bb From 7f69f94c8263c77e07df2f0a3a7288a76021375d Mon Sep 17 00:00:00 2001 From: David Jurado Date: Mon, 26 Jul 2021 11:58:51 -0500 Subject: [PATCH 02/14] Add unzipping progress --- object_detection/README.md | 4 ++-- object_detection/download_dataset.sh | 18 +++++++++++------- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/object_detection/README.md b/object_detection/README.md index 8d40ef0c6..8bd9eb641 100644 --- a/object_detection/README.md +++ b/object_detection/README.md @@ -12,9 +12,9 @@ git clone https://github.com/sergey-serebryakov/mlbox.git && cd mlbox && git che cd ./runners/mlcube_docker && export PYTHONPATH=$(pwd) cd ../../ && pip install -r mlcube/requirements.txt && pip install omegaconf && cd ../ -# Fetch the RNN speech recognition workload +# Fetch the Object Detection workload git clone https://github.com/mlcommons/training && cd ./training -git fetch origin pull/491/head:feature/object_detection && git checkout feature/object_detection +git fetch origin pull/501/head:feature/object_detection && git checkout feature/object_detection cd ./object_detection/mlcube ``` diff --git a/object_detection/download_dataset.sh b/object_detection/download_dataset.sh index b05d180fc..fcd857443 100755 --- a/object_detection/download_dataset.sh +++ b/object_detection/download_dataset.sh @@ -6,24 +6,28 @@ echo "Downloaading to folder: $DATA_ROOT_DIR" mkdir -p $DATA_ROOT_DIR pushd $DATA_ROOT_DIR +echo "Downloading coco_annotations_minival.tgz:" curl -O https://dl.fbaipublicfiles.com/detectron/coco/coco_annotations_minival.tgz -echo "Extracting coco_annotations_minival.tgz ..." +echo "Extracting coco_annotations_minival.tgz:" tar -xzf coco_annotations_minival.tgz &>/dev/null +echo "Downloading annotations_trainval2017.zip:" curl -O http://images.cocodataset.org/annotations/annotations_trainval2017.zip -echo "Extracting annotations_trainval2017.zip ..." +echo "Extracting annotations_trainval2017.zip:" n_files=`unzip -l annotations_trainval2017.zip| grep .json | wc -l` -unzip annotations_trainval2017.zip | pv -l -s $n_files > /dev/null +unzip annotations_trainval2017.zip | { I=-1; while read; do printf "Progress: $((++I*100/$n_files))%%\r"; done; echo ""; } +echo "Downloading val2017.zip:" curl -O http://images.cocodataset.org/zips/val2017.zip -echo "Extracting val2017.zip ..." +echo "Extracting val2017.zip:" n_files=`unzip -l val2017.zip| grep .jpg | wc -l` -unzip val2017.zip | pv -l -s $n_files > /dev/null +unzip val2017.zip | { I=-1; while read; do printf "Progress: $((++I*100/$n_files))%%\r"; done; echo ""; } +echo "Downloading train2017.zip:" curl -O http://images.cocodataset.org/zips/train2017.zip -echo "Extracting train2017.zip ..." +echo "Extracting train2017.zip:" n_files=`unzip -l train2017.zip| grep .jpg | wc -l` -unzip train2017.zip | pv -l -s $n_files > /dev/null +unzip train2017.zip | { I=-1; while read; do printf "Progress: $((++I*100/$n_files))%%\r"; done; echo ""; } # TBD: MD5 verification # $md5sum *.zip *.tgz From a3bc71c010bb76cea7e169fa546fd6d2af50987b Mon Sep 17 00:00:00 2001 From: David Jurado Date: Wed, 28 Jul 2021 19:46:41 -0500 Subject: [PATCH 03/14] Add train task --- object_detection/.gitignore | 3 ++- object_detection/Dockerfile.mlcube | 6 +++-- object_detection/README.md | 2 -- object_detection/mlcube.py | 7 +++++- object_detection/mlcube/mlcube.yaml | 2 +- .../mlcube/workspace/parameters.yaml | 2 ++ .../maskrcnn_benchmark/engine/trainer.py | 1 + .../pytorch/tools/train_mlperf.py | 4 ++-- object_detection/run_and_time.sh | 24 ++++++++++++++++--- 9 files changed, 39 insertions(+), 12 deletions(-) diff --git a/object_detection/.gitignore b/object_detection/.gitignore index eda0ebdf5..7406dc7e9 100644 --- a/object_detection/.gitignore +++ b/object_detection/.gitignore @@ -1 +1,2 @@ -mlcube/workspace/data \ No newline at end of file +mlcube/workspace/data +mlcube/workspace/output \ No newline at end of file diff --git a/object_detection/Dockerfile.mlcube b/object_detection/Dockerfile.mlcube index b277b60ab..5fe9dca29 100644 --- a/object_detection/Dockerfile.mlcube +++ b/object_detection/Dockerfile.mlcube @@ -4,7 +4,7 @@ RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selectio # install basics RUN apt-get update -y \ - && apt-get install -y apt-utils curl unzip pv \ + && apt-get install -y apt-utils curl unzip \ libglib2.0-0=2.56.1-2ubuntu1 \ libsm6=2:1.2.2-1 \ libxext6=2:1.3.3-1 \ @@ -26,6 +26,8 @@ WORKDIR /workspace COPY . . -RUN chmod +x ./run_and_time.sh ./download_dataset.sh +RUN chmod +x ./run_and_time.sh ./download_dataset.sh ./install.sh + +ENV PYTHONPATH /workspace/pytorch ENTRYPOINT ["python", "/workspace/mlcube.py"] \ No newline at end of file diff --git a/object_detection/README.md b/object_detection/README.md index 8bd9eb641..e303bce26 100644 --- a/object_detection/README.md +++ b/object_detection/README.md @@ -32,10 +32,8 @@ The COCO dataset will be downloaded and extracted. Sizes of the dataset in each ### Tasks execution ``` # Download COCO dataset. Default path = /workspace/data -# To override it, use --data_dir=DATA_DIR python mlcube_cli.py run --task download_data --platform docker # Run benchmark. Default paths = ./workspace/data -# Parameters to override: --data_dir=DATA_DIR, --output_dir=OUTPUT_DIR, --parameters_file=PATH_TO_TRAINING_PARAMS python mlcube_cli.py run --task train --platform docker ``` \ No newline at end of file diff --git a/object_detection/mlcube.py b/object_detection/mlcube.py index 379818b72..b3f8c2d03 100644 --- a/object_detection/mlcube.py +++ b/object_detection/mlcube.py @@ -34,6 +34,10 @@ class TrainTask(object): Then executes the benchmark script""" @staticmethod def run(data_dir: str, output_dir: str, parameters_file: str) -> None: + + process = subprocess.Popen("./install.sh", cwd=".") + process.wait() + with open(parameters_file, 'r') as stream: parameters = yaml.safe_load(stream) @@ -43,7 +47,8 @@ def run(data_dir: str, output_dir: str, parameters_file: str) -> None: 'OUTPUT_DIR': output_dir, }) - env.update(parameters) + if parameters is not None: + env.update(parameters) process = subprocess.Popen("./run_and_time.sh", cwd=".", env=env) process.wait() diff --git a/object_detection/mlcube/mlcube.yaml b/object_detection/mlcube/mlcube.yaml index eed17a04f..a921adfec 100644 --- a/object_detection/mlcube/mlcube.yaml +++ b/object_detection/mlcube/mlcube.yaml @@ -25,7 +25,7 @@ tasks: # Train Object Detection model io: # see Download::data_dir, it takes the processes subfolder - - {name: data_dir, type: directory, io: input, default: $WORKSPACE/data/LibriSpeech} + - {name: data_dir, type: directory, io: input, default: $WORKSPACE/data} # Model output folder - {name: output_dir, type: directory, io: output, default: $WORKSPACE/output} # Yaml file with training parameters. diff --git a/object_detection/mlcube/workspace/parameters.yaml b/object_detection/mlcube/workspace/parameters.yaml index e69de29bb..473864770 100644 --- a/object_detection/mlcube/workspace/parameters.yaml +++ b/object_detection/mlcube/workspace/parameters.yaml @@ -0,0 +1,2 @@ +SAVE_CHECKPOINTS: "True" # Instead of False use empty value +SOLVER_MAX_ITER: "100" \ No newline at end of file diff --git a/object_detection/pytorch/maskrcnn_benchmark/engine/trainer.py b/object_detection/pytorch/maskrcnn_benchmark/engine/trainer.py index 860107246..27a20d7d3 100644 --- a/object_detection/pytorch/maskrcnn_benchmark/engine/trainer.py +++ b/object_detection/pytorch/maskrcnn_benchmark/engine/trainer.py @@ -73,6 +73,7 @@ def do_train( model.train() start_training_time = time.time() end = time.time() + early_exit = False for iteration, (images, targets, _) in enumerate(data_loader, start_iter): diff --git a/object_detection/pytorch/tools/train_mlperf.py b/object_detection/pytorch/tools/train_mlperf.py index 061431459..b7f2b100e 100644 --- a/object_detection/pytorch/tools/train_mlperf.py +++ b/object_detection/pytorch/tools/train_mlperf.py @@ -115,7 +115,7 @@ def cast_frozen_bn_to_half(module): cast_frozen_bn_to_half(child) return module -def train(cfg, local_rank, distributed, disable_allreduce_for_logging, random_number_generator): +def train(cfg, local_rank, distributed, random_number_generator): # Model logging log_event(key=constants.GLOBAL_BATCH_SIZE, value=cfg.SOLVER.IMS_PER_BATCH) log_event(key=constants.NUM_IMAGE_CANDIDATES, value=cfg.MODEL.RPN.FPN_POST_NMS_TOP_N_TRAIN) @@ -295,7 +295,7 @@ def main(): logger.info(config_str) logger.info("Running with config:\n{}".format(cfg)) - model, success = train(cfg, args.local_rank, args.distributed, args.disable_allreduce_for_logging, random_number_generator) + model, success = train(cfg, args.local_rank, args.distributed, random_number_generator) if success is not None: if success: diff --git a/object_detection/run_and_time.sh b/object_detection/run_and_time.sh index f8526534b..f7709d19e 100755 --- a/object_detection/run_and_time.sh +++ b/object_detection/run_and_time.sh @@ -1,11 +1,29 @@ #!/bin/bash -# Runs benchmark and reports time to convergence -pushd pytorch +DATA_DIR="${DATA_DIR:-pytorch/datasets/coco}" +DATA_ROOT_TARGET="pytorch/datasets/coco" +OUTPUT_DIR="${OUTPUT_DIR:-}" + +SAVE_CHECKPOINTS="${SAVE_CHECKPOINTS:-}" +SOLVER_MAX_ITER="${SOLVER_MAX_ITER:-40000}" +#Link input data paths +if [ "$DATA_DIR" != "$DATA_ROOT_TARGET" ]; then + mkdir -p $DATA_ROOT_TARGET + ln -s $DATA_DIR/annotations $DATA_ROOT_TARGET/annotations + ln -s $DATA_DIR/train2017 $DATA_ROOT_TARGET/train2017 + ln -s $DATA_DIR/test2017 $DATA_ROOT_TARGET/test2017 + ln -s $DATA_DIR/val2017 $DATA_ROOT_TARGET/val2017 + echo $DATA_ROOT_TARGET + ls -lah $DATA_ROOT_TARGET +fi + +# Runs benchmark and reports time to convergence +pushd pytorch # Single GPU training time python tools/train_mlperf.py --config-file "configs/e2e_mask_rcnn_R_50_FPN_1x.yaml" \ - SOLVER.IMS_PER_BATCH 2 TEST.IMS_PER_BATCH 1 SOLVER.MAX_ITER 720000 SOLVER.STEPS "(480000, 640000)" SOLVER.BASE_LR 0.0025 + SOLVER.IMS_PER_BATCH 2 TEST.IMS_PER_BATCH 1 SOLVER.MAX_ITER 720000 SOLVER.STEPS "(480000, 640000)" \ + SOLVER.MAX_ITER $SOLVER_MAX_ITER SOLVER.BASE_LR 0.0025 SAVE_CHECKPOINTS $SAVE_CHECKPOINTS OUTPUT_DIR $OUTPUT_DIR popd From 648c70a92cae2b85c2de407d2549e358bb36c33d Mon Sep 17 00:00:00 2001 From: David Jurado Date: Tue, 3 Aug 2021 12:02:32 -0500 Subject: [PATCH 04/14] specify other parameter files location --- object_detection/README.md | 8 ++++++++ object_detection/mlcube/workspace/parameters.yaml | 5 ++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/object_detection/README.md b/object_detection/README.md index e303bce26..4768a9161 100644 --- a/object_detection/README.md +++ b/object_detection/README.md @@ -30,7 +30,15 @@ The COCO dataset will be downloaded and extracted. Sizes of the dataset in each | Total | (After all tasks) | All | ~41.7 GB | ### Tasks execution + +Parameters are defined at these files: + +* MLCube user parameters: mlcube/workspace/parameters.yaml +* Project user parameters: pytorch/configs/e2e_mask_rcnn_R_50_FPN_1x.yaml +* Project default parameters: pytorch/maskrcnn_benchmark/config/defaults.py + ``` + # Download COCO dataset. Default path = /workspace/data python mlcube_cli.py run --task download_data --platform docker diff --git a/object_detection/mlcube/workspace/parameters.yaml b/object_detection/mlcube/workspace/parameters.yaml index 473864770..11d6035bf 100644 --- a/object_detection/mlcube/workspace/parameters.yaml +++ b/object_detection/mlcube/workspace/parameters.yaml @@ -1,2 +1,5 @@ SAVE_CHECKPOINTS: "True" # Instead of False use empty value -SOLVER_MAX_ITER: "100" \ No newline at end of file +SOLVER_MAX_ITER: "100" + +# Other user parameters are defined at ../../pytorch/configs/e2e_mask_rcnn_R_50_FPN_1x.yaml +# Default parameters are defined at ../../pytorch/maskrcnn_benchmark/config/defaults.py \ No newline at end of file From 16f75fdaacd71009d7391f86583479502dcf4b45 Mon Sep 17 00:00:00 2001 From: David Jurado Date: Fri, 6 Aug 2021 10:07:12 -0500 Subject: [PATCH 05/14] Add support to override parameters at command line --- object_detection/mlcube/mlcube_cli.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/object_detection/mlcube/mlcube_cli.py b/object_detection/mlcube/mlcube_cli.py index 5df6a4bff..8c4a6643a 100644 --- a/object_detection/mlcube/mlcube_cli.py +++ b/object_detection/mlcube/mlcube_cli.py @@ -33,6 +33,21 @@ def load_config(mlcube_config_path: str, user_config_path: str) -> typing.Dict: return mlcube_config_data +def override_extra_parameters(ctx, mlcube_config_data, task): + """Get extra paramters from context and override them on mlcube config data dict""" + for input_param in ctx.args: + input_key, input_value = input_param.split("=") + input_key = input_key.replace("--", "") + # Replace main container parameters + for key in mlcube_config_data["container"]: + if key==input_key: + mlcube_config_data["container"][key]=input_value + # Replace io paths in current task + for io in mlcube_config_data["tasks"][task]["io"]: + if io["name"]==input_key: + io["default"]=input_value + + @click.group(name='mlcube') def cli(): pass @@ -45,7 +60,8 @@ def cli(): @click.option('--task', required=False, type=str, help='MLCube task name to run, default is `main`.') @click.option('--workspace', required=False, type=str, help='Workspace path, default is `workspace` within ' 'MLCube folder') -def run(mlcube: str, platform: str, task: str, workspace: str): +@click.pass_context +def run(ctx, mlcube: str, platform: str, task: str, workspace: str): mlcube_root = os.path.abspath(mlcube or os.getcwd()) if os.path.isfile(mlcube_root): mlcube_root = os.path.dirname(mlcube_root) @@ -61,10 +77,10 @@ def run(mlcube: str, platform: str, task: str, workspace: str): os.path.join(str(mlcube_root), 'mlcube.yaml'), os.path.join(os.path.expanduser("~"), '.mlcube.yaml') ) - + override_extra_parameters(ctx, mlcube_config_data, task) docker_runner = DockerRun(mlcube_config_data, root=mlcube_root, workspace=workspace, task=task) docker_runner.run() if __name__ == "__main__": - cli() \ No newline at end of file + cli() From efca6e008aa057cce169e1b251588fba441c271a Mon Sep 17 00:00:00 2001 From: David Jurado Date: Tue, 7 Sep 2021 20:47:04 -0500 Subject: [PATCH 06/14] Update to MLCube config v2.0 --- object_detection/README.md | 33 +++++----- object_detection/mlcube/mlcube.yaml | 17 +++--- object_detection/mlcube/mlcube_cli.py | 86 --------------------------- 3 files changed, 26 insertions(+), 110 deletions(-) delete mode 100644 object_detection/mlcube/mlcube_cli.py diff --git a/object_detection/README.md b/object_detection/README.md index 4768a9161..4ddb4f37c 100644 --- a/object_detection/README.md +++ b/object_detection/README.md @@ -1,16 +1,15 @@ -## Current implementation +# Benchmark execution with MLCube -We'll be updating this section as we merge MLCube PRs and make new MLCube releases. +## Project setup -### Project setup -```Python +```bash # Create Python environment virtualenv -p python3 ./env && source ./env/bin/activate # Install MLCube and MLCube docker runner from GitHub repository (normally, users will just run `pip install mlcube mlcube_docker`) -git clone https://github.com/sergey-serebryakov/mlbox.git && cd mlbox && git checkout feature/configV2 -cd ./runners/mlcube_docker && export PYTHONPATH=$(pwd) -cd ../../ && pip install -r mlcube/requirements.txt && pip install omegaconf && cd ../ +git clone https://github.com/mlcommons/mlcube && cd mlcube/mlcube +python setup.py bdist_wheel && pip install --force-reinstall ./dist/mlcube-* && cd .. +cd ./runners/mlcube_docker && python setup.py bdist_wheel && pip install --force-reinstall --no-deps ./dist/mlcube_docker-* && cd ../../.. # Fetch the Object Detection workload git clone https://github.com/mlcommons/training && cd ./training @@ -18,8 +17,7 @@ git fetch origin pull/501/head:feature/object_detection && git checkout feature/ cd ./object_detection/mlcube ``` -### Dataset - +## Dataset The COCO dataset will be downloaded and extracted. Sizes of the dataset in each step: @@ -29,7 +27,7 @@ The COCO dataset will be downloaded and extracted. Sizes of the dataset in each | Extract (Uncompressed dataset) | download_data | Jpg/Json files | ~21.2 GB | | Total | (After all tasks) | All | ~41.7 GB | -### Tasks execution +## Tasks execution Parameters are defined at these files: @@ -37,11 +35,16 @@ Parameters are defined at these files: * Project user parameters: pytorch/configs/e2e_mask_rcnn_R_50_FPN_1x.yaml * Project default parameters: pytorch/maskrcnn_benchmark/config/defaults.py -``` - +```bash # Download COCO dataset. Default path = /workspace/data -python mlcube_cli.py run --task download_data --platform docker +mlcube run --task download_data # Run benchmark. Default paths = ./workspace/data -python mlcube_cli.py run --task train --platform docker -``` \ No newline at end of file +mlcube run --task train +``` + +By default MLCube images use pull-type installation, so they should be available on docker hub. If not, try this: + +```bash +mlcube run ... -Pdocker.build_strategy=auto +``` diff --git a/object_detection/mlcube/mlcube.yaml b/object_detection/mlcube/mlcube.yaml index a921adfec..288bc7d15 100644 --- a/object_detection/mlcube/mlcube.yaml +++ b/object_detection/mlcube/mlcube.yaml @@ -7,7 +7,7 @@ platform: # Edit this according to your system specs accelerator_count: 1 -container: +docker: # Image name. image: mlcommons/object_detection:0.0.1 # Docker build context relative to $MLCUBE_ROOT. Default is `build`. @@ -18,15 +18,14 @@ container: tasks: # Download LibriSpeech dataset download_data: - io: + parameters: # Directory for uncompressed datasets. Total size is ~ 65G. - - {name: data_dir, type: directory, io: output, default: $WORKSPACE/data} + outputs: {data_dir: data/} train: # Train Object Detection model - io: - # see Download::data_dir, it takes the processes subfolder - - {name: data_dir, type: directory, io: input, default: $WORKSPACE/data} + parameters: + # data_dir: see Download::data_dir, it takes the processes subfolder + # parameters_file: Yaml file with training parameters. + inputs: {data_dir: data/, parameters_file: {type: file, default: parameters.yaml}} # Model output folder - - {name: output_dir, type: directory, io: output, default: $WORKSPACE/output} - # Yaml file with training parameters. - - {name: parameters_file, type: file, io: input, default: $WORKSPACE/parameters.yaml} \ No newline at end of file + outputs: {output_dir: output/} \ No newline at end of file diff --git a/object_detection/mlcube/mlcube_cli.py b/object_detection/mlcube/mlcube_cli.py deleted file mode 100644 index 8c4a6643a..000000000 --- a/object_detection/mlcube/mlcube_cli.py +++ /dev/null @@ -1,86 +0,0 @@ -""" -This requires the MLCube 2.0 configuration -""" -import os -import yaml -import click -import typing -from mlcube_docker.docker_run import DockerRun - - -def load_config(mlcube_config_path: str, user_config_path: str) -> typing.Dict: - """Returns dictionary containing MLCube configuration""" - # Load mlcube config data - try: - with open(mlcube_config_path) as stream: - mlcube_config_data = yaml.load(stream.read(), Loader=yaml.SafeLoader) - except IOError as exc: - # If file doesn't exist throw the exception: - # OSError: {PATH_TO}/mnist/mlcube.yaml: No such file or directory - raise IOError("%s: %s" % (mlcube_config_path, exc.strerror)) - - # Load user config data if file exists - if os.path.isfile(user_config_path): - with open(user_config_path) as stream: - user_config_data = yaml.load(stream.read(), Loader=yaml.SafeLoader) - else: - return mlcube_config_data - - # Merge config data - tmp = mlcube_config_data['container'] - mlcube_config_data['container'] = user_config_data['container'] - mlcube_config_data['container'].update(tmp) - return mlcube_config_data - - -def override_extra_parameters(ctx, mlcube_config_data, task): - """Get extra paramters from context and override them on mlcube config data dict""" - for input_param in ctx.args: - input_key, input_value = input_param.split("=") - input_key = input_key.replace("--", "") - # Replace main container parameters - for key in mlcube_config_data["container"]: - if key==input_key: - mlcube_config_data["container"][key]=input_value - # Replace io paths in current task - for io in mlcube_config_data["tasks"][task]["io"]: - if io["name"]==input_key: - io["default"]=input_value - - -@click.group(name='mlcube') -def cli(): - pass - - -@cli.command(name='run', help='Run MLCube ML task.', - context_settings=dict(ignore_unknown_options=True, allow_extra_args=True)) -@click.option('--mlcube', required=False, type=str, help='Path to MLCube directory, default is current.') -@click.option('--platform', required=False, type=str, help='Platform to run MLCube, default is docker/podman.') -@click.option('--task', required=False, type=str, help='MLCube task name to run, default is `main`.') -@click.option('--workspace', required=False, type=str, help='Workspace path, default is `workspace` within ' - 'MLCube folder') -@click.pass_context -def run(ctx, mlcube: str, platform: str, task: str, workspace: str): - mlcube_root = os.path.abspath(mlcube or os.getcwd()) - if os.path.isfile(mlcube_root): - mlcube_root = os.path.dirname(mlcube_root) - - platform = platform or 'docker' - if platform != 'docker': - raise ValueError(f"Only `docker` platform is supported") - - task = task or 'main' - workspace = workspace or os.path.join(mlcube_root, 'workspace') - - mlcube_config_data = load_config( - os.path.join(str(mlcube_root), 'mlcube.yaml'), - os.path.join(os.path.expanduser("~"), '.mlcube.yaml') - ) - override_extra_parameters(ctx, mlcube_config_data, task) - docker_runner = DockerRun(mlcube_config_data, root=mlcube_root, workspace=workspace, task=task) - docker_runner.run() - - -if __name__ == "__main__": - cli() From 9395cfd931491aa722a810179e153e436f85a387 Mon Sep 17 00:00:00 2001 From: David Jurado Date: Fri, 22 Jul 2022 10:56:11 -0500 Subject: [PATCH 07/14] Update readme --- object_detection/README.md | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/object_detection/README.md b/object_detection/README.md index 4ddb4f37c..e8ac3b0a2 100644 --- a/object_detection/README.md +++ b/object_detection/README.md @@ -1,15 +1,13 @@ # Benchmark execution with MLCube -## Project setup +## Current implementation -```bash -# Create Python environment -virtualenv -p python3 ./env && source ./env/bin/activate +We'll be updating this section as we merge MLCube PRs and make new MLCube releases. -# Install MLCube and MLCube docker runner from GitHub repository (normally, users will just run `pip install mlcube mlcube_docker`) -git clone https://github.com/mlcommons/mlcube && cd mlcube/mlcube -python setup.py bdist_wheel && pip install --force-reinstall ./dist/mlcube-* && cd .. -cd ./runners/mlcube_docker && python setup.py bdist_wheel && pip install --force-reinstall --no-deps ./dist/mlcube_docker-* && cd ../../.. +### Project setup +```Python +# Create Python environment and install MLCube Docker runner +virtualenv -p python3 ./env && source ./env/bin/activate && pip install mlcube-docker # Fetch the Object Detection workload git clone https://github.com/mlcommons/training && cd ./training @@ -17,7 +15,8 @@ git fetch origin pull/501/head:feature/object_detection && git checkout feature/ cd ./object_detection/mlcube ``` -## Dataset +### Dataset + The COCO dataset will be downloaded and extracted. Sizes of the dataset in each step: @@ -27,7 +26,7 @@ The COCO dataset will be downloaded and extracted. Sizes of the dataset in each | Extract (Uncompressed dataset) | download_data | Jpg/Json files | ~21.2 GB | | Total | (After all tasks) | All | ~41.7 GB | -## Tasks execution +### Tasks execution Parameters are defined at these files: @@ -35,16 +34,19 @@ Parameters are defined at these files: * Project user parameters: pytorch/configs/e2e_mask_rcnn_R_50_FPN_1x.yaml * Project default parameters: pytorch/maskrcnn_benchmark/config/defaults.py -```bash +``` + # Download COCO dataset. Default path = /workspace/data -mlcube run --task download_data +python mlcube_cli.py run --task download_data --platform docker # Run benchmark. Default paths = ./workspace/data -mlcube run --task train +python mlcube_cli.py run --task train --platform docker ``` -By default MLCube images use pull-type installation, so they should be available on docker hub. If not, try this: +Parameters defined at **mculbe/mlcube.yaml** could be overridden using: `--param=input` + +We are targeting pull-type installation, so MLCube images should be available on docker hub. If not, try this: ```bash -mlcube run ... -Pdocker.build_strategy=auto -``` +mlcube run ... -Pdocker.build_strategy=always +``` \ No newline at end of file From 3a9fe5f8e97b2cc45f9fae7edf362b5ee5e76365 Mon Sep 17 00:00:00 2001 From: David Jurado Date: Fri, 22 Mar 2024 15:56:13 -0500 Subject: [PATCH 08/14] Add MD5 verification --- object_detection/Dockerfile.mlcube | 5 +++-- object_detection/download_dataset.sh | 25 +++++++++++++++++++------ 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/object_detection/Dockerfile.mlcube b/object_detection/Dockerfile.mlcube index 5fe9dca29..43bd303a2 100644 --- a/object_detection/Dockerfile.mlcube +++ b/object_detection/Dockerfile.mlcube @@ -1,10 +1,11 @@ FROM pytorch/pytorch:1.6.0-cuda10.1-cudnn7-devel RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections +RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub # install basics RUN apt-get update -y \ - && apt-get install -y apt-utils curl unzip \ + && apt-get install -y apt-utils curl unzip pv \ libglib2.0-0=2.56.1-2ubuntu1 \ libsm6=2:1.2.2-1 \ libxext6=2:1.3.3-1 \ @@ -26,7 +27,7 @@ WORKDIR /workspace COPY . . -RUN chmod +x ./run_and_time.sh ./download_dataset.sh ./install.sh +RUN chmod +x ./run_and_time.sh ./download_dataset.sh ENV PYTHONPATH /workspace/pytorch diff --git a/object_detection/download_dataset.sh b/object_detection/download_dataset.sh index fcd857443..b1c0ded29 100755 --- a/object_detection/download_dataset.sh +++ b/object_detection/download_dataset.sh @@ -29,11 +29,24 @@ echo "Extracting train2017.zip:" n_files=`unzip -l train2017.zip| grep .jpg | wc -l` unzip train2017.zip | { I=-1; while read; do printf "Progress: $((++I*100/$n_files))%%\r"; done; echo ""; } -# TBD: MD5 verification -# $md5sum *.zip *.tgz -#f4bbac642086de4f52a3fdda2de5fa2c annotations_trainval2017.zip -#cced6f7f71b7629ddf16f17bbcfab6b2 train2017.zip -#442b8da7639aecaf257c1dceb8ba8c80 val2017.zip -#2d2b9d2283adb5e3b8d25eec88e65064 coco_annotations_minival.tgz +# MD5 verification +echo "Running MD5 verification ... this might take a while" +checkMD5 () { + if [ $(pv -f $1| md5sum | cut -d' ' -f1) = $2 ]; + then + echo "$1 MD5 is valid" + else + echo "*ERROR* $1 MD5 is NOT valid" + fi +} + +echo "validating annotations_trainval2017.zip:" +checkMD5 "annotations_trainval2017.zip" "f4bbac642086de4f52a3fdda2de5fa2c" +echo "validating coco_annotations_minival.tgz:" +checkMD5 "coco_annotations_minival.tgz" "2d2b9d2283adb5e3b8d25eec88e65064" +echo "validating val2017.zip:" +checkMD5 "val2017.zip" "442b8da7639aecaf257c1dceb8ba8c80" +echo "validating train2017.zip:" +checkMD5 "train2017.zip" "cced6f7f71b7629ddf16f17bbcfab6b2" popd From ef001bb64771ba1725c143c4dc1be13b59fca2c9 Mon Sep 17 00:00:00 2001 From: David Jurado Date: Fri, 22 Mar 2024 16:01:02 -0500 Subject: [PATCH 09/14] Fix SAVE_CHECKPOINTS flag --- object_detection/mlcube/workspace/parameters.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/object_detection/mlcube/workspace/parameters.yaml b/object_detection/mlcube/workspace/parameters.yaml index 11d6035bf..f90a6c9fd 100644 --- a/object_detection/mlcube/workspace/parameters.yaml +++ b/object_detection/mlcube/workspace/parameters.yaml @@ -1,4 +1,4 @@ -SAVE_CHECKPOINTS: "True" # Instead of False use empty value +SAVE_CHECKPOINTS: "" # Instead of False use empty value SOLVER_MAX_ITER: "100" # Other user parameters are defined at ../../pytorch/configs/e2e_mask_rcnn_R_50_FPN_1x.yaml From 711ecaa440aabc9c8dceef821efb289ac90b6c09 Mon Sep 17 00:00:00 2001 From: David Jurado Date: Fri, 22 Mar 2024 16:13:15 -0500 Subject: [PATCH 10/14] Fix MLCube readme --- object_detection/README_OLD.md | 95 ------------------------------- object_detection/mlcube/README.md | 6 +- 2 files changed, 3 insertions(+), 98 deletions(-) delete mode 100644 object_detection/README_OLD.md diff --git a/object_detection/README_OLD.md b/object_detection/README_OLD.md deleted file mode 100644 index 6895735de..000000000 --- a/object_detection/README_OLD.md +++ /dev/null @@ -1,95 +0,0 @@ -# 1. Problem -Object detection and segmentation. Metrics are mask and box mAP. - -# 2. Directions - -### Steps to configure machine - -1. Checkout the MLPerf repository -``` -mkdir -p mlperf -cd mlperf -git clone https://github.com/mlperf/training.git -``` -2. Install CUDA and Docker -``` -source training/install_cuda_docker.sh -``` -3. Build the docker image for the object detection task -``` -cd training/object_detection/ -nvidia-docker build . -t mlperf/object_detection -``` - -4. Run docker container and install code -``` -nvidia-docker run -v .:/workspace -t -i --rm --ipc=host mlperf/object_detection \ - "cd mlperf/training/object_detection && ./install.sh" -``` -Now exit the docker container (Ctrl-D) to get back to your host. - -### Steps to download data -``` -# From training/object_detection/ -source download_dataset.sh -``` - -### Steps to run benchmark. -``` -nvidia-docker run -v .:/workspace -t -i --rm --ipc=host mlperf/object_detection \ - "cd mlperf/training/object_detection && ./run_and_time.sh" -``` - -# 3. Dataset/Environment -### Publication/Attribution -Microsoft COCO: Common Objects in Context - -### Data preprocessing -Only horizontal flips are allowed. - -### Training and test data separation -As provided by MS-COCO (2017 version). - -### Training data order -Randomly. - -### Test data order -Any order. - -# 4. Model -### Publication/Attribution -He, Kaiming, et al. "Mask r-cnn." Computer Vision (ICCV), 2017 IEEE International Conference on. -IEEE, 2017. - -We use a version of Mask R-CNN with a ResNet50 backbone. - -### List of layers -Running the timing script will display a list of layers. - -### Weight and bias initialization -The ResNet50 base must be loaded from the provided weights. They may be quantized. - -### Loss function -Multi-task loss (classification, box, mask). Described in the Mask R-CNN paper. - -Classification: Smooth L1 loss - -Box: Log loss for true class. - -Mask: per-pixel sigmoid, average binary cross-entropy loss. - -### Optimizer -Momentum SGD. Weight decay of 0.0001, momentum of 0.9. - -# 5. Quality -### Quality metric -As Mask R-CNN can provide both boxes and masks, we evaluate on both box and mask mAP. - -### Quality target -Box mAP of 0.377, mask mAP of 0.339 - -### Evaluation frequency -Once per epoch, 118k. - -### Evaluation thoroughness -Evaluate over the entire validation set. Use the official COCO API to compute mAP. diff --git a/object_detection/mlcube/README.md b/object_detection/mlcube/README.md index e8ac3b0a2..f1e17ba11 100644 --- a/object_detection/mlcube/README.md +++ b/object_detection/mlcube/README.md @@ -37,10 +37,10 @@ Parameters are defined at these files: ``` # Download COCO dataset. Default path = /workspace/data -python mlcube_cli.py run --task download_data --platform docker +mlcube run --task download_data --platform docker # Run benchmark. Default paths = ./workspace/data -python mlcube_cli.py run --task train --platform docker +mlcube run --task train --platform docker ``` Parameters defined at **mculbe/mlcube.yaml** could be overridden using: `--param=input` @@ -49,4 +49,4 @@ We are targeting pull-type installation, so MLCube images should be available on ```bash mlcube run ... -Pdocker.build_strategy=always -``` \ No newline at end of file +``` From 0db618f6f2cba014845f656093212d06fca48893 Mon Sep 17 00:00:00 2001 From: David Jurado Date: Fri, 19 Apr 2024 14:19:54 -0500 Subject: [PATCH 11/14] Add demo tasks --- object_detection/Dockerfile.mlcube | 4 +-- object_detection/download_dataset.sh | 13 +++++++- object_detection/download_demo.sh | 39 ++++++++++++++++++++++++ object_detection/mlcube/.gitignore | 2 ++ object_detection/mlcube/README.md | 31 +++++++++++++++----- object_detection/mlcube/mlcube.yaml | 15 +++++++++- object_detection/run_and_time.sh | 36 ++++++++++++++++------- object_detection/run_demo.sh | 44 ++++++++++++++++++++++++++++ 8 files changed, 161 insertions(+), 23 deletions(-) create mode 100755 object_detection/download_demo.sh create mode 100644 object_detection/mlcube/.gitignore create mode 100755 object_detection/run_demo.sh diff --git a/object_detection/Dockerfile.mlcube b/object_detection/Dockerfile.mlcube index 43bd303a2..5b802e3f9 100644 --- a/object_detection/Dockerfile.mlcube +++ b/object_detection/Dockerfile.mlcube @@ -29,6 +29,4 @@ COPY . . RUN chmod +x ./run_and_time.sh ./download_dataset.sh -ENV PYTHONPATH /workspace/pytorch - -ENTRYPOINT ["python", "/workspace/mlcube.py"] \ No newline at end of file +ENV PYTHONPATH /workspace/pytorch \ No newline at end of file diff --git a/object_detection/download_dataset.sh b/object_detection/download_dataset.sh index b1c0ded29..8203c2c92 100755 --- a/object_detection/download_dataset.sh +++ b/object_detection/download_dataset.sh @@ -1,7 +1,18 @@ #!/bin/bash +: "${DATA_ROOT_DIR:=./pytorch/datasets/coco}" + +while [ $# -gt 0 ]; do + case "$1" in + --data_dir=*) + DATA_ROOT_DIR="${1#*=}" + ;; + *) ;; + esac + shift +done + # Get COCO 2017 data sets -DATA_ROOT_DIR="${DATA_ROOT_DIR:-./pytorch/datasets/coco}" echo "Downloaading to folder: $DATA_ROOT_DIR" mkdir -p $DATA_ROOT_DIR pushd $DATA_ROOT_DIR diff --git a/object_detection/download_demo.sh b/object_detection/download_demo.sh new file mode 100755 index 000000000..f2d9896f9 --- /dev/null +++ b/object_detection/download_demo.sh @@ -0,0 +1,39 @@ +#!/bin/bash +: "${DATA_ROOT_DIR:=./pytorch/datasets/coco}" + +while [ $# -gt 0 ]; do + case "$1" in + --data_dir=*) + DATA_ROOT_DIR="${1#*=}" + ;; + *) ;; + esac + shift +done + + +echo "Downloaading demo to folder: $DATA_ROOT_DIR" +mkdir -p $DATA_ROOT_DIR +pushd $DATA_ROOT_DIR + +echo "Downloading annotations_trainval2017.zip:" +curl -O https://storage.googleapis.com/mlperf_training_demo/object_detection/demo_data.zip +echo "Extracting demo_data.zip:" +unzip -o -q demo_data.zip +echo "Done!" + +# MD5 verification +echo "Running MD5 verification ..." +checkMD5 () { + if [ $(pv -f $1| md5sum | cut -d' ' -f1) = $2 ]; + then + echo "$1 MD5 is valid" + else + echo "*ERROR* $1 MD5 is NOT valid" + fi +} + +echo "validating demo_data.zip:" +checkMD5 "demo_data.zip" "1b50202a21b0d8c3235d0a6f39b6f40c" + +popd diff --git a/object_detection/mlcube/.gitignore b/object_detection/mlcube/.gitignore new file mode 100644 index 000000000..98541c90f --- /dev/null +++ b/object_detection/mlcube/.gitignore @@ -0,0 +1,2 @@ +workspace/ +!workspace/parameter.yaml \ No newline at end of file diff --git a/object_detection/mlcube/README.md b/object_detection/mlcube/README.md index f1e17ba11..8d77a2925 100644 --- a/object_detection/mlcube/README.md +++ b/object_detection/mlcube/README.md @@ -1,9 +1,5 @@ # Benchmark execution with MLCube -## Current implementation - -We'll be updating this section as we merge MLCube PRs and make new MLCube releases. - ### Project setup ```Python # Create Python environment and install MLCube Docker runner @@ -34,15 +30,34 @@ Parameters are defined at these files: * Project user parameters: pytorch/configs/e2e_mask_rcnn_R_50_FPN_1x.yaml * Project default parameters: pytorch/maskrcnn_benchmark/config/defaults.py -``` - +```bash # Download COCO dataset. Default path = /workspace/data -mlcube run --task download_data --platform docker +mlcube run --task=download_data -Pdocker.build_strategy=always # Run benchmark. Default paths = ./workspace/data -mlcube run --task train --platform docker +mlcube run --task=train -Pdocker.build_strategy=always +``` + +### Demo execution + +These tasks will use a demo dataset (39M) to execute a faster training workload for a quick demo (~12 min): + +```bash +# Download subsampled dataset. Default path = /workspace/demo +mlcube run --task=download_data -Pdocker.build_strategy=always + +# Run benchmark. Default paths = ./workspace/demo and ./workspace/demo_output +mlcube run --task=demo -Pdocker.build_strategy=always +``` + +It's also possible to execute the two tasks in one single instruction: + +```bash +mlcube run --task=download_demo,demo -Pdocker.build_strategy=always ``` +### Aditonal options + Parameters defined at **mculbe/mlcube.yaml** could be overridden using: `--param=input` We are targeting pull-type installation, so MLCube images should be available on docker hub. If not, try this: diff --git a/object_detection/mlcube/mlcube.yaml b/object_detection/mlcube/mlcube.yaml index 288bc7d15..b0c693d58 100644 --- a/object_detection/mlcube/mlcube.yaml +++ b/object_detection/mlcube/mlcube.yaml @@ -14,18 +14,31 @@ docker: build_context: "../" # Docker file name within docker build context, default is `Dockerfile`. build_file: "Dockerfile.mlcube" + gpu_args: "--gpus=all --shm-size=1g" tasks: # Download LibriSpeech dataset download_data: + entrypoint: ./download_dataset.sh -a parameters: # Directory for uncompressed datasets. Total size is ~ 65G. outputs: {data_dir: data/} train: # Train Object Detection model + entrypoint: ./run_and_time.sh -a parameters: # data_dir: see Download::data_dir, it takes the processes subfolder # parameters_file: Yaml file with training parameters. inputs: {data_dir: data/, parameters_file: {type: file, default: parameters.yaml}} # Model output folder - outputs: {output_dir: output/} \ No newline at end of file + outputs: {output_dir: output/} + download_demo: + entrypoint: ./download_demo.sh -a + parameters: + # Directory for uncompressed datasets. Total size is ~ 65G. + outputs: {data_dir: demo/} + demo: + entrypoint: ./run_demo.sh -a + parameters: + inputs: {data_dir: demo/, parameters_file: {type: file, default: parameters.yaml}} + outputs: {output_dir: demo_output/} \ No newline at end of file diff --git a/object_detection/run_and_time.sh b/object_detection/run_and_time.sh index f7709d19e..69cd434c6 100755 --- a/object_detection/run_and_time.sh +++ b/object_detection/run_and_time.sh @@ -1,11 +1,29 @@ #!/bin/bash +# Runs benchmark and reports time to convergence +pushd pytorch +python setup.py clean build develop --user + +: "${DATA_DIR:=pytorch/datasets/coco}" +: "${OUTPUT_DIR:=pytorch/output}" +while [ $# -gt 0 ]; do + case "$1" in + --data_dir=*) + DATA_DIR="${1#*=}" + ;; + --output_dir=*) + OUTPUT_DIR="${1#*=}" + ;; + *) ;; + esac + shift +done -DATA_DIR="${DATA_DIR:-pytorch/datasets/coco}" -DATA_ROOT_TARGET="pytorch/datasets/coco" -OUTPUT_DIR="${OUTPUT_DIR:-}" +echo "DATA_DIR" +echo $DATA_DIR -SAVE_CHECKPOINTS="${SAVE_CHECKPOINTS:-}" +DATA_ROOT_TARGET="datasets/coco" +SAVE_CHECKPOINTS="${SAVE_CHECKPOINTS:-False}" SOLVER_MAX_ITER="${SOLVER_MAX_ITER:-40000}" #Link input data paths @@ -15,15 +33,13 @@ if [ "$DATA_DIR" != "$DATA_ROOT_TARGET" ]; then ln -s $DATA_DIR/train2017 $DATA_ROOT_TARGET/train2017 ln -s $DATA_DIR/test2017 $DATA_ROOT_TARGET/test2017 ln -s $DATA_DIR/val2017 $DATA_ROOT_TARGET/val2017 - echo $DATA_ROOT_TARGET - ls -lah $DATA_ROOT_TARGET fi -# Runs benchmark and reports time to convergence -pushd pytorch +pwd + # Single GPU training time python tools/train_mlperf.py --config-file "configs/e2e_mask_rcnn_R_50_FPN_1x.yaml" \ - SOLVER.IMS_PER_BATCH 2 TEST.IMS_PER_BATCH 1 SOLVER.MAX_ITER 720000 SOLVER.STEPS "(480000, 640000)" \ - SOLVER.MAX_ITER $SOLVER_MAX_ITER SOLVER.BASE_LR 0.0025 SAVE_CHECKPOINTS $SAVE_CHECKPOINTS OUTPUT_DIR $OUTPUT_DIR + SOLVER.IMS_PER_BATCH 2 TEST.IMS_PER_BATCH 1 SOLVER.MAX_ITER 720000 \ + SOLVER.STEPS "(480000, 640000)" SOLVER.BASE_LR 0.0025 OUTPUT_DIR $OUTPUT_DIR popd diff --git a/object_detection/run_demo.sh b/object_detection/run_demo.sh new file mode 100755 index 000000000..072895ca6 --- /dev/null +++ b/object_detection/run_demo.sh @@ -0,0 +1,44 @@ +#!/bin/bash +# Runs benchmark and reports time to convergence +pushd pytorch +python setup.py clean build develop --user + +: "${DATA_DIR:=pytorch/datasets/coco}" +: "${OUTPUT_DIR:=pytorch/output}" + +while [ $# -gt 0 ]; do + case "$1" in + --data_dir=*) + DATA_DIR="${1#*=}" + ;; + --output_dir=*) + OUTPUT_DIR="${1#*=}" + ;; + *) ;; + esac + shift +done + +echo "DATA_DIR" +echo $DATA_DIR + +DATA_ROOT_TARGET="datasets/coco" +SAVE_CHECKPOINTS="${SAVE_CHECKPOINTS:-False}" +SOLVER_MAX_ITER="${SOLVER_MAX_ITER:-40000}" + +#Link input data paths +if [ "$DATA_DIR" != "$DATA_ROOT_TARGET" ]; then + mkdir -p $DATA_ROOT_TARGET + ln -s $DATA_DIR/annotations $DATA_ROOT_TARGET/annotations + ln -s $DATA_DIR/train2017 $DATA_ROOT_TARGET/train2017 + ln -s $DATA_DIR/val2017 $DATA_ROOT_TARGET/val2017 +fi + +pwd + +# Single GPU training +time python tools/train_mlperf.py --config-file "configs/e2e_mask_rcnn_R_50_FPN_1x.yaml" \ + SOLVER.IMS_PER_BATCH 2 TEST.IMS_PER_BATCH 1 SOLVER.MAX_ITER 720 \ + SOLVER.STEPS "(480, 640)" SOLVER.BASE_LR 0.0025 OUTPUT_DIR $OUTPUT_DIR + +popd From 573f7823eb0ad66a9545902c1d6f101b5486f7d9 Mon Sep 17 00:00:00 2001 From: David Jurado Date: Fri, 26 Apr 2024 10:59:42 -0500 Subject: [PATCH 12/14] update scripts --- object_detection/README.md | 27 ++++++++++- object_detection/download_dataset.sh | 38 ++++++++++------ object_detection/download_demo.sh | 14 +++--- object_detection/mlcube.py | 67 ---------------------------- object_detection/run_and_time.sh | 16 +++---- object_detection/run_demo.sh | 14 +++--- 6 files changed, 71 insertions(+), 105 deletions(-) delete mode 100644 object_detection/mlcube.py diff --git a/object_detection/README.md b/object_detection/README.md index 2761dad4d..2754eec03 100644 --- a/object_detection/README.md +++ b/object_detection/README.md @@ -1,4 +1,5 @@ # 1. Problem + Object detection and segmentation. Metrics are mask and box mAP. # 2. Directions @@ -6,63 +7,81 @@ Object detection and segmentation. Metrics are mask and box mAP. ### Steps to configure machine 1. Checkout the MLPerf repository + ``` mkdir -p mlperf cd mlperf git clone https://github.com/mlperf/training.git ``` + 2. Install CUDA and Docker + ``` source training/install_cuda_docker.sh ``` + 3. Build the docker image for the object detection task + ``` cd training/object_detection/ nvidia-docker build . -t mlperf/object_detection ``` ### Steps to download data + ``` # From training/object_detection/ source download_dataset.sh ``` -### Steps to run benchmark. +### Steps to run benchmark + ``` nvidia-docker run -v .:/workspace -t -i --rm --ipc=host mlperf/object_detection \ "cd mlperf/training/object_detection && ./run_and_time.sh" ``` # 3. Dataset/Environment + ### Publication/Attribution + Microsoft COCO: Common Objects in Context ### Data preprocessing + Only horizontal flips are allowed. ### Training and test data separation + As provided by MS-COCO (2017 version). ### Training data order + Randomly. ### Test data order + Any order. # 4. Model + ### Publication/Attribution + He, Kaiming, et al. "Mask r-cnn." Computer Vision (ICCV), 2017 IEEE International Conference on. IEEE, 2017. We use a version of Mask R-CNN with a ResNet-50 backbone. ### List of layers + Running the timing script will display a list of layers. ### Weight and bias initialization + The ResNet-50 base must be loaded from the provided weights. They may be quantized. ### Loss function + Multi-task loss (classification, box, mask). Described in the Mask R-CNN paper. Classification: Smooth L1 loss @@ -72,17 +91,23 @@ Box: Log loss for true class. Mask: per-pixel sigmoid, average binary cross-entropy loss. ### Optimizer + Momentum SGD. Weight decay of 0.0001, momentum of 0.9. # 5. Quality + ### Quality metric + As Mask R-CNN can provide both boxes and masks, we evaluate on both box and mask mAP. ### Quality target + Box mAP of 0.377, mask mAP of 0.339 ### Evaluation frequency + Once per epoch, 118k. ### Evaluation thoroughness + Evaluate over the entire validation set. Use the official COCO API to compute mAP. diff --git a/object_detection/download_dataset.sh b/object_detection/download_dataset.sh index 8203c2c92..8b5c68d71 100755 --- a/object_detection/download_dataset.sh +++ b/object_detection/download_dataset.sh @@ -11,7 +11,6 @@ while [ $# -gt 0 ]; do shift done - # Get COCO 2017 data sets echo "Downloaading to folder: $DATA_ROOT_DIR" mkdir -p $DATA_ROOT_DIR @@ -25,30 +24,41 @@ tar -xzf coco_annotations_minival.tgz &>/dev/null echo "Downloading annotations_trainval2017.zip:" curl -O http://images.cocodataset.org/annotations/annotations_trainval2017.zip echo "Extracting annotations_trainval2017.zip:" -n_files=`unzip -l annotations_trainval2017.zip| grep .json | wc -l` -unzip annotations_trainval2017.zip | { I=-1; while read; do printf "Progress: $((++I*100/$n_files))%%\r"; done; echo ""; } +n_files=$(unzip -l annotations_trainval2017.zip | grep .json | wc -l) +unzip annotations_trainval2017.zip | { + I=-1 + while read; do printf "Progress: $((++I * 100 / $n_files))%%\r"; done + echo "" +} echo "Downloading val2017.zip:" curl -O http://images.cocodataset.org/zips/val2017.zip echo "Extracting val2017.zip:" -n_files=`unzip -l val2017.zip| grep .jpg | wc -l` -unzip val2017.zip | { I=-1; while read; do printf "Progress: $((++I*100/$n_files))%%\r"; done; echo ""; } +n_files=$(unzip -l val2017.zip | grep .jpg | wc -l) +unzip val2017.zip | { + I=-1 + while read; do printf "Progress: $((++I * 100 / $n_files))%%\r"; done + echo "" +} echo "Downloading train2017.zip:" curl -O http://images.cocodataset.org/zips/train2017.zip echo "Extracting train2017.zip:" -n_files=`unzip -l train2017.zip| grep .jpg | wc -l` -unzip train2017.zip | { I=-1; while read; do printf "Progress: $((++I*100/$n_files))%%\r"; done; echo ""; } +n_files=$(unzip -l train2017.zip | grep .jpg | wc -l) +unzip train2017.zip | { + I=-1 + while read; do printf "Progress: $((++I * 100 / $n_files))%%\r"; done + echo "" +} # MD5 verification echo "Running MD5 verification ... this might take a while" -checkMD5 () { - if [ $(pv -f $1| md5sum | cut -d' ' -f1) = $2 ]; - then - echo "$1 MD5 is valid" - else - echo "*ERROR* $1 MD5 is NOT valid" - fi +checkMD5() { + if [ $(pv -f $1 | md5sum | cut -d' ' -f1) = $2 ]; then + echo "$1 MD5 is valid" + else + echo "*ERROR* $1 MD5 is NOT valid" + fi } echo "validating annotations_trainval2017.zip:" diff --git a/object_detection/download_demo.sh b/object_detection/download_demo.sh index f2d9896f9..b2266604d 100755 --- a/object_detection/download_demo.sh +++ b/object_detection/download_demo.sh @@ -11,7 +11,6 @@ while [ $# -gt 0 ]; do shift done - echo "Downloaading demo to folder: $DATA_ROOT_DIR" mkdir -p $DATA_ROOT_DIR pushd $DATA_ROOT_DIR @@ -24,13 +23,12 @@ echo "Done!" # MD5 verification echo "Running MD5 verification ..." -checkMD5 () { - if [ $(pv -f $1| md5sum | cut -d' ' -f1) = $2 ]; - then - echo "$1 MD5 is valid" - else - echo "*ERROR* $1 MD5 is NOT valid" - fi +checkMD5() { + if [ $(pv -f $1 | md5sum | cut -d' ' -f1) = $2 ]; then + echo "$1 MD5 is valid" + else + echo "*ERROR* $1 MD5 is NOT valid" + fi } echo "validating demo_data.zip:" diff --git a/object_detection/mlcube.py b/object_detection/mlcube.py deleted file mode 100644 index b3f8c2d03..000000000 --- a/object_detection/mlcube.py +++ /dev/null @@ -1,67 +0,0 @@ -"""MLCube handler file""" -import os -import yaml -import typer -import shutil -import subprocess -from pathlib import Path - - -app = typer.Typer() - -class DownloadDataTask(object): - """Download task Class - It defines the environment variables: - DATA_ROOT_DIR: Directory path to download the dataset - Then executes the download script""" - @staticmethod - def run(data_dir: str) -> None: - - env = os.environ.copy() - env.update({ - 'DATA_ROOT_DIR': data_dir, - }) - - process = subprocess.Popen("./download_dataset.sh", cwd=".", env=env) - process.wait() - -class TrainTask(object): - """Preprocess dataset task Class - It defines the environment variables: - DATA_DIR: Dataset directory path - OUTPUT_DIR: Directory path where model will be saved - All other parameters defined in parameters_file - Then executes the benchmark script""" - @staticmethod - def run(data_dir: str, output_dir: str, parameters_file: str) -> None: - - process = subprocess.Popen("./install.sh", cwd=".") - process.wait() - - with open(parameters_file, 'r') as stream: - parameters = yaml.safe_load(stream) - - env = os.environ.copy() - env.update({ - 'DATA_DIR': data_dir, - 'OUTPUT_DIR': output_dir, - }) - - if parameters is not None: - env.update(parameters) - - process = subprocess.Popen("./run_and_time.sh", cwd=".", env=env) - process.wait() - -@app.command("download_data") -def download_data(data_dir: str = typer.Option(..., '--data_dir')): - DownloadDataTask.run(data_dir) - -@app.command("train") -def train(data_dir: str = typer.Option(..., '--data_dir'), - output_dir: str = typer.Option(..., '--output_dir'), - parameters_file: str = typer.Option(..., '--parameters_file')): - TrainTask.run(data_dir, output_dir, parameters_file) - -if __name__ == '__main__': - app() \ No newline at end of file diff --git a/object_detection/run_and_time.sh b/object_detection/run_and_time.sh index 69cd434c6..e26bc9125 100755 --- a/object_detection/run_and_time.sh +++ b/object_detection/run_and_time.sh @@ -28,18 +28,18 @@ SOLVER_MAX_ITER="${SOLVER_MAX_ITER:-40000}" #Link input data paths if [ "$DATA_DIR" != "$DATA_ROOT_TARGET" ]; then - mkdir -p $DATA_ROOT_TARGET - ln -s $DATA_DIR/annotations $DATA_ROOT_TARGET/annotations - ln -s $DATA_DIR/train2017 $DATA_ROOT_TARGET/train2017 - ln -s $DATA_DIR/test2017 $DATA_ROOT_TARGET/test2017 - ln -s $DATA_DIR/val2017 $DATA_ROOT_TARGET/val2017 + mkdir -p $DATA_ROOT_TARGET + ln -s $DATA_DIR/annotations $DATA_ROOT_TARGET/annotations + ln -s $DATA_DIR/train2017 $DATA_ROOT_TARGET/train2017 + ln -s $DATA_DIR/test2017 $DATA_ROOT_TARGET/test2017 + ln -s $DATA_DIR/val2017 $DATA_ROOT_TARGET/val2017 fi pwd # Single GPU training time python tools/train_mlperf.py --config-file "configs/e2e_mask_rcnn_R_50_FPN_1x.yaml" \ - SOLVER.IMS_PER_BATCH 2 TEST.IMS_PER_BATCH 1 SOLVER.MAX_ITER 720000 \ - SOLVER.STEPS "(480000, 640000)" SOLVER.BASE_LR 0.0025 OUTPUT_DIR $OUTPUT_DIR - + SOLVER.IMS_PER_BATCH 2 TEST.IMS_PER_BATCH 1 SOLVER.MAX_ITER 720000 \ + SOLVER.STEPS "(480000, 640000)" SOLVER.BASE_LR 0.0025 OUTPUT_DIR $OUTPUT_DIR + popd diff --git a/object_detection/run_demo.sh b/object_detection/run_demo.sh index 072895ca6..efb1c0c98 100755 --- a/object_detection/run_demo.sh +++ b/object_detection/run_demo.sh @@ -28,17 +28,17 @@ SOLVER_MAX_ITER="${SOLVER_MAX_ITER:-40000}" #Link input data paths if [ "$DATA_DIR" != "$DATA_ROOT_TARGET" ]; then - mkdir -p $DATA_ROOT_TARGET - ln -s $DATA_DIR/annotations $DATA_ROOT_TARGET/annotations - ln -s $DATA_DIR/train2017 $DATA_ROOT_TARGET/train2017 - ln -s $DATA_DIR/val2017 $DATA_ROOT_TARGET/val2017 + mkdir -p $DATA_ROOT_TARGET + ln -s $DATA_DIR/annotations $DATA_ROOT_TARGET/annotations + ln -s $DATA_DIR/train2017 $DATA_ROOT_TARGET/train2017 + ln -s $DATA_DIR/val2017 $DATA_ROOT_TARGET/val2017 fi pwd # Single GPU training time python tools/train_mlperf.py --config-file "configs/e2e_mask_rcnn_R_50_FPN_1x.yaml" \ - SOLVER.IMS_PER_BATCH 2 TEST.IMS_PER_BATCH 1 SOLVER.MAX_ITER 720 \ - SOLVER.STEPS "(480, 640)" SOLVER.BASE_LR 0.0025 OUTPUT_DIR $OUTPUT_DIR - + SOLVER.IMS_PER_BATCH 2 TEST.IMS_PER_BATCH 1 SOLVER.MAX_ITER 720 \ + SOLVER.STEPS "(480, 640)" SOLVER.BASE_LR 0.0025 OUTPUT_DIR $OUTPUT_DIR + popd From 9f9ab233714cba9f8f570cdcb3e283f8a1a82dea Mon Sep 17 00:00:00 2001 From: David Jurado Date: Fri, 26 Jul 2024 09:43:41 -0500 Subject: [PATCH 13/14] update demo download link --- object_detection/download_demo.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/object_detection/download_demo.sh b/object_detection/download_demo.sh index b2266604d..4a0fc10ac 100755 --- a/object_detection/download_demo.sh +++ b/object_detection/download_demo.sh @@ -16,9 +16,10 @@ mkdir -p $DATA_ROOT_DIR pushd $DATA_ROOT_DIR echo "Downloading annotations_trainval2017.zip:" -curl -O https://storage.googleapis.com/mlperf_training_demo/object_detection/demo_data.zip +curl -O https://mlcube.mlcommons-storage.org/minibenchmarks/object_detection.zip echo "Extracting demo_data.zip:" -unzip -o -q demo_data.zip +unzip -o -q object_detection.zip +rm object_detection.zip echo "Done!" # MD5 verification From 489ed0fc80bc3fde3daf0250042712cf000dfc0c Mon Sep 17 00:00:00 2001 From: David Jurado Date: Fri, 15 Nov 2024 10:18:03 -0500 Subject: [PATCH 14/14] Fix dependencies --- object_detection/Dockerfile.mlcube | 14 ++--- object_detection/download_demo.sh | 4 +- object_detection/mlcube_requirements.txt | 77 ++++++++++++++++++++++++ 3 files changed, 83 insertions(+), 12 deletions(-) create mode 100644 object_detection/mlcube_requirements.txt diff --git a/object_detection/Dockerfile.mlcube b/object_detection/Dockerfile.mlcube index 5b802e3f9..009f22751 100644 --- a/object_detection/Dockerfile.mlcube +++ b/object_detection/Dockerfile.mlcube @@ -1,4 +1,4 @@ -FROM pytorch/pytorch:1.6.0-cuda10.1-cudnn7-devel +FROM pytorch/pytorch:1.7.0-cuda11.0-cudnn8-devel RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub @@ -6,20 +6,14 @@ RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/ # install basics RUN apt-get update -y \ && apt-get install -y apt-utils curl unzip pv \ + libgl1-mesa-glx \ libglib2.0-0=2.56.1-2ubuntu1 \ libsm6=2:1.2.2-1 \ libxext6=2:1.3.3-1 \ libxrender-dev=1:0.9.10-1 -RUN pip install ninja==1.8.2.post2 \ - yacs==0.1.5 \ - cython==0.29.5 \ - matplotlib==3.0.2 \ - opencv-python==4.0.0.21 \ - mlperf_compliance==0.0.10 \ - torchvision==0.2.2 \ - pycocotools==2.0.2 \ - typer==0.3.2 +COPY mlcube_requirements.txt /root/mlcube_requirements.txt +RUN pip install -r /root/mlcube_requirements.txt RUN pip install --no-cache-dir https://github.com/mlperf/logging/archive/9ea0afa.zip diff --git a/object_detection/download_demo.sh b/object_detection/download_demo.sh index 4a0fc10ac..86cd1964b 100755 --- a/object_detection/download_demo.sh +++ b/object_detection/download_demo.sh @@ -19,7 +19,6 @@ echo "Downloading annotations_trainval2017.zip:" curl -O https://mlcube.mlcommons-storage.org/minibenchmarks/object_detection.zip echo "Extracting demo_data.zip:" unzip -o -q object_detection.zip -rm object_detection.zip echo "Done!" # MD5 verification @@ -33,6 +32,7 @@ checkMD5() { } echo "validating demo_data.zip:" -checkMD5 "demo_data.zip" "1b50202a21b0d8c3235d0a6f39b6f40c" +checkMD5 "object_detection.zip" "1b50202a21b0d8c3235d0a6f39b6f40c" +rm object_detection.zip popd diff --git a/object_detection/mlcube_requirements.txt b/object_detection/mlcube_requirements.txt new file mode 100644 index 000000000..1fe413916 --- /dev/null +++ b/object_detection/mlcube_requirements.txt @@ -0,0 +1,77 @@ +backcall==0.2.0 +beautifulsoup4==4.9.3 +certifi==2020.6.20 +cffi==1.14.0 +chardet==3.0.4 +click==8.1.7 +conda==4.9.1 +conda-build==3.20.5 +conda-package-handling==1.7.0 +contourpy==1.1.1 +cryptography==2.9.2 +cycler==0.12.1 +Cython==3.0.11 +dataclasses==0.6 +decorator==4.4.2 +dnspython==2.0.0 +filelock==3.0.12 +fonttools==4.54.1 +future==0.18.2 +glob2==0.7 +idna==2.9 +importlib-resources==6.4.5 +ipython==7.18.1 +ipython-genutils==0.2.0 +jedi==0.17.2 +Jinja2==2.11.2 +kiwisolver==1.4.7 +libarchive-c==2.9 +markdown-it-py==3.0.0 +MarkupSafe==1.1.1 +matplotlib==3.7.5 +mdurl==0.1.2 +mkl-fft==1.2.0 +mkl-random==1.1.1 +mkl-service==2.3.0 +mlperf-compliance==0.0.10 +ninja==1.11.1.1 +numpy==1.19.5 +olefile==0.46 +opencv-python==4.10.0.84 +packaging==24.2 +parso==0.7.0 +pexpect==4.8.0 +pickleshare==0.7.5 +Pillow==8.0.0 +pkginfo==1.6.0 +prompt-toolkit==3.0.8 +psutil==5.7.2 +ptyprocess==0.6.0 +pycocotools==2.0.7 +pycosat==0.6.3 +pycparser==2.20 +pygments==2.18.0 +pyOpenSSL==19.1.0 +pyparsing==3.1.4 +PySocks==1.7.1 +python-dateutil==2.9.0.post0 +python-etcd==0.4.5 +pytz==2020.1 +PyYAML==5.3.1 +requests==2.23.0 +rich==13.9.4 +ruamel-yaml==0.15.87 +shellingham==1.5.4 +six==1.14.0 +soupsieve==2.0.1 +torch==1.7.0 +torchelastic==0.2.1 +torchvision==0.8.0 +tqdm==4.46.0 +traitlets==5.0.5 +typer==0.13.0 +typing-extensions==3.7.4.3 +urllib3==1.25.8 +wcwidth==0.2.5 +yacs==0.1.8 +zipp==3.20.2