mlcommons · davidjurado · Jul 23, 2021 · Jul 26, 2021 · Jul 29, 2021 · Aug 3, 2021
@@ -0,0 +1,2 @@
+# Do not add MLCube's workspace directory.
+mlcube/
@@ -0,0 +1,2 @@
+mlcube/workspace/data
+mlcube/workspace/output
@@ -0,0 +1,26 @@
+FROM pytorch/pytorch:1.7.0-cuda11.0-cudnn8-devel
+
+RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections
+RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
+
+# install basics
+RUN apt-get update -y \
+ && apt-get install -y apt-utils curl unzip pv \
+                        libgl1-mesa-glx \
+                       libglib2.0-0=2.56.1-2ubuntu1 \
+                       libsm6=2:1.2.2-1 \
+                       libxext6=2:1.3.3-1 \
+                       libxrender-dev=1:0.9.10-1
+
+COPY mlcube_requirements.txt /root/mlcube_requirements.txt
+RUN pip install -r /root/mlcube_requirements.txt
+
+RUN pip install --no-cache-dir https://github.com/mlperf/logging/archive/9ea0afa.zip
+
+WORKDIR /workspace
+
+COPY . .
+
+RUN chmod +x ./run_and_time.sh ./download_dataset.sh
+
+ENV PYTHONPATH /workspace/pytorch
@@ -1,68 +1,87 @@
 # 1. Problem
+
 Object detection and segmentation. Metrics are mask and box mAP.
 
 # 2. Directions
 
 ### Steps to configure machine
 
 1. Checkout the MLPerf repository
+
 ```
 mkdir -p mlperf
 cd mlperf
 git clone https://github.com/mlperf/training.git
 ```
+
 2. Install CUDA and Docker
+
 ```
 source training/install_cuda_docker.sh
 ```
+
 3. Build the docker image for the object detection task
+
 ```
 cd training/object_detection/
 nvidia-docker build . -t mlperf/object_detection
 ```
 
 ### Steps to download data
+
 ```
 # From training/object_detection/
 source download_dataset.sh
 ```
 
-### Steps to run benchmark.
+### Steps to run benchmark
+
 ```
 nvidia-docker run -v .:/workspace -t -i --rm --ipc=host mlperf/object_detection \
     "cd mlperf/training/object_detection && ./run_and_time.sh"
 ```
 
 # 3. Dataset/Environment
+
 ### Publication/Attribution
+
 Microsoft COCO: Common Objects in Context
 
 ### Data preprocessing
+
 Only horizontal flips are allowed.
 
 ### Training and test data separation
+
 As provided by MS-COCO (2017 version).
 
 ### Training data order
+
 Randomly.
 
 ### Test data order
+
 Any order.
 
 # 4. Model
+
 ### Publication/Attribution
+
 He, Kaiming, et al. "Mask r-cnn." Computer Vision (ICCV), 2017 IEEE International Conference on.
 IEEE, 2017.
 
 We use a version of Mask R-CNN with a ResNet-50 backbone.
 
 ### List of layers
+
 Running the timing script will display a list of layers.
 
 ### Weight and bias initialization
+
 The ResNet-50 base must be loaded from the provided weights. They may be quantized.
 
 ### Loss function
+
 Multi-task loss (classification, box, mask). Described in the Mask R-CNN paper.
 
 Classification: Smooth L1 loss
@@ -72,17 +91,23 @@ Box: Log loss for true class.
 Mask: per-pixel sigmoid, average binary cross-entropy loss.
 
 ### Optimizer
+
 Momentum SGD. Weight decay of 0.0001, momentum of 0.9.
 
 # 5. Quality
+
 ### Quality metric
+
 As Mask R-CNN can provide both boxes and masks, we evaluate on both box and mask mAP.
 
 ### Quality target
+
 Box mAP of 0.377, mask mAP of 0.339
 
 ### Evaluation frequency
+
 Once per epoch, 118k.
 
 ### Evaluation thoroughness
+
 Evaluate over the entire validation set. Use the official COCO API to compute mAP.
@@ -1,26 +1,73 @@
 #!/bin/bash
+: "${DATA_ROOT_DIR:=./pytorch/datasets/coco}"
+
+while [ $# -gt 0 ]; do
+  case "$1" in
+  --data_dir=*)
+    DATA_ROOT_DIR="${1#*=}"
+    ;;
+  *) ;;
+  esac
+  shift
+done
 
 # Get COCO 2017 data sets
-mkdir -p pytorch/datasets/coco
-pushd pytorch/datasets/coco
+echo "Downloaading to folder: $DATA_ROOT_DIR"
+mkdir -p $DATA_ROOT_DIR
+pushd $DATA_ROOT_DIR
 
+echo "Downloading coco_annotations_minival.tgz:"
 curl -O https://dl.fbaipublicfiles.com/detectron/coco/coco_annotations_minival.tgz
-tar xzf coco_annotations_minival.tgz
+echo "Extracting coco_annotations_minival.tgz:"
+tar -xzf coco_annotations_minival.tgz &>/dev/null
 
-curl -O http://images.cocodataset.org/zips/train2017.zip
-unzip train2017.zip
+echo "Downloading annotations_trainval2017.zip:"
+curl -O http://images.cocodataset.org/annotations/annotations_trainval2017.zip
+echo "Extracting annotations_trainval2017.zip:"
+n_files=$(unzip -l annotations_trainval2017.zip | grep .json | wc -l)
+unzip annotations_trainval2017.zip | {
+  I=-1
+  while read; do printf "Progress: $((++I * 100 / $n_files))%%\r"; done
+  echo ""
+}
 
+echo "Downloading val2017.zip:"
 curl -O http://images.cocodataset.org/zips/val2017.zip
-unzip val2017.zip
+echo "Extracting val2017.zip:"
+n_files=$(unzip -l val2017.zip | grep .jpg | wc -l)
+unzip val2017.zip | {
+  I=-1
+  while read; do printf "Progress: $((++I * 100 / $n_files))%%\r"; done
+  echo ""
+}
 
-curl -O http://images.cocodataset.org/annotations/annotations_trainval2017.zip
-unzip annotations_trainval2017.zip
-
-# TBD: MD5 verification
-# $md5sum *.zip *.tgz
-#f4bbac642086de4f52a3fdda2de5fa2c  annotations_trainval2017.zip
-#cced6f7f71b7629ddf16f17bbcfab6b2  train2017.zip
-#442b8da7639aecaf257c1dceb8ba8c80  val2017.zip
-#2d2b9d2283adb5e3b8d25eec88e65064  coco_annotations_minival.tgz
+echo "Downloading train2017.zip:"
+curl -O http://images.cocodataset.org/zips/train2017.zip
+echo "Extracting train2017.zip:"
+n_files=$(unzip -l train2017.zip | grep .jpg | wc -l)
+unzip train2017.zip | {
+  I=-1
+  while read; do printf "Progress: $((++I * 100 / $n_files))%%\r"; done
+  echo ""
+}
+
+# MD5 verification
+echo "Running MD5 verification ... this might take a while"
+checkMD5() {
+  if [ $(pv -f $1 | md5sum | cut -d' ' -f1) = $2 ]; then
+    echo "$1 MD5 is valid"
+  else
+    echo "*ERROR* $1 MD5 is NOT valid"
+  fi
+}
+
+echo "validating annotations_trainval2017.zip:"
+checkMD5 "annotations_trainval2017.zip" "f4bbac642086de4f52a3fdda2de5fa2c"
+echo "validating coco_annotations_minival.tgz:"
+checkMD5 "coco_annotations_minival.tgz" "2d2b9d2283adb5e3b8d25eec88e65064"
+echo "validating val2017.zip:"
+checkMD5 "val2017.zip" "442b8da7639aecaf257c1dceb8ba8c80"
+echo "validating train2017.zip:"
+checkMD5 "train2017.zip" "cced6f7f71b7629ddf16f17bbcfab6b2"
 
 popd
@@ -0,0 +1,38 @@
+#!/bin/bash
+: "${DATA_ROOT_DIR:=./pytorch/datasets/coco}"
+
+while [ $# -gt 0 ]; do
+  case "$1" in
+  --data_dir=*)
+    DATA_ROOT_DIR="${1#*=}"
+    ;;
+  *) ;;
+  esac
+  shift
+done
+
+echo "Downloaading demo to folder: $DATA_ROOT_DIR"
+mkdir -p $DATA_ROOT_DIR
+pushd $DATA_ROOT_DIR
+
+echo "Downloading annotations_trainval2017.zip:"
+curl -O https://mlcube.mlcommons-storage.org/minibenchmarks/object_detection.zip
+echo "Extracting demo_data.zip:"
+unzip -o -q object_detection.zip
+echo "Done!"
+
+# MD5 verification
+echo "Running MD5 verification ..."
+checkMD5() {
+  if [ $(pv -f $1 | md5sum | cut -d' ' -f1) = $2 ]; then
+    echo "$1 MD5 is valid"
+  else
+    echo "*ERROR* $1 MD5 is NOT valid"
+  fi
+}
+
+echo "validating demo_data.zip:"
+checkMD5 "object_detection.zip" "1b50202a21b0d8c3235d0a6f39b6f40c"
+rm object_detection.zip
+
+popd
@@ -0,0 +1,2 @@
+workspace/
+!workspace/parameter.yaml
@@ -0,0 +1,67 @@
+# Benchmark execution with MLCube
+
+### Project setup
+```Python
+# Create Python environment and install MLCube Docker runner 
+virtualenv -p python3 ./env && source ./env/bin/activate && pip install mlcube-docker
+
+# Fetch the Object Detection workload
+git clone https://github.com/mlcommons/training && cd ./training
+git fetch origin pull/501/head:feature/object_detection && git checkout feature/object_detection
+cd ./object_detection/mlcube
+```
+
+### Dataset
+
+
+The COCO dataset will be downloaded and extracted. Sizes of the dataset in each step:
+
+| Dataset Step                   | MLCube Task       | Format         | Size     |
+|--------------------------------|-------------------|----------------|----------|
+| Download (Compressed dataset)  | download_data     | Tar/Zip files  | ~20.5 GB |
+| Extract (Uncompressed dataset) | download_data     | Jpg/Json files | ~21.2 GB |
+| Total                          | (After all tasks) | All            | ~41.7 GB |
+
+### Tasks execution
+
+Parameters are defined at these files:
+
+* MLCube user parameters: mlcube/workspace/parameters.yaml
+* Project user parameters: pytorch/configs/e2e_mask_rcnn_R_50_FPN_1x.yaml
+* Project default parameters: pytorch/maskrcnn_benchmark/config/defaults.py
+
+```bash
+# Download COCO dataset. Default path = /workspace/data
+mlcube run --task=download_data -Pdocker.build_strategy=always
+
+# Run benchmark. Default paths = ./workspace/data
+mlcube run --task=train -Pdocker.build_strategy=always
+```
+
+### Demo execution
+
+These tasks will use a demo dataset (39M) to execute a faster training workload for a quick demo (~12 min):
+
+```bash
+# Download subsampled dataset. Default path = /workspace/demo
+mlcube run --task=download_data -Pdocker.build_strategy=always
+
+# Run benchmark. Default paths = ./workspace/demo and ./workspace/demo_output
+mlcube run --task=demo -Pdocker.build_strategy=always
+```
+
+It's also possible to execute the two tasks in one single instruction:
+
+```bash
+mlcube run --task=download_demo,demo -Pdocker.build_strategy=always
+```
+
+### Aditonal options
+
+Parameters defined at **mculbe/mlcube.yaml** could be overridden using: `--param=input`
+
+We are targeting pull-type installation, so MLCube images should be available on docker hub. If not, try this:
+
+```bash
+mlcube run ... -Pdocker.build_strategy=always
+```
@@ -0,0 +1,44 @@
+name: Object Detection
+description: MLCommons Object Detection Training Reference Benchmark
+authors: 
+ - {name: "MLCommons Best Practices Working Group"}
+
+platform:
+  # Edit this according to your system specs
+  accelerator_count: 1
+
+docker:
+  # Image name.
+  image: mlcommons/object_detection:0.0.1
+  # Docker build context relative to $MLCUBE_ROOT. Default is `build`.
+  build_context: "../"
+  # Docker file name within docker build context, default is `Dockerfile`.
+  build_file: "Dockerfile.mlcube"
+  gpu_args: "--gpus=all --shm-size=1g"
+
+tasks:
+  # Download LibriSpeech dataset
+  download_data:
+    entrypoint: ./download_dataset.sh -a
+    parameters:
+      # Directory for uncompressed datasets. Total size is ~ 65G.
+      outputs: {data_dir: data/}
+  train:
+  # Train Object Detection model
+    entrypoint: ./run_and_time.sh -a
+    parameters:
+      # data_dir: see Download::data_dir, it takes the processes subfolder
+      # parameters_file: Yaml file with training parameters.
+      inputs: {data_dir: data/, parameters_file: {type: file, default: parameters.yaml}}
+      # Model output folder
+      outputs: {output_dir: output/}
+  download_demo:
+    entrypoint: ./download_demo.sh -a
+    parameters:
+      # Directory for uncompressed datasets. Total size is ~ 65G.
+      outputs: {data_dir: demo/}
+  demo:
+    entrypoint: ./run_demo.sh -a
+    parameters:
+      inputs: {data_dir: demo/, parameters_file: {type: file, default: parameters.yaml}}
+      outputs: {output_dir: demo_output/}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# Do not add MLCube's workspace directory.
		mlcube/