version 1

64b02fb6 · liangjing · 64b02fb6 · 64b02fb6 · 64b02fb6 · 64b02fb6
Commit 64b02fb6 authored Apr 11, 2023 by liangjing
20 changed files
--- a/Dockerfile
+++ b/Dockerfile
+# Copyright (c) 2018-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:22.09-py3
+
+################################################################################
+# DALI box iou builder image
+################################################################################
+FROM ${FROM_IMAGE_NAME} AS dali-box-iou-builder
+WORKDIR /workspace/csrc
+COPY csrc/dali_box_iou ./dali_box_iou
+
+# Build CUDA box iou (written as a DALI extension)
+RUN cd dali_box_iou && \
+    mkdir build && \
+    cd build && \
+    cmake .. && \
+    make
+################################################################################
+
+
+################################################################################
+# DALI proposal matcher builder image
+################################################################################
+FROM ${FROM_IMAGE_NAME} AS dali-proposal-matcher-builder
+WORKDIR /workspace/csrc
+COPY csrc/dali_proposal_matcher ./dali_proposal_matcher
+
+# Build CUDA proposal matcher (written as a DALI extension)
+RUN cd dali_proposal_matcher && \
+    mkdir build && \
+    cd build && \
+    cmake .. && \
+    make
+################################################################################
+
+
+################################################################################
+# RetinaNet
+################################################################################
+FROM ${FROM_IMAGE_NAME}
+
+# Build args
+ARG MAX_JOBS=4
+ARG TORCH_CUDNN_V8_API_ENABLED=1
+
+# Set env vars
+ENV MAX_JOBS=${MAX_JOBS}
+ENV TORCH_CUDNN_V8_API_ENABLED=${TORCH_CUDNN_V8_API_ENABLED}
+
+# Install dependencies for system configuration logger
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends infiniband-diags pciutils && \
+    rm -rf /var/lib/apt/lists/*
+
+# Set working directory
+WORKDIR /workspace/ssd
+
+# Copy code
+COPY . .
+
+# Remove the container's pycocotools in favor of the nvidia optimized version (installed from requirements.txt)
+RUN pip uninstall -y pycocotools
+
+# Install python dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Copy DALI box iou and proposal matcher
+COPY --from=dali-box-iou-builder /workspace/csrc/dali_box_iou/build/lib_box_iou.so /usr/local/lib/lib_box_iou.so
+COPY --from=dali-proposal-matcher-builder /workspace/csrc/dali_proposal_matcher/build/lib_proposal_matcher.so /usr/local/lib/lib_proposal_matcher.so
+################################################################################
--- a/LICENSE
+++ b/LICENSE
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/NOTICE
+++ b/NOTICE
+RetinaNet in Torchvision
+
+This folder includes source code from:
+* https://github.com/pytorch/vision licensed BSD 3-Clause License.
+
--- a/README.md
+++ b/README.md
+#  RetinaNet
+## 模型介绍
+RetinaNet是一种基于特征金字塔网络（Feature Pyramid Network）和Focal Loss损失函数的目标检测模型，由Facebook AI Research团队在2018年提出。RetinaNet旨在解决传统目标检测算法（如Faster R-CNN）在检测小目标时表现不佳的问题。
+
+与传统的目标检测算法相比，RetinaNet具有更高的检测精度和更快的检测速度。它对小目标的检测表现尤为优秀，在各种视觉任务中都取得了很好的效果，例如物体检测、行人重识别等。
+
+## 模型结构
+RetinaNet的网络结构主要分为两个部分：特征提取网络和检测头。
+
+1. 特征提取网络：RetinaNet采用了ResNet作为特征提取网络，通过不同深度的ResNet模块对输入图像进行特征提取，得到一系列的特征金字塔图。这些特征金字塔图代表了不同尺度的物体信息，可以用于检测不同大小的目标。
+2. 检测头：RetinaNet的检测头由两个分支组成，一个分支用于预测目标的置信度，另一个分支用于预测目标的边界框。每个分支都由一系列卷积层和全连接层组成，最终输出一个特定数量的预测值。置信度分支使用Focal Loss损失函数来处理正负样本不均衡问题，边界框分支使用Smooth L1 Loss损失函数来计算边界框的误差。
+
+在检测过程中，RetinaNet首先在特征金字塔图上进行目标检测，然后在检测结果中使用非极大值抑制（NMS）来消除重叠的边界框，并保留置信度最高的边界框。
+
+## 目标精度
+
+ 34.0% mAP
+
+## MLPerf代码参考版本
+
+版本：v2.1
+
+原始代码位置：https://github.com/mlcommons/training_results_v2.1/tree/master/NVIDIA/benchmarks/bert/implementations/pytorch
+
+## 数据集
+模型训练的数据集来自训练数据：Open Images，该数据一个大规模的图像数据集，由Google在2016年发布。该数据集包含了超过900万张标注图像，其中每张图像都包含了多个物体的边界框和类别标签，可用于各种计算机视觉任务，例如物体检测、物体识别、场景理解等。
+
+### Download dataset
+
+该数据集的来源为[OpenImages-v6](https://storage.googleapis.com/openimages/web/index.html)，按下述进行数据下载，数据大小约为352G；
+
+    cd ./scripts
+    pip3  install fiftyone
+    ./download_openimages_mlperf.sh -d <DATAPATH>
+### Download the pretrained backbone
+
+该网络采用的预训练模型为来自ImageNet的ResNeXt50_32x4d，可通过下述进行获取；
+
+    ./download_backbone.sh
+
+## 训练
+
+### 环境配置
+提供[光源](https://www.sourcefind.cn/#/service-details)拉取的训练的docker镜像：
+* 训练镜像：
+
+python依赖安装：
+
+    pip3 install -r requirement.txt
+    python3 setup.py install
+    #注：运行该模式需要特定版本的apex，相应whl包在本目录中提供
+    pip3 install apex-0.1-cp37-cp37m-linux_x86_64.whl
+    cd ./cocoapi-0.7.0/PythonAPI; python3 setup.py install
+
+### 训练
+训练命令（此处以单机8卡规模为例说明）：
+
+    nohup bash sbatch.sh >& ssd_bs16_epoch6.log &
+    #输出结果见ssd_bs16_epoch6.log
+    #注：可通过修改dcu.sh中DATASET_DIR参数按需修改输入数据的位置
+
+## 性能和准确率数据
+测试采用上述输入数据，加速卡采用Z100L，下面为单机8卡测试结果：
+
+| 测试平台 | Accuacy | Throughput (samples/s) |
+| :------: | :-----: | :--------------------: |
+|  Z100L   | 34% mAP |         7.9726         |
+
+## 历史版本
+* https://developer.hpccube.com/codes/modelzoo/mlperf_retinanet
+## 参考
+* https://mlcommons.org/en/
+* https://github.com/mlcommons
--- a/README.md.origin
+++ b/README.md.origin
+<h1 align="center">Single Shot Detector (SSD)</h1>
+
+- [Summary](#summary)
+- [Running the benchmark](#running-the-benchmark)
+  - [Requirements](#requirements)
+  - [Building the docker image](#building-the-docker-image)
+  - [Download dataset](#download-dataset)
+  - [Download the pretrained backbone](#download-the-pretrained-backbone)
+  - [Training on NVIDIA DGX-A100 (single node) with SLURM](#training-on-nvidia-dgx-a100-single-node-with-slurm)
+  - [Training on NVIDIA DGX-A100 (multi node) with SLURM](#training-on-nvidia-dgx-a100-multi-node-with-slurm)
+  - [Training on NVIDIA DGX-A100 (single node) with docker](#training-on-nvidia-dgx-a100-single-node-with-docker)
+  - [Hyperparameter settings](#hyperparameter-settings)
+- [Dataset/Environment](#datasetenvironment)
+  - [Publication/Attribution](#publicationattribution)
+  - [The MLPerf Subset](#the-mlperf-subset)
+- [Model](#model)
+  - [Backbone](#backbone)
+  - [Weight and bias initialization](#weight-and-bias-initialization)
+  - [Input augmentations](#input-augmentations)
+  - [Publication/Attribution](#publicationattribution-1)
+- [Quality](#quality)
+  - [Quality metric](#quality-metric)
+  - [Quality target](#quality-target)
+  - [Evaluation frequency](#evaluation-frequency)
+  - [Evaluation thoroughness](#evaluation-thoroughness)
+
+# Summary
+Single Shot MultiBox Detector (SSD) is an object detection network. For an
+input image, the network outputs a set of bounding boxes around the detected
+objects, along with their classes. For example:
+
+![](https://upload.wikimedia.org/wikipedia/commons/3/38/Detected-with-YOLO--Schreibtisch-mit-Objekten.jpg)
+
+SSD is a one-stage detector, both localization and classification are done in a
+single pass of the network. This allows for a faster inference than region
+proposal network (RPN) based networks, making it more suited for real time
+applications like automotive and low power devices like mobile phones. This is
+also sometimes referred to as being a "single shot" detector for inference.
+
+# Running the benchmark
+The benchmark is intended to run on NVIDIA GPUs, it is tested on A100 but other GPUs should work too (some optimization flags might not be available in all GPU generations).
+
+## Requirements
+The recommended way to run the benchmark is within docker containers. You need to setup your
+machine with:
+1. [PyTorch 22.09-py3 NGC container](https://ngc.nvidia.com/registry/nvidia-pytorch)
+2. [Docker](https://docs.docker.com/engine/install/)
+3. [NVIDIA container runtime](https://github.com/NVIDIA/nvidia-docker)
+4. Slurm with [Pyxis](https://github.com/NVIDIA/pyxis) and [Enroot](https://github.com/NVIDIA/enroot) (multi-node)
+
+## Building the docker image
+Once the above requirements have been met, you can build the benchmark docker image with:
+```bash
+docker build --pull -t <DOCKER_REGISTRY>/mlperf-nvidia:single_stage_detector-pytorch .
+docker push <DOCKER_REGISTRY>/mlperf-nvidia:single_stage_detector-pytorch
+```
+
+## Download dataset
+The benchmark uses a subset of [OpenImages-v6](https://storage.googleapis.com/openimages/web/index.html).
+To download the subset:
+```bash
+pip install fiftyone
+cd ./public-scripts
+./download_openimages_mlperf.sh -d <DATAPATH>
+```
+
+The script will download the benchmark subset with metadata and labels, then
+convert the labels to [COCO](https://cocodataset.org/#home) format. The
+downloaded dataset size is 352GB and the expected folder structure after
+running the script is:
+```
+<DATAPATH>
+│
+└───info.json
+│
+└───train
+│   └─── data
+│   │      000002b66c9c498e.jpg
+│   │      000002b97e5471a0.jpg
+│   │      ...
+│   └─── metadata
+│   │      classes.csv
+│   │      hierarchy.json
+│   │      image_ids.csv
+│   └─── labels
+│          detections.csv
+│          openimages-mlperf.json
+│
+└───validation
+    └─── data
+    │      0001eeaf4aed83f9.jpg
+    │      0004886b7d043cfd.jpg
+    │      ...
+    └─── metadata
+    │      classes.csv
+    │      hierarchy.json
+    │      image_ids.csv
+    └─── labels
+           detections.csv
+           openimages-mlperf.json
+```
+
+Read more about the mlperf subset [here](#the-mlperf-subset).
+
+## Download the pretrained backbone
+The benchmark uses a ResNeXt50_32x4d backbone pretrained on ImageNet. The
+weights are downloaded from PyTorch hub.
+
+By default, the code will automatically download the weights to
+`$TORCH_HOME/hub` (default is `~/.cache/torch/hub`) and save them for later use.
+
+Alternatively, you can manually download the weights with:
+```bash
+bash ./public-scripts/download_backbone.sh
+```
+
+Then use the downloaded file with `--pretrained <PATH TO WEIGHTS>` .
+
+## Training on NVIDIA DGX-A100 (single node) with SLURM
+Launch configuration and system-specific hyperparameters for the NVIDIA DGX-A100
+single node reference are in the `config_DGXA100_001x08x032.sh` script.
+
+Steps required to launch single node training on NVIDIA DGX-A100:
+
+```bash
+source config_DGXA100_001x08x032.sh
+CONT="<DOCKER_REGISTRY>/mlperf-nvidia:single_stage_detector-pytorch" DATADIR="<path/to/dir/containing/openimages/dir>" LOGDIR="<path/to/output/dir>" BACKBONE_DIR="<$(pwd) or path/to/pretrained/ckpt>" sbatch -N $DGXNNODES -t $WALLTIME run.sub
+```
+
+## Training on NVIDIA DGX-A100 (multi node) with SLURM
+Launch configuration and system-specific hyperparameters for the NVIDIA DGX-A100
+multi node reference are in the `config_DGXA100_*.sh` scripts.
+
+Steps required to launch multi node training on NVIDIA DGX-A100:
+
+```bash
+source <MULT_NODE_CONFIG>
+CONT="<DOCKER_REGISTRY>/mlperf-nvidia:single_stage_detector-pytorch" DATADIR="<path/to/dir/containing/openimages/dir>" LOGDIR="<path/to/output/dir>" BACKBONE_DIR="<$(pwd) or path/to/pretrained/ckpt>" sbatch -N $DGXNNODES -t $WALLTIME run.sub
+```
+
+## Training on NVIDIA DGX-A100 (single node) with docker
+When generating results for the official v2.0 submission with one node, the
+benchmark was launched onto a cluster managed by a SLURM scheduler. The
+instructions in [Training on NVIDIA DGX-A100 (single node) with SLURM](#training-on-nvidia-dgx-a100-single-node-with-slurm) explain how that is done.
+
+However, to make it easier to run this benchmark on a wider set of machine
+environments, we are providing here an alternate set of launch instructions
+that can be run using nvidia-docker. Note that performance or functionality may
+vary from the tested SLURM instructions.
+
+Launch configuration and system-specific hyperparameters for the NVIDIA DGX-A100
+single node reference are in the `config_DGXA100_001x08x032.sh` script.
+
+To launch single node training on NVIDIA DGX-A100 with docker, start
+training with:
+
+```bash
+source config_DGXA100_001x08x032.sh
+CONT="<DOCKER_REGISTRY>/mlperf-nvidia:single_stage_detector-pytorch" DATADIR="<path/to/dir/containing/openimages/dir>" LOGDIR="<path/to/output/dir>" BACKBONE_DIR="<$(pwd) or path/to/pretrained/ckpt>" ./run_with_docker.sh
+
+```
+
+Alternatively, you can launch an interactive docker session:
+
+```bash
+docker run --rm -it \
+  --gpus=all \
+  --ipc=host \
+  -v <DATADIR>:/datasets/open-images-v6-mlperf \
+  <DOCKER_REGISTRY>/mlperf-nvidia:single_stage_detector-pytorch bash
+```
+
+Then launching the training command manually with:
+
+```bash
+source config_DGXA100_001x08x032.sh
+torchrun --standalone --nproc_per_node=${DGXNGPU} --no_python ./run_and_time.sh
+```
+
+You can read more about torchrun [here](https://pytorch.org/docs/stable/elastic/run.html).
+
+## Hyperparameter settings
+Hyperparameters are recorded in the `config_*.sh` files for each configuration
+and in `run_and_time.sh`.
+
+# Dataset/Environment
+## Publication/Attribution
+[Google Open Images Dataset V6](https://storage.googleapis.com/openimages/web/index.html)
+
+## The MLPerf Subset
+The MLPerf subset includes only 264 classes of the total 601 available in the
+full dataset:
+
+| Dataset           | # classes | # train images | # validation images | Size  |
+|-------------------|-----------|----------------|---------------------|-------|
+| OpenImages Full   | 601       | 1,743,042      | 41,620              | 534GB |
+| OpenImages MLperf | 264       | 1,170,301      | 24,781              | 352GB |
+
+These are the lowest level classes (no child classes) in the dataset
+[semantic hierarchy tree](https://storage.googleapis.com/openimages/2018_04/bbox_labels_600_hierarchy_visualizer/circle.html)
+with at least 1000 samples.
+
+The list of used classes can be viewed
+[here](https://github.com/mlcommons/training/blob/master/single_stage_detector/scripts/download_openimages_mlperf.sh).
+
+# Model
+This network takes an input 800x800 image from
+[OpenImages-v6](https://storage.googleapis.com/openimages/web/index.html)
+and 264 categories, and computes a set of bounding boxes and categories.
+Other detector models use multiple stages, first proposing regions of interest
+that might contain objects, then iterating over the regions of interest to try
+to categorize each object. SSD does both of these in one stage, leading to
+lower-latency and higher-performance inference.
+
+## Backbone
+
+The backbone is based on ResNeXt50_32x4d as described in Section 3 of
+[this paper](https://arxiv.org/pdf/1611.05431.pdf).  Using the
+same notation as Table 1 of the paper the backbone looks like:
+
+| stage      | # stacked blocks | shape of a residual block  |
+| :--------: | :--------------: | :------------------------: |
+| conv1      |                  | 7x7, 64, stride 2          |
+|            |                  | 3x3 max pool, stride 2     |
+| conv2_x    | 3                | 1x1, 128                   |
+|            |                  | 3x3, 128, groups=32        |
+|            |                  | 1x1, 256                   |
+| conv3_x    | 4                | 1x1, 256                   |
+|            |                  | 3x3, 256, groups=32        |
+|            |                  | 1x1, 512                   |
+| conv4_x    | 6                | 1x1, 512                   |
+|            |                  | 3x3, 512, groups=32        |
+|            |                  | 1x1, 1024                  |
+| conv5_x    | 3                | 1x1, 1024                  |
+|            |                  | 3x3, 1024, groups=32       |
+|            |                  | 1x1, 2048                  |
+
+Input images are 800x800 RGB. They are fed to a 7x7 stride 2 convolution with
+64 output channels, then through a 3x3 stride 2 max-pool layer.
+The rest of the backbone is built from "building blocks": 3x3
+grouped convolutions with a "short-cut" residual connection
+around the pair.  All convolutions in the backbone are followed by batch-norm
+and ReLU.
+
+The backbone is initialized with the pretrained weights from the corresponding
+layers of the ResNeXt50_32x4d implementation from the [Torchvision model
+zoo](https://download.pytorch.org/models/-7cdf4587.pth), described in
+detail [here](https://pytorch.org/hub/pytorch_vision_resnext/).  It is
+a ResNeXt50_32x4d network trained on 224x224 ImageNet to achieve a Top-1
+error rate of 22.38  and a Top-5 error rate of 6.30.
+
+Of the five convolution stages, only the last three are trained.
+The weights of the first two stages are frozen
+([code](https://github.com/mlcommons/training/blob/master/single_stage_detector/ssd/model/backbone_utils.py#L94-L101)).
+In addition, all batch norm layers in the backbone are frozen
+([code](https://github.com/mlcommons/training/blob/master/single_stage_detector/ssd/model/backbone_utils.py#L52)).
+
+
+## Weight and bias initialization
+1. The ResNeXt50_32x4d backbone is initialized with the pretrained weights
+   from [Torchvision model zoo](https://download.pytorch.org/models/-7cdf4587.pth).
+
+2. The classification head weights are initialized using normal distribution
+   with `mean=0` and `std=0.01`. The biases are initialized with zeros, except
+   for the classification convolution which is initialized with
+   `constant=-4.59511985013459`
+   ([code](https://github.com/mlcommons/training/blob/master/single_stage_detector/ssd/model/retinanet.py#L85-L90)).
+
+3. The regression head weights are initialized using normal distribution
+   with `mean=0` and `std=0.01`. The biases are initialized with zeros
+   ([code](https://github.com/mlcommons/training/blob/master/single_stage_detector/ssd/model/retinanet.py#L171-L177)).
+
+4. The FPN network weights are initialized with uniform Kaiming (also known as
+   He initialization) using `negative slope=1`. The biases are initialized
+   with zeros
+   ([code](https://github.com/mlcommons/training/blob/master/single_stage_detector/ssd/model/feature_pyramid_network.py#L90-L91)).
+
+
+## Input augmentations
+The input images are assumed to be sRGB with values in range 0.0 through 1.0.
+The input pipeline does the following:
+
+1. Random horizontal flip of both the image and its ground-truth bounding boxes
+   with a probability of 50%.
+
+2. Normalize the colors to a mean of (0.485, 0.456, 0.406) and standard
+   deviation (0.229, 0.224, 0.225).
+
+3. Resize image to 800x800 using bilinear interpolation.
+
+
+## Publication/Attribution
+
+Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed,
+Cheng-Yang Fu, Alexander C. Berg.  [SSD: Single Shot MultiBox
+Detector](https://arxiv.org/abs/1512.02325). In the _Proceedings of the
+European Conference on Computer Vision_, (ECCV-14):21-37, 2016.
+
+Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.  [Deep Residual Learning for
+Image Recognition](https://arxiv.org/abs/1512.03385).  In the _Proceedings of
+the Conference on Computer Vision and Pattern Recognition_, (CVPR):770-778, 2016.
+
+Jonathan Huang, Vivek Rathod, Chen Sun, Menglong Zhu, Anoop Korattikara,
+Alireza Fathi, Ian Fischer, Zbigniew Wojna, Yang Song, Sergio Guadarrama, Kevin
+Murphy. [Speed/accuracy trade-offs for modern convolutional object
+detectors](https://arxiv.org/abs/1611.10012).  In the _Proceedings of the
+Conference on Computer Vision and Pattern Recognition_, (CVPR):3296-3305, 2017.
+
+Krasin I., Duerig T., Alldrin N., Ferrari V., Abu-El-Haija S., Kuznetsova A.,
+Rom H., Uijlings J., Popov S., Kamali S., Malloci M., Pont-Tuset J., Veit A.,
+Belongie S., Gomes V., Gupta A., Sun C., Chechik G., Cai D., Feng Z.,
+Narayanan D., Murphy K.
+[OpenImages](https://storage.googleapis.com/openimages/web/index.html): A public
+dataset for large-scale multi-label and multi-class image classification, 2017.
+
+Saining Xie, Ross Girshick, Piotr Dollár, Zhuowen Tu, Kaiming He.
+[Aggregated Residual Transformations for Deep Neural Networks](https://arxiv.org/abs/1611.05431)
+
+Tsung-Yi Lin, Priya Goyal, Ross Girshick, Kaiming He, Piotr Dollár.
+[Focal Loss for Dense Object Detection](https://arxiv.org/abs/1708.02002)
+
+Torchvision pretrained [ResNeXt50_32x4d](https://pytorch.org/vision/0.12/models.html#id25) on ImageNet
+
+Torchvision [RetinaNet](https://pytorch.org/vision/0.12/models.html#id65)
+
+# Quality
+## Quality metric
+Metric is COCO box mAP (averaged over IoU of 0.5:0.95), computed over the
+OpenImages-MLPerf validation subset.
+
+## Quality target
+mAP of 0.34
+
+## Evaluation frequency
+Every epoch, starting with the first one.
+
+## Evaluation thoroughness
+All the images in the OpenImages-MLPerf validation subset
--- a/README_dgxa100_n160_ngc22.09_pytorch.md
+++ b/README_dgxa100_n160_ngc22.09_pytorch.md
+## Steps to launch training
+
+### NVIDIA DGX A100 (multi node)
+
+Launch configuration and system-specific hyperparameters for the NVIDIA DGX A100
+multi node submission are in the `config_DGXA100_160x08x001.sh` script.
+
+Steps required to launch multi node training on NVIDIA DGX A100
+
+1. Build the docker container and push to a docker registry
+
+```
+docker build --pull -t <DOCKER_REGISTRY>/mlperf-nvidia:single_stage_detector-pytorch .
+docker push <DOCKER_REGISTRY>/mlperf-nvidia:single_stage_detector-pytorch
+```
+
+2. Launch the training
+
+```
+source config_DGXA100_160x08x001.sh
+CONT="<DOCKER_REGISTRY>/mlperf-nvidia:single_stage_detector-pytorch" DATADIR=<path/to/data/dir> LOGDIR=<path/to/output/dir> sbatch -N $DGXNNODES -t $WALLTIME run.sub
+```
--- a/README_dgxa100_n256_ngc22.09_pytorch.md
+++ b/README_dgxa100_n256_ngc22.09_pytorch.md
+## Steps to launch training
+
+### NVIDIA DGX A100 (multi node)
+
+Launch configuration and system-specific hyperparameters for the NVIDIA DGX A100
+multi node submission are in the `config_DGXA100_256x08x001.sh` script.
+
+Steps required to launch multi node training on NVIDIA DGX A100
+
+1. Build the docker container and push to a docker registry
+
+```
+docker build --pull -t <DOCKER_REGISTRY>/mlperf-nvidia:single_stage_detector-pytorch .
+docker push <DOCKER_REGISTRY>/mlperf-nvidia:single_stage_detector-pytorch
+```
+
+2. Launch the training
+
+```
+source config_DGXA100_256x08x001.sh
+CONT="<DOCKER_REGISTRY>/mlperf-nvidia:single_stage_detector-pytorch" DATADIR=<path/to/data/dir> LOGDIR=<path/to/output/dir> sbatch -N $DGXNNODES -t $WALLTIME run.sub
+```
--- a/README_dgxa100_n8_ngc22.09_pytorch.md
+++ b/README_dgxa100_n8_ngc22.09_pytorch.md
+## Steps to launch training
+
+### NVIDIA DGX A100 (multi node)
+
+Launch configuration and system-specific hyperparameters for the NVIDIA DGX A100
+multi node submission are in the `config_DGXA100_008x08x004.sh` script.
+
+Steps required to launch multi node training on NVIDIA DGX A100
+
+1. Build the docker container and push to a docker registry
+
+```
+docker build --pull -t <DOCKER_REGISTRY>/mlperf-nvidia:single_stage_detector-pytorch .
+docker push <DOCKER_REGISTRY>/mlperf-nvidia:single_stage_detector-pytorch
+```
+
+2. Launch the training
+
+```
+source config_DGXA100_008x08x004.sh
+CONT="<DOCKER_REGISTRY>/mlperf-nvidia:single_stage_detector-pytorch" DATADIR=<path/to/data/dir> LOGDIR=<path/to/output/dir> sbatch -N $DGXNNODES -t $WALLTIME run.sub
+```
--- a/README_dgxa100_ngc22.09_pytorch.md
+++ b/README_dgxa100_ngc22.09_pytorch.md
+## Steps to launch training
+
+### NVIDIA DGX A100 (single node)
+
+Launch configuration and system-specific hyperparameters for the NVIDIA DGX A100
+single node submission are in the `config_DGXA100_001x08x032.sh` script.
+
+Steps required to launch single node training on NVIDIA DGX A100
+
+1. Build the docker container and push to a docker registry
+
+```
+docker build --pull -t <DOCKER_REGISTRY>/mlperf-nvidia:single_stage_detector-pytorch .
+docker push <DOCKER_REGISTRY>/mlperf-nvidia:single_stage_detector-pytorch
+```
+
+2. Launch the training
+
+```
+source config_DGXA100_001x08x032.sh
+CONT="<DOCKER_REGISTRY>/mlperf-nvidia:single_stage_detector-pytorch" DATADIR=<path/to/data/dir> LOGDIR=<path/to/output/dir> sbatch -N $DGXNNODES -t $WALLTIME run.sub
+```
--- a/__pycache__/async_executor.cpython-37.pyc
+++ b/__pycache__/async_executor.cpython-37.pyc
--- a/__pycache__/dali.cpython-37.pyc
+++ b/__pycache__/dali.cpython-37.pyc
--- a/__pycache__/engine.cpython-37.pyc
+++ b/__pycache__/engine.cpython-37.pyc
--- a/__pycache__/mlperf_logger.cpython-37.pyc
+++ b/__pycache__/mlperf_logger.cpython-37.pyc
--- a/__pycache__/model_capture.cpython-37.pyc
+++ b/__pycache__/model_capture.cpython-37.pyc
--- a/__pycache__/presets.cpython-37.pyc
+++ b/__pycache__/presets.cpython-37.pyc
--- a/__pycache__/syn_dataset.cpython-37.pyc
+++ b/__pycache__/syn_dataset.cpython-37.pyc
--- a/__pycache__/transforms.cpython-37.pyc
+++ b/__pycache__/transforms.cpython-37.pyc
--- a/__pycache__/utils.cpython-37.pyc
+++ b/__pycache__/utils.cpython-37.pyc
--- a/apex-0.1-cp37-cp37m-linux_x86_64.whl
+++ b/apex-0.1-cp37-cp37m-linux_x86_64.whl
--- a/async_executor.py
+++ b/async_executor.py
+# Copyright (c) 2020-2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#           http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import OrderedDict
+import multiprocessing as mp # TODO(ahmadki): pytorch mp ?
+from concurrent.futures import ProcessPoolExecutor # TODO(ahmadki): pytorch futures ?
+# from loky import get_reusable_executor
+# from mpi4py.futures import MPIPoolExecutor
+
+# A general class for async executing of functions
+class AsyncExecutor(object):
+    def __init__(self, max_workers=1, mp_context="spawn"):
+        self.max_workers = max_workers
+        self.tasks = OrderedDict()  # a dict of {tags: futures}
+        self.mp_context = mp.get_context(mp_context)
+
+        # mp.set_start_method(mp_context)
+        self.pool = ProcessPoolExecutor(max_workers=self.max_workers, mp_context=self.mp_context)
+        # self.pool = MPIPoolExecutor(max_workers, main=False)
+        # self.pool = get_reusable_executor(max_workers=max_workers, timeout=None)
+
+    def __del__(self):
+        self.cancel(tag=None)
+        self.pool.shutdown(wait=False)
+
+    ####################
+    # Executor functions
+    ####################
+    # submit given function and its arguments for async execution
+    def submit(self, tag, fn, *args, **kwargs):
+        self.tasks[tag] = self.pool.submit(fn, *args, **kwargs)
+
+    def shutdown(wait=True):
+        self.pool.shutdown(wait=True)
+
+    #############################
+    # functions on future objects
+    #############################
+    def cancel(self, tag=None):
+        if tag:
+            return self.tasks[tag].cancel()
+        else:
+            return {tag: self.tasks[tag].cancel() for tag in self.tasks.keys()}
+
+    def cancelled(self, tag=None):
+        if tag:
+            return self.tasks[tag].cancelled()
+        else:
+            return {tag: self.tasks[tag].cancelled() for tag in self.tasks.keys()}
+
+    def running(self, tag=None):
+        if tag:
+            return self.tasks[tag].running()
+        else:
+            return {tag: self.tasks[tag].running() for tag in self.tasks.keys()}
+
+    def done(self, tag=None):
+        if tag:
+            return self.tasks[tag].done()
+        else:
+            return {tag: self.tasks[tag].done() for tag in self.tasks.keys()}
+
+    def result(self, tag=None, timeout=None):
+        if tag:
+            return self.tasks[tag].result(timeout=timeout)
+        else:
+            return {tag: self.tasks[tag].result(timeout=timeout) for tag in self.tasks.keys()}
+
+    def exception(self, tag=None, timeout=None):
+        if tag:
+            return self.tasks[tag].exception(timeout=timeout)
+        else:
+            return {tag: self.tasks[tag].exception(timeout=timeout) for tag in self.tasks.keys()}
+
+    def add_done_callback(self, tag=None, fn=None):
+        if tag:
+            return self.tasks[tag].add_done_callback(fn=fn)
+        else:
+            return {tag: self.tasks[tag].add_done_callback(fn=fn) for tag in self.tasks.keys()}
+
+    ######################
+    # Management functions
+    ######################
+    # return result of a task and deletes it if successful
+    # if blocking is true, wait timeout for the task to complete
+    # if timeout is None, wait indefinitely
+    def dequeue_if_done(self, tag, blocking=False, timeout=None):
+        if self.done(tag=tag):
+            result = self.result(tag=tag)
+            del self.tasks[tag]
+        elif blocking:
+            result = self.result(tag=tag, timeout=timeout)
+            del self.tasks[tag]
+        else:
+            result = None
+        return result
+
+    # return the result of last (LIFO) task or first task (FIFO) and delete it if successful
+    # if blocking is true, wait timeout for the task to complete
+    # if timeout is None, wait indefinitely
+    def pop_if_done(self, last=True, blocking=False, timeout=None):
+        if len(self.tasks)==0:
+            return None
+        tag = next(iter(self.tasks.keys())) if last else next(reversed(self.tasks.keys()))
+        return {tag: self.dequeue_if_done(tag=tag, blocking=blocking, timeout=timeout)}
+
+    # pop the result of all done tasks
+    def pop_all_done(self):
+        if len(self.tasks)==0:
+            return None
+        done_tasks = {}
+        tags = list(self.tasks.keys()) # make a copy of tags because we might mutate self.tasks
+        for tag in tags:
+            result = self.dequeue_if_done(tag, blocking=False)
+            if result:
+                done_tasks[tag] = result
+        return done_tasks
+
+    # return list of tags
+    def tags(self):
+        return self.tasks.keys()
+
+
+
+async_executor = AsyncExecutor(max_workers=1)