git init

3c15726c · yangzhong · 3c15726c · 3c15726c · 3c15726c · 3c15726c
Commit 3c15726c authored Nov 01, 2025 by yangzhong
20 changed files
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
+## Contributing
+
+The best way to contribute to the MLCommons is to get involved with one of our many project communities. You find more information about getting involved with MLCommons [here](https://mlcommons.org/en/get-involved/#getting-started). 
+
+Generally we encourage people to become a MLCommons member if they wish to contribute to MLCommons projects, but outside pull requests are very welcome too.
+
+To get started contributing code, you or your organization needs to sign the MLCommons CLA found at the [MLC policies page](https://mlcommons.org/en/policies/). Once you or your organization has signed the corporate CLA, please fill out this [CLA sign up form](https://forms.gle/Ew1KkBVpyeJDuRw67) form to get your specific GitHub handle authorized so that you can start contributing code under the proper license.
+
+MLCommons project work is tracked with issue trackers and pull requests. Modify the project in your own fork and issue a pull request once you want other developers to take a look at what you have done and discuss the proposed changes. Ensure that cla-bot and other checks pass for your Pull requests.
--- a/DEPS
+++ b/DEPS
+vars = {
+  # Pull in chromium build files and tools for multi-platform build support.
+  'chromium_git': 'https://chromium.googlesource.com/chromium/src',
+
+  'mlpth_root': 'src',
+}
+
+deps = {
+  '{mlpth_root}/build': {
+    'url': '{chromium_git}/build@e3ed5e43c305b353b49e08ac69e7f4d1c2d88ad2'
+  },
+  '{mlpth_root}/buildtools': {
+    'url': '{chromium_git}/buildtools@106e9fce3799633f42b45ca8bbe9e84e1e235603'
+  },
+  '{mlpth_root}/tools/clang': {
+    'url': '{chromium_git}/tools/clang.git@3114fbc11f9644c54dd0a4cdbfa867bac50ff983',
+  },
+  '{mlpth_root}/third_party/pybind': {
+    'url': 'https://github.com/pybind/pybind11.git@v2.2',
+  },
+}
+
+recursedeps = [
+  '{mlpth_root}/buildtools',
+]
+
+hooks = [
+  # Pull clang-format binaries using checked-in hashes.
+  {
+    'name': 'clang_format_win',
+    'pattern': '.',
+    'condition': 'host_os == "win"',
+    'action': [ 'download_from_google_storage',
+                '--no_resume',
+                '--platform=win32',
+                '--no_auth',
+                '--bucket', 'chromium-clang-format',
+                '-s', '{mlpth_root}/buildtools/win/clang-format.exe.sha1',
+    ],
+  },
+  {
+    'name': 'clang_format_mac',
+    'pattern': '.',
+    'condition': 'host_os == "mac"',
+    'action': [ 'download_from_google_storage',
+                '--no_resume',
+                '--platform=darwin',
+                '--no_auth',
+                '--bucket', 'chromium-clang-format',
+                '-s', '{mlpth_root}/buildtools/mac/clang-format.sha1',
+    ],
+  },
+  {
+    'name': 'clang_format_linux',
+    'pattern': '.',
+    'condition': 'host_os == "linux"',
+    'action': [ 'download_from_google_storage',
+                '--no_resume',
+                '--platform=linux*',
+                '--no_auth',
+                '--bucket', 'chromium-clang-format',
+                '-s', '{mlpth_root}/buildtools/linux64/clang-format.sha1',
+    ],
+  },
+
+  # Pull GN using checked-in hashes.
+  {
+    'name': 'gn_win',
+    'pattern': '.',
+    'condition': 'host_os == "win"',
+    'action': [ 'download_from_google_storage',
+                '--no_resume',
+                '--platform=win32',
+                '--no_auth',
+                '--bucket', 'chromium-gn',
+                '-s', '{mlpth_root}/buildtools/win/gn.exe.sha1',
+    ],
+  },
+  {
+    'name': 'gn_mac',
+    'pattern': '.',
+    'condition': 'host_os == "mac"',
+    'action': [ 'download_from_google_storage',
+                '--no_resume',
+                '--platform=darwin',
+                '--no_auth',
+                '--bucket', 'chromium-gn',
+                '-s', '{mlpth_root}/buildtools/mac/gn.sha1',
+    ],
+  },
+  {
+    'name': 'gn_linux',
+    'pattern': '.',
+    'condition': 'host_os == "linux"',
+    'action': [ 'download_from_google_storage',
+                '--no_resume',
+                '--platform=linux*',
+                '--no_auth',
+                '--bucket', 'chromium-gn',
+                '-s', '{mlpth_root}/buildtools/linux64/gn.sha1',
+    ],
+  },
+
+  # Pull sysroots.
+  {
+    'name': 'sysroot_arm',
+    'pattern': '.',
+    'condition': '(checkout_linux and checkout_arm)',
+    'action': ['python', '{mlpth_root}/build/linux/sysroot_scripts/install-sysroot.py',
+               '--arch=arm'],
+  },
+  {
+    'name': 'sysroot_arm64',
+    'pattern': '.',
+    'condition': '(checkout_linux and checkout_arm64)',
+    'action': ['python', '{mlpth_root}/build/linux/sysroot_scripts/install-sysroot.py',
+               '--arch=arm64'],
+  },
+  {
+    'name': 'sysroot_x86',
+    'pattern': '.',
+    'condition': '(checkout_linux and (checkout_x86 or checkout_x64))',
+    'action': ['python', '{mlpth_root}/build/linux/sysroot_scripts/install-sysroot.py',
+               '--arch=x86'],
+  },
+  {
+    'name': 'sysroot_mips',
+    'pattern': '.',
+    'condition': '(checkout_linux and checkout_mips)',
+    'action': ['python', '{mlpth_root}/build/linux/sysroot_scripts/install-sysroot.py',
+               '--arch=mips'],
+  },
+  {
+    'name': 'sysroot_x64',
+    'pattern': '.',
+    'condition': 'checkout_linux and checkout_x64',
+    'action': ['python', '{mlpth_root}/build/linux/sysroot_scripts/install-sysroot.py',
+               '--arch=x64'],
+  },
+  {
+    # Update the Windows toolchain if necessary.
+    'name': 'win_toolchain',
+    'pattern': '.',
+    'condition': 'checkout_win',
+    'action': ['python', '{mlpth_root}/build/vs_toolchain.py', 'update'],
+  },
+  {
+    'name': 'fuchsia_sdk',
+    'pattern': '.',
+    'condition': 'checkout_fuchsia',
+    'action': [
+      'python',
+      '{mlpth_root}/build/fuchsia/update_sdk.py',
+    ],
+  },
+  {
+    # Note: On Win, this should run after win_toolchain, as it may use it.
+    'name': 'clang',
+    'pattern': '.',
+    # clang not supported on aix
+    'condition': 'host_os != "aix"',
+    'action': ['python', '{mlpth_root}//tools/clang/scripts/update.py'],
+  },
+]
--- a/LICENSE.md
+++ b/LICENSE.md
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
--- a/README.md
+++ b/README.md
+### Retinanet-pytorch inference 
+
+##### 1.修改配置文件
+
+配置路径（根据你的实际路径修改）
+
+数据集:open-images-v6-mlperf validation
+
+例：/public/opendas/DL_DATA/open-images-v6-mlperf/validation
+
+开源权重模型文件：resnext50_32x4d_fpn.pth
+
+在/retinanet_infer_pytorch/vision/classification_and_detection/retinanet_acc.sh启动脚本中修改
+
+##### 2.推理
+
+```
+cd /retinanet_infer_pytorch/vision/classification_and_detection/
+chmod +x retinanet_acc.sh
+bash retinanet_acc.sh
+```
+
--- a/SubmissionExample.ipynb
+++ b/SubmissionExample.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# End to End MLPerf Submission example\n",
+    "\n",
+    "This is following the [General MLPerf Submission Rules](https://github.com/mlcommons/policies/blob/master/submission_rules.adoc).\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Get the MLPerf Inference Benchmark Suite source code\n",
+    "\n",
+    "You run this notebook from the root of the 'mlcommons/inference' repo that you cloned with\n",
+    "```\n",
+    "git clone --recurse-submodules https://github.com/mlcommons/inference.git --depth 1\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Build loadgen"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# build loadgen\n",
+    "!pip install pybind11\n",
+    "!cd loadgen; CFLAGS=\"-std=c++14 -O3\" python setup.py develop"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "!cd vision/classification_and_detection; python setup.py develop"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Set Working Directory"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%cd vision/classification_and_detection"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Download data\n",
+    "\n",
+    "For this example, the ImageNet and/or COCO validation data should already be on the host system. See the [MLPerf Image Classification task](https://github.com/mlcommons/inference/tree/master/vision/classification_and_detection#datasets) for more details on obtaining this. For the following step each validation dataset is stored in /workspace/data/. You should change this to the location in your setup."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%bash\n",
+    "\n",
+    "mkdir data\n",
+    "ln -s /workspace/data/imagenet2012 data/\n",
+    "ln -s /workspace/data/coco data/"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Download models"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%bash\n",
+    "\n",
+    "mkdir models\n",
+    "\n",
+    "# resnet50\n",
+    "wget -q https://zenodo.org/record/2535873/files/resnet50_v1.pb -O models/resnet50_v1.pb \n",
+    "wget -q https://zenodo.org/record/2592612/files/resnet50_v1.onnx -O models/resnet50_v1.onnx\n",
+    "\n",
+    "# ssd-mobilenet\n",
+    "wget -q http://download.tensorflow.org/models/object_detection/ssd_mobilenet_v1_coco_2018_01_28.tar.gz -O models/ssd_mobilenet_v1_coco_2018_01_28.tar.gz\n",
+    "tar zxvf ./models/ssd_mobilenet_v1_coco_2018_01_28.tar.gz -C ./models; mv models/ssd_mobilenet_v1_coco_2018_01_28/frozen_inference_graph.pb ./models/ssd_mobilenet_v1_coco_2018_01_28.pb\n",
+    "wget -q https://zenodo.org/record/3163026/files/ssd_mobilenet_v1_coco_2018_01_28.onnx -O models/ssd_mobilenet_v1_coco_2018_01_28.onnx \n",
+    "\n",
+    "# ssd-resnet34\n",
+    "wget -q https://zenodo.org/record/3345892/files/tf_ssd_resnet34_22.1.zip -O models/tf_ssd_resnet34_22.1.zip\n",
+    "unzip ./models/tf_ssd_resnet34_22.1.zip -d ./models; mv models/tf_ssd_resnet34_22.1/resnet34_tf.22.1.pb ./models\n",
+    "wget -q https://zenodo.org/record/3228411/files/resnet34-ssd1200.onnx -O models/resnet34-ssd1200.onnx"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Run benchmarks using the reference implementation\n",
+    "\n",
+    "Lets prepare a submission for ResNet-50 on a cloud datacenter server with a NVIDIA T4 GPU using TensorFlow. \n",
+    "\n",
+    "The following script will run those combinations and prepare a submission directory, following the general submission rules documented [here](https://github.com/mlcommons/policies/blob/master/submission_rules.adoc)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import logging\n",
+    "import os\n",
+    "logger = logging.getLogger()\n",
+    "logger.setLevel(logging.CRITICAL)\n",
+    "\n",
+    "os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' \n",
+    "os.environ['CUDA_VISIBLE_DEVICES'] = \"0\"\n",
+    "\n",
+    "# final results go here\n",
+    "ORG = \"mlperf-org\"\n",
+    "DIVISION = \"closed\"\n",
+    "SUBMISSION_ROOT = \"/tmp/mlperf-submission\"\n",
+    "SUBMISSION_DIR = os.path.join(SUBMISSION_ROOT, DIVISION, ORG)\n",
+    "os.environ['SUBMISSION_ROOT'] = SUBMISSION_ROOT\n",
+    "os.environ['SUBMISSION_DIR'] = SUBMISSION_DIR\n",
+    "os.makedirs(SUBMISSION_DIR, exist_ok=True)\n",
+    "os.makedirs(os.path.join(SUBMISSION_DIR, \"measurements\"), exist_ok=True)\n",
+    "os.makedirs(os.path.join(SUBMISSION_DIR, \"code\"), exist_ok=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "====== resnet50/SingleStream =====\n",
+      "TestScenario.SingleStream qps=7322.28, mean=0.0078, time=6.828, acc=76.456%, queries=50000, tiles=50.0:0.0077,80.0:0.0078,90.0:0.0078,95.0:0.0079,99.0:0.0131,99.9:0.0135\n",
+      "accuracy=76.456%, good=38228, total=50000\n",
+      "TestScenario.SingleStream qps=125.88, mean=0.0079, time=600.138, queries=75546, tiles=50.0:0.0079,80.0:0.0080,90.0:0.0080,95.0:0.0081,99.0:0.0081,99.9:0.0082\n",
+      "====== resnet50/Server =====\n",
+      "TestScenario.Server qps=7528.79, mean=0.0832, time=6.641, acc=76.456%, queries=50000, tiles=50.0:0.0809,80.0:0.0922,90.0:0.0932,95.0:0.0941,99.0:0.0963,99.9:0.1022\n",
+      "accuracy=76.456%, good=38228, total=50000\n",
+      "TestScenario.Server qps=128.84, mean=116.7138, time=2098.285, queries=270336, tiles=50.0:115.9511,80.0:185.2868,90.0:209.0362,95.0:220.8464,99.0:230.0520,99.9:231.5965\n",
+      "====== resnet50/Offline =====\n",
+      "TestScenario.Offline qps=2008.52, mean=0.3050, time=3.112, acc=76.456%, queries=6250, tiles=50.0:0.3017,80.0:0.3416,90.0:0.3465,95.0:0.3525,99.0:0.3646,99.9:1.2464\n",
+      "accuracy=76.456%, good=38228, total=50000\n",
+      "TestScenario.Offline qps=285.33, mean=1157.2775, time=2313.086, queries=660000, tiles=50.0:1157.2701,80.0:1850.5871,90.0:2081.7068,95.0:2197.3040,99.0:2289.7431,99.9:2310.5646\n",
+      "====== resnet50/MultiStream =====\n",
+      "TestScenario.MultiStream qps=1891.35, mean=0.0879, time=3.357, acc=76.447%, queries=6350, tiles=50.0:0.1002,80.0:0.1265,90.0:0.1311,95.0:0.1321,99.0:0.1356,99.9:0.1422\n",
+      "accuracy=76.456%, good=38228, total=50000\n",
+      "TestScenario.MultiStream qps=266.63, mean=0.0904, time=40555.550, queries=10813440, tiles=50.0:0.1050,80.0:0.1289,90.0:0.1369,95.0:0.1376,99.0:0.1386,99.9:0.1399\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:main:Namespace(accuracy=True, backend='tensorflow', cache=0, count=None, data_format=None, dataset='imagenet', dataset_list=None, dataset_path='/workspace/inference/vision/classification_and_detection/data/imagenet2012', find_peak_performance=False, inputs=['input_tensor:0'], max_batchsize=8, max_latency=0.0005, mlperf_conf='../../mlperf.conf', model='/workspace/inference/vision/classification_and_detection/models/resnet50_v1.pb', model_name='resnet50', output='/tmp/mlperf-submission/closed/mlperf-org/results/tf-gpu/resnet/SingleStream/accuracy', outputs=['ArgMax:0'], profile='resnet50-tf', qps=145, samples_per_query=40, scenario='SingleStream', threads=2, time=None, user_conf='user.conf')\n",
+      "INFO:imagenet:loaded 50000 images, cache=0, took=419.7sec\n",
+      "INFO:main:starting TestScenario.SingleStream\n",
+      "INFO:main:Namespace(accuracy=False, backend='tensorflow', cache=0, count=None, data_format=None, dataset='imagenet', dataset_list=None, dataset_path='/workspace/inference/vision/classification_and_detection/data/imagenet2012', find_peak_performance=False, inputs=['input_tensor:0'], max_batchsize=8, max_latency=0.0005, mlperf_conf='../../mlperf.conf', model='/workspace/inference/vision/classification_and_detection/models/resnet50_v1.pb', model_name='resnet50', output='/tmp/mlperf-submission/closed/mlperf-org/results/tf-gpu/resnet/SingleStream/performance/run_1', outputs=['ArgMax:0'], profile='resnet50-tf', qps=145, samples_per_query=40, scenario='SingleStream', threads=2, time=None, user_conf='user.conf')\n",
+      "INFO:imagenet:loaded 50000 images, cache=0, took=1.1sec\n",
+      "INFO:main:starting TestScenario.SingleStream\n",
+      "INFO:main:Namespace(accuracy=True, backend='tensorflow', cache=0, count=None, data_format=None, dataset='imagenet', dataset_list=None, dataset_path='/workspace/inference/vision/classification_and_detection/data/imagenet2012', find_peak_performance=False, inputs=['input_tensor:0'], max_batchsize=8, max_latency=None, mlperf_conf='../../mlperf.conf', model='/workspace/inference/vision/classification_and_detection/models/resnet50_v1.pb', model_name='resnet50', output='/tmp/mlperf-submission/closed/mlperf-org/results/tf-gpu/resnet/Server/accuracy', outputs=['ArgMax:0'], profile='resnet50-tf', qps=145, samples_per_query=40, scenario='Server', threads=2, time=None, user_conf='user.conf')\n",
+      "INFO:imagenet:loaded 50000 images, cache=0, took=1.1sec\n",
+      "INFO:main:starting TestScenario.Server\n",
+      "INFO:main:Namespace(accuracy=False, backend='tensorflow', cache=0, count=None, data_format=None, dataset='imagenet', dataset_list=None, dataset_path='/workspace/inference/vision/classification_and_detection/data/imagenet2012', find_peak_performance=False, inputs=['input_tensor:0'], max_batchsize=8, max_latency=None, mlperf_conf='../../mlperf.conf', model='/workspace/inference/vision/classification_and_detection/models/resnet50_v1.pb', model_name='resnet50', output='/tmp/mlperf-submission/closed/mlperf-org/results/tf-gpu/resnet/Server/performance/run_1', outputs=['ArgMax:0'], profile='resnet50-tf', qps=145, samples_per_query=40, scenario='Server', threads=2, time=None, user_conf='user.conf')\n",
+      "INFO:imagenet:loaded 50000 images, cache=0, took=1.1sec\n",
+      "INFO:main:starting TestScenario.Server\n",
+      "INFO:main:Namespace(accuracy=True, backend='tensorflow', cache=0, count=None, data_format=None, dataset='imagenet', dataset_list=None, dataset_path='/workspace/inference/vision/classification_and_detection/data/imagenet2012', find_peak_performance=False, inputs=['input_tensor:0'], max_batchsize=8, max_latency=None, mlperf_conf='../../mlperf.conf', model='/workspace/inference/vision/classification_and_detection/models/resnet50_v1.pb', model_name='resnet50', output='/tmp/mlperf-submission/closed/mlperf-org/results/tf-gpu/resnet/Offline/accuracy', outputs=['ArgMax:0'], profile='resnet50-tf', qps=1000, samples_per_query=40, scenario='Offline', threads=2, time=None, user_conf='user.conf')\n",
+      "INFO:imagenet:loaded 50000 images, cache=0, took=1.1sec\n",
+      "INFO:main:starting TestScenario.Offline\n",
+      "INFO:main:Namespace(accuracy=False, backend='tensorflow', cache=0, count=None, data_format=None, dataset='imagenet', dataset_list=None, dataset_path='/workspace/inference/vision/classification_and_detection/data/imagenet2012', find_peak_performance=False, inputs=['input_tensor:0'], max_batchsize=8, max_latency=None, mlperf_conf='../../mlperf.conf', model='/workspace/inference/vision/classification_and_detection/models/resnet50_v1.pb', model_name='resnet50', output='/tmp/mlperf-submission/closed/mlperf-org/results/tf-gpu/resnet/Offline/performance/run_1', outputs=['ArgMax:0'], profile='resnet50-tf', qps=1000, samples_per_query=40, scenario='Offline', threads=2, time=None, user_conf='user.conf')\n",
+      "INFO:imagenet:loaded 50000 images, cache=0, took=1.1sec\n",
+      "INFO:main:starting TestScenario.Offline\n",
+      "INFO:main:Namespace(accuracy=True, backend='tensorflow', cache=0, count=None, data_format=None, dataset='imagenet', dataset_list=None, dataset_path='/workspace/inference/vision/classification_and_detection/data/imagenet2012', find_peak_performance=False, inputs=['input_tensor:0'], max_batchsize=8, max_latency=None, mlperf_conf='../../mlperf.conf', model='/workspace/inference/vision/classification_and_detection/models/resnet50_v1.pb', model_name='resnet50', output='/tmp/mlperf-submission/closed/mlperf-org/results/tf-gpu/resnet/MultiStream/accuracy', outputs=['ArgMax:0'], profile='resnet50-tf', qps=145, samples_per_query=40, scenario='MultiStream', threads=2, time=None, user_conf='user.conf')\n",
+      "INFO:imagenet:loaded 50000 images, cache=0, took=1.1sec\n",
+      "INFO:main:starting TestScenario.MultiStream\n",
+      "INFO:main:Namespace(accuracy=False, backend='tensorflow', cache=0, count=None, data_format=None, dataset='imagenet', dataset_list=None, dataset_path='/workspace/inference/vision/classification_and_detection/data/imagenet2012', find_peak_performance=False, inputs=['input_tensor:0'], max_batchsize=8, max_latency=None, mlperf_conf='../../mlperf.conf', model='/workspace/inference/vision/classification_and_detection/models/resnet50_v1.pb', model_name='resnet50', output='/tmp/mlperf-submission/closed/mlperf-org/results/tf-gpu/resnet/MultiStream/performance/run_1', outputs=['ArgMax:0'], profile='resnet50-tf', qps=145, samples_per_query=40, scenario='MultiStream', threads=2, time=None, user_conf='user.conf')\n",
+      "INFO:imagenet:loaded 50000 images, cache=0, took=1.1sec\n",
+      "INFO:main:starting TestScenario.MultiStream\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%bash\n",
+    "\n",
+    "# where to find stuff\n",
+    "export DATA_ROOT=`pwd`/data\n",
+    "export MODEL_DIR=`pwd`/models\n",
+    "\n",
+    "# options for official runs\n",
+    "gopt=\"--max-batchsize 8 --samples-per-query 40 --threads 2 --qps 145\"\n",
+    "\n",
+    "\n",
+    "function one_run {\n",
+    "    # args: mode count framework device model ...\n",
+    "    scenario=$1; shift\n",
+    "    count=$1; shift\n",
+    "    framework=$1\n",
+    "    device=$2\n",
+    "    model=$3\n",
+    "    system_id=$framework-$device\n",
+    "    echo \"====== $model/$scenario =====\"\n",
+    "\n",
+    "    case $model in \n",
+    "    resnet50) \n",
+    "        cmd=\"tools/accuracy-imagenet.py --imagenet-val-file $DATA_ROOT/imagenet2012/val_map.txt\"\n",
+    "        offical_name=\"resnet\";;\n",
+    "    ssd-mobilenet) \n",
+    "        cmd=\"tools/accuracy-coco.py --coco-dir $DATA_ROOT/coco\"\n",
+    "        offical_name=\"ssd-small\";;\n",
+    "    ssd-resnet34) \n",
+    "        cmd=\"tools/accuracy-coco.py --coco-dir $DATA_ROOT/coco\"\n",
+    "        offical_name=\"ssd-large\";;\n",
+    "    esac\n",
+    "    output_dir=$SUBMISSION_DIR/results/$system_id/$offical_name\n",
+    "    \n",
+    "    # accuracy run\n",
+    "    ./run_local.sh $@ --scenario $scenario --accuracy --output $output_dir/$scenario/accuracy\n",
+    "    python $cmd --mlperf-accuracy-file $output_dir/$scenario/accuracy/mlperf_log_accuracy.json \\\n",
+    "            >  $output_dir/$scenario/accuracy/accuracy.txt\n",
+    "    cat $output_dir/$scenario/accuracy/accuracy.txt\n",
+    "\n",
+    "    # performance run\n",
+    "    cnt=0\n",
+    "    while [ $cnt -lt $count ]; do\n",
+    "        let cnt=cnt+1\n",
+    "        ./run_local.sh $@ --scenario $scenario --output $output_dir/$scenario/performance/run_$cnt\n",
+    "    done\n",
+    "    \n",
+    "    # setup the measurements directory\n",
+    "    mdir=$SUBMISSION_DIR/measurements/$system_id/$offical_name/$scenario\n",
+    "    mkdir -p $mdir\n",
+    "    cp ../../mlperf.conf $mdir\n",
+    "\n",
+    "    # reference app uses command line instead of user.conf\n",
+    "    echo \"# empty\" > $mdir/user.conf\n",
+    "    touch $mdir/README.md\n",
+    "    impid=\"reference\"\n",
+    "    cat > $mdir/$system_id\"_\"$impid\"_\"$scenario\".json\" <<EOF\n",
+    "    {\n",
+    "        \"input_data_types\": \"fp32\",\n",
+    "        \"retraining\": \"none\",\n",
+    "        \"starting_weights_filename\": \"https://zenodo.org/record/2535873/files/resnet50_v1.pb\",\n",
+    "        \"weight_data_types\": \"fp32\",\n",
+    "        \"weight_transformations\": \"none\"\n",
+    "    }\n",
+    "EOF\n",
+    "}\n",
+    "\n",
+    "function one_model {\n",
+    "    # args: framework device model ...\n",
+    "    one_run SingleStream 1 $@ --max-latency 0.0005\n",
+    "    one_run Server 1 $@\n",
+    "    one_run Offline 1 $@ --qps 1000\n",
+    "    one_run MultiStream 1 $@\n",
+    "}\n",
+    "\n",
+    "\n",
+    "# run image classifier benchmarks \n",
+    "export DATA_DIR=$DATA_ROOT/imagenet2012\n",
+    "one_model tf gpu resnet50 $gopt"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "There might be large trace files in the submission directory - we can delete them."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!find {SUBMISSION_DIR}/ -name mlperf_log_trace.json -delete"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Complete submission directory\n",
+    "\n",
+    "Add the required meta data to the submission."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%bash\n",
+    "\n",
+    "#\n",
+    "# setup systems directory\n",
+    "#\n",
+    "if [ ! -d ${SUBMISSION_DIR}/systems ]; then\n",
+    "    mkdir ${SUBMISSION_DIR}/systems\n",
+    "fi\n",
+    "\n",
+    "cat > ${SUBMISSION_DIR}/systems/tf-gpu.json <<EOF\n",
+    "{\n",
+    "        \"division\": \"closed\",\n",
+    "        \"status\": \"available\",\n",
+    "        \"submitter\": \"mlperf-org\",\n",
+    "        \"system_name\": \"tf-gpu\",\n",
+    "        \"system_type\": \"datacenter\",\n",
+    "        \n",
+    "        \"number_of_nodes\": 1,\n",
+    "        \"host_memory_capacity\": \"32GB\",\n",
+    "        \"host_processor_core_count\": 1,\n",
+    "        \"host_processor_frequency\": \"3.50GHz\",\n",
+    "        \"host_processor_model_name\": \"Intel(R) Xeon(R) CPU E5-1620 v3 @ 3.50GHz\",\n",
+    "        \"host_processors_per_node\": 1,\n",
+    "        \"host_storage_capacity\": \"512GB\",\n",
+    "        \"host_storage_type\": \"SSD\",\n",
+    "        \n",
+    "        \"accelerator_frequency\": \"-\",\n",
+    "        \"accelerator_host_interconnect\": \"-\",\n",
+    "        \"accelerator_interconnect\": \"-\",\n",
+    "        \"accelerator_interconnect_topology\": \"-\",\n",
+    "        \"accelerator_memory_capacity\": \"16GB\",\n",
+    "        \"accelerator_memory_configuration\": \"none\",\n",
+    "        \"accelerator_model_name\": \"T4\",\n",
+    "        \"accelerator_on-chip_memories\": \"-\",\n",
+    "        \"accelerators_per_node\": 1,\n",
+    "\n",
+    "        \"framework\": \"v1.14.0-rc1-22-gaf24dc9\",\n",
+    "        \"operating_system\": \"ubuntu-18.04\",\n",
+    "        \"other_software_stack\": \"cuda-11.2\",\n",
+    "        \"sw_notes\": \"\"\n",
+    "}\n",
+    "EOF"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%bash\n",
+    "\n",
+    "#\n",
+    "# setup code directory\n",
+    "#\n",
+    "dir=${SUBMISSION_DIR}/code/resnet/reference\n",
+    "mkdir -p $dir\n",
+    "echo \"git clone https://github.com/mlcommons/inference.git\" > $dir/VERSION.txt\n",
+    "git rev-parse HEAD >> $dir/VERSION.txt"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### What's in the submission directory now ?\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/tmp/mlperf-submission/closed/mlperf-org/systems/tf-gpu.json\r\n",
+      "/tmp/mlperf-submission/closed/mlperf-org/measurements/tf-gpu/resnet/Offline/user.conf\r\n",
+      "/tmp/mlperf-submission/closed/mlperf-org/measurements/tf-gpu/resnet/Offline/mlperf.conf\r\n",
+      "/tmp/mlperf-submission/closed/mlperf-org/measurements/tf-gpu/resnet/Offline/README.md\r\n",
+      "/tmp/mlperf-submission/closed/mlperf-org/measurements/tf-gpu/resnet/Offline/tf-gpu_reference_Offline.json\r\n",
+      "/tmp/mlperf-submission/closed/mlperf-org/measurements/tf-gpu/resnet/SingleStream/user.conf\r\n",
+      "/tmp/mlperf-submission/closed/mlperf-org/measurements/tf-gpu/resnet/SingleStream/mlperf.conf\r\n",
+      "/tmp/mlperf-submission/closed/mlperf-org/measurements/tf-gpu/resnet/SingleStream/tf-gpu_reference_SingleStream.json\r\n",
+      "/tmp/mlperf-submission/closed/mlperf-org/measurements/tf-gpu/resnet/SingleStream/README.md\r\n",
+      "/tmp/mlperf-submission/closed/mlperf-org/measurements/tf-gpu/resnet/MultiStream/user.conf\r\n",
+      "/tmp/mlperf-submission/closed/mlperf-org/measurements/tf-gpu/resnet/MultiStream/mlperf.conf\r\n",
+      "/tmp/mlperf-submission/closed/mlperf-org/measurements/tf-gpu/resnet/MultiStream/tf-gpu_reference_MultiStream.json\r\n",
+      "/tmp/mlperf-submission/closed/mlperf-org/measurements/tf-gpu/resnet/MultiStream/README.md\r\n",
+      "/tmp/mlperf-submission/closed/mlperf-org/measurements/tf-gpu/resnet/Server/user.conf\r\n",
+      "/tmp/mlperf-submission/closed/mlperf-org/measurements/tf-gpu/resnet/Server/mlperf.conf\r\n",
+      "/tmp/mlperf-submission/closed/mlperf-org/measurements/tf-gpu/resnet/Server/README.md\r\n",
+      "/tmp/mlperf-submission/closed/mlperf-org/measurements/tf-gpu/resnet/Server/tf-gpu_reference_Server.json\r\n",
+      "/tmp/mlperf-submission/closed/mlperf-org/results/tf-gpu/resnet/Offline/performance/run_1/results.json\r\n",
+      "/tmp/mlperf-submission/closed/mlperf-org/results/tf-gpu/resnet/Offline/performance/run_1/mlperf_log_summary.txt\r\n",
+      "/tmp/mlperf-submission/closed/mlperf-org/results/tf-gpu/resnet/Offline/performance/run_1/mlperf_log_detail.txt\r\n",
+      "/tmp/mlperf-submission/closed/mlperf-org/results/tf-gpu/resnet/Offline/performance/run_1/mlperf_log_accuracy.json\r\n",
+      "/tmp/mlperf-submission/closed/mlperf-org/results/tf-gpu/resnet/Offline/accuracy/results.json\r\n",
+      "/tmp/mlperf-submission/closed/mlperf-org/results/tf-gpu/resnet/Offline/accuracy/accuracy.txt\r\n",
+      "/tmp/mlperf-submission/closed/mlperf-org/results/tf-gpu/resnet/Offline/accuracy/mlperf_log_summary.txt\r\n",
+      "/tmp/mlperf-submission/closed/mlperf-org/results/tf-gpu/resnet/Offline/accuracy/mlperf_log_detail.txt\r\n",
+      "/tmp/mlperf-submission/closed/mlperf-org/results/tf-gpu/resnet/Offline/accuracy/mlperf_log_accuracy.json\r\n",
+      "/tmp/mlperf-submission/closed/mlperf-org/results/tf-gpu/resnet/SingleStream/performance/run_1/results.json\r\n",
+      "/tmp/mlperf-submission/closed/mlperf-org/results/tf-gpu/resnet/SingleStream/performance/run_1/mlperf_log_summary.txt\r\n",
+      "/tmp/mlperf-submission/closed/mlperf-org/results/tf-gpu/resnet/SingleStream/performance/run_1/mlperf_log_detail.txt\r\n",
+      "/tmp/mlperf-submission/closed/mlperf-org/results/tf-gpu/resnet/SingleStream/performance/run_1/mlperf_log_accuracy.json\r\n",
+      "/tmp/mlperf-submission/closed/mlperf-org/results/tf-gpu/resnet/SingleStream/accuracy/results.json\r\n",
+      "/tmp/mlperf-submission/closed/mlperf-org/results/tf-gpu/resnet/SingleStream/accuracy/accuracy.txt\r\n",
+      "/tmp/mlperf-submission/closed/mlperf-org/results/tf-gpu/resnet/SingleStream/accuracy/mlperf_log_summary.txt\r\n",
+      "/tmp/mlperf-submission/closed/mlperf-org/results/tf-gpu/resnet/SingleStream/accuracy/mlperf_log_detail.txt\r\n",
+      "/tmp/mlperf-submission/closed/mlperf-org/results/tf-gpu/resnet/SingleStream/accuracy/mlperf_log_accuracy.json\r\n",
+      "/tmp/mlperf-submission/closed/mlperf-org/results/tf-gpu/resnet/MultiStream/performance/run_1/results.json\r\n",
+      "/tmp/mlperf-submission/closed/mlperf-org/results/tf-gpu/resnet/MultiStream/performance/run_1/mlperf_log_summary.txt\r\n",
+      "/tmp/mlperf-submission/closed/mlperf-org/results/tf-gpu/resnet/MultiStream/performance/run_1/mlperf_log_detail.txt\r\n",
+      "/tmp/mlperf-submission/closed/mlperf-org/results/tf-gpu/resnet/MultiStream/performance/run_1/mlperf_log_accuracy.json\r\n",
+      "/tmp/mlperf-submission/closed/mlperf-org/results/tf-gpu/resnet/MultiStream/accuracy/results.json\r\n",
+      "/tmp/mlperf-submission/closed/mlperf-org/results/tf-gpu/resnet/MultiStream/accuracy/accuracy.txt\r\n",
+      "/tmp/mlperf-submission/closed/mlperf-org/results/tf-gpu/resnet/MultiStream/accuracy/mlperf_log_summary.txt\r\n",
+      "/tmp/mlperf-submission/closed/mlperf-org/results/tf-gpu/resnet/MultiStream/accuracy/mlperf_log_detail.txt\r\n",
+      "/tmp/mlperf-submission/closed/mlperf-org/results/tf-gpu/resnet/MultiStream/accuracy/mlperf_log_accuracy.json\r\n",
+      "/tmp/mlperf-submission/closed/mlperf-org/results/tf-gpu/resnet/Server/performance/run_1/results.json\r\n",
+      "/tmp/mlperf-submission/closed/mlperf-org/results/tf-gpu/resnet/Server/performance/run_1/mlperf_log_summary.txt\r\n",
+      "/tmp/mlperf-submission/closed/mlperf-org/results/tf-gpu/resnet/Server/performance/run_1/mlperf_log_detail.txt\r\n",
+      "/tmp/mlperf-submission/closed/mlperf-org/results/tf-gpu/resnet/Server/performance/run_1/mlperf_log_accuracy.json\r\n",
+      "/tmp/mlperf-submission/closed/mlperf-org/results/tf-gpu/resnet/Server/accuracy/results.json\r\n",
+      "/tmp/mlperf-submission/closed/mlperf-org/results/tf-gpu/resnet/Server/accuracy/accuracy.txt\r\n",
+      "/tmp/mlperf-submission/closed/mlperf-org/results/tf-gpu/resnet/Server/accuracy/mlperf_log_summary.txt\r\n",
+      "/tmp/mlperf-submission/closed/mlperf-org/results/tf-gpu/resnet/Server/accuracy/mlperf_log_detail.txt\r\n",
+      "/tmp/mlperf-submission/closed/mlperf-org/results/tf-gpu/resnet/Server/accuracy/mlperf_log_accuracy.json\r\n",
+      "/tmp/mlperf-submission/closed/mlperf-org/code/resnet/reference/VERSION.txt\r\n"
+     ]
+    }
+   ],
+   "source": [
+    "!find {SUBMISSION_ROOT}/ -type f"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "If we look at some files:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "-- SingleStream Accuracy\n",
+      "accuracy=76.456%, good=38228, total=50000\n",
+      "\n",
+      "-- SingleStream Summary\n",
+      "================================================\n",
+      "MLPerf Results Summary\n",
+      "================================================\n",
+      "SUT name : PySUT\n",
+      "Scenario : SingleStream\n",
+      "Mode     : PerformanceOnly\n",
+      "90th percentile latency (ns) : 8030958\n",
+      "Result is : VALID\n",
+      "  Min duration satisfied : Yes\n",
+      "  Min queries satisfied : Yes\n",
+      "\n",
+      "-- Server Summary\n",
+      "================================================\n",
+      "MLPerf Results Summary\n",
+      "================================================\n",
+      "SUT name : PySUT\n",
+      "Scenario : Server\n",
+      "Mode     : PerformanceOnly\n",
+      "Scheduled samples per second : 144.87\n",
+      "Result is : INVALID\n",
+      "  Performance constraints satisfied : NO\n",
+      "  Min duration satisfied : Yes\n"
+     ]
+    }
+   ],
+   "source": [
+    "!echo \"-- SingleStream Accuracy\"; head {SUBMISSION_DIR}/results/tf-gpu/resnet/SingleStream/accuracy/accuracy.txt\n",
+    "!echo \"\\n-- SingleStream Summary\"; head {SUBMISSION_DIR}/results/tf-gpu/resnet/SingleStream/performance/run_1/mlperf_log_summary.txt\n",
+    "!echo \"\\n-- Server Summary\"; head {SUBMISSION_DIR}/results/tf-gpu/resnet/Server/performance/run_1/mlperf_log_summary.txt"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Run the submission checker\n",
+    "\n",
+    "Finally, run the submission checker tool that does some sanity checking on your submission.\n",
+    "We run it at the end and attach the output to the submission."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!python ../../tools/submission/submission-checker.py --input {SUBMISSION_ROOT} > {SUBMISSION_DIR}/submission-checker.log 2>&1 \n",
+    "!cat {SUBMISSION_DIR}/submission-checker.log"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.5"
+  },
+  "varInspector": {
+   "cols": {
+    "lenName": 16,
+    "lenType": 16,
+    "lenVar": 40
+   },
+   "kernels_config": {
+    "python": {
+     "delete_cmd_postfix": "",
+     "delete_cmd_prefix": "del ",
+     "library": "var_list.py",
+     "varRefreshCmd": "print(var_dic_list())"
+    },
+    "r": {
+     "delete_cmd_postfix": ") ",
+     "delete_cmd_prefix": "rm(",
+     "library": "var_list.r",
+     "varRefreshCmd": "cat(var_dic_list()) "
+    }
+   },
+   "types_to_exclude": [
+    "module",
+    "function",
+    "builtin_function_or_method",
+    "instance",
+    "_Feature"
+   ],
+   "window_display": false
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/Submission_Guidelines.md
+++ b/Submission_Guidelines.md
+## Submission Rules
+
+The MLPerf inference submission rules are spread between the [MLCommons policies](https://github.com/mlcommons/policies/blob/master/submission_rules.adoc) and the [MLCommons Inference policies](https://github.com/mlcommons/inference_policies/blob/master/inference_rules.adoc) documents. Further, the rules related to power submissions are given [here](https://github.com/mlcommons/inference_policies/blob/master/power_measurement.adoc). The below points are a summary taken from the official rules to act as a checklist for the submitter - please see the original rules for any clarification.
+
+
+## Hardware requirements
+1. MLCommons inference results can be submitted on any hardware and we have past results from Raspberry Pi to high-end inference servers.
+2. Closed category submission for datacenter category needs **ECC RAM** and also needs to have the **networking** capabilities as detailed [here](https://github.com/mlcommons/inference_policies/blob/master/inference_rules.adoc#networking-from-the-v30-round)
+3. Power submissions need an [approved power analyzer](https://github.com/mlcommons/inference_policies/blob/master/power_measurement.adoc#74-which-power-analyzers-aka-meters-are-supported).
+
+## Things to Know
+ 
+1. Closed submission needs performance and accuracy run for all the required scenarios (as per edge/datacenter category) with accuracy within 99% or 99.9% as given in the respective task READMEs. Further, the model weights are not supposed to be altered except for quantization. If any of these constraints are not met, the submission cannot go under closed division but can still be submitted under open division.
+2. Reference models are mostly fp32 and reference implementations are just for reference and not meant to be directly used by submitters as they are not optimized for performance.
+3. Calibration document due **one week** before the submission deadline
+4. Power submission needs a power analyzer (approved by SPEC Power) and EULA signature to get access to SPEC PTDaemon
+5. To submit under the `available` category your submission system must be available (in whole or in parts and either publicly or to customers) and the software used must be either open source or an **official or beta release** as on the submission deadline. Submissions using nightly release for example cannot be submitted under the available category. 
+
+### Is there an automatic way to run the MLPerf inference benchmarks?
+
+MLPerf inference submissions are expected to be run on various hardware and supported software stacks. Therefore, MLCommons provides only reference implementations to guide submitters in creating optimal implementations for their specific software and hardware configurations. Additionally, all implementations used for MLPerf inference submissions are available in the MLCommons [Inference results](https://github.com/orgs/mlcommons/repositories?q=inference_results_v+sort%3Aname) repositories (under `closed/<submitter>/code` directory), offering further guidance for submitters developing their own implementations.
+
+### Expected time to do benchmark runs
+1. Closed submission under datacenter needs offline and server scenario runs with a minimum of ten minutes needed for both. 
+2. Closed submission under the edge category needs single stream, multi-stream (only for R50 and retinanet), and offline scenarios. A minimum of ten minutes is needed for each scenario. 
+3. Further two (three for ResNet50) compliance runs are needed for closed division, each taking at least 10 minutes for each scenario.
+4. SingleStream, MultiStream and Server scenarios use early stopping and so can always finish around 10 minutes
+5. Offline scenario needs a minimum of 24756 input queries to be processed -- can take hours for low-performing models like 3dunet, LLMs, etc.
+6. Open division has no accuracy constraints, no required compliance runs, and can be submitted for any single scenario. There is no constraint on the model used except that the model accuracy must be validated on the accuracy dataset used in the corresponding MLPerf inference task [or must be preapproved](https://github.com/mlcommons/inference_policies/blob/master/inference_rules.adoc#412-relaxed-constraints-for-the-open-division).
+7. Power submission needs an extra ranging mode to determine the peak current usage and this often doubles the overall experiment run time. If this overhead is too much, ranging run can be reduced to 5 minutes run using mechanisms like [this](https://github.com/mlcommons/cm4mlops/blob/main/script/benchmark-program-mlperf/customize.py#L18).
+
+
+## Validity of the submission
+
+1. [MLCommons Inference submission checker](https://github.com/mlcommons/inference/blob/master/tools/submission/submission_checker.py) is provided to ensure that all submissions are passing the required checks.
+2. In the unlikely event that there is an error on the submission checker for your submission, please raise a GitHub issue [here](https://github.com/mlcommons/inference/issues)
+3. Any submission passing the submission checker is valid to go to the review discussions but submitters are still required to answer any queries and fix any issues being reported by other submitters.
+
+### Reviewing other submissions
+1. Ensure that the `system_desc_id.json` file is having meaningful responses - `submission_checker` only checks for the existence of the fields.
+2. For power submissions, `power settings` and `analyzer table` files are to be submitted, and even though the submission checker checks for the existence of these files, the content of [these files](https://github.com/mlcommons/inference_policies/blob/master/power_measurement.adoc#64-power-management-settings) must be checked manually for validity.
+3. README files in the submission directory must be checked to make sure that the instructions are reproducible.
+4. For closed datacenter submissions, [ECC RAM and Networking requirements](https://github.com/mlcommons/inference_policies/blob/master/inference_rules.adoc#constraints-for-the-closed-division) must be ensured.
+5. Submission checker might be reporting warnings and some of these warnings can warrant an answer from the submitter.
+
+## Changes from MLCommons Inference 4.0
+
+1. One new benchmark in the datacenter category: Mixtral-8x7B. No changes in the edge category.
+2. For power submissions, there is no code change. 
+
+
--- a/automotive/3d-object-detection/README.md
+++ b/automotive/3d-object-detection/README.md
+## Reference implementation fo automotive 3D detection benchmark
+
+## Dataset and model checkpoints
+Contact MLCommons support for accessing the Waymo Open Dataset along with the model checkpoints for the reference implementation. You will need to accept a license agreement and will be given directions to download the data. You will need to place the kitti_format folder under a directory named waymo. There are four total checkpoints 2 for pytorch and 2 for onnx.
+
+## Running with docker
+Build the container and mount the inference repo and Waymo dataset directory.
+```
+docker build -t auto_inference -f dockerfile.gpu .
+
+docker run --gpus=all -it -v <directory to inference repo>/inference/:/inference -v <path to waymo dataset>/waymo:/waymo --rm auto_inference
+```
+### Run with GPU
+```
+cd /inference/automotive/3d-object-detection
+python main.py --dataset waymo --dataset-path /waymo/kitti_format/ --lidar-path <checkpoint_path>/pp_ep36.pth --segmentor-path <checkpoint_path>/best_deeplabv3plus_resnet50_waymo_os16.pth --mlperf_conf /inference/mlperf.conf
+```
+
+### Run with CPU and ONNX
+```
+python main.py --dataset waymo --dataset-path /waymo/kitti_format/ --lidar-path <checkpoint_path>/pp.onnx --segmentor-path <checkpoint_path>/deeplabv3+.onnx --mlperf_conf /inference/mlperf.conf
+```
+
+### Run the accuracy checker
+```
+python accuracy_waymo.py --mlperf-accuracy-file <path to accuracy file>/mlperf_log_accuracy.json --waymo-dir /waymo/kitti_format/
+```
--- a/automotive/3d-object-detection/accuracy_waymo.py
+++ b/automotive/3d-object-detection/accuracy_waymo.py
+"""
+Tool to calculate accuracy for loadgen accuracy output found in mlperf_log_accuracy.json
+We assume that loadgen's query index is in the same order as
+the images in coco's annotations/instances_val2017.json.
+"""
+
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import argparse
+import json
+import os
+
+import numpy as np
+from waymo import Waymo
+from tools.evaluate import do_eval
+# pylint: disable=missing-docstring
+CLASSES = Waymo.CLASSES
+LABEL2CLASSES = {v: k for k, v in CLASSES.items()}
+
+
+def get_args():
+    """Parse commandline."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--mlperf-accuracy-file",
+        required=True,
+        help="path to mlperf_log_accuracy.json")
+    parser.add_argument(
+        "--waymo-dir",
+        required=True,
+        help="waymo dataset directory")
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="verbose messages")
+    parser.add_argument(
+        "--output-file",
+        default="openimages-results.json",
+        help="path to output file")
+    parser.add_argument(
+        "--use-inv-map",
+        action="store_true",
+        help="use inverse label map")
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = get_args()
+
+    with open(args.mlperf_accuracy_file, "r") as f:
+        results = json.load(f)
+
+    detections = {}
+    image_ids = set()
+    seen = set()
+    no_results = 0
+
+    val_dataset = Waymo(
+        data_root=args.waymo_dir,
+        split='val',
+        painted=True,
+        cam_sync=False)
+
+    for j in results:
+        idx = j['qsl_idx']
+        # de-dupe in case loadgen sends the same image multiple times
+        if idx in seen:
+            continue
+        seen.add(idx)
+
+        # reconstruct from mlperf accuracy log
+        # what is written by the benchmark is an array of float32's:
+        # id, box[0], box[1], box[2], box[3], score, detection_class
+        # note that id is a index into instances_val2017.json, not the actual
+        # image_id
+        data = np.frombuffer(bytes.fromhex(j['data']), np.float32)
+
+        for i in range(0, len(data), 14):
+            dimension = [float(x) for x in data[i:i + 3]]
+            location = [float(x) for x in data[i + 3:i + 6]]
+            rotation_y = float(data[i + 6])
+            bbox = [float(x) for x in data[i + 7:i + 11]]
+            label = int(data[i + 11])
+            score = float(data[i + 12])
+            image_idx = int(data[i + 13])
+            if image_idx not in detections:
+                detections[image_idx] = {
+                    'name': [],
+                    'dimensions': [],
+                    'location': [],
+                    'rotation_y': [],
+                    'bbox': [],
+                    'score': []
+                }
+            if dimension[0] > 0:
+                detections[image_idx]['name'].append(LABEL2CLASSES[label])
+                detections[image_idx]['dimensions'].append(dimension)
+                detections[image_idx]['location'].append(location)
+                detections[image_idx]['rotation_y'].append(rotation_y)
+                detections[image_idx]['bbox'].append(bbox)
+                detections[image_idx]['score'].append(score)
+            image_ids.add(image_idx)
+
+    with open(args.output_file, "w") as fp:
+        json.dump(detections, fp, sort_keys=True, indent=4)
+    format_results = {}
+    for key in detections.keys():
+        format_results[key] = {k: np.array(v)
+                               for k, v in detections[key].items()}
+    map_stats = do_eval(
+        format_results,
+        val_dataset.data_infos,
+        CLASSES,
+        cam_sync=False)
+    map_stats['Total'] = np.mean(list(map_stats.values()))
+
+    print(map_stats)
+    if args.verbose:
+        print("found {} results".format(len(results)))
+        print("found {} images".format(len(image_ids)))
+        print("found {} images with no results".format(no_results))
+        print("ignored {} dupes".format(len(results) - len(seen)))
+
+
+if __name__ == "__main__":
+    main()
--- a/automotive/3d-object-detection/backend.py
+++ b/automotive/3d-object-detection/backend.py
+"""
+abstract backend class
+"""
+
+
+class Backend:
+    def __init__(self):
+        self.inputs = []
+        self.outputs = []
+
+    def version(self):
+        raise NotImplementedError("Backend:version")
+
+    def name(self):
+        raise NotImplementedError("Backend:name")
+
+    def load(self, model_path, inputs=None, outputs=None):
+        raise NotImplementedError("Backend:load")
+
+    def predict(self, feed):
+        raise NotImplementedError("Backend:predict")
--- a/automotive/3d-object-detection/backend_debug.py
+++ b/automotive/3d-object-detection/backend_debug.py
+import torch
+import backend
+
+
+class BackendDebug(backend.Backend):
+    def __init__(self, image_size=[3, 1024, 1024], **kwargs):
+        super(BackendDebug, self).__init__()
+        self.image_size = image_size
+
+    def version(self):
+        return torch.__version__
+
+    def name(self):
+        return "debug-SUT"
+
+    def image_format(self):
+        return "NCHW"
+
+    def load(self):
+        return self
+
+    def predict(self, prompts):
+        images = []
+        return images
--- a/automotive/3d-object-detection/backend_deploy.py
+++ b/automotive/3d-object-detection/backend_deploy.py
+from typing import Optional, List, Union
+import os
+import torch
+import logging
+import backend
+from collections import namedtuple
+from model.painter import Painter
+from model.pointpillars import PointPillars
+import numpy as np
+from tools.process import keep_bbox_from_image_range
+from waymo import Waymo
+
+
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger("backend-pytorch")
+
+
+def change_calib_device(calib, cuda):
+    result = {}
+    if cuda:
+        device = 'cuda'
+    else:
+        device = 'cpu'
+    result['R0_rect'] = calib['R0_rect'].to(device=device, dtype=torch.float)
+    for i in range(5):
+        result['P' + str(i)] = calib['P' + str(i)
+                                     ].to(device=device, dtype=torch.float)
+        result['Tr_velo_to_cam_' +
+               str(i)] = calib['Tr_velo_to_cam_' +
+                               str(i)].to(device=device, dtype=torch.float)
+    return result
+
+
+class BackendDeploy(backend.Backend):
+    def __init__(
+        self,
+        segmentor_path,
+        lidar_detector_path,
+        data_path
+    ):
+        super(BackendDeploy, self).__init__()
+        self.segmentor_path = segmentor_path
+        self.lidar_detector_path = lidar_detector_path
+        # self.segmentation_classes = 18
+        self.detection_classes = 3
+        self.data_root = data_path
+        CLASSES = Waymo.CLASSES
+        self.LABEL2CLASSES = {v: k for k, v in CLASSES.items()}
+
+    def version(self):
+        return torch.__version__
+
+    def name(self):
+        return "python-SUT"
+
+    def load(self):
+        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+        PaintArgs = namedtuple(
+            'PaintArgs', [
+                'training_path', 'model_path', 'cam_sync'])
+        painting_args = PaintArgs(
+            os.path.join(
+                self.data_root,
+                'training'),
+            self.segmentor_path,
+            False)
+        self.painter = Painter(painting_args)
+        self.segmentor = self.painter.model
+        model = PointPillars(
+            nclasses=self.detection_classes,
+            painted=True).to(
+            device=device)
+        model.eval()
+        checkpoint = torch.load(self.lidar_detector_path)
+        model.load_state_dict(checkpoint["model_state_dict"])
+        self.lidar_detector = model
+
+        return self
+
+    def predict(self, inputs):
+        dimensions, locations, rotation_y, box2d, class_labels, class_scores, ids = [
+        ], [], [], [], [], [], []
+        with torch.inference_mode():
+            device = torch.device(
+                "cuda:0" if torch.cuda.is_available() else "cpu")
+            model_input = inputs[0]
+            batched_pts = model_input['pts']
+            scores_from_cam = []
+            for i in range(len(model_input['images'])):
+                segmentation_score = self.segmentor(
+                    model_input['images'][i].to(device))[0]
+                scores_from_cam.append(
+                    self.painter.get_score(segmentation_score).cpu())
+            points = self.painter.augment_lidar_class_scores_both(
+                scores_from_cam, batched_pts, model_input['calib_info'])
+            batch_results = self.lidar_detector(
+                batched_pts=[points.to(device=device)], mode='val')
+            for j, result in enumerate(batch_results):
+                format_result = {
+                    'class': [],
+                    'truncated': [],
+                    'occluded': [],
+                    'alpha': [],
+                    'bbox': [],
+                    'dimensions': [],
+                    'location': [],
+                    'rotation_y': [],
+                    'score': [],
+                    'idx': -1
+                }
+
+                calib_info = model_input['calib_info']
+                image_info = model_input['image_info']
+                idx = model_input['image_info']['image_idx']
+                format_result['idx'] = idx
+                calib_info = change_calib_device(calib_info, False)
+                result_filter = keep_bbox_from_image_range(
+                    result, calib_info, 5, image_info, False)
+
+                lidar_bboxes = result_filter['lidar_bboxes']
+                labels, scores = result_filter['labels'], result_filter['scores']
+                bboxes2d, camera_bboxes = result_filter['bboxes2d'], result_filter['camera_bboxes']
+                for lidar_bbox, label, score, bbox2d, camera_bbox in \
+                        zip(lidar_bboxes, labels, scores, bboxes2d, camera_bboxes):
+                    format_result['class'].append(label.item())
+                    format_result['truncated'].append(0.0)
+                    format_result['occluded'].append(0)
+                    alpha = camera_bbox[6] - \
+                        np.arctan2(camera_bbox[0], camera_bbox[2])
+                    format_result['alpha'].append(alpha.item())
+                    format_result['bbox'].append(bbox2d.tolist())
+                    format_result['dimensions'].append(camera_bbox[3:6])
+                    format_result['location'].append(camera_bbox[:3])
+                    format_result['rotation_y'].append(camera_bbox[6].item())
+                    format_result['score'].append(score.item())
+
+                if len(format_result['dimensions']) > 0:
+                    format_result['dimensions'] = torch.stack(
+                        format_result['dimensions'])
+                    format_result['location'] = torch.stack(
+                        format_result['location'])
+                dimensions.append(format_result['dimensions'])
+                locations.append(format_result['location'])
+                rotation_y.append(format_result['rotation_y'])
+                class_labels.append(format_result['class'])
+                class_scores.append(format_result['score'])
+                box2d.append(format_result['bbox'])
+                ids.append(format_result['idx'])
+        return dimensions, locations, rotation_y, box2d, class_labels, class_scores, ids
--- a/automotive/3d-object-detection/backend_onnx.py
+++ b/automotive/3d-object-detection/backend_onnx.py
+from typing import Optional, List, Union
+import os
+import torch
+import logging
+import backend
+from collections import namedtuple
+from model.painter import Painter
+from model.pointpillars_core import PointPillarsPre, PointPillarsPos
+import numpy as np
+from tools.process import keep_bbox_from_image_range
+from waymo import Waymo
+import onnxruntime as ort
+
+
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger("backend-onnx")
+
+
+def change_calib_device(calib, cuda):
+    result = {}
+    if cuda:
+        device = 'cuda'
+    else:
+        device = 'cpu'
+    result['R0_rect'] = calib['R0_rect'].to(device=device, dtype=torch.float)
+    for i in range(5):
+        result['P' + str(i)] = calib['P' + str(i)
+                                     ].to(device=device, dtype=torch.float)
+        result['Tr_velo_to_cam_' +
+               str(i)] = calib['Tr_velo_to_cam_' +
+                               str(i)].to(device=device, dtype=torch.float)
+    return result
+
+
+def to_numpy(tensor):
+    return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()
+
+
+class BackendOnnx(backend.Backend):
+    def __init__(
+        self,
+        segmentor_path,
+        lidar_detector_path,
+        data_path
+    ):
+        super(BackendOnnx, self).__init__()
+        self.segmentor_path = segmentor_path
+        self.lidar_detector_path = lidar_detector_path
+        # self.segmentation_classes = 18
+        self.detection_classes = 3
+        self.data_root = data_path
+        CLASSES = Waymo.CLASSES
+        self.LABEL2CLASSES = {v: k for k, v in CLASSES.items()}
+
+    def version(self):
+        return torch.__version__
+
+    def name(self):
+        return "python-SUT"
+
+    def load(self):
+        device = torch.device("cpu")
+        PaintArgs = namedtuple(
+            'PaintArgs', [
+                'training_path', 'model_path', 'cam_sync'])
+        painting_args = PaintArgs(
+            os.path.join(
+                self.data_root,
+                'training'),
+            self.segmentor_path,
+            False)
+        self.painter = Painter(painting_args, onnx=True)
+        self.segmentor = self.painter.model
+        model_pre = PointPillarsPre()
+        model_post = PointPillarsPos(self.detection_classes)
+        model_pre.eval()
+        model_post.eval()
+        ort_sess = ort.InferenceSession(self.lidar_detector_path)
+        self.lidar_detector = ort_sess
+        self.model_pre = model_pre
+        self.model_post = model_post
+        return self
+
+    def predict(self, inputs):
+        dimensions, locations, rotation_y, box2d, class_labels, class_scores, ids = [
+        ], [], [], [], [], [], []
+        with torch.inference_mode():
+            model_input = inputs[0]
+            batched_pts = model_input['pts']
+            scores_from_cam = []
+            for i in range(len(model_input['images'])):
+                input_image_name = self.segmentor.get_inputs()[0].name
+                input_data = {
+                    input_image_name: to_numpy(
+                        model_input['images'][i])}
+                segmentation_score = self.segmentor.run(None, input_data)
+                segmentation_score = [
+                    torch.from_numpy(item) for item in segmentation_score]
+                scores_from_cam.append(
+                    self.painter.get_score(
+                        segmentation_score[0].squeeze(0)).cpu())
+            points = self.painter.augment_lidar_class_scores_both(
+                scores_from_cam, batched_pts, model_input['calib_info'])
+            pillars, coors_batch, npoints_per_pillar = self.model_pre(batched_pts=[
+                                                                      points])
+            input_pillars_name = self.lidar_detector.get_inputs()[0].name
+            input_coors_batch_name = self.lidar_detector.get_inputs()[1].name
+            input_npoints_per_pillar_name = self.lidar_detector.get_inputs()[
+                2].name
+            input_data = {input_pillars_name: to_numpy(pillars),
+                          input_coors_batch_name: to_numpy(coors_batch),
+                          input_npoints_per_pillar_name: to_numpy(npoints_per_pillar)}
+            result = self.lidar_detector.run(None, input_data)
+            result = [torch.from_numpy(item) for item in result]
+            batch_results = self.model_post(result)
+            for j, result in enumerate(batch_results):
+                format_result = {
+                    'class': [],
+                    'truncated': [],
+                    'occluded': [],
+                    'alpha': [],
+                    'bbox': [],
+                    'dimensions': [],
+                    'location': [],
+                    'rotation_y': [],
+                    'score': [],
+                    'idx': -1
+                }
+
+                calib_info = model_input['calib_info']
+                image_info = model_input['image_info']
+                idx = model_input['image_info']['image_idx']
+                format_result['idx'] = idx
+                calib_info = change_calib_device(calib_info, False)
+                result_filter = keep_bbox_from_image_range(
+                    result, calib_info, 5, image_info, False)
+
+                lidar_bboxes = result_filter['lidar_bboxes']
+                labels, scores = result_filter['labels'], result_filter['scores']
+                bboxes2d, camera_bboxes = result_filter['bboxes2d'], result_filter['camera_bboxes']
+                for lidar_bbox, label, score, bbox2d, camera_bbox in \
+                        zip(lidar_bboxes, labels, scores, bboxes2d, camera_bboxes):
+                    format_result['class'].append(label.item())
+                    format_result['truncated'].append(0.0)
+                    format_result['occluded'].append(0)
+                    alpha = camera_bbox[6] - \
+                        np.arctan2(camera_bbox[0], camera_bbox[2])
+                    format_result['alpha'].append(alpha.item())
+                    format_result['bbox'].append(bbox2d.tolist())
+                    format_result['dimensions'].append(camera_bbox[3:6])
+                    format_result['location'].append(camera_bbox[:3])
+                    format_result['rotation_y'].append(camera_bbox[6].item())
+                    format_result['score'].append(score.item())
+
+                if len(format_result['dimensions']) > 0:
+                    format_result['dimensions'] = torch.stack(
+                        format_result['dimensions'])
+                    format_result['location'] = torch.stack(
+                        format_result['location'])
+                dimensions.append(format_result['dimensions'])
+                locations.append(format_result['location'])
+                rotation_y.append(format_result['rotation_y'])
+                class_labels.append(format_result['class'])
+                class_scores.append(format_result['score'])
+                box2d.append(format_result['bbox'])
+                ids.append(format_result['idx'])
+
+        return dimensions, locations, rotation_y, box2d, class_labels, class_scores, ids
--- a/automotive/3d-object-detection/dataset.py
+++ b/automotive/3d-object-detection/dataset.py
+"""
+dataset related classes and methods
+"""
+
+# pylint: disable=unused-argument,missing-docstring
+
+import logging
+import sys
+import time
+
+import numpy as np
+import torch
+
+
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger("dataset")
+
+
+class Dataset:
+    def __init__(self):
+        self.items_inmemory = {}
+
+    def preprocess(self, use_cache=True):
+        raise NotImplementedError("Dataset:preprocess")
+
+    def get_item_count(self):
+        raise NotImplementedError("Dataset:get_item_count")
+
+    def get_list(self):
+        raise NotImplementedError("Dataset:get_list")
+
+    def load_query_samples(self, sample_list):
+        raise NotImplementedError("Dataset:load_query_samples")
+
+    def unload_query_samples(self, sample_list):
+        raise NotImplementedError("Dataset:unload_query_samples")
+
+    def get_samples(self, id_list):
+        raise NotImplementedError("Dataset:get_samples")
+
+    def get_item(self, id):
+        raise NotImplementedError("Dataset:get_item")
+
+
+def preprocess(list_data):
+    batched_pts_list, batched_gt_bboxes_list = [], []
+    batched_labels_list, batched_names_list = [], []
+    batched_difficulty_list = []
+    batched_img_list, batched_calib_list = [], []
+    batched_images = []
+    for data_dict in list_data:
+        pts, gt_bboxes_3d = data_dict['pts'], data_dict['gt_bboxes_3d']
+        gt_labels, gt_names = data_dict['gt_labels'], data_dict['gt_names']
+        difficulty = data_dict['difficulty']
+        image_info, calib_info = data_dict['image_info'], data_dict['calib_info']
+
+        batched_pts_list.append(torch.from_numpy(pts))
+        batched_gt_bboxes_list.append(torch.from_numpy(gt_bboxes_3d))
+        batched_labels_list.append(torch.from_numpy(gt_labels))
+        batched_names_list.append(gt_names)  # List(str)
+        batched_difficulty_list.append(torch.from_numpy(difficulty))
+        batched_img_list.append(image_info)
+        batched_calib_list.append(calib_info)
+        batched_images.append(data_dict['images'])
+    rt_data_dict = dict(
+        batched_pts=batched_pts_list,
+        batched_gt_bboxes=batched_gt_bboxes_list,
+        batched_labels=batched_labels_list,
+        batched_names=batched_names_list,
+        batched_difficulty=batched_difficulty_list,
+        batched_img_info=batched_img_list,
+        batched_calib_info=batched_calib_list,
+        batched_images=batched_images
+    )
+
+    return rt_data_dict
--- a/automotive/3d-object-detection/dockerfile cpu
+++ b/automotive/3d-object-detection/dockerfile cpu
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:23.08-py3
+FROM ${FROM_IMAGE_NAME}
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+# apt dependencies
+RUN apt-get update
+RUN apt-get install -y ffmpeg libsm6 libxext6
+
+# install LDM
+COPY . /diffusion
+RUN cd /diffusion && \
+    pip install --no-cache-dir -r requirements.txt
+
+# install loadgen
+RUN cd /tmp && \
+    git clone --recursive https://github.com/mlcommons/inference && \
+    cd inference/loadgen && \
+    pip install pybind11 && \
+    CFLAGS="-std=c++14" python setup.py install && \
+    rm -rf mlperf
\ No newline at end of file
--- a/automotive/3d-object-detection/dockerfile.gpu
+++ b/automotive/3d-object-detection/dockerfile.gpu
+ARG FROM_IMAGE_NAME=pytorch/pytorch:2.2.2-cuda11.8-cudnn8-devel
+FROM ${FROM_IMAGE_NAME}
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+# apt dependencies
+RUN apt-get update
+RUN apt-get install -y ffmpeg libsm6 libxext6 git
+
+# install LDM
+COPY . /diffusion
+RUN cd /diffusion && \
+    pip install --no-cache-dir -r requirements.txt
+
+# install loadgen
+RUN cd /tmp && \
+    git clone --recursive https://github.com/mlcommons/inference && \
+    cd inference/loadgen && \
+    pip install pybind11 && \
+    CFLAGS="-std=c++14" python setup.py install && \
+    rm -rf mlperf
+
+RUN pip install tqdm==4.65.0
+RUN pip install numba==0.60.0
+RUN pip install opencv-python==4.11.0.86
+RUN pip install open3d==0.19.0
+RUN pip install scikit-image==0.25.0
+RUN pip install ninja==1.11.1
+RUN pip install shapely==2.0.6
+RUN pip install tensorboard==2.18.0
+RUN pip install onnxruntime==1.20.1
\ No newline at end of file
--- a/automotive/3d-object-detection/main.py
+++ b/automotive/3d-object-detection/main.py
+"""
+mlperf inference benchmarking tool
+"""
+
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import argparse
+import array
+import collections
+import json
+import logging
+import os
+import sys
+import threading
+import time
+from queue import Queue
+
+import mlperf_loadgen as lg
+import numpy as np
+import torch
+
+import dataset
+import waymo
+
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger("main")
+
+NANO_SEC = 1e9
+MILLI_SEC = 1000
+
+SUPPORTED_DATASETS = {
+    "waymo": (
+        waymo.Waymo,
+        dataset.preprocess,
+        waymo.PostProcessWaymo(),
+        {}  # "image_size": [3, 1024, 1024]},
+    )
+}
+
+
+SUPPORTED_PROFILES = {
+    "defaults": {
+        "dataset": "waymo",
+        "backend": "pytorch",
+        "model-name": "pointpainting",
+    },
+}
+
+SCENARIO_MAP = {
+    "SingleStream": lg.TestScenario.SingleStream,
+    "MultiStream": lg.TestScenario.MultiStream,
+    "Server": lg.TestScenario.Server,
+    "Offline": lg.TestScenario.Offline,
+}
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--dataset",
+        choices=SUPPORTED_DATASETS.keys(),
+        help="dataset")
+    parser.add_argument(
+        "--dataset-path",
+        required=True,
+        help="path to the dataset")
+    parser.add_argument(
+        "--profile", choices=SUPPORTED_PROFILES.keys(), help="standard profiles"
+    )
+    parser.add_argument(
+        "--scenario",
+        default="SingleStream",
+        help="mlperf benchmark scenario, one of " +
+        str(list(SCENARIO_MAP.keys())),
+    )
+    parser.add_argument(
+        "--max-batchsize",
+        type=int,
+        default=1,
+        help="max batch size in a single inference",
+    )
+    parser.add_argument("--threads", default=1, type=int, help="threads")
+    parser.add_argument(
+        "--accuracy",
+        action="store_true",
+        help="enable accuracy pass")
+    parser.add_argument(
+        "--find-peak-performance",
+        action="store_true",
+        help="enable finding peak performance pass",
+    )
+    parser.add_argument("--backend", help="Name of the backend")
+    parser.add_argument("--model-name", help="Name of the model")
+    parser.add_argument("--output", default="output", help="test results")
+    parser.add_argument("--qps", type=int, help="target qps")
+    parser.add_argument("--lidar-path", help="Path to model weights")
+    parser.add_argument("--segmentor-path", help="Path to model weights")
+
+    parser.add_argument(
+        "--dtype",
+        default="fp32",
+        choices=["fp32", "fp16", "bf16"],
+        help="dtype of the model",
+    )
+    parser.add_argument(
+        "--device",
+        default="cuda",
+        choices=["cuda", "cpu"],
+        help="device to run the benchmark",
+    )
+
+    # file to use mlperf rules compliant parameters
+    parser.add_argument(
+        "--mlperf_conf", default="mlperf.conf", help="mlperf rules config"
+    )
+    # file for user LoadGen settings such as target QPS
+    parser.add_argument(
+        "--user_conf",
+        default="user.conf",
+        help="user config for user LoadGen settings such as target QPS",
+    )
+    # file for LoadGen audit settings
+    parser.add_argument(
+        "--audit_conf", default="audit.config", help="config for LoadGen audit settings"
+    )
+
+    # below will override mlperf rules compliant settings - don't use for
+    # official submission
+    parser.add_argument("--time", type=int, help="time to scan in seconds")
+    parser.add_argument("--count", type=int, help="dataset items to use")
+    parser.add_argument("--debug", action="store_true", help="debug")
+    parser.add_argument(
+        "--performance-sample-count", type=int, help="performance sample count", default=5000
+    )
+    parser.add_argument(
+        "--max-latency", type=float, help="mlperf max latency in pct tile"
+    )
+    parser.add_argument(
+        "--samples-per-query",
+        default=8,
+        type=int,
+        help="mlperf multi-stream samples per query",
+    )
+    args = parser.parse_args()
+
+    # don't use defaults in argparser. Instead we default to a dict, override that with a profile
+    # and take this as default unless command line give
+    defaults = SUPPORTED_PROFILES["defaults"]
+
+    if args.profile:
+        profile = SUPPORTED_PROFILES[args.profile]
+        defaults.update(profile)
+    for k, v in defaults.items():
+        kc = k.replace("-", "_")
+        if getattr(args, kc) is None:
+            setattr(args, kc, v)
+
+    if args.scenario not in SCENARIO_MAP:
+        parser.error("valid scanarios:" + str(list(SCENARIO_MAP.keys())))
+    return args
+
+
+def get_backend(backend, **kwargs):
+    if backend == "pytorch":
+        from backend_deploy import BackendDeploy
+
+        backend = BackendDeploy(**kwargs)
+    elif backend == 'onnx':
+        from backend_onnx import BackendOnnx
+        backend = BackendOnnx(**kwargs)
+    elif backend == "debug":
+        from backend_debug import BackendDebug
+
+        backend = BackendDebug()
+    else:
+        raise ValueError("unknown backend: " + backend)
+    return backend
+
+
+class Item:
+    """An item that we queue for processing by the thread pool."""
+
+    def __init__(self, query_id, content_id, inputs, img=None):
+        self.query_id = query_id
+        self.content_id = content_id
+        self.img = img
+        self.inputs = inputs
+        self.start = time.time()
+
+
+class RunnerBase:
+    def __init__(self, model, ds, threads, post_proc=None, max_batchsize=128):
+        self.take_accuracy = False
+        self.ds = ds
+        self.model = model
+        self.post_process = post_proc
+        self.threads = threads
+        self.take_accuracy = False
+        self.max_batchsize = max_batchsize
+        self.result_timing = []
+
+    def handle_tasks(self, tasks_queue):
+        pass
+
+    def start_run(self, result_dict, take_accuracy):
+        self.result_dict = result_dict
+        self.result_timing = []
+        self.take_accuracy = take_accuracy
+        self.post_process.start()
+
+    def run_one_item(self, qitem: Item):
+        # run the prediction
+        processed_results = []
+        try:
+            results = self.model.predict(qitem.inputs)
+            processed_results = self.post_process(
+                results, qitem.content_id, qitem.inputs, self.result_dict)
+
+            if self.take_accuracy:
+                self.post_process.add_results(processed_results)
+            self.result_timing.append(time.time() - qitem.start)
+        except Exception as ex:  # pylint: disable=broad-except
+            src = [self.ds.get_item_loc(i) for i in qitem.content_id]
+            log.error("thread: failed on contentid=%s, %s", src, ex)
+            # since post_process will not run, fake empty responses
+            processed_results = [[]] * len(qitem.query_id)
+        finally:
+            response_array_refs = []
+            response = []
+            for idx, query_id in enumerate(qitem.query_id):
+                response_array = array.array("B", np.array(
+                    processed_results[idx], np.float32).tobytes())
+
+                response_array_refs.append(response_array)
+                bi = response_array.buffer_info()
+                response.append(lg.QuerySampleResponse(query_id, bi[0], bi[1]))
+            lg.QuerySamplesComplete(response)
+
+    def enqueue(self, query_samples):
+        idx = [q.index for q in query_samples]
+        query_id = [q.id for q in query_samples]
+        if len(query_samples) < self.max_batchsize:
+            data, label = self.ds.get_samples(idx)
+            self.run_one_item(Item(query_id, idx, data, label))
+        else:
+            bs = self.max_batchsize
+            for i in range(0, len(idx), bs):
+                data, label = self.ds.get_samples(idx[i: i + bs])
+                self.run_one_item(
+                    Item(query_id[i: i + bs], idx[i: i + bs], data, label)
+                )
+
+    def finish(self):
+        pass
+
+
+class QueueRunner(RunnerBase):
+    def __init__(self, model, ds, threads, post_proc=None, max_batchsize=128):
+        super().__init__(model, ds, threads, post_proc, max_batchsize)
+        self.tasks = Queue(maxsize=threads * 4)
+        self.workers = []
+        self.result_dict = {}
+
+        for _ in range(self.threads):
+            worker = threading.Thread(
+                target=self.handle_tasks, args=(
+                    self.tasks,))
+            worker.daemon = True
+            self.workers.append(worker)
+            worker.start()
+
+    def handle_tasks(self, tasks_queue):
+        """Worker thread."""
+        while True:
+            qitem = tasks_queue.get()
+            if qitem is None:
+                # None in the queue indicates the parent want us to exit
+                tasks_queue.task_done()
+                break
+            self.run_one_item(qitem)
+            tasks_queue.task_done()
+
+    def enqueue(self, query_samples):
+        idx = [q.index for q in query_samples]
+        query_id = [q.id for q in query_samples]
+        if len(query_samples) < self.max_batchsize:
+            data, label = self.ds.get_samples(idx)
+            self.tasks.put(Item(query_id, idx, data, label))
+        else:
+            bs = self.max_batchsize
+            for i in range(0, len(idx), bs):
+                ie = i + bs
+                data, label = self.ds.get_samples(idx[i:ie])
+                self.tasks.put(Item(query_id[i:ie], idx[i:ie], data, label))
+
+    def finish(self):
+        # exit all threads
+        for _ in self.workers:
+            self.tasks.put(None)
+        for worker in self.workers:
+            worker.join()
+
+
+def main():
+    args = get_args()
+
+    log.info(args)
+
+    # find backend
+    backend = get_backend(
+        # TODO: pass model, inference and backend arguments
+        args.backend,
+        lidar_detector_path=args.lidar_path,
+        segmentor_path=args.segmentor_path,
+        data_path=args.dataset_path
+
+    )
+    if args.dtype == "fp16":
+        dtype = torch.float16
+    elif args.dtype == "bf16":
+        dtype = torch.bfloat16
+    else:
+        dtype = torch.float32
+
+    # --count applies to accuracy mode only and can be used to limit the number of images
+    # for testing.
+    count_override = False
+    count = args.count
+    if count:
+        count_override = True
+
+    # load model to backend
+    model = backend.load()
+
+    # dataset to use
+    dataset_class, pre_proc, post_proc, kwargs = SUPPORTED_DATASETS[args.dataset]
+    ds = dataset_class(
+        data_root=args.dataset_path,
+        split='val',
+        painted=True,
+        cam_sync=False)
+
+    final_results = {
+        "runtime": model.name(),
+        "version": model.version(),
+        "time": int(time.time()),
+        "args": vars(args),
+        "cmdline": str(args),
+    }
+
+    mlperf_conf = os.path.abspath(args.mlperf_conf)
+    if not os.path.exists(mlperf_conf):
+        log.error("{} not found".format(mlperf_conf))
+        sys.exit(1)
+
+    user_conf = os.path.abspath(args.user_conf)
+    if not os.path.exists(user_conf):
+        log.error("{} not found".format(user_conf))
+        sys.exit(1)
+
+    audit_config = os.path.abspath(args.audit_conf)
+
+    if args.output:
+        output_dir = os.path.abspath(args.output)
+        os.makedirs(output_dir, exist_ok=True)
+        os.chdir(output_dir)
+
+    #
+    # make one pass over the dataset to validate accuracy
+    #
+    count = ds.get_item_count()
+
+    # warmup
+    # TODO: Load warmup samples, the following code is a general
+    # way of doing this, but might need some fixing
+    ds.load_query_samples([0])
+    for i in range(5):
+        input = ds.get_samples([0])
+        _ = backend.predict(input[0])
+
+    scenario = SCENARIO_MAP[args.scenario]
+    runner_map = {
+        lg.TestScenario.SingleStream: RunnerBase,
+        lg.TestScenario.MultiStream: QueueRunner,
+        lg.TestScenario.Server: QueueRunner,
+        lg.TestScenario.Offline: QueueRunner,
+    }
+    runner = runner_map[scenario](
+        model, ds, args.threads, post_proc=post_proc, max_batchsize=args.max_batchsize
+    )
+
+    def issue_queries(query_samples):
+        runner.enqueue(query_samples)
+
+    def flush_queries():
+        pass
+
+    log_output_settings = lg.LogOutputSettings()
+    log_output_settings.outdir = output_dir
+    log_output_settings.copy_summary_to_stdout = False
+    log_settings = lg.LogSettings()
+    log_settings.enable_trace = args.debug
+    log_settings.log_output = log_output_settings
+
+    settings = lg.TestSettings()
+    settings.FromConfig(user_conf, args.model_name, args.scenario)
+    settings.scenario = scenario
+    settings.mode = lg.TestMode.PerformanceOnly
+    if args.accuracy:
+        settings.mode = lg.TestMode.AccuracyOnly
+    if args.find_peak_performance:
+        settings.mode = lg.TestMode.FindPeakPerformance
+
+    if args.time:
+        # override the time we want to run
+        settings.min_duration_ms = args.time * MILLI_SEC
+        settings.max_duration_ms = args.time * MILLI_SEC
+
+    if args.qps:
+        qps = float(args.qps)
+        settings.server_target_qps = qps
+        settings.offline_expected_qps = qps
+
+    if count_override:
+        settings.min_query_count = count
+        settings.max_query_count = count
+
+    if args.samples_per_query:
+        settings.multi_stream_samples_per_query = args.samples_per_query
+    if args.max_latency:
+        settings.server_target_latency_ns = int(args.max_latency * NANO_SEC)
+        settings.multi_stream_expected_latency_ns = int(
+            args.max_latency * NANO_SEC)
+
+    performance_sample_count = (
+        args.performance_sample_count
+        if args.performance_sample_count
+        else min(count, 500)
+    )
+    sut = lg.ConstructSUT(issue_queries, flush_queries)
+    qsl = lg.ConstructQSL(
+        count, performance_sample_count, ds.load_query_samples, ds.unload_query_samples
+    )
+
+    log.info("starting {}".format(scenario))
+    result_dict = {"scenario": str(scenario)}
+    runner.start_run(result_dict, args.accuracy)
+
+    lg.StartTestWithLogSettings(sut, qsl, settings, log_settings, audit_config)
+
+    if args.accuracy:
+        post_proc.finalize(result_dict, ds)
+        final_results["accuracy_results"] = result_dict
+
+    runner.finish()
+    lg.DestroyQSL(qsl)
+    lg.DestroySUT(sut)
+
+    #
+    # write final results
+    #
+    if args.output:
+        with open("results.json", "w") as f:
+            json.dump(final_results, f, sort_keys=True, indent=4)
+
+
+if __name__ == "__main__":
+    main()
--- a/automotive/3d-object-detection/model/__init__.py
+++ b/automotive/3d-object-detection/model/__init__.py
+from .anchors import Anchors, anchors2bboxes, bboxes2deltas
+from .pointpillars import PointPillars
--- a/automotive/3d-object-detection/model/anchors.py
+++ b/automotive/3d-object-detection/model/anchors.py
+import torch
+import math
+from tools.process import limit_period, iou2d_nearest
+
+
+class Anchors():
+    def __init__(self, ranges, sizes, rotations):
+        assert len(ranges) == len(sizes)
+        self.ranges = ranges
+        self.sizes = sizes
+        self.rotations = rotations
+
+    def get_anchors(self, feature_map_size, anchor_range,
+                    anchor_size, rotations):
+        '''
+        feature_map_size: (y_l, x_l)
+        anchor_range: [x1, y1, z1, x2, y2, z2]
+        anchor_size: [w, l, h]
+        rotations: [0, 1.57]
+        return: shape=(y_l, x_l, 2, 7)
+        '''
+        device = feature_map_size.device
+        x_centers = torch.linspace(
+            anchor_range[0],
+            anchor_range[3],
+            feature_map_size[1] + 1,
+            device=device)
+        y_centers = torch.linspace(
+            anchor_range[1],
+            anchor_range[4],
+            feature_map_size[0] + 1,
+            device=device)
+        z_centers = torch.linspace(
+            anchor_range[2],
+            anchor_range[5],
+            1 + 1,
+            device=device)
+
+        x_shift = (x_centers[1] - x_centers[0]) / 2
+        y_shift = (y_centers[1] - y_centers[0]) / 2
+        z_shift = (z_centers[1] - z_centers[0]) / 2
+        x_centers = x_centers[:feature_map_size[1]] + \
+            x_shift  # (feature_map_size[1], )
+        y_centers = y_centers[:feature_map_size[0]] + \
+            y_shift  # (feature_map_size[0], )
+        z_centers = z_centers[:1] + z_shift  # (1, )
+
+        # [feature_map_size[1], feature_map_size[0], 1, 2] * 4
+        meshgrids = torch.meshgrid(x_centers, y_centers, z_centers, rotations)
+        meshgrids = list(meshgrids)
+        for i in range(len(meshgrids)):
+            # [feature_map_size[1], feature_map_size[0], 1, 2, 1]
+            meshgrids[i] = meshgrids[i][..., None]
+
+        anchor_size = anchor_size[None, None, None, None, :]
+        repeat_shape = [
+            feature_map_size[1],
+            feature_map_size[0],
+            1,
+            len(rotations),
+            1]
+        # [feature_map_size[1], feature_map_size[0], 1, 2, 3]
+        anchor_size = anchor_size.repeat(repeat_shape)
+        meshgrids.insert(3, anchor_size)
+        # [1, feature_map_size[0], feature_map_size[1], 2, 7]
+        anchors = torch.cat(
+            meshgrids,
+            dim=-
+            1).permute(
+            2,
+            1,
+            0,
+            3,
+            4).contiguous()
+        return anchors.squeeze(0)
+
+    def get_multi_anchors(self, feature_map_size):
+        '''
+        feature_map_size: (y_l, x_l)
+        ranges: [[x1, y1, z1, x2, y2, z2], [x1, y1, z1, x2, y2, z2], [x1, y1, z1, x2, y2, z2]]
+        sizes: [[w, l, h], [w, l, h], [w, l, h]]
+        rotations: [0, 1.57]
+        return: shape=(y_l, x_l, 3, 2, 7)
+        '''
+        device = feature_map_size.device
+        ranges = torch.tensor(self.ranges, device=device)
+        sizes = torch.tensor(self.sizes, device=device)
+        rotations = torch.tensor(self.rotations, device=device)
+        multi_anchors = []
+        for i in range(len(ranges)):
+            anchors = self.get_anchors(feature_map_size=feature_map_size,
+                                       anchor_range=ranges[i],
+                                       anchor_size=sizes[i],
+                                       rotations=rotations)
+            multi_anchors.append(anchors[:, :, None, :, :])
+        multi_anchors = torch.cat(multi_anchors, dim=2)
+
+        return multi_anchors
+
+
+def anchors2bboxes(anchors, deltas):
+    '''
+    anchors: (M, 7),  (x, y, z, w, l, h, theta)
+    deltas: (M, 7)
+    return: (M, 7)
+    '''
+    da = torch.sqrt(anchors[:, 3] ** 2 + anchors[:, 4] ** 2)
+    x = deltas[:, 0] * da + anchors[:, 0]
+    y = deltas[:, 1] * da + anchors[:, 1]
+    z = deltas[:, 2] * anchors[:, 5] + anchors[:, 2] + anchors[:, 5] / 2
+
+    w = anchors[:, 3] * torch.exp(deltas[:, 3])
+    l = anchors[:, 4] * torch.exp(deltas[:, 4])
+    h = anchors[:, 5] * torch.exp(deltas[:, 5])
+
+    z = z - h / 2
+
+    theta = anchors[:, 6] + deltas[:, 6]
+
+    bboxes = torch.stack([x, y, z, w, l, h, theta], dim=1)
+    return bboxes
+
+
+def bboxes2deltas(bboxes, anchors):
+    '''
+    bboxes: (M, 7), (x, y, z, w, l, h, theta)
+    anchors: (M, 7)
+    return: (M, 7)
+    '''
+    da = torch.sqrt(anchors[:, 3] ** 2 + anchors[:, 4] ** 2)
+
+    dx = (bboxes[:, 0] - anchors[:, 0]) / da
+    dy = (bboxes[:, 1] - anchors[:, 1]) / da
+
+    zb = bboxes[:, 2] + bboxes[:, 5] / 2  # bottom center
+    za = anchors[:, 2] + anchors[:, 5] / 2  # bottom center
+    dz = (zb - za) / anchors[:, 5]  # bottom center
+
+    dw = torch.log(bboxes[:, 3] / anchors[:, 3])
+    dl = torch.log(bboxes[:, 4] / anchors[:, 4])
+    dh = torch.log(bboxes[:, 5] / anchors[:, 5])
+    dtheta = bboxes[:, 6] - anchors[:, 6]
+
+    deltas = torch.stack([dx, dy, dz, dw, dl, dh, dtheta], dim=1)
+    return deltas
+
+
+def anchor_target(batched_anchors, batched_gt_bboxes,
+                  batched_gt_labels, assigners, nclasses):
+    '''
+    batched_anchors: [(y_l, x_l, 3, 2, 7), (y_l, x_l, 3, 2, 7), ... ]
+    batched_gt_bboxes: [(n1, 7), (n2, 7), ...]
+    batched_gt_labels: [(n1, ), (n2, ), ...]
+    return:
+           dict = {batched_anchors_labels: (bs, n_anchors),
+                   batched_labels_weights: (bs, n_anchors),
+                   batched_anchors_reg: (bs, n_anchors, 7),
+                   batched_reg_weights: (bs, n_anchors),
+                   batched_anchors_dir: (bs, n_anchors),
+                   batched_dir_weights: (bs, n_anchors)}
+    '''
+    assert len(batched_anchors) == len(
+        batched_gt_bboxes) == len(batched_gt_labels)
+    batch_size = len(batched_anchors)
+    n_assigners = len(assigners)
+    batched_labels, batched_label_weights = [], []
+    batched_bbox_reg, batched_bbox_reg_weights = [], []
+    batched_dir_labels, batched_dir_labels_weights = [], []
+    for i in range(batch_size):
+        anchors = batched_anchors[i]
+        gt_bboxes, gt_labels = batched_gt_bboxes[i], batched_gt_labels[i]
+        # what we want to get next ?
+        # 1. identify positive anchors and negative anchors  -> cls
+        # 2. identify the regresstion values  -> reg
+        # 3. indentify the direction  -> dir_cls
+        multi_labels, multi_label_weights = [], []
+        multi_bbox_reg, multi_bbox_reg_weights = [], []
+        multi_dir_labels, multi_dir_labels_weights = [], []
+        d1, d2, d3, d4, d5 = anchors.size()
+        for j in range(n_assigners):  # multi anchors
+            assigner = assigners[j]
+            pos_iou_thr, neg_iou_thr, min_iou_thr = \
+                assigner['pos_iou_thr'], assigner['neg_iou_thr'], assigner['min_iou_thr']
+            cur_anchors = anchors[:, :, j, :, :].reshape(-1, 7)
+            overlaps = iou2d_nearest(gt_bboxes, cur_anchors)
+            if overlaps.shape[0] == 0:
+                max_overlaps = torch.zeros_like(
+                    cur_anchors[:, 0], dtype=cur_anchors.dtype)
+                max_overlaps_idx = torch.zeros_like(
+                    cur_anchors[:, 0], dtype=torch.long)
+            else:
+                max_overlaps, max_overlaps_idx = torch.max(overlaps, dim=0)
+                gt_max_overlaps, _ = torch.max(overlaps, dim=1)
+
+            assigned_gt_inds = - \
+                torch.ones_like(cur_anchors[:, 0], dtype=torch.long)
+            # a. negative anchors
+            assigned_gt_inds[max_overlaps < neg_iou_thr] = 0
+
+            # b. positive anchors
+            # rule 1
+            assigned_gt_inds[max_overlaps >=
+                             pos_iou_thr] = max_overlaps_idx[max_overlaps >= pos_iou_thr] + 1
+
+            # rule 2
+            # support one bbox to multi anchors, only if the anchors are with the highest iou.
+            # rule2 may modify the labels generated by rule 1
+            for i in range(len(gt_bboxes)):
+                if gt_max_overlaps[i] >= min_iou_thr:
+                    assigned_gt_inds[overlaps[i] == gt_max_overlaps[i]] = i + 1
+
+            pos_flag = assigned_gt_inds > 0
+            neg_flag = assigned_gt_inds == 0
+            # 1. anchor labels
+            # -1 is not optimal, for some bboxes are with labels -1
+            assigned_gt_labels = torch.zeros_like(
+                cur_anchors[:, 0], dtype=torch.long) + nclasses
+            assigned_gt_labels[pos_flag] = gt_labels[assigned_gt_inds[pos_flag] - 1].long()
+            assigned_gt_labels_weights = torch.zeros_like(cur_anchors[:, 0])
+            assigned_gt_labels_weights[pos_flag] = 1
+            assigned_gt_labels_weights[neg_flag] = 1
+
+            # 2. anchor regression
+            assigned_gt_reg_weights = torch.zeros_like(cur_anchors[:, 0])
+            assigned_gt_reg_weights[pos_flag] = 1
+
+            assigned_gt_reg = torch.zeros_like(cur_anchors)
+            positive_anchors = cur_anchors[pos_flag]
+            corr_gt_bboxes = gt_bboxes[assigned_gt_inds[pos_flag] - 1]
+            assigned_gt_reg[pos_flag] = bboxes2deltas(
+                corr_gt_bboxes, positive_anchors)
+
+            # 3. anchor direction
+            assigned_gt_dir_weights = torch.zeros_like(cur_anchors[:, 0])
+            assigned_gt_dir_weights[pos_flag] = 1
+
+            assigned_gt_dir = torch.zeros_like(
+                cur_anchors[:, 0], dtype=torch.long)
+            dir_cls_targets = limit_period(
+                corr_gt_bboxes[:, 6].cpu(), 0, 2 * math.pi).to(corr_gt_bboxes)
+            dir_cls_targets = torch.floor(dir_cls_targets / math.pi).long()
+            assigned_gt_dir[pos_flag] = torch.clamp(
+                dir_cls_targets, min=0, max=1)
+
+            multi_labels.append(assigned_gt_labels.reshape(d1, d2, 1, d4))
+            multi_label_weights.append(
+                assigned_gt_labels_weights.reshape(
+                    d1, d2, 1, d4))
+            multi_bbox_reg.append(assigned_gt_reg.reshape(d1, d2, 1, d4, -1))
+            multi_bbox_reg_weights.append(
+                assigned_gt_reg_weights.reshape(
+                    d1, d2, 1, d4))
+            multi_dir_labels.append(assigned_gt_dir.reshape(d1, d2, 1, d4))
+            multi_dir_labels_weights.append(
+                assigned_gt_dir_weights.reshape(
+                    d1, d2, 1, d4))
+
+        multi_labels = torch.cat(multi_labels, dim=-2).reshape(-1)
+        multi_label_weights = torch.cat(
+            multi_label_weights, dim=-2).reshape(-1)
+        multi_bbox_reg = torch.cat(multi_bbox_reg, dim=-3).reshape(-1, d5)
+        multi_bbox_reg_weights = torch.cat(
+            multi_bbox_reg_weights, dim=-2).reshape(-1)
+        multi_dir_labels = torch.cat(multi_dir_labels, dim=-2).reshape(-1)
+        multi_dir_labels_weights = torch.cat(
+            multi_dir_labels_weights, dim=-2).reshape(-1)
+
+        batched_labels.append(multi_labels)
+        batched_label_weights.append(multi_label_weights)
+        batched_bbox_reg.append(multi_bbox_reg)
+        batched_bbox_reg_weights.append(multi_bbox_reg_weights)
+        batched_dir_labels.append(multi_dir_labels)
+        batched_dir_labels_weights.append(multi_dir_labels_weights)
+
+    rt_dict = dict(
+        batched_labels=torch.stack(
+            batched_labels, 0),  # (bs, y_l * x_l * 3 * 2)
+        batched_label_weights=torch.stack(
+            batched_label_weights, 0),  # (bs, y_l * x_l * 3 * 2)
+        batched_bbox_reg=torch.stack(
+            batched_bbox_reg, 0),  # (bs, y_l * x_l * 3 * 2, 7)
+        batched_bbox_reg_weights=torch.stack(
+            batched_bbox_reg_weights, 0),  # (bs, y_l * x_l * 3 * 2)
+        batched_dir_labels=torch.stack(
+            batched_dir_labels, 0),  # (bs, y_l * x_l * 3 * 2)
+        batched_dir_labels_weights=torch.stack(
+            batched_dir_labels_weights, 0)  # (bs, y_l * x_l * 3 * 2)
+    )
+
+    return rt_dict
--- a/automotive/3d-object-detection/model/painter.py
+++ b/automotive/3d-object-detection/model/painter.py
+import onnxruntime as ort
+import argparse
+import model.segmentation as network
+import os
+import numpy as np
+import torch
+from torchvision import transforms
+from PIL import Image
+import copy
+import sys
+from tqdm import tqdm
+sys.path.append('..')
+
+
+def get_calib_from_file(calib_file):
+    """Read in a calibration file and parse into a dictionary."""
+    data = {}
+
+    with open(calib_file, 'r') as f:
+        lines = [line for line in f.readlines() if line.strip()]
+    for line in lines:
+        key, value = line.split(':', 1)
+        # The only non-float values in these files are dates, which
+        # we don't care about anyway
+        try:
+            if key == 'R0_rect':
+                data['R0'] = torch.tensor([float(x)
+                                          for x in value.split()]).reshape(3, 3)
+            else:
+                data[key] = torch.tensor([float(x)
+                                         for x in value.split()]).reshape(3, 4)
+        except ValueError:
+            pass
+
+    return data
+
+
+def to_numpy(tensor):
+    return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()
+
+
+class Painter:
+    def __init__(self, args, onnx=False):
+        self.root_split_path = args.training_path
+        self.save_path = os.path.join(args.training_path, "painted_lidar/")
+        self.onnx = onnx
+        if not os.path.exists(self.save_path):
+            os.mkdir(self.save_path)
+
+        self.seg_net_index = 0
+        self.model = None
+        print(f'Using Segmentation Network -- deeplabv3plus')
+        checkpoint_file = args.model_path
+        if self.onnx:
+            model = ort.InferenceSession(checkpoint_file)
+            self.input_image_name = model.get_inputs()[0].name
+        else:
+            model = network.modeling.__dict__['deeplabv3plus_resnet50'](
+                num_classes=19, output_stride=16)
+            checkpoint = torch.load(checkpoint_file)
+            model.load_state_dict(checkpoint["model_state"])
+            model.eval()
+            device = torch.device(
+                'cuda' if torch.cuda.is_available() else 'cpu')
+            model.to(device)
+        self.model = model
+        self.cam_sync = args.cam_sync
+
+    def get_lidar(self, idx):
+        lidar_file = os.path.join(
+            self.root_split_path, 'velodyne/' + ('%s.bin' % idx))
+        return torch.from_numpy(np.fromfile(
+            str(lidar_file), dtype=np.float32).reshape(-1, 6))
+
+    def get_image(self, idx, camera):
+        filename = os.path.join(self.root_split_path,
+                                camera + ('%s.jpg' % idx))
+        input_image = Image.open(filename)
+        preprocess = transforms.Compose([
+            transforms.ToTensor(),
+            transforms.Normalize(
+                mean=[
+                    0.485, 0.456, 0.406], std=[
+                    0.229, 0.224, 0.225]),
+        ])
+
+        input_tensor = preprocess(input_image)
+        # create a mini-batch as expected by the model
+        input_batch = input_tensor.unsqueeze(0)
+        if torch.cuda.is_available():
+            input_batch = input_batch.to('cuda')
+        # move the input and model to GPU for speed if available
+        if torch.cuda.is_available():
+            input_batch = input_batch.to('cuda')
+        return input_batch
+
+    def get_model_output(self, input_batch):
+        with torch.no_grad():
+            output = self.model(input_batch)[0]
+        return output
+
+    def get_score(self, model_output):
+        sf = torch.nn.Softmax(dim=2)
+        output_permute = model_output.permute(1, 2, 0)
+        output_permute = sf(output_permute)
+        output_reassign = torch.zeros(
+            output_permute.size(0), output_permute.size(1), 6).to(
+            device=model_output.device)
+        output_reassign[:, :, 0] = torch.sum(
+            output_permute[:, :, :11], dim=2)  # background
+        output_reassign[:, :, 1] = output_permute[:, :, 18]  # bicycle
+        output_reassign[:, :, 2] = torch.sum(
+            output_permute[:, :, [13, 14, 15, 16]], dim=2)  # vehicles
+        output_reassign[:, :, 3] = output_permute[:, :, 11]  # person
+        output_reassign[:, :, 4] = output_permute[:, :, 12]  # rider
+        output_reassign[:, :, 5] = output_permute[:, :, 17]  # motorcycle
+
+        return output_reassign
+
+    def get_calib_fromfile(self, idx, device):
+        calib_file = os.path.join(
+            self.root_split_path, 'calib/' + ('%s.txt' % idx))
+        calib = get_calib_from_file(calib_file)
+        calib['P0'] = torch.cat([calib['P0'], torch.tensor(
+            [[0., 0., 0., 1.]])], axis=0).to(device=device)
+        calib['P1'] = torch.cat([calib['P1'], torch.tensor(
+            [[0., 0., 0., 1.]])], axis=0).to(device=device)
+        calib['P2'] = torch.cat([calib['P2'], torch.tensor(
+            [[0., 0., 0., 1.]])], axis=0).to(device=device)
+        calib['P3'] = torch.cat([calib['P3'], torch.tensor(
+            [[0., 0., 0., 1.]])], axis=0).to(device=device)
+        calib['P4'] = torch.cat([calib['P4'], torch.tensor(
+            [[0., 0., 0., 1.]])], axis=0).to(device=device)
+        calib['R0_rect'] = torch.zeros(
+            [4, 4], dtype=calib['R0'].dtype, device=device)
+        calib['R0_rect'][3, 3] = 1.
+        calib['R0_rect'][:3, :3] = calib['R0'].to(device=device)
+        calib['Tr_velo_to_cam_0'] = torch.cat([calib['Tr_velo_to_cam_0'], torch.tensor(
+            [[0., 0., 0., 1.]], )], axis=0).to(device=device)
+        calib['Tr_velo_to_cam_1'] = torch.cat([calib['Tr_velo_to_cam_1'], torch.tensor(
+            [[0., 0., 0., 1.]], )], axis=0).to(device=device)
+        calib['Tr_velo_to_cam_2'] = torch.cat([calib['Tr_velo_to_cam_2'], torch.tensor(
+            [[0., 0., 0., 1.]], )], axis=0).to(device=device)
+        calib['Tr_velo_to_cam_3'] = torch.cat([calib['Tr_velo_to_cam_3'], torch.tensor(
+            [[0., 0., 0., 1.]], )], axis=0).to(device=device)
+        calib['Tr_velo_to_cam_4'] = torch.cat([calib['Tr_velo_to_cam_4'], torch.tensor(
+            [[0., 0., 0., 1.]], )], axis=0).to(device=device)
+        return calib
+
+    def cam_to_lidar(self, pointcloud, projection_mats, camera_num):
+        """
+        Takes in lidar in velo coords, returns lidar points in camera coords
+
+        :param pointcloud: (n_points, 4) np.array (x,y,z,r) in velodyne coordinates
+        :return lidar_cam_coords: (n_points, 4) np.array (x,y,z,r) in camera coordinates
+        """
+
+        lidar_velo_coords = copy.deepcopy(pointcloud)
+        # copy reflectances column
+        reflectances = copy.deepcopy(lidar_velo_coords[:, -1])
+        lidar_velo_coords[:, -1] = 1  # for multiplying with homogeneous matrix
+        lidar_cam_coords = projection_mats['Tr_velo_to_cam_' +
+                                           str(camera_num)].matmul(lidar_velo_coords.transpose(0, 1))
+        lidar_cam_coords = lidar_cam_coords.transpose(0, 1)
+        lidar_cam_coords[:, -1] = reflectances
+
+        return lidar_cam_coords
+
+    def project_points_mask(self, lidar_cam_points,
+                            projection_mats, class_scores, camera_num):
+        points_projected_on_mask = projection_mats['P' + str(camera_num)].matmul(
+            projection_mats['R0_rect'].matmul(lidar_cam_points.transpose(0, 1)))
+        points_projected_on_mask = points_projected_on_mask.transpose(0, 1)
+        points_projected_on_mask = points_projected_on_mask / \
+            (points_projected_on_mask[:, 2].reshape(-1, 1))
+
+        true_where_x_on_img = (0 < points_projected_on_mask[:, 0]) & (
+            points_projected_on_mask[:, 0] < class_scores[camera_num].shape[1])  # x in img coords is cols of img
+        true_where_y_on_img = (0 < points_projected_on_mask[:, 1]) & (
+            points_projected_on_mask[:, 1] < class_scores[camera_num].shape[0])
+        true_where_point_on_img = true_where_x_on_img & true_where_y_on_img & (
+            lidar_cam_points[:, 2] > 0)
+
+        # filter out points that don't project to image
+        points_projected_on_mask = points_projected_on_mask[true_where_point_on_img]
+        # using floor so you don't end up indexing num_rows+1th row or col
+        points_projected_on_mask = torch.floor(points_projected_on_mask).int()
+        # drops homogenous coord 1 from every point, giving (N_pts, 2) int
+        # array
+        points_projected_on_mask = points_projected_on_mask[:, :2]
+        return (points_projected_on_mask, true_where_point_on_img)
+
+    def augment_lidar_class_scores_both(
+            self, class_scores, lidar_raw, projection_mats):
+        """
+        Projects lidar points onto segmentation map, appends class score each point projects onto.
+        """
+        # lidar_cam_coords = self.cam_to_lidar(lidar_raw, projection_mats)
+
+        ################################
+        lidar_cam_coords = self.cam_to_lidar(
+            lidar_raw[:, :4], projection_mats, 0)
+
+        lidar_cam_coords[:, -1] = 1  # homogenous coords for projection
+
+        points_projected_on_mask_0, true_where_point_on_img_0 = self.project_points_mask(
+            lidar_cam_coords, projection_mats, class_scores, 0)
+
+        lidar_cam_coords = self.cam_to_lidar(
+            lidar_raw[:, :4], projection_mats, 1)
+        lidar_cam_coords[:, -1] = 1  # homogenous coords for projection
+
+        points_projected_on_mask_1, true_where_point_on_img_1 = self.project_points_mask(
+            lidar_cam_coords, projection_mats, class_scores, 1)
+
+        lidar_cam_coords = self.cam_to_lidar(
+            lidar_raw[:, :4], projection_mats, 2)
+        lidar_cam_coords[:, -1] = 1
+        points_projected_on_mask_2, true_where_point_on_img_2 = self.project_points_mask(
+            lidar_cam_coords, projection_mats, class_scores, 2)
+
+        lidar_cam_coords = self.cam_to_lidar(
+            lidar_raw[:, :4], projection_mats, 3)
+        lidar_cam_coords[:, -1] = 1
+        points_projected_on_mask_3, true_where_point_on_img_3 = self.project_points_mask(
+            lidar_cam_coords, projection_mats, class_scores, 3)
+
+        lidar_cam_coords = self.cam_to_lidar(
+            lidar_raw[:, :4], projection_mats, 4)
+        lidar_cam_coords[:, -1] = 1
+        points_projected_on_mask_4, true_where_point_on_img_4 = self.project_points_mask(
+            lidar_cam_coords, projection_mats, class_scores, 4)
+
+        true_where_point_on_both_0_1 = true_where_point_on_img_0 & true_where_point_on_img_1
+        true_where_point_on_both_0_2 = true_where_point_on_img_0 & true_where_point_on_img_2
+        true_where_point_on_both_1_3 = true_where_point_on_img_1 & true_where_point_on_img_3
+        true_where_point_on_both_2_4 = true_where_point_on_img_2 & true_where_point_on_img_4
+        true_where_point_on_img = true_where_point_on_img_1 | true_where_point_on_img_0 | true_where_point_on_img_2 | true_where_point_on_img_3 | true_where_point_on_img_4
+
+        point_scores_0 = class_scores[0][points_projected_on_mask_0[:, 1],
+                                         points_projected_on_mask_0[:, 0]].reshape(-1, class_scores[0].shape[2])
+        point_scores_1 = class_scores[1][points_projected_on_mask_1[:, 1],
+                                         points_projected_on_mask_1[:, 0]].reshape(-1, class_scores[1].shape[2])
+        point_scores_2 = class_scores[2][points_projected_on_mask_2[:, 1],
+                                         points_projected_on_mask_2[:, 0]].reshape(-1, class_scores[2].shape[2])
+        point_scores_3 = class_scores[3][points_projected_on_mask_3[:, 1],
+                                         points_projected_on_mask_3[:, 0]].reshape(-1, class_scores[3].shape[2])
+        point_scores_4 = class_scores[4][points_projected_on_mask_4[:, 1],
+                                         points_projected_on_mask_4[:, 0]].reshape(-1, class_scores[4].shape[2])
+
+        augmented_lidar = torch.cat((lidar_raw[:, :5], torch.zeros(
+            (lidar_raw.shape[0], class_scores[1].shape[2])).to(device=lidar_raw.device)), axis=1)
+        augmented_lidar[true_where_point_on_img_0, -
+                        class_scores[0].shape[2]:] += point_scores_0
+        augmented_lidar[true_where_point_on_img_1, -
+                        class_scores[1].shape[2]:] += point_scores_1
+        augmented_lidar[true_where_point_on_img_2, -
+                        class_scores[2].shape[2]:] += point_scores_2
+        augmented_lidar[true_where_point_on_img_3, -
+                        class_scores[3].shape[2]:] += point_scores_3
+        augmented_lidar[true_where_point_on_img_4, -
+                        class_scores[4].shape[2]:] += point_scores_4
+        augmented_lidar[true_where_point_on_both_0_1, -class_scores[0].shape[2]:] = 0.5 * \
+            augmented_lidar[true_where_point_on_both_0_1, -
+                            class_scores[0].shape[2]:]
+        augmented_lidar[true_where_point_on_both_0_2, -class_scores[0].shape[2]:] = 0.5 * \
+            augmented_lidar[true_where_point_on_both_0_2, -
+                            class_scores[0].shape[2]:]
+        augmented_lidar[true_where_point_on_both_1_3, -class_scores[1].shape[2]:] = 0.5 * \
+            augmented_lidar[true_where_point_on_both_1_3, -
+                            class_scores[1].shape[2]:]
+        augmented_lidar[true_where_point_on_both_2_4, -class_scores[2].shape[2]:] = 0.5 * \
+            augmented_lidar[true_where_point_on_both_2_4, -
+                            class_scores[2].shape[2]:]
+        if self.cam_sync:
+            augmented_lidar = augmented_lidar[true_where_point_on_img]
+
+        return augmented_lidar
--- a/automotive/3d-object-detection/model/pointpillars.py
+++ b/automotive/3d-object-detection/model/pointpillars.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from model.anchors import Anchors, anchor_target, anchors2bboxes
+from ops import Voxelization
+import open3d.ml.torch as ml3d
+from tools.process import limit_period
+import math
+
+
+class PillarLayer(nn.Module):
+    def __init__(self, voxel_size, point_cloud_range,
+                 max_num_points, max_voxels):
+        super().__init__()
+        self.voxel_layer = Voxelization(voxel_size=voxel_size,
+                                        point_cloud_range=point_cloud_range,
+                                        max_num_points=max_num_points,
+                                        max_voxels=max_voxels)
+
+    @torch.no_grad()
+    def forward(self, batched_pts):
+        '''
+        batched_pts: list[tensor], len(batched_pts) = bs
+        return:
+               pillars: (p1 + p2 + ... + pb, num_points, c),
+               coors_batch: (p1 + p2 + ... + pb, 1 + 3),
+               num_points_per_pillar: (p1 + p2 + ... + pb, ), (b: batch size)
+        '''
+        pillars, coors, npoints_per_pillar = [], [], []
+        for i, pts in enumerate(batched_pts):
+            voxels_out, coors_out, num_points_per_voxel_out = self.voxel_layer(
+                pts)
+            # voxels_out: (max_voxel, num_points, c), coors_out: (max_voxel, 3)
+            # num_points_per_voxel_out: (max_voxel, )
+            pillars.append(voxels_out)
+            coors.append(coors_out.long())
+            npoints_per_pillar.append(num_points_per_voxel_out)
+
+        # (p1 + p2 + ... + pb, num_points, c)
+        pillars = torch.cat(pillars, dim=0)
+        npoints_per_pillar = torch.cat(
+            npoints_per_pillar,
+            dim=0)  # (p1 + p2 + ... + pb, )
+        coors_batch = []
+        for i, cur_coors in enumerate(coors):
+            coors_batch.append(F.pad(cur_coors, (1, 0), value=i))
+        # (p1 + p2 + ... + pb, 1 + 3)
+        coors_batch = torch.cat(coors_batch, dim=0)
+
+        return pillars, coors_batch, npoints_per_pillar
+
+
+class PillarEncoder(nn.Module):
+    def __init__(self, voxel_size, point_cloud_range, in_channel, out_channel):
+        super().__init__()
+        self.out_channel = out_channel
+        self.vx, self.vy = voxel_size[0], voxel_size[1]
+        self.x_offset = voxel_size[0] / 2 + point_cloud_range[0]
+        self.y_offset = voxel_size[1] / 2 + point_cloud_range[1]
+        self.x_l = math.ceil(
+            (point_cloud_range[3] -
+             point_cloud_range[0]) /
+            voxel_size[0])
+        self.y_l = math.ceil(
+            (point_cloud_range[4] -
+             point_cloud_range[1]) /
+            voxel_size[1])
+
+        self.conv = nn.Conv1d(in_channel, out_channel, 1, bias=False)
+        self.bn = nn.BatchNorm1d(out_channel, eps=1e-3, momentum=0.01)
+
+    def forward(self, pillars, coors_batch, npoints_per_pillar):
+        '''
+        pillars: (p1 + p2 + ... + pb, num_points, c), c = 4
+        coors_batch: (p1 + p2 + ... + pb, 1 + 3)
+        npoints_per_pillar: (p1 + p2 + ... + pb, )
+        return:  (bs, out_channel, y_l, x_l)
+        '''
+        device = pillars.device
+        # 1. calculate offset to the points center (in each pillar)
+        offset_pt_center = pillars[:,
+                                   :,
+                                   :3] - torch.sum(pillars[:,
+                                                           :,
+                                                           :3],
+                                                   dim=1,
+                                                   keepdim=True) / npoints_per_pillar[:,
+                                                                                      None,
+                                                                                      None]  # (p1 + p2 + ... + pb, num_points, 3)
+
+        # 2. calculate offset to the pillar center
+        # (p1 + p2 + ... + pb, num_points, 1)
+        x_offset_pi_center = pillars[:, :, :1] - \
+            (coors_batch[:, None, 1:2] * self.vx + self.x_offset)
+        # (p1 + p2 + ... + pb, num_points, 1)
+        y_offset_pi_center = pillars[:, :, 1:2] - \
+            (coors_batch[:, None, 2:3] * self.vy + self.y_offset)
+
+        # 3. encoder
+        features = torch.cat([pillars,
+                              offset_pt_center,
+                              x_offset_pi_center,
+                              y_offset_pi_center],
+                             dim=-1)  # (p1 + p2 + ... + pb, num_points, 9)
+        features[:, :, 0:1] = x_offset_pi_center  # tmp
+        features[:, :, 1:2] = y_offset_pi_center  # tmp
+        # In consitent with mmdet3d.
+        # The reason can be referenced to
+        # https://github.com/open-mmlab/mmdetection3d/issues/1150
+
+        # 4. find mask for (0, 0, 0) and update the encoded features
+        # a very beautiful implementation
+        voxel_ids = torch.arange(
+            0, pillars.size(1)).to(device)  # (num_points, )
+        # (num_points, p1 + p2 + ... + pb)
+        mask = voxel_ids[:, None] < npoints_per_pillar[None, :]
+        # (p1 + p2 + ... + pb, num_points)
+        mask = mask.permute(1, 0).contiguous()
+        features *= mask[:, :, None]
+
+        # 5. embedding
+        # (p1 + p2 + ... + pb, 9, num_points)
+        features = features.permute(0, 2, 1).contiguous()
+        # (p1 + p2 + ... + pb, out_channels, num_points)
+        features = F.relu(self.bn(self.conv(features)))
+        # (p1 + p2 + ... + pb, out_channels)
+        pooling_features = torch.max(features, dim=-1)[0]
+
+        # 6. pillar scatter
+        batched_canvas = []
+        bs = coors_batch[-1, 0] + 1
+        for i in range(bs):
+            cur_coors_idx = coors_batch[:, 0] == i
+            cur_coors = coors_batch[cur_coors_idx, :]
+            cur_features = pooling_features[cur_coors_idx]
+
+            canvas = torch.zeros(
+                (self.x_l,
+                 self.y_l,
+                 self.out_channel),
+                dtype=torch.float32,
+                device=device)
+            canvas[cur_coors[:, 1], cur_coors[:, 2]] = cur_features
+            canvas = canvas.permute(2, 1, 0).contiguous()
+            batched_canvas.append(canvas)
+        # (bs, in_channel, self.y_l, self.x_l)
+        batched_canvas = torch.stack(batched_canvas, dim=0)
+        return batched_canvas
+
+
+class Backbone(nn.Module):
+    def __init__(self, in_channel, out_channels,
+                 layer_nums, layer_strides=[2, 2, 2]):
+        super().__init__()
+        assert len(out_channels) == len(layer_nums)
+        assert len(out_channels) == len(layer_strides)
+
+        self.multi_blocks = nn.ModuleList()
+        for i in range(len(layer_strides)):
+            blocks = []
+            blocks.append(
+                nn.Conv2d(
+                    in_channel,
+                    out_channels[i],
+                    3,
+                    stride=layer_strides[i],
+                    bias=False,
+                    padding=1))
+            blocks.append(
+                nn.BatchNorm2d(
+                    out_channels[i],
+                    eps=1e-3,
+                    momentum=0.01))
+            blocks.append(nn.ReLU(inplace=True))
+
+            for _ in range(layer_nums[i]):
+                blocks.append(
+                    nn.Conv2d(
+                        out_channels[i],
+                        out_channels[i],
+                        3,
+                        bias=False,
+                        padding=1))
+                blocks.append(
+                    nn.BatchNorm2d(
+                        out_channels[i],
+                        eps=1e-3,
+                        momentum=0.01))
+                blocks.append(nn.ReLU(inplace=True))
+
+            in_channel = out_channels[i]
+            self.multi_blocks.append(nn.Sequential(*blocks))
+
+        # in consitent with mmdet3d
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(
+                    m.weight, mode='fan_out', nonlinearity='relu')
+
+    def forward(self, x):
+        '''
+        x: (b, c, y_l, x_l). Default: (6, 64, 496, 432)
+        return: list[]. Default: [(6, 64, 248, 216), (6, 128, 124, 108), (6, 256, 62, 54)]
+        '''
+        outs = []
+        for i in range(len(self.multi_blocks)):
+            x = self.multi_blocks[i](x)
+            outs.append(x)
+        return outs
+
+
+class Neck(nn.Module):
+    def __init__(self, in_channels, upsample_strides, out_channels):
+        super().__init__()
+        assert len(in_channels) == len(upsample_strides)
+        assert len(upsample_strides) == len(out_channels)
+
+        self.decoder_blocks = nn.ModuleList()
+        for i in range(len(in_channels)):
+            decoder_block = []
+            decoder_block.append(nn.ConvTranspose2d(in_channels[i],
+                                                    out_channels[i],
+                                                    upsample_strides[i],
+                                                    stride=upsample_strides[i],
+                                                    bias=False))
+            decoder_block.append(
+                nn.BatchNorm2d(
+                    out_channels[i],
+                    eps=1e-3,
+                    momentum=0.01))
+            decoder_block.append(nn.ReLU(inplace=True))
+
+            self.decoder_blocks.append(nn.Sequential(*decoder_block))
+
+        # in consitent with mmdet3d
+        for m in self.modules():
+            if isinstance(m, nn.ConvTranspose2d):
+                nn.init.kaiming_normal_(
+                    m.weight, mode='fan_out', nonlinearity='relu')
+
+    def forward(self, x):
+        '''
+        x: [(bs, 64, 248, 216), (bs, 128, 124, 108), (bs, 256, 62, 54)]
+        return: (bs, 384, 248, 216)
+        '''
+        outs = []
+        for i in range(len(self.decoder_blocks)):
+            xi = self.decoder_blocks[i](x[i])  # (bs, 128, 248, 216)
+            outs.append(xi)
+        out = torch.cat(outs, dim=1)
+        return out
+
+
+class Head(nn.Module):
+    def __init__(self, in_channel, n_anchors, n_classes):
+        super().__init__()
+
+        self.conv_cls = nn.Conv2d(in_channel, n_anchors * n_classes, 1)
+        self.conv_reg = nn.Conv2d(in_channel, n_anchors * 7, 1)
+        self.conv_dir_cls = nn.Conv2d(in_channel, n_anchors * 2, 1)
+
+        # in consitent with mmdet3d
+        conv_layer_id = 0
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.normal_(m.weight, mean=0, std=0.01)
+                if conv_layer_id == 0:
+                    prior_prob = 0.01
+                    bias_init = float(-math.log((1 - prior_prob) / prior_prob))
+                    nn.init.constant_(m.bias, bias_init)
+                else:
+                    nn.init.constant_(m.bias, 0)
+                conv_layer_id += 1
+
+    def forward(self, x):
+        '''
+        x: (bs, 384, 248, 216)
+        return:
+              bbox_cls_pred: (bs, n_anchors*3, 248, 216)
+              bbox_pred: (bs, n_anchors*7, 248, 216)
+              bbox_dir_cls_pred: (bs, n_anchors*2, 248, 216)
+        '''
+        bbox_cls_pred = self.conv_cls(x)
+        bbox_pred = self.conv_reg(x)
+        bbox_dir_cls_pred = self.conv_dir_cls(x)
+        return bbox_cls_pred, bbox_pred, bbox_dir_cls_pred
+
+
+class PointPillars(nn.Module):
+    def __init__(self,
+                 nclasses=3,
+                 voxel_size=[0.32, 0.32, 6],
+                 point_cloud_range=[-74.88, -74.88, -2, 74.88, 74.88, 4],
+                 max_num_points=20,
+                 max_voxels=(32000, 32000),
+                 painted=False):
+        super().__init__()
+        self.nclasses = nclasses
+        self.pillar_layer = PillarLayer(voxel_size=voxel_size,
+                                        point_cloud_range=point_cloud_range,
+                                        max_num_points=max_num_points,
+                                        max_voxels=max_voxels)
+        if painted:
+            pillar_channel = 16
+        else:
+            pillar_channel = 10
+        self.pillar_encoder = PillarEncoder(voxel_size=voxel_size,
+                                            point_cloud_range=point_cloud_range,
+                                            in_channel=pillar_channel,
+                                            out_channel=64)
+        self.backbone = Backbone(in_channel=64,
+                                 out_channels=[64, 128, 256],
+                                 layer_nums=[3, 5, 5],
+                                 layer_strides=[1, 2, 2])
+        self.neck = Neck(in_channels=[64, 128, 256],
+                         upsample_strides=[1, 2, 4],
+                         out_channels=[128, 128, 128])
+        self.head = Head(
+            in_channel=384,
+            n_anchors=2 * nclasses,
+            n_classes=nclasses)
+
+        # anchors
+        ranges = [[-74.88, -74.88, -0.0345, 74.88, 74.88, -0.0345],
+                  [-74.88, -74.88, 0, 74.88, 74.88, 0],
+                  [-74.88, -74.88, -0.1188, 74.88, 74.88, -0.1188]]
+        sizes = [[0.84, .91, 1.74], [.84, 1.81, 1.77], [2.08, 4.73, 1.77]]
+        rotations = [0, 1.57]
+        self.anchors_generator = Anchors(ranges=ranges,
+                                         sizes=sizes,
+                                         rotations=rotations)
+
+        # train
+        self.assigners = [
+            {'pos_iou_thr': 0.5, 'neg_iou_thr': 0.3, 'min_iou_thr': 0.3},
+            {'pos_iou_thr': 0.5, 'neg_iou_thr': 0.3, 'min_iou_thr': 0.3},
+            {'pos_iou_thr': 0.55, 'neg_iou_thr': 0.4, 'min_iou_thr': 0.4},
+        ]
+
+        # val and test
+        self.nms_pre = 4096
+        self.nms_thr = 0.25
+        self.score_thr = 0.1
+        self.max_num = 500
+
+    def get_predicted_bboxes_single(
+            self, bbox_cls_pred, bbox_pred, bbox_dir_cls_pred, anchors):
+        '''
+        bbox_cls_pred: (n_anchors*3, 248, 216)
+        bbox_pred: (n_anchors*7, 248, 216)
+        bbox_dir_cls_pred: (n_anchors*2, 248, 216)
+        anchors: (y_l, x_l, 3, 2, 7)
+        return:
+            bboxes: (k, 7)
+            labels: (k, )
+            scores: (k, )
+        '''
+        # 0. pre-process
+        bbox_cls_pred = bbox_cls_pred.permute(
+            1, 2, 0).reshape(-1, self.nclasses)
+        bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 7)
+        bbox_dir_cls_pred = bbox_dir_cls_pred.permute(1, 2, 0).reshape(-1, 2)
+        anchors = anchors.reshape(-1, 7)
+
+        bbox_cls_pred = torch.sigmoid(bbox_cls_pred)
+        bbox_dir_cls_pred = torch.max(bbox_dir_cls_pred, dim=1)[1]
+
+        # 1. obtain self.nms_pre bboxes based on scores
+        inds = bbox_cls_pred.max(1)[0].topk(self.nms_pre)[1]
+        bbox_cls_pred = bbox_cls_pred[inds]
+        bbox_pred = bbox_pred[inds]
+        bbox_dir_cls_pred = bbox_dir_cls_pred[inds]
+        anchors = anchors[inds]
+
+        # 2. decode predicted offsets to bboxes
+        bbox_pred = anchors2bboxes(anchors, bbox_pred)
+
+        # 3. nms
+        bbox_pred2d_xy = bbox_pred[:, [0, 1]]
+        bbox_pred2d_lw = bbox_pred[:, [3, 4]]
+        bbox_pred2d = torch.cat([bbox_pred2d_xy - bbox_pred2d_lw / 2,
+                                 bbox_pred2d_xy + bbox_pred2d_lw / 2,
+                                 bbox_pred[:, 6:]], dim=-1)  # (n_anchors, 5)
+
+        ret_bboxes, ret_labels, ret_scores = [], [], []
+        for i in range(self.nclasses):
+            # 3.1 filter bboxes with scores below self.score_thr
+            cur_bbox_cls_pred = bbox_cls_pred[:, i]
+            score_inds = cur_bbox_cls_pred > self.score_thr
+            if score_inds.sum() == 0:
+                continue
+
+            cur_bbox_cls_pred = cur_bbox_cls_pred[score_inds]
+            cur_bbox_pred2d = bbox_pred2d[score_inds]
+            cur_bbox_pred = bbox_pred[score_inds]
+            cur_bbox_dir_cls_pred = bbox_dir_cls_pred[score_inds]
+
+            # 3.2 nms core
+            keep_inds = ml3d.ops.nms(
+                cur_bbox_pred2d.cpu(),
+                cur_bbox_cls_pred.cpu(),
+                self.nms_thr)
+
+            cur_bbox_cls_pred = cur_bbox_cls_pred[keep_inds]
+            cur_bbox_pred = cur_bbox_pred[keep_inds]
+            cur_bbox_dir_cls_pred = cur_bbox_dir_cls_pred[keep_inds]
+            cur_bbox_pred[:, -
+                          1] = limit_period(cur_bbox_pred[:, -
+                                                          1].detach().cpu(), 1, math.pi).to(cur_bbox_pred)  # [-pi, 0]
+            cur_bbox_pred[:, -1] += (1 - cur_bbox_dir_cls_pred) * math.pi
+
+            ret_bboxes.append(cur_bbox_pred)
+            ret_labels.append(torch.zeros_like(
+                cur_bbox_pred[:, 0], dtype=torch.long) + i)
+            ret_scores.append(cur_bbox_cls_pred)
+
+        # 4. filter some bboxes if bboxes number is above self.max_num
+        if len(ret_bboxes) == 0:
+            return {
+                'lidar_bboxes': torch.empty((0, 7)).detach().cpu(),
+                'labels': torch.empty(0).detach().cpu(),
+                'scores': torch.empty(0).detach().cpu()
+            }
+        ret_bboxes = torch.cat(ret_bboxes, 0)
+        ret_labels = torch.cat(ret_labels, 0)
+        ret_scores = torch.cat(ret_scores, 0)
+        if ret_bboxes.size(0) > self.max_num:
+            final_inds = ret_scores.topk(self.max_num)[1]
+            ret_bboxes = ret_bboxes[final_inds]
+            ret_labels = ret_labels[final_inds]
+            ret_scores = ret_scores[final_inds]
+        result = {
+            'lidar_bboxes': ret_bboxes.detach().cpu(),
+            'labels': ret_labels.detach().cpu(),
+            'scores': ret_scores.detach().cpu()
+        }
+        return result
+
+    def get_predicted_bboxes(
+            self, bbox_cls_pred, bbox_pred, bbox_dir_cls_pred, batched_anchors):
+        '''
+        bbox_cls_pred: (bs, n_anchors*3, 248, 216)
+        bbox_pred: (bs, n_anchors*7, 248, 216)
+        bbox_dir_cls_pred: (bs, n_anchors*2, 248, 216)
+        batched_anchors: (bs, y_l, x_l, 3, 2, 7)
+        return:
+            bboxes: [(k1, 7), (k2, 7), ... ]
+            labels: [(k1, ), (k2, ), ... ]
+            scores: [(k1, ), (k2, ), ... ]
+        '''
+        results = []
+        bs = bbox_cls_pred.size(0)
+        for i in range(bs):
+            result = self.get_predicted_bboxes_single(bbox_cls_pred=bbox_cls_pred[i],
+                                                      bbox_pred=bbox_pred[i],
+                                                      bbox_dir_cls_pred=bbox_dir_cls_pred[i],
+                                                      anchors=batched_anchors[i])
+            results.append(result)
+        return results
+
+    def forward(self, batched_pts, mode='test',
+                batched_gt_bboxes=None, batched_gt_labels=None):
+        batch_size = len(batched_pts)
+        # batched_pts: list[tensor] -> pillars: (p1 + p2 + ... + pb, num_points, c),
+        #                              coors_batch: (p1 + p2 + ... + pb, 1 + 3),
+        # num_points_per_pillar: (p1 + p2 + ... + pb, ), (b: batch size)
+        pillars, coors_batch, npoints_per_pillar = self.pillar_layer(
+            batched_pts)
+
+        # pillars: (p1 + p2 + ... + pb, num_points, c), c = 4
+        # coors_batch: (p1 + p2 + ... + pb, 1 + 3)
+        # npoints_per_pillar: (p1 + p2 + ... + pb, )
+        #                     -> pillar_features: (bs, out_channel, y_l, x_l)
+        pillar_features = self.pillar_encoder(
+            pillars, coors_batch, npoints_per_pillar)
+
+        # xs:  [(bs, 64, 248, 216), (bs, 128, 124, 108), (bs, 256, 62, 54)]
+        xs = self.backbone(pillar_features)
+
+        # x: (bs, 384, 248, 216)
+        x = self.neck(xs)
+
+        # bbox_cls_pred: (bs, n_anchors*3, 248, 216)
+        # bbox_pred: (bs, n_anchors*7, 248, 216)
+        # bbox_dir_cls_pred: (bs, n_anchors*2, 248, 216)
+        bbox_cls_pred, bbox_pred, bbox_dir_cls_pred = self.head(x)
+
+        # anchors
+        device = bbox_cls_pred.device
+        feature_map_size = torch.tensor(
+            list(bbox_cls_pred.size()[-2:]), device=device)
+        anchors = self.anchors_generator.get_multi_anchors(feature_map_size)
+        batched_anchors = [anchors for _ in range(batch_size)]
+
+        if mode == 'train':
+            anchor_target_dict = anchor_target(batched_anchors=batched_anchors,
+                                               batched_gt_bboxes=batched_gt_bboxes,
+                                               batched_gt_labels=batched_gt_labels,
+                                               assigners=self.assigners,
+                                               nclasses=self.nclasses)
+
+            return bbox_cls_pred, bbox_pred, bbox_dir_cls_pred, anchor_target_dict
+        elif mode == 'val':
+            results = self.get_predicted_bboxes(bbox_cls_pred=bbox_cls_pred,
+                                                bbox_pred=bbox_pred,
+                                                bbox_dir_cls_pred=bbox_dir_cls_pred,
+                                                batched_anchors=batched_anchors)
+            return results
+
+        elif mode == 'test':
+            results = self.get_predicted_bboxes(bbox_cls_pred=bbox_cls_pred,
+                                                bbox_pred=bbox_pred,
+                                                bbox_dir_cls_pred=bbox_dir_cls_pred,
+                                                batched_anchors=batched_anchors)
+            return results
+        else:
+            raise ValueError