add code

d2b71343 · 雍大凯 · 69e57885 · d2b71343 · d2b71343 · d2b71343
Commit d2b71343 authored Apr 08, 2026 by 雍大凯
20 changed files
--- a/docker-hub/FlashOCC/Flashocc/projects/mmdet3d_plugin/ops/nearest_assign/__pycache__/nearest_assign.cpython-310.pyc
+++ b/docker-hub/FlashOCC/Flashocc/projects/mmdet3d_plugin/ops/nearest_assign/__pycache__/nearest_assign.cpython-310.pyc
--- a/docker-hub/FlashOCC/Flashocc/projects/mmdet3d_plugin/ops/nearest_assign/nearest_assign.py
+++ b/docker-hub/FlashOCC/Flashocc/projects/mmdet3d_plugin/ops/nearest_assign/nearest_assign.py
+# Copyright (c) Phigent Robotics. All rights reserved.
+
+import numpy as np
+import torch
+
+from . import nearest_assign_ext
+
+__all__ = ['nearest_assign']
+
+
+class QuickNearestAssignCuda(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx,
+                occ_pred, 
+                l2s_key,
+                occind2detind,
+                inst_cls,
+                inst_xyz,
+                inst_id_list,
+                ):
+
+        occ_pred = occ_pred.contiguous().int()
+        l2s_key = l2s_key.contiguous().int()
+        occind2detind = occind2detind.contiguous().int()
+        inst_cls = inst_cls.contiguous().int()
+        inst_xyz = inst_xyz.contiguous().int()
+        inst_id_list = inst_id_list.contiguous().int()
+        inst_pred = occ_pred.new_zeros(occ_pred.shape)
+
+        nearest_assign_ext.nearest_assign_forward(
+            occ_pred, 
+            l2s_key,
+            occind2detind,
+            inst_cls,
+            inst_xyz,
+            inst_id_list,
+            inst_pred
+        )
+
+        return inst_pred
+
+
+def nearest_assign(occ_pred, 
+                l2s_key,
+                occind2detind,
+                inst_cls,
+                inst_xyz,
+                inst_id_list):
+    inst_pred = QuickNearestAssignCuda.apply(occ_pred, 
+                l2s_key,
+                occind2detind,
+                inst_cls,
+                inst_xyz,
+                inst_id_list
+                )      # (B, Dz, Dy, Dx, C)
+    return inst_pred
+
+def test_bev_pool_v2():
+    depth = np.array([0.3, 0.4, 0.2, 0.1, 0.7, 0.6, 0.8, 0.9])
+    depth = torch.from_numpy(depth).float().cuda()
+    depth = depth.view(1, 1, 2, 2, 2).requires_grad_()
+    feat = torch.ones(
+        size=[1, 1, 2, 2, 2], dtype=torch.float,
+        device='cuda').requires_grad_()
+    ranks_depth = torch.from_numpy(np.array([0, 4, 1, 6])).int().cuda()
+    ranks_feat = torch.from_numpy(np.array([0, 0, 1, 2])).int().cuda()
+    ranks_bev = torch.from_numpy(np.array([0, 0, 1, 1])).int().cuda()
+
+    kept = torch.ones(
+        ranks_bev.shape[0], device=ranks_bev.device, dtype=torch.bool)
+    kept[1:] = ranks_bev[1:] != ranks_bev[:-1]
+    interval_starts = torch.where(kept)[0].int()
+    if len(interval_starts) == 0:
+        return None, None, None, None, None
+    interval_lengths = torch.zeros_like(interval_starts)
+    interval_lengths[:-1] = interval_starts[1:] - interval_starts[:-1]
+    interval_lengths[-1] = ranks_bev.shape[0] - interval_starts[-1]
+    bev_feat = bev_pool_v2(depth, feat, ranks_depth, ranks_feat, ranks_bev,
+                           (1, 1, 2, 2, 2), interval_starts, interval_lengths)
+    loss = torch.sum(bev_feat)
+    loss.backward()
+    assert loss == 4.4
+    grad_depth = np.array([2., 2., 0., 0., 2., 0., 2., 0.])
+    grad_depth = torch.from_numpy(grad_depth).float()
+    grad_depth = grad_depth.cuda().view(1, 1, 2, 2, 2)
+    assert depth.grad.allclose(grad_depth)
+    grad_feat = np.array([1.0, 1.0, 0.4, 0.4, 0.8, 0.8, 0., 0.])
+    grad_feat = torch.from_numpy(grad_feat).float().cuda().view(1, 1, 2, 2, 2)
+    assert feat.grad.allclose(grad_feat)
--- a/docker-hub/FlashOCC/Flashocc/projects/mmdet3d_plugin/ops/nearest_assign/nearest_assign_ext.cpython-310-x86_64-linux-gnu.so
+++ b/docker-hub/FlashOCC/Flashocc/projects/mmdet3d_plugin/ops/nearest_assign/nearest_assign_ext.cpython-310-x86_64-linux-gnu.so
--- a/docker-hub/FlashOCC/Flashocc/projects/mmdet3d_plugin/ops/nearest_assign/src/nearest_assign.cpp
+++ b/docker-hub/FlashOCC/Flashocc/projects/mmdet3d_plugin/ops/nearest_assign/src/nearest_assign.cpp
+// Copyright (c) Phigent Robotics. All rights reserved.
+// Reference https://arxiv.org/abs/2211.17111
+#include <torch/torch.h>
+#include <c10/cuda/CUDAGuard.h>
+
+// CUDA function declarations
+void nearest_assign(
+                    const int* l2s_key,
+                    int l2s_size,
+                    const int *__restrict__ occind2detind,
+                    int inst_size,
+                    const int *__restrict__ occ_pred,
+                    const int *__restrict__ inst_xyz,
+                    const int *__restrict__ inst_cls,
+                    const int *__restrict__ inst_id_list,
+                    int* __restrict__ inst_pred);
+
+void nearest_assign_forward(
+  const at::Tensor _occ_pred,    // (200, 200, 16)
+  const at::Tensor _l2s_key,     // (l2s_size, 1)
+  const at::Tensor _occind2detind, // (10, 1)
+  const at::Tensor _inst_cls,     // (inst_size, 1)
+  const at::Tensor _inst_xyz,     // (inst_size, 3)
+  const at::Tensor _inst_id_list, // (inst_size, 1)
+  at::Tensor _inst_pred           // (200, 200, 16)
+) {
+  int l2s_size = _l2s_key.size(0);
+  int inst_size = _inst_xyz.size(0);
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(_occ_pred));
+  const int* occ_pred = _occ_pred.data_ptr<int>();
+  const int* inst_xyz = _inst_xyz.data_ptr<int>();
+  const int* inst_cls = _inst_cls.data_ptr<int>();
+  const int* l2s_key = _l2s_key.data_ptr<int>();
+  const int* inst_id_list = _inst_id_list.data_ptr<int>();
+  const int* occind2detind = _occind2detind.data_ptr<int>();
+  // std::map<int, int> l2s;
+  // for (int l2s_ind = 0; l2s_ind < l2s_size; l2s_ind++){
+  //   l2s.insert(pair<int, int>(l2s_key[l2s_ind], l2s_val[l2s_ind]));
+  // }
+
+  int* inst_pred = _inst_pred.data_ptr<int>();
+  nearest_assign(
+                 l2s_key,
+                 l2s_size,
+                 occind2detind,
+                 inst_size,
+                 occ_pred,
+                 inst_xyz,
+                 inst_cls,
+                 inst_id_list,
+                 inst_pred
+                 );
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("nearest_assign_forward", &nearest_assign_forward,
+        "nearest_assign_forward");
+}
--- a/docker-hub/FlashOCC/Flashocc/projects/mmdet3d_plugin/ops/nearest_assign/src/nearest_assign_cuda.cu
+++ b/docker-hub/FlashOCC/Flashocc/projects/mmdet3d_plugin/ops/nearest_assign/src/nearest_assign_cuda.cu
+// Copyright (c) Phigent Robotics. All rights reserved.
+// Reference https://arxiv.org/abs/2211.17111
+
+#include <stdio.h>
+#include <stdlib.h>
+
+__global__ void nearest_assign_kernel(
+                                  const int* l2s_key,
+                                  int l2s_size,
+                                  const int* occind2detind,
+                                  const int *__restrict__ occ_pred,
+                                  const int *__restrict__ inst_xyz,
+                                  const int *__restrict__ inst_cls,
+                                  const int *__restrict__ inst_id_list,
+                                  int inst_size,
+                                  int* __restrict__ inst_pred) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  // while (idx < 200*200*16)
+  if (true)
+  {
+    int occ_pred_label = occ_pred[idx];
+    int dist_min = 100000000;
+    for (int index = 0; index < l2s_size; index ++)
+    {
+      if (occ_pred_label == l2s_key[index])
+      {
+        int x = idx/(200*16);
+        int y = (idx - x*200*16)/16;
+        int z = idx - x*200*16 - y*16;
+        int inst_ind = 0;
+        for (inst_ind = 0; inst_ind < inst_size; inst_ind ++)
+        {
+          if (inst_cls[inst_ind] == occind2detind[occ_pred_label])
+          {
+            int dx = x - inst_xyz[inst_ind*3+0];
+            int dy = y - inst_xyz[inst_ind*3+1];
+            int dz = z - inst_xyz[inst_ind*3+2];
+            int dist = dx*dx + dy*dy + dz*dz;
+            if (dist < dist_min){
+              dist_min = dist;
+              inst_pred[idx] = inst_id_list[inst_ind];
+            }
+          }
+        }
+        return;
+      }
+    }
+    inst_pred[idx] = occ_pred[idx];
+
+    // idx += blockDim.x * gridDim.x;
+  }
+
+}
+
+void nearest_assign(
+              const int* l2s_key,
+              int l2s_size,
+              const int *__restrict__ occind2detind,
+              int inst_size,
+              const int *__restrict__ occ_pred,
+              const int *__restrict__ inst_xyz,
+              const int *__restrict__ inst_cls,
+              const int *__restrict__ inst_id_list,
+              int* __restrict__ inst_pred) {
+  // nearest_assign_kernel<<<128, 256>>>(
+  nearest_assign_kernel<<<(int)ceil(((double)200 * 200 * 16 / 256)), 256>>>(
+    l2s_key, l2s_size, occind2detind, 
+    occ_pred, inst_xyz, inst_cls, 
+    inst_id_list, inst_size, inst_pred
+  );
+}
+
+
--- a/docker-hub/FlashOCC/Flashocc/projects/mmdet3d_plugin/ops/nearest_assign/src/nearest_assign_cuda.hip
+++ b/docker-hub/FlashOCC/Flashocc/projects/mmdet3d_plugin/ops/nearest_assign/src/nearest_assign_cuda.hip
+// !!! This is a file automatically generated by hipify!!!
+#include <ATen/dtk_macros.h>
+#include "hip/hip_runtime.h"
+// Copyright (c) Phigent Robotics. All rights reserved.
+// Reference https://arxiv.org/abs/2211.17111
+
+#include <stdio.h>
+#include <stdlib.h>
+
+__global__ void nearest_assign_kernel(
+                                  const int* l2s_key,
+                                  int l2s_size,
+                                  const int* occind2detind,
+                                  const int *__restrict__ occ_pred,
+                                  const int *__restrict__ inst_xyz,
+                                  const int *__restrict__ inst_cls,
+                                  const int *__restrict__ inst_id_list,
+                                  int inst_size,
+                                  int* __restrict__ inst_pred) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  // while (idx < 200*200*16)
+  if (true)
+  {
+    int occ_pred_label = occ_pred[idx];
+    int dist_min = 100000000;
+    for (int index = 0; index < l2s_size; index ++)
+    {
+      if (occ_pred_label == l2s_key[index])
+      {
+        int x = idx/(200*16);
+        int y = (idx - x*200*16)/16;
+        int z = idx - x*200*16 - y*16;
+        int inst_ind = 0;
+        for (inst_ind = 0; inst_ind < inst_size; inst_ind ++)
+        {
+          if (inst_cls[inst_ind] == occind2detind[occ_pred_label])
+          {
+            int dx = x - inst_xyz[inst_ind*3+0];
+            int dy = y - inst_xyz[inst_ind*3+1];
+            int dz = z - inst_xyz[inst_ind*3+2];
+            int dist = dx*dx + dy*dy + dz*dz;
+            if (dist < dist_min){
+              dist_min = dist;
+              inst_pred[idx] = inst_id_list[inst_ind];
+            }
+          }
+        }
+        return;
+      }
+    }
+    inst_pred[idx] = occ_pred[idx];
+
+    // idx += blockDim.x * gridDim.x;
+  }
+
+}
+
+void nearest_assign(
+              const int* l2s_key,
+              int l2s_size,
+              const int *__restrict__ occind2detind,
+              int inst_size,
+              const int *__restrict__ occ_pred,
+              const int *__restrict__ inst_xyz,
+              const int *__restrict__ inst_cls,
+              const int *__restrict__ inst_id_list,
+              int* __restrict__ inst_pred) {
+  // nearest_assign_kernel<<<128, 256>>>(
+ hipLaunchKernelGGL(( nearest_assign_kernel), dim3((int)ceil(((double)200 * 200 * 16 / 256))), dim3(256), 0, 0, 
+    l2s_key, l2s_size, occind2detind, 
+    occ_pred, inst_xyz, inst_cls, 
+    inst_id_list, inst_size, inst_pred
+  );
+}
+
+
--- a/docker-hub/FlashOCC/Flashocc/projects/mmdet3d_plugin/ops/nearest_assign/src/nearest_assign_hip.cpp
+++ b/docker-hub/FlashOCC/Flashocc/projects/mmdet3d_plugin/ops/nearest_assign/src/nearest_assign_hip.cpp
+// !!! This is a file automatically generated by hipify!!!
+#include <ATen/dtk_macros.h>
+// Copyright (c) Phigent Robotics. All rights reserved.
+// Reference https://arxiv.org/abs/2211.17111
+#include <torch/torch.h>
+#include <ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h>
+
+// CUDA function declarations
+void nearest_assign(
+                    const int* l2s_key,
+                    int l2s_size,
+                    const int *__restrict__ occind2detind,
+                    int inst_size,
+                    const int *__restrict__ occ_pred,
+                    const int *__restrict__ inst_xyz,
+                    const int *__restrict__ inst_cls,
+                    const int *__restrict__ inst_id_list,
+                    int* __restrict__ inst_pred);
+
+void nearest_assign_forward(
+  const at::Tensor _occ_pred,    // (200, 200, 16)
+  const at::Tensor _l2s_key,     // (l2s_size, 1)
+  const at::Tensor _occind2detind, // (10, 1)
+  const at::Tensor _inst_cls,     // (inst_size, 1)
+  const at::Tensor _inst_xyz,     // (inst_size, 3)
+  const at::Tensor _inst_id_list, // (inst_size, 1)
+  at::Tensor _inst_pred           // (200, 200, 16)
+) {
+  int l2s_size = _l2s_key.size(0);
+  int inst_size = _inst_xyz.size(0);
+  const at::hip::OptionalHIPGuardMasqueradingAsCUDA device_guard(device_of(_occ_pred));
+  const int* occ_pred = _occ_pred.data_ptr<int>();
+  const int* inst_xyz = _inst_xyz.data_ptr<int>();
+  const int* inst_cls = _inst_cls.data_ptr<int>();
+  const int* l2s_key = _l2s_key.data_ptr<int>();
+  const int* inst_id_list = _inst_id_list.data_ptr<int>();
+  const int* occind2detind = _occind2detind.data_ptr<int>();
+  // std::map<int, int> l2s;
+  // for (int l2s_ind = 0; l2s_ind < l2s_size; l2s_ind++){
+  //   l2s.insert(pair<int, int>(l2s_key[l2s_ind], l2s_val[l2s_ind]));
+  // }
+
+  int* inst_pred = _inst_pred.data_ptr<int>();
+  nearest_assign(
+                 l2s_key,
+                 l2s_size,
+                 occind2detind,
+                 inst_size,
+                 occ_pred,
+                 inst_xyz,
+                 inst_cls,
+                 inst_id_list,
+                 inst_pred
+                 );
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("nearest_assign_forward", &nearest_assign_forward,
+        "nearest_assign_forward");
+}
--- a/docker-hub/FlashOCC/Flashocc/projects/setup.py
+++ b/docker-hub/FlashOCC/Flashocc/projects/setup.py
+from setuptools import find_packages, setup
+
+import os
+import shutil
+import sys
+import torch
+import warnings
+from os import path as osp
+from torch.utils.cpp_extension import (BuildExtension, CppExtension,
+                                       CUDAExtension)
+
+
+def make_cuda_ext(name,
+                  module,
+                  sources,
+                  sources_cuda=[],
+                  extra_args=[],
+                  extra_include_path=[]):
+
+    define_macros = []
+    extra_compile_args = {'cxx': [] + extra_args}
+
+    if torch.cuda.is_available() or os.getenv('FORCE_CUDA', '0') == '1':
+        define_macros += [('WITH_CUDA', None)]
+        extension = CUDAExtension
+        extra_compile_args['nvcc'] = extra_args + [
+            '-D__CUDA_NO_HALF_OPERATORS__',
+            '-D__CUDA_NO_HALF_CONVERSIONS__',
+            '-D__CUDA_NO_HALF2_OPERATORS__',
+        ]
+        sources += sources_cuda
+    else:
+        print('Compiling {} without CUDA'.format(name))
+        extension = CppExtension
+        # raise EnvironmentError('CUDA is required to compile MMDetection!')
+
+    return extension(
+        name='{}.{}'.format(module, name),
+        sources=[os.path.join(*module.split('.'), p) for p in sources],
+        include_dirs=extra_include_path,
+        define_macros=define_macros,
+        extra_compile_args=extra_compile_args)
+
+
+if __name__ == '__main__':
+    setup(
+        name='flashocc_plugin',
+        description=("OpenMMLab's next-generation platform"
+                     'for general 3D object detection.'),
+        long_description_content_type='text/markdown',
+        author='MMDetection3D Contributors',
+        author_email='zwwdev@gmail.com',
+        keywords='computer vision, 3D object detection',
+        url='https://github.com/open-mmlab/mmdetection3d',
+        classifiers=[
+            'Development Status :: 4 - Beta',
+            'License :: OSI Approved :: Apache Software License',
+            'Operating System :: OS Independent',
+            'Programming Language :: Python :: 3',
+            'Programming Language :: Python :: 3.6',
+            'Programming Language :: Python :: 3.7',
+        ],
+        license='Apache License 2.0',
+        ext_modules=[
+            make_cuda_ext(
+                name="bev_pool_ext",
+                module="mmdet3d_plugin.ops.bev_pool",
+                sources=[
+                    "src/bev_pooling.cpp",
+                    "src/bev_sum_pool.cpp",
+                    "src/bev_sum_pool_cuda.cu",
+                    "src/bev_max_pool.cpp",
+                    "src/bev_max_pool_cuda.cu",
+                ],
+            ),
+            make_cuda_ext(
+                name="bev_pool_v2_ext",
+                module="mmdet3d_plugin.ops.bev_pool_v2",
+                sources=[
+                    "src/bev_pool.cpp",
+                    "src/bev_pool_cuda.cu"
+                ],
+            ),
+            make_cuda_ext(
+                name="nearest_assign_ext",
+                module="mmdet3d_plugin.ops.nearest_assign",
+                sources=[
+                    "src/nearest_assign.cpp",
+                    "src/nearest_assign_cuda.cu"
+                ],
+            ),
+        ],
+        cmdclass={'build_ext': BuildExtension},
+        zip_safe=False)
--- a/docker-hub/FlashOCC/Flashocc/repro.py
+++ b/docker-hub/FlashOCC/Flashocc/repro.py
+
+import torch
+from torch import tensor, device
+import torch.fx as fx
+from torch._dynamo.testing import rand_strided
+from math import inf
+import torch._inductor.inductor_prims
+
+import torch._dynamo.config
+import torch._inductor.config
+import torch._functorch.config
+import torch.fx.experimental._config
+torch._dynamo.config.capture_scalar_outputs = True
+
+
+
+
+
+isolate_fails_code_str = None
+
+
+
+# torch version: 2.4.1
+# torch cuda version: None
+# torch git version: 45d303c9e4f41ec2f5450b6f60031246f67189d6
+
+
+# CUDA Info: 
+# nvcc not found
+# GPU Hardware Info: 
+# BW200 : 8 
+
+
+from torch.nn import *
+class Repro(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    
+    
+    def forward(self, primals_1, primals_2, primals_4, primals_5, primals_6, primals_7, primals_8, primals_10, convert_element_type_1, clamp_max, convert_element_type_3, clamp_max_1, clamp_max_2, clamp_max_3, cat, convolution, squeeze_1, relu, convolution_1, getitem_3, rsqrt_1, convert_element_type_5, clamp_max_4, convert_element_type_7, clamp_max_5, clamp_max_6, clamp_max_7, add_19, convolution_2, squeeze_7, relu_2, unsqueeze_14, unsqueeze_38, tangents_1):
+        sum_1 = torch.ops.aten.sum.dim_IntList(tangents_1, [0, 2, 3])
+        convolution_backward = torch.ops.aten.convolution_backward.default(tangents_1, relu_2, primals_10, [256], [1, 1], [0, 0], [1, 1], False, [0, 0], 1, [True, True, False]);  tangents_1 = primals_10 = None
+        getitem_6 = convolution_backward[0]
+        getitem_7 = convolution_backward[1];  convolution_backward = None
+        le = torch.ops.aten.le.Scalar(relu_2, 0);  relu_2 = None
+        full_default = torch.ops.aten.full.default([], 0.0, dtype = torch.float32, layout = torch.strided, device = device(type='cuda', index=2), pin_memory = False)
+        where = torch.ops.aten.where.self(le, full_default, getitem_6);  le = getitem_6 = None
+        sum_2 = torch.ops.aten.sum.dim_IntList(where, [0, 2, 3])
+        sub_13 = torch.ops.aten.sub.Tensor(convolution_2, unsqueeze_14);  convolution_2 = unsqueeze_14 = None
+        mul_31 = torch.ops.aten.mul.Tensor(where, sub_13)
+        sum_3 = torch.ops.aten.sum.dim_IntList(mul_31, [0, 2, 3]);  mul_31 = None
+        mul_32 = torch.ops.aten.mul.Tensor(sum_2, 6.25e-06)
+        unsqueeze_15 = torch.ops.aten.unsqueeze.default(mul_32, 0);  mul_32 = None
+        unsqueeze_16 = torch.ops.aten.unsqueeze.default(unsqueeze_15, 2);  unsqueeze_15 = None
+        unsqueeze_17 = torch.ops.aten.unsqueeze.default(unsqueeze_16, 3);  unsqueeze_16 = None
+        mul_33 = torch.ops.aten.mul.Tensor(sum_3, 6.25e-06)
+        mul_34 = torch.ops.aten.mul.Tensor(squeeze_7, squeeze_7)
+        mul_35 = torch.ops.aten.mul.Tensor(mul_33, mul_34);  mul_33 = mul_34 = None
+        unsqueeze_18 = torch.ops.aten.unsqueeze.default(mul_35, 0);  mul_35 = None
+        unsqueeze_19 = torch.ops.aten.unsqueeze.default(unsqueeze_18, 2);  unsqueeze_18 = None
+        unsqueeze_20 = torch.ops.aten.unsqueeze.default(unsqueeze_19, 3);  unsqueeze_19 = None
+        mul_36 = torch.ops.aten.mul.Tensor(squeeze_7, primals_8);  primals_8 = None
+        unsqueeze_21 = torch.ops.aten.unsqueeze.default(mul_36, 0);  mul_36 = None
+        unsqueeze_22 = torch.ops.aten.unsqueeze.default(unsqueeze_21, 2);  unsqueeze_21 = None
+        unsqueeze_23 = torch.ops.aten.unsqueeze.default(unsqueeze_22, 3);  unsqueeze_22 = None
+        mul_37 = torch.ops.aten.mul.Tensor(sub_13, unsqueeze_20);  sub_13 = unsqueeze_20 = None
+        sub_15 = torch.ops.aten.sub.Tensor(where, mul_37);  where = mul_37 = None
+        sub_16 = torch.ops.aten.sub.Tensor(sub_15, unsqueeze_17);  sub_15 = unsqueeze_17 = None
+        mul_38 = torch.ops.aten.mul.Tensor(sub_16, unsqueeze_23);  sub_16 = unsqueeze_23 = None
+        mul_39 = torch.ops.aten.mul.Tensor(sum_3, squeeze_7);  sum_3 = squeeze_7 = None
+        convolution_backward_1 = torch.ops.aten.convolution_backward.default(mul_38, add_19, primals_7, [0], [1, 1], [1, 1], [1, 1], False, [0, 0], 1, [True, True, False]);  mul_38 = add_19 = primals_7 = None
+        getitem_9 = convolution_backward_1[0]
+        getitem_10 = convolution_backward_1[1];  convolution_backward_1 = None
+        mul_40 = torch.ops.aten.mul.Tensor(getitem_9, clamp_max_7);  clamp_max_7 = None
+        neg = torch.ops.aten.neg.default(mul_40)
+        add_25 = torch.ops.aten.add.Tensor(getitem_9, neg);  getitem_9 = neg = None
+        mul_41 = torch.ops.aten.mul.Tensor(mul_40, clamp_max_6)
+        neg_1 = torch.ops.aten.neg.default(mul_41)
+        add_26 = torch.ops.aten.add.Tensor(mul_40, neg_1);  mul_40 = neg_1 = None
+        mul_42 = torch.ops.aten.mul.Tensor(add_25, clamp_max_6);  clamp_max_6 = None
+        neg_2 = torch.ops.aten.neg.default(mul_42)
+        add_27 = torch.ops.aten.add.Tensor(add_25, neg_2);  add_25 = neg_2 = None
+        full_default_1 = torch.ops.aten.full.default([4, 512, 100, 100], 0, dtype = torch.float32, layout = torch.strided, device = device(type='cuda', index=2), pin_memory = False)
+        _unsafe_index_put = torch.ops.aten._unsafe_index_put.default(full_default_1, [None, None, clamp_max_4, clamp_max_5], mul_41, True);  mul_41 = None
+        _unsafe_index_put_1 = torch.ops.aten._unsafe_index_put.default(full_default_1, [None, None, clamp_max_4, convert_element_type_7], add_26, True);  clamp_max_4 = add_26 = None
+        add_28 = torch.ops.aten.add.Tensor(_unsafe_index_put, _unsafe_index_put_1);  _unsafe_index_put = _unsafe_index_put_1 = None
+        _unsafe_index_put_2 = torch.ops.aten._unsafe_index_put.default(full_default_1, [None, None, convert_element_type_5, clamp_max_5], mul_42, True);  clamp_max_5 = mul_42 = None
+        add_29 = torch.ops.aten.add.Tensor(add_28, _unsafe_index_put_2);  add_28 = _unsafe_index_put_2 = None
+        _unsafe_index_put_3 = torch.ops.aten._unsafe_index_put.default(full_default_1, [None, None, convert_element_type_5, convert_element_type_7], add_27, True);  full_default_1 = convert_element_type_5 = convert_element_type_7 = add_27 = None
+        add_30 = torch.ops.aten.add.Tensor(add_29, _unsafe_index_put_3);  add_29 = _unsafe_index_put_3 = None
+        sub_6 = torch.ops.aten.sub.Tensor(convolution_1, getitem_3)
+        mul_12 = torch.ops.aten.mul.Tensor(sub_6, rsqrt_1);  sub_6 = None
+        unsqueeze_4 = torch.ops.aten.unsqueeze.default(primals_5, -1)
+        unsqueeze_5 = torch.ops.aten.unsqueeze.default(unsqueeze_4, -1);  unsqueeze_4 = None
+        mul_18 = torch.ops.aten.mul.Tensor(mul_12, unsqueeze_5);  mul_12 = unsqueeze_5 = None
+        unsqueeze_6 = torch.ops.aten.unsqueeze.default(primals_6, -1);  primals_6 = None
+        unsqueeze_7 = torch.ops.aten.unsqueeze.default(unsqueeze_6, -1);  unsqueeze_6 = None
+        add_14 = torch.ops.aten.add.Tensor(mul_18, unsqueeze_7);  mul_18 = unsqueeze_7 = None
+        relu_1 = torch.ops.aten.relu.default(add_14);  add_14 = None
+        le_1 = torch.ops.aten.le.Scalar(relu_1, 0);  relu_1 = None
+        where_1 = torch.ops.aten.where.self(le_1, full_default, add_30);  le_1 = add_30 = None
+        squeeze_3 = torch.ops.aten.squeeze.dims(getitem_3, [0, 2, 3]);  getitem_3 = None
+        unsqueeze_24 = torch.ops.aten.unsqueeze.default(squeeze_3, 0);  squeeze_3 = None
+        unsqueeze_25 = torch.ops.aten.unsqueeze.default(unsqueeze_24, 2);  unsqueeze_24 = None
+        unsqueeze_26 = torch.ops.aten.unsqueeze.default(unsqueeze_25, 3);  unsqueeze_25 = None
+        sum_4 = torch.ops.aten.sum.dim_IntList(where_1, [0, 2, 3])
+        sub_17 = torch.ops.aten.sub.Tensor(convolution_1, unsqueeze_26);  convolution_1 = unsqueeze_26 = None
+        mul_43 = torch.ops.aten.mul.Tensor(where_1, sub_17)
+        sum_5 = torch.ops.aten.sum.dim_IntList(mul_43, [0, 2, 3]);  mul_43 = None
+        mul_44 = torch.ops.aten.mul.Tensor(sum_4, 2.5e-05)
+        unsqueeze_27 = torch.ops.aten.unsqueeze.default(mul_44, 0);  mul_44 = None
+        unsqueeze_28 = torch.ops.aten.unsqueeze.default(unsqueeze_27, 2);  unsqueeze_27 = None
+        unsqueeze_29 = torch.ops.aten.unsqueeze.default(unsqueeze_28, 3);  unsqueeze_28 = None
+        mul_45 = torch.ops.aten.mul.Tensor(sum_5, 2.5e-05)
+        squeeze_4 = torch.ops.aten.squeeze.dims(rsqrt_1, [0, 2, 3]);  rsqrt_1 = None
+        mul_46 = torch.ops.aten.mul.Tensor(squeeze_4, squeeze_4)
+        mul_47 = torch.ops.aten.mul.Tensor(mul_45, mul_46);  mul_45 = mul_46 = None
+        unsqueeze_30 = torch.ops.aten.unsqueeze.default(mul_47, 0);  mul_47 = None
+        unsqueeze_31 = torch.ops.aten.unsqueeze.default(unsqueeze_30, 2);  unsqueeze_30 = None
+        unsqueeze_32 = torch.ops.aten.unsqueeze.default(unsqueeze_31, 3);  unsqueeze_31 = None
+        mul_48 = torch.ops.aten.mul.Tensor(squeeze_4, primals_5);  primals_5 = None
+        unsqueeze_33 = torch.ops.aten.unsqueeze.default(mul_48, 0);  mul_48 = None
+        unsqueeze_34 = torch.ops.aten.unsqueeze.default(unsqueeze_33, 2);  unsqueeze_33 = None
+        unsqueeze_35 = torch.ops.aten.unsqueeze.default(unsqueeze_34, 3);  unsqueeze_34 = None
+        mul_49 = torch.ops.aten.mul.Tensor(sub_17, unsqueeze_32);  sub_17 = unsqueeze_32 = None
+        sub_19 = torch.ops.aten.sub.Tensor(where_1, mul_49);  where_1 = mul_49 = None
+        sub_20 = torch.ops.aten.sub.Tensor(sub_19, unsqueeze_29);  sub_19 = unsqueeze_29 = None
+        mul_50 = torch.ops.aten.mul.Tensor(sub_20, unsqueeze_35);  sub_20 = unsqueeze_35 = None
+        mul_51 = torch.ops.aten.mul.Tensor(sum_5, squeeze_4);  sum_5 = squeeze_4 = None
+        convolution_backward_2 = torch.ops.aten.convolution_backward.default(mul_50, relu, primals_4, [0], [1, 1], [1, 1], [1, 1], False, [0, 0], 1, [True, True, False]);  mul_50 = primals_4 = None
+        getitem_12 = convolution_backward_2[0]
+        getitem_13 = convolution_backward_2[1];  convolution_backward_2 = None
+        le_2 = torch.ops.aten.le.Scalar(relu, 0);  relu = None
+        where_2 = torch.ops.aten.where.self(le_2, full_default, getitem_12);  le_2 = full_default = getitem_12 = None
+        sum_6 = torch.ops.aten.sum.dim_IntList(where_2, [0, 2, 3])
+        sub_21 = torch.ops.aten.sub.Tensor(convolution, unsqueeze_38);  convolution = unsqueeze_38 = None
+        mul_52 = torch.ops.aten.mul.Tensor(where_2, sub_21)
+        sum_7 = torch.ops.aten.sum.dim_IntList(mul_52, [0, 2, 3]);  mul_52 = None
+        mul_53 = torch.ops.aten.mul.Tensor(sum_6, 2.5e-05)
+        unsqueeze_39 = torch.ops.aten.unsqueeze.default(mul_53, 0);  mul_53 = None
+        unsqueeze_40 = torch.ops.aten.unsqueeze.default(unsqueeze_39, 2);  unsqueeze_39 = None
+        unsqueeze_41 = torch.ops.aten.unsqueeze.default(unsqueeze_40, 3);  unsqueeze_40 = None
+        mul_54 = torch.ops.aten.mul.Tensor(sum_7, 2.5e-05)
+        mul_55 = torch.ops.aten.mul.Tensor(squeeze_1, squeeze_1)
+        mul_56 = torch.ops.aten.mul.Tensor(mul_54, mul_55);  mul_54 = mul_55 = None
+        unsqueeze_42 = torch.ops.aten.unsqueeze.default(mul_56, 0);  mul_56 = None
+        unsqueeze_43 = torch.ops.aten.unsqueeze.default(unsqueeze_42, 2);  unsqueeze_42 = None
+        unsqueeze_44 = torch.ops.aten.unsqueeze.default(unsqueeze_43, 3);  unsqueeze_43 = None
+        mul_57 = torch.ops.aten.mul.Tensor(squeeze_1, primals_2);  primals_2 = None
+        unsqueeze_45 = torch.ops.aten.unsqueeze.default(mul_57, 0);  mul_57 = None
+        unsqueeze_46 = torch.ops.aten.unsqueeze.default(unsqueeze_45, 2);  unsqueeze_45 = None
+        unsqueeze_47 = torch.ops.aten.unsqueeze.default(unsqueeze_46, 3);  unsqueeze_46 = None
+        mul_58 = torch.ops.aten.mul.Tensor(sub_21, unsqueeze_44);  sub_21 = unsqueeze_44 = None
+        sub_23 = torch.ops.aten.sub.Tensor(where_2, mul_58);  where_2 = mul_58 = None
+        sub_24 = torch.ops.aten.sub.Tensor(sub_23, unsqueeze_41);  sub_23 = unsqueeze_41 = None
+        mul_59 = torch.ops.aten.mul.Tensor(sub_24, unsqueeze_47);  sub_24 = unsqueeze_47 = None
+        mul_60 = torch.ops.aten.mul.Tensor(sum_7, squeeze_1);  sum_7 = squeeze_1 = None
+        convolution_backward_3 = torch.ops.aten.convolution_backward.default(mul_59, cat, primals_1, [0], [1, 1], [1, 1], [1, 1], False, [0, 0], 1, [True, True, False]);  mul_59 = cat = primals_1 = None
+        getitem_15 = convolution_backward_3[0]
+        getitem_16 = convolution_backward_3[1];  convolution_backward_3 = None
+        slice_1 = torch.ops.aten.slice.Tensor(getitem_15, 1, 0, 128)
+        slice_2 = torch.ops.aten.slice.Tensor(getitem_15, 1, 128, 640);  getitem_15 = None
+        mul_61 = torch.ops.aten.mul.Tensor(slice_2, clamp_max_3);  clamp_max_3 = None
+        neg_3 = torch.ops.aten.neg.default(mul_61)
+        add_31 = torch.ops.aten.add.Tensor(slice_2, neg_3);  slice_2 = neg_3 = None
+        mul_62 = torch.ops.aten.mul.Tensor(mul_61, clamp_max_2)
+        neg_4 = torch.ops.aten.neg.default(mul_62)
+        add_32 = torch.ops.aten.add.Tensor(mul_61, neg_4);  mul_61 = neg_4 = None
+        mul_63 = torch.ops.aten.mul.Tensor(add_31, clamp_max_2);  clamp_max_2 = None
+        neg_5 = torch.ops.aten.neg.default(mul_63)
+        add_33 = torch.ops.aten.add.Tensor(add_31, neg_5);  add_31 = neg_5 = None
+        full_default_7 = torch.ops.aten.full.default([4, 512, 25, 25], 0, dtype = torch.float32, layout = torch.strided, device = device(type='cuda', index=2), pin_memory = False)
+        _unsafe_index_put_4 = torch.ops.aten._unsafe_index_put.default(full_default_7, [None, None, clamp_max, clamp_max_1], mul_62, True);  mul_62 = None
+        _unsafe_index_put_5 = torch.ops.aten._unsafe_index_put.default(full_default_7, [None, None, clamp_max, convert_element_type_3], add_32, True);  clamp_max = add_32 = None
+        add_34 = torch.ops.aten.add.Tensor(_unsafe_index_put_4, _unsafe_index_put_5);  _unsafe_index_put_4 = _unsafe_index_put_5 = None
+        _unsafe_index_put_6 = torch.ops.aten._unsafe_index_put.default(full_default_7, [None, None, convert_element_type_1, clamp_max_1], mul_63, True);  clamp_max_1 = mul_63 = None
+        add_35 = torch.ops.aten.add.Tensor(add_34, _unsafe_index_put_6);  add_34 = _unsafe_index_put_6 = None
+        _unsafe_index_put_7 = torch.ops.aten._unsafe_index_put.default(full_default_7, [None, None, convert_element_type_1, convert_element_type_3], add_33, True);  full_default_7 = convert_element_type_1 = convert_element_type_3 = add_33 = None
+        add_36 = torch.ops.aten.add.Tensor(add_35, _unsafe_index_put_7);  add_35 = _unsafe_index_put_7 = None
+        return [getitem_16, mul_60, sum_6, getitem_13, mul_51, sum_4, getitem_10, mul_39, sum_2, getitem_7, sum_1, None, None, None, None, None, None, None, None, None, slice_1, add_36]
+        
+def load_args(reader):
+    buf0 = reader.storage('934c55e4a7a69a0a29a96cd8ef9f11c9859658e1', 11796480, device=device(type='cuda', index=2))
+    reader.tensor(buf0, (512, 640, 3, 3), requires_grad=True, is_leaf=True)  # primals_1
+    buf1 = reader.storage('f12094f433480ec90280d223057708434df38941', 2048, device=device(type='cuda', index=2))
+    reader.tensor(buf1, (512,), requires_grad=True, is_leaf=True)  # primals_2
+    buf2 = reader.storage('06c46ad2c91ec5c8eebc4fb0be80459bdfe007a8', 9437184, device=device(type='cuda', index=2))
+    reader.tensor(buf2, (512, 512, 3, 3), requires_grad=True, is_leaf=True)  # primals_4
+    buf3 = reader.storage('aba0c4266c842d1845e720dc0c789942770a60b7', 2048, device=device(type='cuda', index=2))
+    reader.tensor(buf3, (512,), requires_grad=True, is_leaf=True)  # primals_5
+    buf4 = reader.storage('bb8471d379e03c8ccb9897ce7d3a2dfbacb44e30', 2048, device=device(type='cuda', index=2))
+    reader.tensor(buf4, (512,), requires_grad=True, is_leaf=True)  # primals_6
+    buf5 = reader.storage('b9484105fb5b2045fb6550a1edb77af72e639416', 4718592, device=device(type='cuda', index=2))
+    reader.tensor(buf5, (256, 512, 3, 3), requires_grad=True, is_leaf=True)  # primals_7
+    buf6 = reader.storage('b778b8cab416c3fa6763b88e431266ae6ea28941', 1024, device=device(type='cuda', index=2))
+    reader.tensor(buf6, (256,), requires_grad=True, is_leaf=True)  # primals_8
+    buf7 = reader.storage('c5f14ec72c73a593b47ef4aecf37f6bb25d2dec4', 262144, device=device(type='cuda', index=2))
+    reader.tensor(buf7, (256, 256, 1, 1), requires_grad=True, is_leaf=True)  # primals_10
+    buf8 = reader.storage('99ef5c7086a924dfc5221c01ff1520de469849c8', 800, device=device(type='cuda', index=2), dtype_hint=torch.int64)
+    reader.tensor(buf8, (100, 1), dtype=torch.int64, is_leaf=True)  # convert_element_type_1
+    buf9 = reader.storage('532b7b8fc19c48c7434e569ab96aa0670d5651ef', 800, device=device(type='cuda', index=2), dtype_hint=torch.int64)
+    reader.tensor(buf9, (100, 1), dtype=torch.int64, is_leaf=True)  # clamp_max
+    buf10 = reader.storage('99ef5c7086a924dfc5221c01ff1520de469849c8', 800, device=device(type='cuda', index=2), dtype_hint=torch.int64)
+    reader.tensor(buf10, (100,), dtype=torch.int64, is_leaf=True)  # convert_element_type_3
+    buf11 = reader.storage('532b7b8fc19c48c7434e569ab96aa0670d5651ef', 800, device=device(type='cuda', index=2), dtype_hint=torch.int64)
+    reader.tensor(buf11, (100,), dtype=torch.int64, is_leaf=True)  # clamp_max_1
+    buf12 = reader.storage('0538ed039b8a4706a4f85bf431e12664d8940742', 400, device=device(type='cuda', index=2))
+    reader.tensor(buf12, (100,), is_leaf=True)  # clamp_max_2
+    buf13 = reader.storage('0538ed039b8a4706a4f85bf431e12664d8940742', 400, device=device(type='cuda', index=2))
+    reader.tensor(buf13, (100, 1), is_leaf=True)  # clamp_max_3
+    buf14 = reader.storage('5d41e66671a283b70001fd74345d8e7e3def00bd', 102400000, device=device(type='cuda', index=2))
+    reader.tensor(buf14, (4, 640, 100, 100), is_leaf=True)  # cat
+    buf15 = reader.storage('a8fe0ed584571bb3218d663656459a36545be5e6', 81920000, device=device(type='cuda', index=2))
+    reader.tensor(buf15, (4, 512, 100, 100), is_leaf=True)  # convolution
+    buf16 = reader.storage('0af13bcf109b8ca2df7f5ce3387d51e8576fb30a', 2048, device=device(type='cuda', index=2))
+    reader.tensor(buf16, (512,), is_leaf=True)  # squeeze_1
+    buf17 = reader.storage('32f14d6fa07f654fbb09ef1563066303a3501eda', 81920000, device=device(type='cuda', index=2))
+    reader.tensor(buf17, (4, 512, 100, 100), is_leaf=True)  # relu
+    buf18 = reader.storage('aca23d51e723ad9b4bec2e54d6f0af4b5b85cc7d', 81920000, device=device(type='cuda', index=2))
+    reader.tensor(buf18, (4, 512, 100, 100), is_leaf=True)  # convolution_1
+    buf19 = reader.storage('4940c79e48676c2e1359870dc770e25cd780983d', 2048, device=device(type='cuda', index=2))
+    reader.tensor(buf19, (1, 512, 1, 1), is_leaf=True)  # getitem_3
+    buf20 = reader.storage('d17407a9f45954a4d0d36e5b20a40ac554cc3aff', 2048, device=device(type='cuda', index=2))
+    reader.tensor(buf20, (1, 512, 1, 1), is_leaf=True)  # rsqrt_1
+    buf21 = reader.storage('95fbd2b85e217ab78f8f9d7900b273a1362b3112', 1600, device=device(type='cuda', index=2), dtype_hint=torch.int64)
+    reader.tensor(buf21, (200, 1), dtype=torch.int64, is_leaf=True)  # convert_element_type_5
+    buf22 = reader.storage('d9920b87a7261c94c907bc68889b005f277cd597', 1600, device=device(type='cuda', index=2), dtype_hint=torch.int64)
+    reader.tensor(buf22, (200, 1), dtype=torch.int64, is_leaf=True)  # clamp_max_4
+    buf23 = reader.storage('95fbd2b85e217ab78f8f9d7900b273a1362b3112', 1600, device=device(type='cuda', index=2), dtype_hint=torch.int64)
+    reader.tensor(buf23, (200,), dtype=torch.int64, is_leaf=True)  # convert_element_type_7
+    buf24 = reader.storage('d9920b87a7261c94c907bc68889b005f277cd597', 1600, device=device(type='cuda', index=2), dtype_hint=torch.int64)
+    reader.tensor(buf24, (200,), dtype=torch.int64, is_leaf=True)  # clamp_max_5
+    buf25 = reader.storage('131d76cb798ee04745f0c7dcb67b63c74a6c00df', 800, device=device(type='cuda', index=2))
+    reader.tensor(buf25, (200,), is_leaf=True)  # clamp_max_6
+    buf26 = reader.storage('131d76cb798ee04745f0c7dcb67b63c74a6c00df', 800, device=device(type='cuda', index=2))
+    reader.tensor(buf26, (200, 1), is_leaf=True)  # clamp_max_7
+    buf27 = reader.storage('32194c54194bddd5f695a8d306828130629246fc', 327680000, device=device(type='cuda', index=2))
+    reader.tensor(buf27, (4, 512, 200, 200), is_leaf=True)  # add_19
+    buf28 = reader.storage('e3a286ef8d6373c83ef30afe16eaae96ee52b965', 163840000, device=device(type='cuda', index=2))
+    reader.tensor(buf28, (4, 256, 200, 200), is_leaf=True)  # convolution_2
+    buf29 = reader.storage('9572b289e6d5c9bdd20a79367d4005440da40795', 1024, device=device(type='cuda', index=2))
+    reader.tensor(buf29, (256,), is_leaf=True)  # squeeze_7
+    buf30 = reader.storage('42f9ce794a05b12a40f15cbd4abb1201ccef0f72', 163840000, device=device(type='cuda', index=2))
+    reader.tensor(buf30, (4, 256, 200, 200), is_leaf=True)  # relu_2
+    buf31 = reader.storage('61670207f087dc68f052bc03747d9ab365297b17', 1024, device=device(type='cuda', index=2))
+    reader.tensor(buf31, (1, 256, 1, 1), is_leaf=True)  # unsqueeze_14
+    buf32 = reader.storage('ab77896e6dd76345e63586ecda30b1e4a63439cc', 2048, device=device(type='cuda', index=2))
+    reader.tensor(buf32, (1, 512, 1, 1), is_leaf=True)  # unsqueeze_38
+    buf33 = reader.storage('f0ec623d2a44ff0f64fc264faf9128c2a6896e57', 163840000, device=device(type='cuda', index=2))
+    reader.tensor(buf33, (4, 256, 200, 200), is_leaf=True)  # tangents_1
+load_args._version = 0
+mod = Repro()
+if __name__ == '__main__':
+    from torch._dynamo.repro.after_aot import run_repro
+    with torch.no_grad():
+        run_repro(mod, load_args, accuracy=True, command='run', save_dir='/root/FlashOCC/torch_compile_debug/run_2025_08_24_19_42_28_279064-pid_182645/minifier/checkpoints', tracing_mode='real', check_str=None)
+        # To run it separately, do 
+        # mod, args = run_repro(mod, load_args, accuracy=True, command='get_args', save_dir='/root/FlashOCC/torch_compile_debug/run_2025_08_24_19_42_28_279064-pid_182645/minifier/checkpoints', tracing_mode='real', check_str=None)
+        # mod(*args)
\ No newline at end of file
--- a/docker-hub/FlashOCC/Flashocc/requirements.txt
+++ b/docker-hub/FlashOCC/Flashocc/requirements.txt
+-r requirements/build.txt
+-r requirements/optional.txt
+-r requirements/runtime.txt
+-r requirements/tests.txt
--- a/docker-hub/FlashOCC/Flashocc/requirements/build.txt
+++ b/docker-hub/FlashOCC/Flashocc/requirements/build.txt
--- a/docker-hub/FlashOCC/Flashocc/requirements/docs.txt
+++ b/docker-hub/FlashOCC/Flashocc/requirements/docs.txt
+docutils==0.16.0
+m2r
+mistune==0.8.4
+myst-parser
+-e git+https://github.com/open-mmlab/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
+sphinx==4.0.2
+sphinx-copybutton
+sphinx_markdown_tables
--- a/docker-hub/FlashOCC/Flashocc/requirements/mminstall.txt
+++ b/docker-hub/FlashOCC/Flashocc/requirements/mminstall.txt
+mmcv-full>=1.4.8,<=1.6.0
+mmdet>=2.24.0,<=3.0.0
+mmsegmentation>=0.20.0,<=1.0.0
--- a/docker-hub/FlashOCC/Flashocc/requirements/optional.txt
+++ b/docker-hub/FlashOCC/Flashocc/requirements/optional.txt
+open3d
+spconv
+waymo-open-dataset-tf-2-1-0==1.2.0
--- a/docker-hub/FlashOCC/Flashocc/requirements/readthedocs.txt
+++ b/docker-hub/FlashOCC/Flashocc/requirements/readthedocs.txt
+mmcv>=1.4.8
+mmdet>=2.24.0
+mmsegmentation>=0.20.1
+torch
+torchvision
--- a/docker-hub/FlashOCC/Flashocc/requirements/runtime.txt
+++ b/docker-hub/FlashOCC/Flashocc/requirements/runtime.txt
+lyft_dataset_sdk
+networkx>=2.2,<2.3
+numba==0.53.0
+numpy
+nuscenes-devkit
+plyfile
+scikit-image
+# by default we also use tensorboard to log results
+tensorboard
+trimesh>=2.35.39,<2.35.40
--- a/docker-hub/FlashOCC/Flashocc/requirements/tests.txt
+++ b/docker-hub/FlashOCC/Flashocc/requirements/tests.txt
+asynctest
+codecov
+flake8
+interrogate
+isort
+# Note: used for kwarray.group_items, this may be ported to mmcv in the future.
+kwarray
+pytest
+pytest-cov
+pytest-runner
+ubelt
+xdoctest >= 0.10.0
+yapf
--- a/docker-hub/FlashOCC/Flashocc/rocblas_Flashocc.log
+++ b/docker-hub/FlashOCC/Flashocc/rocblas_Flashocc.log
+    118 ./rocblas-bench -f gemm_ex --transposeA T --transposeB N -m 1024 -n 2048 -k 25344 --alpha 1 --a_type f32_r --lda 25344 --b_type f32_r --ldb 25344 --beta 0 --c_type f32_r --ldc 1024 --d_type f32_r --ldd 1024 --compute_type f32_r --algo 0 --solution_index 0 --flags 0
+      2 ./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 256 -n 960000 -k 512 --alpha 1 --lda 256 --ldb 512 --beta 0 --ldc 256
+      2 ./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB N -m 512 -n 960000 -k 288 --alpha 1 --lda 512 --ldb 288 --beta 0 --ldc 512
+      2 ./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 256 -n 512 -k 960000 --alpha 1 --lda 256 --ldb 512 --beta 0 --ldc 256
+      2 ./rocblas-bench -f gemm -r f32_r --transposeA N --transposeB T -m 512 -n 288 -k 960000 --alpha 1 --lda 512 --ldb 288 --beta 0 --ldc 512
+      2 ./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 288 -n 960000 -k 512 --alpha 1 --lda 512 --ldb 512 --beta 1 --ldc 288
+      2 ./rocblas-bench -f gemm -r f32_r --transposeA T --transposeB N -m 512 -n 960000 -k 256 --alpha 1 --lda 256 --ldb 256 --beta 0 --ldc 512
+    948 ./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 11264 -n 256 -k 64 --alpha 1 --a_type f32_r --lda 11264 --stride_a 720896 --b_type f32_r --ldb 64 --stride_b 0 --beta 0 --c_type f32_r --ldc 11264 --stride_c 2883584 --d_type f32_r --ldd 11264 --stride_d 2883584 --batch_count 144 --compute_type f32_r --algo 0 --solution_index 0 --flags 0
+    237 ./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 11264 -n 64 -k 64 --alpha 1 --a_type f32_r --lda 11264 --stride_a 720896 --b_type f32_r --ldb 64 --stride_b 0 --beta 0 --c_type f32_r --ldc 11264 --stride_c 720896 --d_type f32_r --ldd 11264 --stride_d 720896 --batch_count 144 --compute_type f32_r --algo 0 --solution_index 0 --flags 0
+    948 ./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 2816 -n 512 -k 128 --alpha 1 --a_type f32_r --lda 2816 --stride_a 360448 --b_type f32_r --ldb 128 --stride_b 0 --beta 0 --c_type f32_r --ldc 2816 --stride_c 1441792 --d_type f32_r --ldd 2816 --stride_d 1441792 --batch_count 144 --compute_type f32_r --algo 0 --solution_index 0 --flags 0
+    237 ./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 2816 -n 512 -k 256 --alpha 1 --a_type f32_r --lda 2816 --stride_a 720896 --b_type f32_r --ldb 256 --stride_b 0 --beta 0 --c_type f32_r --ldc 2816 --stride_c 1441792 --d_type f32_r --ldd 2816 --stride_d 1441792 --batch_count 144 --compute_type f32_r --algo 0 --solution_index 0 --flags 0
+    119 ./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 40000 -n 256 -k 256 --alpha 1 --a_type f32_r --lda 40000 --stride_a 10240000 --b_type f32_r --ldb 256 --stride_b 0 --beta 0 --c_type f32_r --ldc 40000 --stride_c 10240000 --d_type f32_r --ldd 40000 --stride_d 10240000 --batch_count 24 --compute_type f32_r --algo 0 --solution_index 0 --flags 0
+    119 ./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB N -m 704 -n 152 -k 256 --alpha 1 --a_type f32_r --lda 704 --stride_a 180224 --b_type f32_r --ldb 256 --stride_b 0 --beta 0 --c_type f32_r --ldc 704 --stride_c 107008 --d_type f32_r --ldd 704 --stride_d 107008 --batch_count 144 --compute_type f32_r --algo 0 --solution_index 0 --flags 0
+    118 ./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 11264 -n 256 -k 128 --alpha 1 --a_type f32_r --lda 11264 --stride_a 1441792 --b_type f32_r --ldb 256 --stride_b 0 --beta 0 --c_type f32_r --ldc 11264 --stride_c 2883584 --d_type f32_r --ldd 11264 --stride_d 2883584 --batch_count 144 --compute_type f32_r --algo 0 --solution_index 0 --flags 0
+    236 ./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 11264 -n 256 -k 64 --alpha 1 --a_type f32_r --lda 11264 --stride_a 720896 --b_type f32_r --ldb 256 --stride_b 0 --beta 0 --c_type f32_r --ldc 11264 --stride_c 2883584 --d_type f32_r --ldd 11264 --stride_d 2883584 --batch_count 144 --compute_type f32_r --algo 0 --solution_index 0 --flags 0
+    118 ./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 2816 -n 256 -k 512 --alpha 1 --a_type f32_r --lda 2816 --stride_a 1441792 --b_type f32_r --ldb 256 --stride_b 0 --beta 0 --c_type f32_r --ldc 2816 --stride_c 720896 --d_type f32_r --ldd 2816 --stride_d 720896 --batch_count 144 --compute_type f32_r --algo 0 --solution_index 0 --flags 0
+    118 ./rocblas-bench -f gemm_strided_batched_ex --transposeA N --transposeB T -m 704 -n 512 -k 1024 --alpha 1 --a_type f32_r --lda 704 --stride_a 720896 --b_type f32_r --ldb 512 --stride_b 0 --beta 0 --c_type f32_r --ldc 704 --stride_c 360448 --d_type f32_r --ldd 704 --stride_d 360448 --batch_count 144 --compute_type f32_r --algo 0 --solution_index 0 --flags 0
+    118 ./rocblas-bench -f gemm_strided_batched_ex --transposeA T --transposeB N -m 512 -n 512 -k 9600 --alpha 1 --a_type f32_r --lda 9600 --stride_a 4915200 --b_type f32_r --ldb 9600 --stride_b 4915200 --beta 0 --c_type f32_r --ldc 512 --stride_c 262144 --d_type f32_r --ldd 512 --stride_d 262144 --batch_count 49 --compute_type f32_r --algo 0 --solution_index 0 --flags 0
+    118 ./rocblas-bench -f gemm_strided_batched_ex --transposeA T --transposeB N -m 512 -n 640 -k 9600 --alpha 1 --a_type f32_r --lda 9600 --stride_a 4915200 --b_type f32_r --ldb 9600 --stride_b 6144000 --beta 0 --c_type f32_r --ldc 512 --stride_c 327680 --d_type f32_r --ldd 512 --stride_d 327680 --batch_count 49 --compute_type f32_r --algo 0 --solution_index 0 --flags 0
+      2 ./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA T --transposeB N -m 3 -n 1 -k 3 --alpha 1 --lda 3 --stride_a 9 --ldb 3 --stride_b 3 --beta 0 --ldc 3 --stride_c 3 --batch_count 8921088
+      2 ./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA T --transposeB N -m 3 -n 3 -k 3 --alpha 1 --lda 3 --stride_a 9 --ldb 4 --stride_b 16 --beta 0 --ldc 3 --stride_c 9 --batch_count 144
+      4 ./rocblas-bench -f gemm_strided_batched -r f32_r --transposeA T --transposeB T -m 3 -n 1 -k 3 --alpha 1 --lda 3 --stride_a 9 --ldb 1 --stride_b 3 --beta 0 --ldc 3 --stride_c 3 --batch_count 8921088
+    238 ./rocblas-bench -f gemm_strided_batched -r f64_r --transposeA N --transposeB N -m 4 -n 4 -k 4 --alpha 1 --lda 4 --stride_a 16 --ldb 4 --stride_b 16 --beta 0 --ldc 4 --stride_c 16 --batch_count 144
--- a/docker-hub/FlashOCC/Flashocc/start_flashocc.sh
+++ b/docker-hub/FlashOCC/Flashocc/start_flashocc.sh
+#!/bin/bash
+
+export NCCL_TOPO_FILE=null
+export NCCL_ALGO=Ring
+export NCCL_RINGS="N0 0 7 6 5 4 3 2 1 N0|N1 1 2 3 4 5 6 7 0 N1|N2 2 1 0 7 6 5 4 3 N2|N3 3 4 5 6 7 0 1 2 N3|N4 4 3 2 1 0 7 6 5 N4|N5 5 6 7 0 1 2 3 4 N5|N6 6 5 4 3 2 1 0 7 N6|N7 7 0 1 2 3 4 5 6 N7"
+
+export PYTORCH_MIOPEN_SUGGEST_NHWC=1
+export MIOPEN_PRECISION_FP32_FP32_FP32_TF32_FP32=1
+export MIOPEN_FIND_MODE=1
+export ROCBLAS_MATH_MODE=1
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+
+export TORCHINDUCTOR_LAYOUT_OPTIMIZATION=1
+export TORCHINDUCTOR_COORDINATE_DESCENT_TUNING=1
+
+#export MIOPEN_ENABLE_LOGGING=1        # 打开MIOPEN  LOGGING日志 default =0
+#export MIOPEN_ENABLE_LOGGING_CMD=1    # 输出日志CMD信息 default =0
+#export MIOPEN_LOG_LEVEL=6             # 设置日志打印level default=0
+#export ROCBLAS_LAYER=3                # 打开 rocblas输出日志 default=0
+
+
+
+
+
+TIME=$(date "+%Y-%m-%d_%H_%M")
+
+MASTER_ADDR=${1:-localhost}
+NNODES=${2:-1}
+NODE_RANK=${3:-0}
+CONFIG=${4:-projects/configs/flashocc/flashocc-r50.py}
+
+bash tools/dist_train_numa.sh $MASTER_ADDR $NNODES $NODE_RANK $CONFIG \
+	2>&1 | tee cvm_bw1000_flashocc_${NNODES}nodes_$TIME.log
--- a/docker-hub/FlashOCC/Flashocc/tools/analysis_tools/analyze_logs.py
+++ b/docker-hub/FlashOCC/Flashocc/tools/analysis_tools/analyze_logs.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import json
+from collections import defaultdict
+
+import numpy as np
+import seaborn as sns
+from matplotlib import pyplot as plt
+
+
+def cal_train_time(log_dicts, args):
+    for i, log_dict in enumerate(log_dicts):
+        print(f'{"-" * 5}Analyze train time of {args.json_logs[i]}{"-" * 5}')
+        all_times = []
+        for epoch in log_dict.keys():
+            if args.include_outliers:
+                all_times.append(log_dict[epoch]['time'])
+            else:
+                all_times.append(log_dict[epoch]['time'][1:])
+        all_times = np.array(all_times)
+        epoch_ave_time = all_times.mean(-1)
+        slowest_epoch = epoch_ave_time.argmax()
+        fastest_epoch = epoch_ave_time.argmin()
+        std_over_epoch = epoch_ave_time.std()
+        print(f'slowest epoch {slowest_epoch + 1}, '
+              f'average time is {epoch_ave_time[slowest_epoch]:.4f}')
+        print(f'fastest epoch {fastest_epoch + 1}, '
+              f'average time is {epoch_ave_time[fastest_epoch]:.4f}')
+        print(f'time std over epochs is {std_over_epoch:.4f}')
+        print(f'average iter time: {np.mean(all_times):.4f} s/iter')
+        print()
+
+
+def plot_curve(log_dicts, args):
+    if args.backend is not None:
+        plt.switch_backend(args.backend)
+    sns.set_style(args.style)
+    # if legend is None, use {filename}_{key} as legend
+    legend = args.legend
+    if legend is None:
+        legend = []
+        for json_log in args.json_logs:
+            for metric in args.keys:
+                legend.append(f'{json_log}_{metric}')
+    assert len(legend) == (len(args.json_logs) * len(args.keys))
+    metrics = args.keys
+
+    num_metrics = len(metrics)
+    for i, log_dict in enumerate(log_dicts):
+        epochs = list(log_dict.keys())
+        for j, metric in enumerate(metrics):
+            print(f'plot curve of {args.json_logs[i]}, metric is {metric}')
+            if metric not in log_dict[epochs[args.interval - 1]]:
+                raise KeyError(
+                    f'{args.json_logs[i]} does not contain metric {metric}')
+
+            if args.mode == 'eval':
+                if min(epochs) == args.interval:
+                    x0 = args.interval
+                else:
+                    # if current training is resumed from previous checkpoint
+                    # we lost information in early epochs
+                    # `xs` should start according to `min(epochs)`
+                    if min(epochs) % args.interval == 0:
+                        x0 = min(epochs)
+                    else:
+                        # find the first epoch that do eval
+                        x0 = min(epochs) + args.interval - \
+                            min(epochs) % args.interval
+                xs = np.arange(x0, max(epochs) + 1, args.interval)
+                ys = []
+                for epoch in epochs[args.interval - 1::args.interval]:
+                    ys += log_dict[epoch][metric]
+
+                # if training is aborted before eval of the last epoch
+                # `xs` and `ys` will have different length and cause an error
+                # check if `ys[-1]` is empty here
+                if not log_dict[epoch][metric]:
+                    xs = xs[:-1]
+
+                ax = plt.gca()
+                ax.set_xticks(xs)
+                plt.xlabel('epoch')
+                plt.plot(xs, ys, label=legend[i * num_metrics + j], marker='o')
+            else:
+                xs = []
+                ys = []
+                num_iters_per_epoch = \
+                    log_dict[epochs[args.interval-1]]['iter'][-1]
+                for epoch in epochs[args.interval - 1::args.interval]:
+                    iters = log_dict[epoch]['iter']
+                    if log_dict[epoch]['mode'][-1] == 'val':
+                        iters = iters[:-1]
+                    xs.append(
+                        np.array(iters) + (epoch - 1) * num_iters_per_epoch)
+                    ys.append(np.array(log_dict[epoch][metric][:len(iters)]))
+                xs = np.concatenate(xs)
+                ys = np.concatenate(ys)
+                plt.xlabel('iter')
+                plt.plot(
+                    xs, ys, label=legend[i * num_metrics + j], linewidth=0.5)
+            plt.legend()
+        if args.title is not None:
+            plt.title(args.title)
+    if args.out is None:
+        plt.show()
+    else:
+        print(f'save curve to: {args.out}')
+        plt.savefig(args.out)
+        plt.cla()
+
+
+def add_plot_parser(subparsers):
+    parser_plt = subparsers.add_parser(
+        'plot_curve', help='parser for plotting curves')
+    parser_plt.add_argument(
+        'json_logs',
+        type=str,
+        nargs='+',
+        help='path of train log in json format')
+    parser_plt.add_argument(
+        '--keys',
+        type=str,
+        nargs='+',
+        default=['mAP_0.25'],
+        help='the metric that you want to plot')
+    parser_plt.add_argument('--title', type=str, help='title of figure')
+    parser_plt.add_argument(
+        '--legend',
+        type=str,
+        nargs='+',
+        default=None,
+        help='legend of each plot')
+    parser_plt.add_argument(
+        '--backend', type=str, default=None, help='backend of plt')
+    parser_plt.add_argument(
+        '--style', type=str, default='dark', help='style of plt')
+    parser_plt.add_argument('--out', type=str, default=None)
+    parser_plt.add_argument('--mode', type=str, default='train')
+    parser_plt.add_argument('--interval', type=int, default=1)
+
+
+def add_time_parser(subparsers):
+    parser_time = subparsers.add_parser(
+        'cal_train_time',
+        help='parser for computing the average time per training iteration')
+    parser_time.add_argument(
+        'json_logs',
+        type=str,
+        nargs='+',
+        help='path of train log in json format')
+    parser_time.add_argument(
+        '--include-outliers',
+        action='store_true',
+        help='include the first value of every epoch when computing '
+        'the average time')
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Analyze Json Log')
+    # currently only support plot curve and calculate average train time
+    subparsers = parser.add_subparsers(dest='task', help='task parser')
+    add_plot_parser(subparsers)
+    add_time_parser(subparsers)
+    args = parser.parse_args()
+    return args
+
+
+def load_json_logs(json_logs):
+    # load and convert json_logs to log_dict, key is epoch, value is a sub dict
+    # keys of sub dict is different metrics, e.g. memory, bbox_mAP
+    # value of sub dict is a list of corresponding values of all iterations
+    log_dicts = [dict() for _ in json_logs]
+    for json_log, log_dict in zip(json_logs, log_dicts):
+        with open(json_log, 'r') as log_file:
+            for line in log_file:
+                log = json.loads(line.strip())
+                # skip lines without `epoch` field
+                if 'epoch' not in log:
+                    continue
+                epoch = log.pop('epoch')
+                if epoch not in log_dict:
+                    log_dict[epoch] = defaultdict(list)
+                for k, v in log.items():
+                    log_dict[epoch][k].append(v)
+    return log_dicts
+
+
+def main():
+    args = parse_args()
+
+    json_logs = args.json_logs
+    for json_log in json_logs:
+        assert json_log.endswith('.json')
+
+    log_dicts = load_json_logs(json_logs)
+
+    eval(args.task)(log_dicts, args)
+
+
+if __name__ == '__main__':
+    main()