add code

d2b71343 · 雍大凯 · 69e57885 · d2b71343 · d2b71343 · c9541b0d
Commit d2b71343 authored Apr 08, 2026 by 雍大凯
20 changed files
--- a/docker-hub/FlashOCC/Flashocc/lib/dvr/dvr.cu
+++ b/docker-hub/FlashOCC/Flashocc/lib/dvr/dvr.cu
+// Acknowledgments: https://github.com/tarashakhurana/4d-occ-forecasting
+// Modified by Haisong Liu
+#include <torch/extension.h>
+#include <stdio.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <vector>
+#include <string>
+#include <iostream>
+#define MAX_D 1446 // 700 + 700 + 45 + 1
+#define MAX_STEP 1000
+enum LossType {L1, L2, ABSREL};
+enum PhaseName {TEST, TRAIN};
+template <typename scalar_t>
+__global__ void init_cuda_kernel(
+    const torch::PackedTensorAccessor32<scalar_t,3,torch::RestrictPtrTraits> points,
+    const torch::PackedTensorAccessor32<scalar_t,2,torch::RestrictPtrTraits> tindex,
+    torch::PackedTensorAccessor32<scalar_t,5,torch::RestrictPtrTraits> occupancy) {
+    // batch index
+    const auto n = blockIdx.y;
+    // ray index
+    const auto c = blockIdx.x * blockDim.x + threadIdx.x;
+    // num of rays
+    const auto M = points.size(1);
+    const auto T = occupancy.size(1);
+    // we allocated more threads than num_rays
+    if (c < M) {
+        // ray end point
+        const auto t = tindex[n][c];
+        // invalid points
+        assert(T == 1 || t < T);
+        // if t < 0, it is a padded point
+        if (t < 0) return;
+        // time index for sigma
+        // when T = 1, we have a static sigma
+        const auto ts = (T == 1) ? 0 : t;
+        // grid shape
+        const int vzsize = occupancy.size(2);
+        const int vysize = occupancy.size(3);
+        const int vxsize = occupancy.size(4);
+        // assert(vzsize + vysize + vxsize <= MAX_D);
+        // end point
+        const int vx = int(points[n][c][0]);
+        const int vy = int(points[n][c][1]);
+        const int vz = int(points[n][c][2]);
+        //
+        if (0 <= vx && vx < vxsize &&
+            0 <= vy && vy < vysize &&
+            0 <= vz && vz < vzsize) {
+            occupancy[n][ts][vz][vy][vx] = 1;
+        }
+    }
+}
+template <typename scalar_t>
+__global__ void render_forward_cuda_kernel(
+    const torch::PackedTensorAccessor32<scalar_t,5,torch::RestrictPtrTraits> sigma,
+    const torch::PackedTensorAccessor32<scalar_t,3,torch::RestrictPtrTraits> origin,
+    const torch::PackedTensorAccessor32<scalar_t,3,torch::RestrictPtrTraits> points,
+    const torch::PackedTensorAccessor32<scalar_t,2,torch::RestrictPtrTraits> tindex,
+    // torch::PackedTensorAccessor32<scalar_t,5,torch::RestrictPtrTraits> pog,
+    torch::PackedTensorAccessor32<scalar_t,2,torch::RestrictPtrTraits> pred_dist,
+    torch::PackedTensorAccessor32<scalar_t,2,torch::RestrictPtrTraits> gt_dist,
+    torch::PackedTensorAccessor32<scalar_t,3,torch::RestrictPtrTraits> coord_index,
+    PhaseName train_phase) {
+    // batch index
+    const auto n = blockIdx.y;
+    // ray index
+    const auto c = blockIdx.x * blockDim.x + threadIdx.x;
+    // num of rays
+    const auto M = points.size(1);
+    const auto T = sigma.size(1);
+    // we allocated more threads than num_rays
+    if (c < M) {
+        // ray end point
+        const auto t = tindex[n][c];
+        // invalid points
+        // assert(t < T);
+        assert(T == 1 || t < T);
+        // time index for sigma
+        // when T = 1, we have a static sigma
+        const auto ts = (T == 1) ? 0 : t;
+        // if t < 0, it is a padded point
+        if (t < 0) return;
+        // grid shape
+        const int vzsize = sigma.size(2);
+        const int vysize = sigma.size(3);
+        const int vxsize = sigma.size(4);
+        // assert(vzsize + vysize + vxsize <= MAX_D);
+        // origin
+        const double xo = origin[n][t][0];
+        const double yo = origin[n][t][1];
+        const double zo = origin[n][t][2];
+        // end point
+        const double xe = points[n][c][0];
+        const double ye = points[n][c][1];
+        const double ze = points[n][c][2];
+        // locate the voxel where the origin resides
+        const int vxo = int(xo);
+        const int vyo = int(yo);
+        const int vzo = int(zo);
+        const int vxe = int(xe);
+        const int vye = int(ye);
+        const int vze = int(ze);
+        // NOTE: new
+        int vx = vxo;
+        int vy = vyo;
+        int vz = vzo;
+        // origin to end
+        const double rx = xe - xo;
+        const double ry = ye - yo;
+        const double rz = ze - zo;
+        double gt_d = sqrt(rx * rx + ry * ry + rz * rz);
+        // directional vector
+        const double dx = rx / gt_d;
+        const double dy = ry / gt_d;
+        const double dz = rz / gt_d;
+        // In which direction the voxel ids are incremented.
+        const int stepX = (dx >= 0) ? 1 : -1;
+        const int stepY = (dy >= 0) ? 1 : -1;
+        const int stepZ = (dz >= 0) ? 1 : -1;
+        // Distance along the ray to the next voxel border from the current position (tMaxX, tMaxY, tMaxZ).
+        const double next_voxel_boundary_x = vx + (stepX < 0 ? 0 : 1);
+        const double next_voxel_boundary_y = vy + (stepY < 0 ? 0 : 1);
+        const double next_voxel_boundary_z = vz + (stepZ < 0 ? 0 : 1);
+        // tMaxX, tMaxY, tMaxZ -- distance until next intersection with voxel-border
+        // the value of t at which the ray crosses the first vertical voxel boundary
+        double tMaxX = (dx!=0) ? (next_voxel_boundary_x - xo)/dx : DBL_MAX; //
+        double tMaxY = (dy!=0) ? (next_voxel_boundary_y - yo)/dy : DBL_MAX; //
+        double tMaxZ = (dz!=0) ? (next_voxel_boundary_z - zo)/dz : DBL_MAX; //
+        // tDeltaX, tDeltaY, tDeltaZ --
+        // how far along the ray we must move for the horizontal component to equal the width of a voxel
+        // the direction in which we traverse the grid
+        // can only be FLT_MAX if we never go in that direction
+        const double tDeltaX = (dx!=0) ? stepX/dx : DBL_MAX;
+        const double tDeltaY = (dy!=0) ? stepY/dy : DBL_MAX;
+        const double tDeltaZ = (dz!=0) ? stepZ/dz : DBL_MAX;
+        int3 path[MAX_D];
+        double csd[MAX_D];  // cumulative sum of sigma times delta
+        double p[MAX_D];  // alpha
+        double d[MAX_D];
+        // forward raymarching with voxel traversal
+        int step = 0;  // total number of voxels traversed
+        int count = 0;  // number of voxels traversed inside the voxel grid
+        double last_d = 0.0;  // correct initialization
+        // voxel traversal raycasting
+        bool was_inside = false;
+        while (true) {
+            bool inside = (0 <= vx && vx < vxsize) &&
+                (0 <= vy && vy < vysize) &&
+                (0 <= vz && vz < vzsize);
+            if (inside) {
+                was_inside = true;
+                path[count] = make_int3(vx, vy, vz);
+            } else if (was_inside) { // was but no longer inside
+                // we know we are not coming back so terminate
+                break;
+            } /*else if (last_d > gt_d) {
+                break;
+            } */
+            /*else { // has not gone inside yet
+                // assert(count == 0);
+                // (1) when we have hit the destination but haven't gone inside the voxel grid
+                // (2) when we have traveled MAX_D voxels but haven't found one valid voxel
+                //     handle intersection corner cases in case of infinite loop
+                bool hit = (vx == vxe && vy == vye && vz == vze);  // this test seems brittle with corner cases
+                if (hit || step >= MAX_D)
+                    break;
+                //if (last_d >= gt_d || step >= MAX_D) break;
+            } */
+            // _d represents the ray distance has traveled before escaping the current voxel cell
+            double _d = 0.0;
+            // voxel traversal
+            if (tMaxX < tMaxY) {
+                if (tMaxX < tMaxZ) {
+                    _d = tMaxX;
+                    vx += stepX;
+                    tMaxX += tDeltaX;
+                } else {
+                    _d = tMaxZ;
+                    vz += stepZ;
+                    tMaxZ += tDeltaZ;
+                }
+            } else {
+                if (tMaxY < tMaxZ) {
+                    _d = tMaxY;
+                    vy += stepY;
+                    tMaxY += tDeltaY;
+                } else {
+                    _d = tMaxZ;
+                    vz += stepZ;
+                    tMaxZ += tDeltaZ;
+                }
+            }
+            if (inside) {
+                // get sigma at the current voxel
+                const int3 &v = path[count];  // use the recorded index
+                const double _sigma = sigma[n][ts][v.z][v.y][v.x];
+                const double _delta = max(0.0, _d - last_d);  // THIS TURNS OUT IMPORTANT
+                const double sd = _sigma * _delta;
+                if (count == 0) { // the first voxel inside
+                    csd[count] = sd;
+                    p[count] = 1 - exp(-sd);
+                } else {
+                    csd[count] = csd[count-1] + sd;
+                    p[count] = exp(-csd[count-1]) - exp(-csd[count]);
+                }
+                // record the traveled distance
+                d[count] = _d;
+                // count the number of voxels we have escaped
+                count ++;
+            }
+            last_d = _d;
+            step ++;
+            if (step > MAX_STEP) {
+                break;
+            }
+        }
+        // the total number of voxels visited should not exceed this number
+        assert(count <= MAX_D);
+        if (count > 0) {
+            // compute the expected ray distance
+            //double exp_d = 0.0;
+            double exp_d = d[count-1];
+            const int3 &v_init = path[count-1];
+            int x = v_init.x;
+            int y = v_init.y;
+            int z = v_init.z;
+            for (int i = 0; i < count; i++) {
+                //printf("%f\t%f\n",p[i], d[i]);
+                //exp_d += p[i] * d[i];
+                const int3 &v = path[i];
+                const double occ = sigma[n][ts][v.z][v.y][v.x];
+                if (occ > 0.5) {
+                    exp_d = d[i];
+                    x = v.x;
+                    y = v.y;
+                    z = v.z;
+                    break;
+                }
+            }
+            //printf("%f\n",exp_d);
+            // add an imaginary sample at the end point should gt_d exceeds max_d
+            double p_out = exp(-csd[count-1]);
+            double max_d = d[count-1];
+            // if (gt_d > max_d)
+            //   exp_d += (p_out * gt_d);
+            // p_out is the probability the ray escapes the voxel grid
+            //exp_d += (p_out * max_d);
+            if (train_phase == 1) {
+                gt_d = min(gt_d, max_d);
+            }
+            // write the rendered ray distance (max_d)
+            pred_dist[n][c] = exp_d;
+            gt_dist[n][c] = gt_d;
+            coord_index[n][c][0] = double(x);
+            coord_index[n][c][1] = double(y);
+            coord_index[n][c][2] = double(z);
+            // // write occupancy
+            // for (int i = 0; i < count; i ++) {
+            //     const int3 &v = path[i];
+            //     auto & occ = pog[n][t][v.z][v.y][v.x];
+            //     if (p[i] >= occ) {
+            //         occ = p[i];
+            //     }
+            // }
+        }
+    }
+}
+/*
+ * input shape
+ *   sigma      : N x T x H x L x W
+ *   origin   : N x T x 3
+ *   points   : N x M x 4
+ * output shape
+ *   dist     : N x M
+ */
+std::vector<torch::Tensor> render_forward_cuda(
+    torch::Tensor sigma,
+    torch::Tensor origin,
+    torch::Tensor points,
+    torch::Tensor tindex,
+    const std::vector<int> grid,
+    std::string phase_name) {
+    const auto N = points.size(0); // batch size
+    const auto M = points.size(1); // num of rays
+    const auto T = grid[0];
+    const auto H = grid[1];
+    const auto L = grid[2];
+    const auto W = grid[3];
+    const auto device = sigma.device();
+    const int threads = 1024;
+    const dim3 blocks((M + threads - 1) / threads, N);
+    //
+    // const auto dtype = points.dtype();
+    // const auto options = torch::TensorOptions().dtype(dtype).device(device).requires_grad(false);
+    // auto pog = torch::zeros({N, T, H, L, W}, options);
+    // perform rendering
+    auto gt_dist = -torch::ones({N, M}, device);
+    auto pred_dist = -torch::ones({N, M}, device);
+    auto coord_index = torch::zeros({N, M, 3}, device);
+    PhaseName train_phase;
+    if (phase_name.compare("test") == 0) {
+        train_phase = TEST;
+    } else if (phase_name.compare("train") == 0){
+        train_phase = TRAIN;
+    } else {
+        std::cout << "UNKNOWN PHASE NAME: " << phase_name << std::endl;
+        exit(1);
+    }
+    AT_DISPATCH_FLOATING_TYPES(sigma.type(), "render_forward_cuda", ([&] {
+                render_forward_cuda_kernel<scalar_t><<<blocks, threads>>>(
+                    sigma.packed_accessor32<scalar_t,5,torch::RestrictPtrTraits>(),
+                    origin.packed_accessor32<scalar_t,3,torch::RestrictPtrTraits>(),
+                    points.packed_accessor32<scalar_t,3,torch::RestrictPtrTraits>(),
+                    tindex.packed_accessor32<scalar_t,2,torch::RestrictPtrTraits>(),
+                    // pog.packed_accessor32<scalar_t,5,torch::RestrictPtrTraits>(),
+                    pred_dist.packed_accessor32<scalar_t,2,torch::RestrictPtrTraits>(),
+                    gt_dist.packed_accessor32<scalar_t,2,torch::RestrictPtrTraits>(),
+                    coord_index.packed_accessor32<scalar_t,3,torch::RestrictPtrTraits>(),
+                    train_phase);
+            }));
+    cudaDeviceSynchronize();
+    // return {pog, pred_dist, gt_dist};
+    return {pred_dist, gt_dist, coord_index};
+}
+template <typename scalar_t>
+__global__ void render_cuda_kernel(
+    const torch::PackedTensorAccessor32<scalar_t,5,torch::RestrictPtrTraits> sigma,
+    const torch::PackedTensorAccessor32<scalar_t,3,torch::RestrictPtrTraits> origin,
+    const torch::PackedTensorAccessor32<scalar_t,3,torch::RestrictPtrTraits> points,
+    const torch::PackedTensorAccessor32<scalar_t,2,torch::RestrictPtrTraits> tindex,
+    // const torch::PackedTensorAccessor32<scalar_t,5,torch::RestrictPtrTraits> occupancy,
+    torch::PackedTensorAccessor32<scalar_t,2,torch::RestrictPtrTraits> pred_dist,
+    torch::PackedTensorAccessor32<scalar_t,2,torch::RestrictPtrTraits> gt_dist,
+    torch::PackedTensorAccessor32<scalar_t,5,torch::RestrictPtrTraits> grad_sigma,
+    // torch::PackedTensorAccessor32<scalar_t,5,torch::RestrictPtrTraits> grad_sigma_count,
+    LossType loss_type) {
+    // batch index
+    const auto n = blockIdx.y;
+    // ray index
+    const auto c = blockIdx.x * blockDim.x + threadIdx.x;
+    // num of rays
+    const auto M = points.size(1);
+    const auto T = sigma.size(1);
+    // we allocated more threads than num_rays
+    if (c < M) {
+        // ray end point
+        const auto t = tindex[n][c];
+        // invalid points
+        // assert(t < T);
+        assert(T == 1 || t < T);
+        // time index for sigma
+        // when T = 1, we have a static sigma
+        const auto ts = (T == 1) ? 0 : t;
+        // if t < 0, it is a padded point
+        if (t < 0) return;
+        // grid shape
+        const int vzsize = sigma.size(2);
+        const int vysize = sigma.size(3);
+        const int vxsize = sigma.size(4);
+        // assert(vzsize + vysize + vxsize <= MAX_D);
+        // origin
+        const double xo = origin[n][t][0];
+        const double yo = origin[n][t][1];
+        const double zo = origin[n][t][2];
+        // end point
+        const double xe = points[n][c][0];
+        const double ye = points[n][c][1];
+        const double ze = points[n][c][2];
+        // locate the voxel where the origin resides
+        const int vxo = int(xo);
+        const int vyo = int(yo);
+        const int vzo = int(zo);
+        //
+        const int vxe = int(xe);
+        const int vye = int(ye);
+        const int vze = int(ze);
+        // NOTE: new
+        int vx = vxo;
+        int vy = vyo;
+        int vz = vzo;
+        // origin to end
+        const double rx = xe - xo;
+        const double ry = ye - yo;
+        const double rz = ze - zo;
+        double gt_d = sqrt(rx * rx + ry * ry + rz * rz);
+        // directional vector
+        const double dx = rx / gt_d;
+        const double dy = ry / gt_d;
+        const double dz = rz / gt_d;
+        // In which direction the voxel ids are incremented.
+        const int stepX = (dx >= 0) ? 1 : -1;
+        const int stepY = (dy >= 0) ? 1 : -1;
+        const int stepZ = (dz >= 0) ? 1 : -1;
+        // Distance along the ray to the next voxel border from the current position (tMaxX, tMaxY, tMaxZ).
+        const double next_voxel_boundary_x = vx + (stepX < 0 ? 0 : 1);
+        const double next_voxel_boundary_y = vy + (stepY < 0 ? 0 : 1);
+        const double next_voxel_boundary_z = vz + (stepZ < 0 ? 0 : 1);
+        // tMaxX, tMaxY, tMaxZ -- distance until next intersection with voxel-border
+        // the value of t at which the ray crosses the first vertical voxel boundary
+        double tMaxX = (dx!=0) ? (next_voxel_boundary_x - xo)/dx : DBL_MAX; //
+        double tMaxY = (dy!=0) ? (next_voxel_boundary_y - yo)/dy : DBL_MAX; //
+        double tMaxZ = (dz!=0) ? (next_voxel_boundary_z - zo)/dz : DBL_MAX; //
+        // tDeltaX, tDeltaY, tDeltaZ --
+        // how far along the ray we must move for the horizontal component to equal the width of a voxel
+        // the direction in which we traverse the grid
+        // can only be FLT_MAX if we never go in that direction
+        const double tDeltaX = (dx!=0) ? stepX/dx : DBL_MAX;
+        const double tDeltaY = (dy!=0) ? stepY/dy : DBL_MAX;
+        const double tDeltaZ = (dz!=0) ? stepZ/dz : DBL_MAX;
+        int3 path[MAX_D];
+        double csd[MAX_D];  // cumulative sum of sigma times delta
+        double p[MAX_D];  // alpha
+        double d[MAX_D];
+        double dt[MAX_D];
+        // forward raymarching with voxel traversal
+        int step = 0;  // total number of voxels traversed
+        int count = 0;  // number of voxels traversed inside the voxel grid
+        double last_d = 0.0;  // correct initialization
+        // voxel traversal raycasting
+        bool was_inside = false;
+        while (true) {
+            bool inside = (0 <= vx && vx < vxsize) &&
+                (0 <= vy && vy < vysize) &&
+                (0 <= vz && vz < vzsize);
+            if (inside) { // now inside
+                was_inside = true;
+                path[count] = make_int3(vx, vy, vz);
+            } else if (was_inside) { // was inside but no longer
+                // we know we are not coming back so terminate
+                break;
+            } else if (last_d > gt_d) {
+                break;
+            } /* else { // has not gone inside yet
+                // assert(count == 0);
+                // (1) when we have hit the destination but haven't gone inside the voxel grid
+                // (2) when we have traveled MAX_D voxels but haven't found one valid voxel
+                //     handle intersection corner cases in case of infinite loop
+                // bool hit = (vx == vxe && vy == vye && vz == vze);
+                // if (hit || step >= MAX_D)
+                //     break;
+                if (last_d >= gt_d || step >= MAX_D) break;
+            } */
+            // _d represents the ray distance has traveled before escaping the current voxel cell
+            double _d = 0.0;
+            // voxel traversal
+            if (tMaxX < tMaxY) {
+                if (tMaxX < tMaxZ) {
+                    _d = tMaxX;
+                    vx += stepX;
+                    tMaxX += tDeltaX;
+                } else {
+                    _d = tMaxZ;
+                    vz += stepZ;
+                    tMaxZ += tDeltaZ;
+                }
+            } else {
+                if (tMaxY < tMaxZ) {
+                    _d = tMaxY;
+                    vy += stepY;
+                    tMaxY += tDeltaY;
+                } else {
+                    _d = tMaxZ;
+                    vz += stepZ;
+                    tMaxZ += tDeltaZ;
+                }
+            }
+            if (inside) {
+                // get sigma at the current voxel
+                const int3 &v = path[count];  // use the recorded index
+                const double _sigma = sigma[n][ts][v.z][v.y][v.x];
+                const double _delta = max(0.0, _d - last_d);  // THIS TURNS OUT IMPORTANT
+                const double sd = _sigma * _delta;
+                if (count == 0) { // the first voxel inside
+                    csd[count] = sd;
+                    p[count] = 1 - exp(-sd);
+                } else {
+                    csd[count] = csd[count-1] + sd;
+                    p[count] = exp(-csd[count-1]) - exp(-csd[count]);
+                }
+                // record the traveled distance
+                d[count] = _d;
+                dt[count] = _delta;
+                // count the number of voxels we have escaped
+                count ++;
+            }
+            last_d = _d;
+            step ++;
+            if (step > MAX_STEP) {
+                break;
+            }
+        }
+        // the total number of voxels visited should not exceed this number
+        assert(count <= MAX_D);
+        // WHEN THERE IS AN INTERSECTION BETWEEN THE RAY AND THE VOXEL GRID
+        if (count > 0) {
+            // compute the expected ray distance
+            double exp_d = 0.0;
+            for (int i = 0; i < count; i ++)
+                exp_d += p[i] * d[i];
+            // add an imaginary sample at the end point should gt_d exceeds max_d
+            double p_out = exp(-csd[count-1]);
+            double max_d = d[count-1];
+            exp_d += (p_out * max_d);
+            gt_d = min(gt_d, max_d);
+            // write the rendered ray distance (max_d)
+            pred_dist[n][c] = exp_d;
+            gt_dist[n][c] = gt_d;
+            /* backward raymarching */
+            double dd_dsigma[MAX_D];
+            for (int i = count - 1; i >= 0; i --) {
+                // NOTE: probably need to double check again
+                if (i == count - 1)
+                    dd_dsigma[i] = p_out * max_d;
+                else
+                    dd_dsigma[i] = dd_dsigma[i+1] - exp(-csd[i]) * (d[i+1] - d[i]);
+            }
+            for (int i = count - 1; i >= 0; i --)
+                dd_dsigma[i] *= dt[i];
+            // option 2: cap at the boundary
+            for (int i = count - 1; i >= 0; i --)
+                dd_dsigma[i] -= dt[i] * p_out * max_d;
+            double dl_dd = 1.0;
+            if (loss_type == L1)
+                dl_dd = (exp_d >= gt_d) ? 1 : -1;
+            else if (loss_type == L2)
+                dl_dd = (exp_d - gt_d);
+            else if (loss_type == ABSREL)
+                dl_dd = (exp_d >= gt_d) ? (1.0/gt_d) : -(1.0/gt_d);
+            // apply chain rule
+            for (int i = 0; i < count; i ++) {
+                const int3 &v = path[i];
+                // NOTE: potential race conditions when writing gradients
+                grad_sigma[n][ts][v.z][v.y][v.x] += dl_dd * dd_dsigma[i];
+                // grad_sigma_count[n][ts][v.z][v.y][v.x] += 1;
+            }
+        }
+    }
+}
+/*
+ * input shape
+ *   sigma      : N x T x H x L x W
+ *   origin   : N x T x 3
+ *   points   : N x M x 4
+ * output shape
+ *   dist     : N x M
+ *   loss     : N x M
+ *   grad_sigma : N x T x H x L x W
+ */
+std::vector<torch::Tensor> render_cuda(
+    torch::Tensor sigma,
+    torch::Tensor origin,
+    torch::Tensor points,
+    torch::Tensor tindex,
+    std::string loss_name) {
+    const auto N = points.size(0); // batch size
+    const auto M = points.size(1); // num of rays
+    const auto device = sigma.device();
+    const int threads = 1024;
+    const dim3 blocks((M + threads - 1) / threads, N);
+    // perform rendering
+    auto gt_dist = -torch::ones({N, M}, device);
+    auto pred_dist = -torch::ones({N, M}, device);
+    auto grad_sigma = torch::zeros_like(sigma);
+    // auto grad_sigma_count = torch::zeros_like(sigma);
+    LossType loss_type;
+    if (loss_name.compare("l1") == 0) {
+        loss_type = L1;
+    } else if (loss_name.compare("l2") == 0) {
+        loss_type = L2;
+    } else if (loss_name.compare("absrel") == 0) {
+        loss_type = ABSREL;
+    } else if (loss_name.compare("bce") == 0){
+        loss_type = L1;
+    } else {
+        std::cout << "UNKNOWN LOSS TYPE: " << loss_name << std::endl;
+        exit(1);
+    }
+    AT_DISPATCH_FLOATING_TYPES(sigma.type(), "render_cuda", ([&] {
+                render_cuda_kernel<scalar_t><<<blocks, threads>>>(
+                    sigma.packed_accessor32<scalar_t,5,torch::RestrictPtrTraits>(),
+                    origin.packed_accessor32<scalar_t,3,torch::RestrictPtrTraits>(),
+                    points.packed_accessor32<scalar_t,3,torch::RestrictPtrTraits>(),
+                    tindex.packed_accessor32<scalar_t,2,torch::RestrictPtrTraits>(),
+                    // occupancy.packed_accessor32<scalar_t,5,torch::RestrictPtrTraits>(),
+                    pred_dist.packed_accessor32<scalar_t,2,torch::RestrictPtrTraits>(),
+                    gt_dist.packed_accessor32<scalar_t,2,torch::RestrictPtrTraits>(),
+                    grad_sigma.packed_accessor32<scalar_t,5,torch::RestrictPtrTraits>(),
+                    // grad_sigma_count.packed_accessor32<scalar_t,5,torch::RestrictPtrTraits>(),
+                    loss_type);
+            }));
+    cudaDeviceSynchronize();
+    // grad_sigma_count += (grad_sigma_count == 0);
+    // grad_sigma /= grad_sigma_count;
+    return {pred_dist, gt_dist, grad_sigma};
+}
+/*
+ * input shape
+ *   origin   : N x T x 3
+ *   points   : N x M x 3
+ *   tindex   : N x M
+ * output shape
+ *   occupancy: N x T x H x L x W
+ */
+torch::Tensor init_cuda(
+    torch::Tensor points,
+    torch::Tensor tindex,
+    const std::vector<int> grid) {
+    const auto N = points.size(0); // batch size
+    const auto M = points.size(1); // num of rays
+    const auto T = grid[0];
+    const auto H = grid[1];
+    const auto L = grid[2];
+    const auto W = grid[3];
+    const auto dtype = points.dtype();
+    const auto device = points.device();
+    const auto options = torch::TensorOptions().dtype(dtype).device(device).requires_grad(false);
+    auto occupancy = torch::zeros({N, T, H, L, W}, options);
+    const int threads = 1024;
+    const dim3 blocks((M + threads - 1) / threads, N);
+    // initialize occupancy such that every voxel with one or more points is occupied
+    AT_DISPATCH_FLOATING_TYPES(points.type(), "init_cuda", ([&] {
+                init_cuda_kernel<scalar_t><<<blocks, threads>>>(
+                    points.packed_accessor32<scalar_t,3,torch::RestrictPtrTraits>(),
+                    tindex.packed_accessor32<scalar_t,2,torch::RestrictPtrTraits>(),
+                    occupancy.packed_accessor32<scalar_t,5,torch::RestrictPtrTraits>());
+            }));
+    // synchronize
+    cudaDeviceSynchronize();
+    return occupancy;
+}
\ No newline at end of file
--- a/docker-hub/FlashOCC/Flashocc/lib/dvr/dvr.hip
+++ b/docker-hub/FlashOCC/Flashocc/lib/dvr/dvr.hip
+// !!! This is a file automatically generated by hipify!!!
+#include <ATen/dtk_macros.h>
+// Acknowledgments: https://github.com/tarashakhurana/4d-occ-forecasting
+// Modified by Haisong Liu
+#include <torch/extension.h>
+#include <stdio.h>
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime.h>
+#include <vector>
+#include <string>
+#include <iostream>
+#define MAX_D 1446 // 700 + 700 + 45 + 1
+#define MAX_STEP 1000
+enum LossType {L1, L2, ABSREL};
+enum PhaseName {TEST, TRAIN};
+template <typename scalar_t>
+__global__ void init_cuda_kernel(
+    const torch::PackedTensorAccessor32<scalar_t,3,torch::RestrictPtrTraits> points,
+    const torch::PackedTensorAccessor32<scalar_t,2,torch::RestrictPtrTraits> tindex,
+    torch::PackedTensorAccessor32<scalar_t,5,torch::RestrictPtrTraits> occupancy) {
+    // batch index
+    const auto n = blockIdx.y;
+    // ray index
+    const auto c = blockIdx.x * blockDim.x + threadIdx.x;
+    // num of rays
+    const auto M = points.size(1);
+    const auto T = occupancy.size(1);
+    // we allocated more threads than num_rays
+    if (c < M) {
+        // ray end point
+        const auto t = tindex[n][c];
+        // invalid points
+        assert(T == 1 || t < T);
+        // if t < 0, it is a padded point
+        if (t < 0) return;
+        // time index for sigma
+        // when T = 1, we have a static sigma
+        const auto ts = (T == 1) ? 0 : t;
+        // grid shape
+        const int vzsize = occupancy.size(2);
+        const int vysize = occupancy.size(3);
+        const int vxsize = occupancy.size(4);
+        // assert(vzsize + vysize + vxsize <= MAX_D);
+        // end point
+        const int vx = int(points[n][c][0]);
+        const int vy = int(points[n][c][1]);
+        const int vz = int(points[n][c][2]);
+        //
+        if (0 <= vx && vx < vxsize &&
+            0 <= vy && vy < vysize &&
+            0 <= vz && vz < vzsize) {
+            occupancy[n][ts][vz][vy][vx] = 1;
+        }
+    }
+}
+template <typename scalar_t>
+__global__ void render_forward_cuda_kernel(
+    const torch::PackedTensorAccessor32<scalar_t,5,torch::RestrictPtrTraits> sigma,
+    const torch::PackedTensorAccessor32<scalar_t,3,torch::RestrictPtrTraits> origin,
+    const torch::PackedTensorAccessor32<scalar_t,3,torch::RestrictPtrTraits> points,
+    const torch::PackedTensorAccessor32<scalar_t,2,torch::RestrictPtrTraits> tindex,
+    // torch::PackedTensorAccessor32<scalar_t,5,torch::RestrictPtrTraits> pog,
+    torch::PackedTensorAccessor32<scalar_t,2,torch::RestrictPtrTraits> pred_dist,
+    torch::PackedTensorAccessor32<scalar_t,2,torch::RestrictPtrTraits> gt_dist,
+    torch::PackedTensorAccessor32<scalar_t,3,torch::RestrictPtrTraits> coord_index,
+    PhaseName train_phase) {
+    // batch index
+    const auto n = blockIdx.y;
+    // ray index
+    const auto c = blockIdx.x * blockDim.x + threadIdx.x;
+    // num of rays
+    const auto M = points.size(1);
+    const auto T = sigma.size(1);
+    // we allocated more threads than num_rays
+    if (c < M) {
+        // ray end point
+        const auto t = tindex[n][c];
+        // invalid points
+        // assert(t < T);
+        assert(T == 1 || t < T);
+        // time index for sigma
+        // when T = 1, we have a static sigma
+        const auto ts = (T == 1) ? 0 : t;
+        // if t < 0, it is a padded point
+        if (t < 0) return;
+        // grid shape
+        const int vzsize = sigma.size(2);
+        const int vysize = sigma.size(3);
+        const int vxsize = sigma.size(4);
+        // assert(vzsize + vysize + vxsize <= MAX_D);
+        // origin
+        const double xo = origin[n][t][0];
+        const double yo = origin[n][t][1];
+        const double zo = origin[n][t][2];
+        // end point
+        const double xe = points[n][c][0];
+        const double ye = points[n][c][1];
+        const double ze = points[n][c][2];
+        // locate the voxel where the origin resides
+        const int vxo = int(xo);
+        const int vyo = int(yo);
+        const int vzo = int(zo);
+        const int vxe = int(xe);
+        const int vye = int(ye);
+        const int vze = int(ze);
+        // NOTE: new
+        int vx = vxo;
+        int vy = vyo;
+        int vz = vzo;
+        // origin to end
+        const double rx = xe - xo;
+        const double ry = ye - yo;
+        const double rz = ze - zo;
+        double gt_d = sqrt(rx * rx + ry * ry + rz * rz);
+        // directional vector
+        const double dx = rx / gt_d;
+        const double dy = ry / gt_d;
+        const double dz = rz / gt_d;
+        // In which direction the voxel ids are incremented.
+        const int stepX = (dx >= 0) ? 1 : -1;
+        const int stepY = (dy >= 0) ? 1 : -1;
+        const int stepZ = (dz >= 0) ? 1 : -1;
+        // Distance along the ray to the next voxel border from the current position (tMaxX, tMaxY, tMaxZ).
+        const double next_voxel_boundary_x = vx + (stepX < 0 ? 0 : 1);
+        const double next_voxel_boundary_y = vy + (stepY < 0 ? 0 : 1);
+        const double next_voxel_boundary_z = vz + (stepZ < 0 ? 0 : 1);
+        // tMaxX, tMaxY, tMaxZ -- distance until next intersection with voxel-border
+        // the value of t at which the ray crosses the first vertical voxel boundary
+        double tMaxX = (dx!=0) ? (next_voxel_boundary_x - xo)/dx : DBL_MAX; //
+        double tMaxY = (dy!=0) ? (next_voxel_boundary_y - yo)/dy : DBL_MAX; //
+        double tMaxZ = (dz!=0) ? (next_voxel_boundary_z - zo)/dz : DBL_MAX; //
+        // tDeltaX, tDeltaY, tDeltaZ --
+        // how far along the ray we must move for the horizontal component to equal the width of a voxel
+        // the direction in which we traverse the grid
+        // can only be FLT_MAX if we never go in that direction
+        const double tDeltaX = (dx!=0) ? stepX/dx : DBL_MAX;
+        const double tDeltaY = (dy!=0) ? stepY/dy : DBL_MAX;
+        const double tDeltaZ = (dz!=0) ? stepZ/dz : DBL_MAX;
+        int3 path[MAX_D];
+        double csd[MAX_D];  // cumulative sum of sigma times delta
+        double p[MAX_D];  // alpha
+        double d[MAX_D];
+        // forward raymarching with voxel traversal
+        int step = 0;  // total number of voxels traversed
+        int count = 0;  // number of voxels traversed inside the voxel grid
+        double last_d = 0.0;  // correct initialization
+        // voxel traversal raycasting
+        bool was_inside = false;
+        while (true) {
+            bool inside = (0 <= vx && vx < vxsize) &&
+                (0 <= vy && vy < vysize) &&
+                (0 <= vz && vz < vzsize);
+            if (inside) {
+                was_inside = true;
+                path[count] = make_int3(vx, vy, vz);
+            } else if (was_inside) { // was but no longer inside
+                // we know we are not coming back so terminate
+                break;
+            } /*else if (last_d > gt_d) {
+                break;
+            } */
+            /*else { // has not gone inside yet
+                // assert(count == 0);
+                // (1) when we have hit the destination but haven't gone inside the voxel grid
+                // (2) when we have traveled MAX_D voxels but haven't found one valid voxel
+                //     handle intersection corner cases in case of infinite loop
+                bool hit = (vx == vxe && vy == vye && vz == vze);  // this test seems brittle with corner cases
+                if (hit || step >= MAX_D)
+                    break;
+                //if (last_d >= gt_d || step >= MAX_D) break;
+            } */
+            // _d represents the ray distance has traveled before escaping the current voxel cell
+            double _d = 0.0;
+            // voxel traversal
+            if (tMaxX < tMaxY) {
+                if (tMaxX < tMaxZ) {
+                    _d = tMaxX;
+                    vx += stepX;
+                    tMaxX += tDeltaX;
+                } else {
+                    _d = tMaxZ;
+                    vz += stepZ;
+                    tMaxZ += tDeltaZ;
+                }
+            } else {
+                if (tMaxY < tMaxZ) {
+                    _d = tMaxY;
+                    vy += stepY;
+                    tMaxY += tDeltaY;
+                } else {
+                    _d = tMaxZ;
+                    vz += stepZ;
+                    tMaxZ += tDeltaZ;
+                }
+            }
+            if (inside) {
+                // get sigma at the current voxel
+                const int3 &v = path[count];  // use the recorded index
+                const double _sigma = sigma[n][ts][v.z][v.y][v.x];
+                const double _delta = max(0.0, _d - last_d);  // THIS TURNS OUT IMPORTANT
+                const double sd = _sigma * _delta;
+                if (count == 0) { // the first voxel inside
+                    csd[count] = sd;
+                    p[count] = 1 - exp(-sd);
+                } else {
+                    csd[count] = csd[count-1] + sd;
+                    p[count] = exp(-csd[count-1]) - exp(-csd[count]);
+                }
+                // record the traveled distance
+                d[count] = _d;
+                // count the number of voxels we have escaped
+                count ++;
+            }
+            last_d = _d;
+            step ++;
+            if (step > MAX_STEP) {
+                break;
+            }
+        }
+        // the total number of voxels visited should not exceed this number
+        assert(count <= MAX_D);
+        if (count > 0) {
+            // compute the expected ray distance
+            //double exp_d = 0.0;
+            double exp_d = d[count-1];
+            const int3 &v_init = path[count-1];
+            int x = v_init.x;
+            int y = v_init.y;
+            int z = v_init.z;
+            for (int i = 0; i < count; i++) {
+                //printf("%f\t%f\n",p[i], d[i]);
+                //exp_d += p[i] * d[i];
+                const int3 &v = path[i];
+                const double occ = sigma[n][ts][v.z][v.y][v.x];
+                if (occ > 0.5) {
+                    exp_d = d[i];
+                    x = v.x;
+                    y = v.y;
+                    z = v.z;
+                    break;
+                }
+            }
+            //printf("%f\n",exp_d);
+            // add an imaginary sample at the end point should gt_d exceeds max_d
+            double p_out = exp(-csd[count-1]);
+            double max_d = d[count-1];
+            // if (gt_d > max_d)
+            //   exp_d += (p_out * gt_d);
+            // p_out is the probability the ray escapes the voxel grid
+            //exp_d += (p_out * max_d);
+            if (train_phase == 1) {
+                gt_d = min(gt_d, max_d);
+            }
+            // write the rendered ray distance (max_d)
+            pred_dist[n][c] = exp_d;
+            gt_dist[n][c] = gt_d;
+            coord_index[n][c][0] = double(x);
+            coord_index[n][c][1] = double(y);
+            coord_index[n][c][2] = double(z);
+            // // write occupancy
+            // for (int i = 0; i < count; i ++) {
+            //     const int3 &v = path[i];
+            //     auto & occ = pog[n][t][v.z][v.y][v.x];
+            //     if (p[i] >= occ) {
+            //         occ = p[i];
+            //     }
+            // }
+        }
+    }
+}
+/*
+ * input shape
+ *   sigma      : N x T x H x L x W
+ *   origin   : N x T x 3
+ *   points   : N x M x 4
+ * output shape
+ *   dist     : N x M
+ */
+std::vector<torch::Tensor> render_forward_cuda(
+    torch::Tensor sigma,
+    torch::Tensor origin,
+    torch::Tensor points,
+    torch::Tensor tindex,
+    const std::vector<int> grid,
+    std::string phase_name) {
+    const auto N = points.size(0); // batch size
+    const auto M = points.size(1); // num of rays
+    const auto T = grid[0];
+    const auto H = grid[1];
+    const auto L = grid[2];
+    const auto W = grid[3];
+    const auto device = sigma.device();
+    const int threads = 1024;
+    const dim3 blocks((M + threads - 1) / threads, N);
+    //
+    // const auto dtype = points.dtype();
+    // const auto options = torch::TensorOptions().dtype(dtype).device(device).requires_grad(false);
+    // auto pog = torch::zeros({N, T, H, L, W}, options);
+    // perform rendering
+    auto gt_dist = -torch::ones({N, M}, device);
+    auto pred_dist = -torch::ones({N, M}, device);
+    auto coord_index = torch::zeros({N, M, 3}, device);
+    PhaseName train_phase;
+    if (phase_name.compare("test") == 0) {
+        train_phase = TEST;
+    } else if (phase_name.compare("train") == 0){
+        train_phase = TRAIN;
+    } else {
+        std::cout << "UNKNOWN PHASE NAME: " << phase_name << std::endl;
+        exit(1);
+    }
+    AT_DISPATCH_FLOATING_TYPES(sigma.type(), "render_forward_cuda", ([&] {
+               hipLaunchKernelGGL(( render_forward_cuda_kernel<scalar_t>), dim3(blocks), dim3(threads), 0, 0, 
+                    sigma.packed_accessor32<scalar_t,5,torch::RestrictPtrTraits>(),
+                    origin.packed_accessor32<scalar_t,3,torch::RestrictPtrTraits>(),
+                    points.packed_accessor32<scalar_t,3,torch::RestrictPtrTraits>(),
+                    tindex.packed_accessor32<scalar_t,2,torch::RestrictPtrTraits>(),
+                    // pog.packed_accessor32<scalar_t,5,torch::RestrictPtrTraits>(),
+                    pred_dist.packed_accessor32<scalar_t,2,torch::RestrictPtrTraits>(),
+                    gt_dist.packed_accessor32<scalar_t,2,torch::RestrictPtrTraits>(),
+                    coord_index.packed_accessor32<scalar_t,3,torch::RestrictPtrTraits>(),
+                    train_phase);
+            }));
+    hipDeviceSynchronize();
+    // return {pog, pred_dist, gt_dist};
+    return {pred_dist, gt_dist, coord_index};
+}
+template <typename scalar_t>
+__global__ void render_cuda_kernel(
+    const torch::PackedTensorAccessor32<scalar_t,5,torch::RestrictPtrTraits> sigma,
+    const torch::PackedTensorAccessor32<scalar_t,3,torch::RestrictPtrTraits> origin,
+    const torch::PackedTensorAccessor32<scalar_t,3,torch::RestrictPtrTraits> points,
+    const torch::PackedTensorAccessor32<scalar_t,2,torch::RestrictPtrTraits> tindex,
+    // const torch::PackedTensorAccessor32<scalar_t,5,torch::RestrictPtrTraits> occupancy,
+    torch::PackedTensorAccessor32<scalar_t,2,torch::RestrictPtrTraits> pred_dist,
+    torch::PackedTensorAccessor32<scalar_t,2,torch::RestrictPtrTraits> gt_dist,
+    torch::PackedTensorAccessor32<scalar_t,5,torch::RestrictPtrTraits> grad_sigma,
+    // torch::PackedTensorAccessor32<scalar_t,5,torch::RestrictPtrTraits> grad_sigma_count,
+    LossType loss_type) {
+    // batch index
+    const auto n = blockIdx.y;
+    // ray index
+    const auto c = blockIdx.x * blockDim.x + threadIdx.x;
+    // num of rays
+    const auto M = points.size(1);
+    const auto T = sigma.size(1);
+    // we allocated more threads than num_rays
+    if (c < M) {
+        // ray end point
+        const auto t = tindex[n][c];
+        // invalid points
+        // assert(t < T);
+        assert(T == 1 || t < T);
+        // time index for sigma
+        // when T = 1, we have a static sigma
+        const auto ts = (T == 1) ? 0 : t;
+        // if t < 0, it is a padded point
+        if (t < 0) return;
+        // grid shape
+        const int vzsize = sigma.size(2);
+        const int vysize = sigma.size(3);
+        const int vxsize = sigma.size(4);
+        // assert(vzsize + vysize + vxsize <= MAX_D);
+        // origin
+        const double xo = origin[n][t][0];
+        const double yo = origin[n][t][1];
+        const double zo = origin[n][t][2];
+        // end point
+        const double xe = points[n][c][0];
+        const double ye = points[n][c][1];
+        const double ze = points[n][c][2];
+        // locate the voxel where the origin resides
+        const int vxo = int(xo);
+        const int vyo = int(yo);
+        const int vzo = int(zo);
+        //
+        const int vxe = int(xe);
+        const int vye = int(ye);
+        const int vze = int(ze);
+        // NOTE: new
+        int vx = vxo;
+        int vy = vyo;
+        int vz = vzo;
+        // origin to end
+        const double rx = xe - xo;
+        const double ry = ye - yo;
+        const double rz = ze - zo;
+        double gt_d = sqrt(rx * rx + ry * ry + rz * rz);
+        // directional vector
+        const double dx = rx / gt_d;
+        const double dy = ry / gt_d;
+        const double dz = rz / gt_d;
+        // In which direction the voxel ids are incremented.
+        const int stepX = (dx >= 0) ? 1 : -1;
+        const int stepY = (dy >= 0) ? 1 : -1;
+        const int stepZ = (dz >= 0) ? 1 : -1;
+        // Distance along the ray to the next voxel border from the current position (tMaxX, tMaxY, tMaxZ).
+        const double next_voxel_boundary_x = vx + (stepX < 0 ? 0 : 1);
+        const double next_voxel_boundary_y = vy + (stepY < 0 ? 0 : 1);
+        const double next_voxel_boundary_z = vz + (stepZ < 0 ? 0 : 1);
+        // tMaxX, tMaxY, tMaxZ -- distance until next intersection with voxel-border
+        // the value of t at which the ray crosses the first vertical voxel boundary
+        double tMaxX = (dx!=0) ? (next_voxel_boundary_x - xo)/dx : DBL_MAX; //
+        double tMaxY = (dy!=0) ? (next_voxel_boundary_y - yo)/dy : DBL_MAX; //
+        double tMaxZ = (dz!=0) ? (next_voxel_boundary_z - zo)/dz : DBL_MAX; //
+        // tDeltaX, tDeltaY, tDeltaZ --
+        // how far along the ray we must move for the horizontal component to equal the width of a voxel
+        // the direction in which we traverse the grid
+        // can only be FLT_MAX if we never go in that direction
+        const double tDeltaX = (dx!=0) ? stepX/dx : DBL_MAX;
+        const double tDeltaY = (dy!=0) ? stepY/dy : DBL_MAX;
+        const double tDeltaZ = (dz!=0) ? stepZ/dz : DBL_MAX;
+        int3 path[MAX_D];
+        double csd[MAX_D];  // cumulative sum of sigma times delta
+        double p[MAX_D];  // alpha
+        double d[MAX_D];
+        double dt[MAX_D];
+        // forward raymarching with voxel traversal
+        int step = 0;  // total number of voxels traversed
+        int count = 0;  // number of voxels traversed inside the voxel grid
+        double last_d = 0.0;  // correct initialization
+        // voxel traversal raycasting
+        bool was_inside = false;
+        while (true) {
+            bool inside = (0 <= vx && vx < vxsize) &&
+                (0 <= vy && vy < vysize) &&
+                (0 <= vz && vz < vzsize);
+            if (inside) { // now inside
+                was_inside = true;
+                path[count] = make_int3(vx, vy, vz);
+            } else if (was_inside) { // was inside but no longer
+                // we know we are not coming back so terminate
+                break;
+            } else if (last_d > gt_d) {
+                break;
+            } /* else { // has not gone inside yet
+                // assert(count == 0);
+                // (1) when we have hit the destination but haven't gone inside the voxel grid
+                // (2) when we have traveled MAX_D voxels but haven't found one valid voxel
+                //     handle intersection corner cases in case of infinite loop
+                // bool hit = (vx == vxe && vy == vye && vz == vze);
+                // if (hit || step >= MAX_D)
+                //     break;
+                if (last_d >= gt_d || step >= MAX_D) break;
+            } */
+            // _d represents the ray distance has traveled before escaping the current voxel cell
+            double _d = 0.0;
+            // voxel traversal
+            if (tMaxX < tMaxY) {
+                if (tMaxX < tMaxZ) {
+                    _d = tMaxX;
+                    vx += stepX;
+                    tMaxX += tDeltaX;
+                } else {
+                    _d = tMaxZ;
+                    vz += stepZ;
+                    tMaxZ += tDeltaZ;
+                }
+            } else {
+                if (tMaxY < tMaxZ) {
+                    _d = tMaxY;
+                    vy += stepY;
+                    tMaxY += tDeltaY;
+                } else {
+                    _d = tMaxZ;
+                    vz += stepZ;
+                    tMaxZ += tDeltaZ;
+                }
+            }
+            if (inside) {
+                // get sigma at the current voxel
+                const int3 &v = path[count];  // use the recorded index
+                const double _sigma = sigma[n][ts][v.z][v.y][v.x];
+                const double _delta = max(0.0, _d - last_d);  // THIS TURNS OUT IMPORTANT
+                const double sd = _sigma * _delta;
+                if (count == 0) { // the first voxel inside
+                    csd[count] = sd;
+                    p[count] = 1 - exp(-sd);
+                } else {
+                    csd[count] = csd[count-1] + sd;
+                    p[count] = exp(-csd[count-1]) - exp(-csd[count]);
+                }
+                // record the traveled distance
+                d[count] = _d;
+                dt[count] = _delta;
+                // count the number of voxels we have escaped
+                count ++;
+            }
+            last_d = _d;
+            step ++;
+            if (step > MAX_STEP) {
+                break;
+            }
+        }
+        // the total number of voxels visited should not exceed this number
+        assert(count <= MAX_D);
+        // WHEN THERE IS AN INTERSECTION BETWEEN THE RAY AND THE VOXEL GRID
+        if (count > 0) {
+            // compute the expected ray distance
+            double exp_d = 0.0;
+            for (int i = 0; i < count; i ++)
+                exp_d += p[i] * d[i];
+            // add an imaginary sample at the end point should gt_d exceeds max_d
+            double p_out = exp(-csd[count-1]);
+            double max_d = d[count-1];
+            exp_d += (p_out * max_d);
+            gt_d = min(gt_d, max_d);
+            // write the rendered ray distance (max_d)
+            pred_dist[n][c] = exp_d;
+            gt_dist[n][c] = gt_d;
+            /* backward raymarching */
+            double dd_dsigma[MAX_D];
+            for (int i = count - 1; i >= 0; i --) {
+                // NOTE: probably need to double check again
+                if (i == count - 1)
+                    dd_dsigma[i] = p_out * max_d;
+                else
+                    dd_dsigma[i] = dd_dsigma[i+1] - exp(-csd[i]) * (d[i+1] - d[i]);
+            }
+            for (int i = count - 1; i >= 0; i --)
+                dd_dsigma[i] *= dt[i];
+            // option 2: cap at the boundary
+            for (int i = count - 1; i >= 0; i --)
+                dd_dsigma[i] -= dt[i] * p_out * max_d;
+            double dl_dd = 1.0;
+            if (loss_type == L1)
+                dl_dd = (exp_d >= gt_d) ? 1 : -1;
+            else if (loss_type == L2)
+                dl_dd = (exp_d - gt_d);
+            else if (loss_type == ABSREL)
+                dl_dd = (exp_d >= gt_d) ? (1.0/gt_d) : -(1.0/gt_d);
+            // apply chain rule
+            for (int i = 0; i < count; i ++) {
+                const int3 &v = path[i];
+                // NOTE: potential race conditions when writing gradients
+                grad_sigma[n][ts][v.z][v.y][v.x] += dl_dd * dd_dsigma[i];
+                // grad_sigma_count[n][ts][v.z][v.y][v.x] += 1;
+            }
+        }
+    }
+}
+/*
+ * input shape
+ *   sigma      : N x T x H x L x W
+ *   origin   : N x T x 3
+ *   points   : N x M x 4
+ * output shape
+ *   dist     : N x M
+ *   loss     : N x M
+ *   grad_sigma : N x T x H x L x W
+ */
+std::vector<torch::Tensor> render_cuda(
+    torch::Tensor sigma,
+    torch::Tensor origin,
+    torch::Tensor points,
+    torch::Tensor tindex,
+    std::string loss_name) {
+    const auto N = points.size(0); // batch size
+    const auto M = points.size(1); // num of rays
+    const auto device = sigma.device();
+    const int threads = 1024;
+    const dim3 blocks((M + threads - 1) / threads, N);
+    // perform rendering
+    auto gt_dist = -torch::ones({N, M}, device);
+    auto pred_dist = -torch::ones({N, M}, device);
+    auto grad_sigma = torch::zeros_like(sigma);
+    // auto grad_sigma_count = torch::zeros_like(sigma);
+    LossType loss_type;
+    if (loss_name.compare("l1") == 0) {
+        loss_type = L1;
+    } else if (loss_name.compare("l2") == 0) {
+        loss_type = L2;
+    } else if (loss_name.compare("absrel") == 0) {
+        loss_type = ABSREL;
+    } else if (loss_name.compare("bce") == 0){
+        loss_type = L1;
+    } else {
+        std::cout << "UNKNOWN LOSS TYPE: " << loss_name << std::endl;
+        exit(1);
+    }
+    AT_DISPATCH_FLOATING_TYPES(sigma.type(), "render_cuda", ([&] {
+               hipLaunchKernelGGL(( render_cuda_kernel<scalar_t>), dim3(blocks), dim3(threads), 0, 0, 
+                    sigma.packed_accessor32<scalar_t,5,torch::RestrictPtrTraits>(),
+                    origin.packed_accessor32<scalar_t,3,torch::RestrictPtrTraits>(),
+                    points.packed_accessor32<scalar_t,3,torch::RestrictPtrTraits>(),
+                    tindex.packed_accessor32<scalar_t,2,torch::RestrictPtrTraits>(),
+                    // occupancy.packed_accessor32<scalar_t,5,torch::RestrictPtrTraits>(),
+                    pred_dist.packed_accessor32<scalar_t,2,torch::RestrictPtrTraits>(),
+                    gt_dist.packed_accessor32<scalar_t,2,torch::RestrictPtrTraits>(),
+                    grad_sigma.packed_accessor32<scalar_t,5,torch::RestrictPtrTraits>(),
+                    // grad_sigma_count.packed_accessor32<scalar_t,5,torch::RestrictPtrTraits>(),
+                    loss_type);
+            }));
+    hipDeviceSynchronize();
+    // grad_sigma_count += (grad_sigma_count == 0);
+    // grad_sigma /= grad_sigma_count;
+    return {pred_dist, gt_dist, grad_sigma};
+}
+/*
+ * input shape
+ *   origin   : N x T x 3
+ *   points   : N x M x 3
+ *   tindex   : N x M
+ * output shape
+ *   occupancy: N x T x H x L x W
+ */
+torch::Tensor init_cuda(
+    torch::Tensor points,
+    torch::Tensor tindex,
+    const std::vector<int> grid) {
+    const auto N = points.size(0); // batch size
+    const auto M = points.size(1); // num of rays
+    const auto T = grid[0];
+    const auto H = grid[1];
+    const auto L = grid[2];
+    const auto W = grid[3];
+    const auto dtype = points.dtype();
+    const auto device = points.device();
+    const auto options = torch::TensorOptions().dtype(dtype).device(device).requires_grad(false);
+    auto occupancy = torch::zeros({N, T, H, L, W}, options);
+    const int threads = 1024;
+    const dim3 blocks((M + threads - 1) / threads, N);
+    // initialize occupancy such that every voxel with one or more points is occupied
+    AT_DISPATCH_FLOATING_TYPES(points.type(), "init_cuda", ([&] {
+               hipLaunchKernelGGL(( init_cuda_kernel<scalar_t>), dim3(blocks), dim3(threads), 0, 0, 
+                    points.packed_accessor32<scalar_t,3,torch::RestrictPtrTraits>(),
+                    tindex.packed_accessor32<scalar_t,2,torch::RestrictPtrTraits>(),
+                    occupancy.packed_accessor32<scalar_t,5,torch::RestrictPtrTraits>());
+            }));
+    // synchronize
+    hipDeviceSynchronize();
+    return occupancy;
+}
\ No newline at end of file
--- a/mmdetection3d @ c9541b0d
+++ b/mmdetection3d @ c9541b0d
+Subproject commit c9541b0db89498fdea5cafd05b7b17f7b625b858
--- a/docker-hub/FlashOCC/Flashocc/projects/__init__.py
+++ b/docker-hub/FlashOCC/Flashocc/projects/__init__.py
--- a/docker-hub/FlashOCC/Flashocc/projects/__pycache__/__init__.cpython-310.pyc
+++ b/docker-hub/FlashOCC/Flashocc/projects/__pycache__/__init__.cpython-310.pyc
--- a/docker-hub/FlashOCC/Flashocc/projects/configs/bevdet_occ/bevdet-occ-r50-4d-stereo.py
+++ b/docker-hub/FlashOCC/Flashocc/projects/configs/bevdet_occ/bevdet-occ-r50-4d-stereo.py
+_base_ = ['../../../mmdetection3d/configs/_base_/datasets/nus-3d.py',
+          '../../../mmdetection3d/configs/_base_/default_runtime.py']
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+data_config = {
+    'cams': [
+        'CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT',
+        'CAM_BACK', 'CAM_BACK_RIGHT'
+    ],
+    'Ncams':
+    6,
+    'input_size': (256, 704),
+    'src_size': (900, 1600),
+    # Augmentation
+    'resize': (-0.06, 0.11),
+    'rot': (-5.4, 5.4),
+    'flip': True,
+    'crop_h': (0.0, 0.0),
+    'resize_test': 0.00,
+}
+grid_config = {
+    'x': [-40, 40, 0.4],
+    'y': [-40, 40, 0.4],
+    'z': [-1, 5.4, 0.4],
+    'depth': [1.0, 45.0, 0.5],
+}
+voxel_size = [0.1, 0.1, 0.2]
+numC_Trans = 32
+multi_adj_frame_id_cfg = (1, 1+1, 1)
+model = dict(
+    type='BEVStereo4DOCC',
+    align_after_view_transfromation=False,
+    num_adj=len(range(*multi_adj_frame_id_cfg)),
+    img_backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 2, 3),
+        frozen_stages=-1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=False,
+        with_cp=True,
+        style='pytorch'),
+    img_neck=dict(
+        type='CustomFPN',
+        in_channels=[1024, 2048],
+        out_channels=256,
+        num_outs=1,
+        start_level=0,
+        out_ids=[0]),
+    img_view_transformer=dict(
+        type='LSSViewTransformerBEVStereo',
+        grid_config=grid_config,
+        input_size=data_config['input_size'],
+        in_channels=256,
+        out_channels=numC_Trans,
+        sid=False,
+        collapse_z=False,
+        loss_depth_weight=0.05,
+        depthnet_cfg=dict(use_dcn=False,
+                          aspp_mid_channels=96,
+                          stereo=True,
+                          bias=5.),
+        downsample=16),
+    img_bev_encoder_backbone=dict(
+        type='CustomResNet3D',
+        numC_input=numC_Trans * (len(range(*multi_adj_frame_id_cfg))+1),
+        num_layer=[1, 2, 4],
+        with_cp=False,
+        num_channels=[numC_Trans, numC_Trans*2, numC_Trans*4],
+        stride=[1, 2, 2],
+        backbone_output_ids=[0, 1, 2]),
+    img_bev_encoder_neck=dict(type='LSSFPN3D',
+                              in_channels=numC_Trans*7,
+                              out_channels=numC_Trans),
+    pre_process=dict(
+        type='CustomResNet3D',
+        numC_input=numC_Trans,
+        with_cp=False,
+        num_layer=[1, ],
+        num_channels=[numC_Trans, ],
+        stride=[1, ],
+        backbone_output_ids=[0, ]),
+    occ_head=dict(
+        type='BEVOCCHead3D',
+        in_dim=numC_Trans,
+        out_dim=32,
+        use_mask=True,
+        num_classes=18,
+        use_predicter=True,
+        class_balance=False,
+        loss_occ=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=False,
+            ignore_index=255,
+            loss_weight=1.0
+        ),
+    )
+)
+# Data
+dataset_type = 'NuScenesDatasetOccpancy'
+data_root = 'data/nuscenes/'
+file_client_args = dict(backend='disk')
+bda_aug_conf = dict(
+    rot_lim=(-0., 0.),
+    scale_lim=(1., 1.),
+    flip_dx_ratio=0.5,
+    flip_dy_ratio=0.5)
+train_pipeline = [
+    dict(
+        type='PrepareImageInputs',
+        is_train=True,
+        data_config=data_config,
+        sequential=True),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=True),
+    dict(type='LoadOccGTFromFile'),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(type='PointToMultiViewDepth', downsample=1, grid_config=grid_config),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(
+        type='Collect3D', keys=['img_inputs', 'gt_depth', 'voxel_semantics',
+                                'mask_lidar', 'mask_camera'])
+]
+test_pipeline = [
+    dict(type='PrepareImageInputs', data_config=data_config, sequential=True),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=False),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points', 'img_inputs'])
+        ])
+]
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=False)
+share_data_config = dict(
+    type=dataset_type,
+    data_root=data_root,
+    classes=class_names,
+    modality=input_modality,
+    stereo=True,
+    filter_empty_gt=False,
+    img_info_prototype='bevdet4d',
+    multi_adj_frame_id_cfg=multi_adj_frame_id_cfg,
+)
+test_data_config = dict(
+    pipeline=test_pipeline,
+    ann_file=data_root + 'bevdetv2-nuscenes_infos_val.pkl')
+data = dict(
+    samples_per_gpu=4,
+    workers_per_gpu=4,
+    train=dict(
+        data_root=data_root,
+        ann_file=data_root + 'bevdetv2-nuscenes_infos_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        test_mode=False,
+        use_valid_flag=True,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR'),
+    val=test_data_config,
+    test=test_data_config)
+for key in ['val', 'train', 'test']:
+    data[key].update(share_data_config)
+# Optimizer
+optimizer = dict(type='AdamW', lr=1e-4, weight_decay=1e-2)
+optimizer_config = dict(grad_clip=dict(max_norm=5, norm_type=2))
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=200,
+    warmup_ratio=0.001,
+    step=[24, ])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
+custom_hooks = [
+    dict(
+        type='MEGVIIEMAHook',
+        init_updates=10560,
+        priority='NORMAL',
+    ),
+]
+load_from = "ckpts/bevdet-r50-4d-stereo-cbgs.pth"
+# fp16 = dict(loss_scale='dynamic')
+evaluation = dict(interval=1, start=20, pipeline=test_pipeline)
+checkpoint_config = dict(interval=1, max_keep_ckpts=5)
+# with_pretrain:
+# align_after_view_transfromation=False
+# ===> per class IoU of 6019 samples:
+# ===> others - IoU = 8.22
+# ===> barrier - IoU = 44.21
+# ===> bicycle - IoU = 10.34
+# ===> bus - IoU = 42.08
+# ===> car - IoU = 49.63
+# ===> construction_vehicle - IoU = 23.37
+# ===> motorcycle - IoU = 17.41
+# ===> pedestrian - IoU = 21.49
+# ===> traffic_cone - IoU = 19.7
+# ===> trailer - IoU = 31.33
+# ===> truck - IoU = 37.09
+# ===> driveable_surface - IoU = 80.13
+# ===> other_flat - IoU = 37.37
+# ===> sidewalk - IoU = 50.41
+# ===> terrain - IoU = 54.29
+# ===> manmade - IoU = 45.56
+# ===> vegetation - IoU = 39.59
+# ===> mIoU of 6019 samples: 36.01
--- a/docker-hub/FlashOCC/Flashocc/projects/configs/bevdet_occ/bevdet-occ-r50.py
+++ b/docker-hub/FlashOCC/Flashocc/projects/configs/bevdet_occ/bevdet-occ-r50.py
+_base_ = ['../../../mmdetection3d/configs/_base_/datasets/nus-3d.py',
+          '../../../mmdetection3d/configs/_base_/default_runtime.py']
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+data_config = {
+    'cams': [
+        'CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT',
+        'CAM_BACK', 'CAM_BACK_RIGHT'
+    ],
+    'Ncams':
+    6,
+    'input_size': (256, 704),
+    'src_size': (900, 1600),
+    # Augmentation
+    'resize': (-0.06, 0.11),
+    'rot': (-5.4, 5.4),
+    'flip': True,
+    'crop_h': (0.0, 0.0),
+    'resize_test': 0.00,
+}
+grid_config = {
+    'x': [-40, 40, 0.4],
+    'y': [-40, 40, 0.4],
+    'z': [-1, 5.4, 0.4],
+    'depth': [1.0, 45.0, 0.5],
+}
+voxel_size = [0.1, 0.1, 0.2]
+numC_Trans = 32
+model = dict(
+    type='BEVDetOCC',
+    img_backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(2, 3),
+        frozen_stages=-1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=False,
+        with_cp=True,
+        style='pytorch',
+        pretrained='torchvision://resnet50',
+    ),
+    img_neck=dict(
+        type='CustomFPN',
+        in_channels=[1024, 2048],
+        out_channels=256,
+        num_outs=1,
+        start_level=0,
+        out_ids=[0]),
+    img_view_transformer=dict(
+        type='LSSViewTransformer',
+        grid_config=grid_config,
+        input_size=data_config['input_size'],
+        in_channels=256,
+        out_channels=numC_Trans,
+        sid=False,
+        collapse_z=False,
+        downsample=16),
+    img_bev_encoder_backbone=dict(
+        type='CustomResNet3D',
+        numC_input=numC_Trans,
+        num_layer=[1, 2, 4],
+        with_cp=False,
+        num_channels=[numC_Trans, numC_Trans*2, numC_Trans*4],
+        stride=[1, 2, 2],
+        backbone_output_ids=[0, 1, 2]),
+    img_bev_encoder_neck=dict(type='LSSFPN3D',
+                              in_channels=numC_Trans*7,
+                              out_channels=numC_Trans),
+    occ_head=dict(
+        type='BEVOCCHead3D',
+        in_dim=numC_Trans,
+        out_dim=32,
+        use_mask=True,
+        num_classes=18,
+        use_predicter=True,
+        class_balance=False,
+        loss_occ=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=False,
+            ignore_index=255,
+            loss_weight=1.0
+        ),
+    )
+)
+# Data
+dataset_type = 'NuScenesDatasetOccpancy'
+data_root = 'data/nuscenes/'
+file_client_args = dict(backend='disk')
+bda_aug_conf = dict(
+    rot_lim=(-0., 0.),
+    scale_lim=(1., 1.),
+    flip_dx_ratio=0.5,
+    flip_dy_ratio=0.5)
+train_pipeline = [
+    dict(
+        type='PrepareImageInputs',
+        is_train=True,
+        data_config=data_config,
+        sequential=False),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=True),
+    dict(type='LoadOccGTFromFile'),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(type='PointToMultiViewDepth', downsample=1, grid_config=grid_config),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(
+        type='Collect3D', keys=['img_inputs', 'gt_depth', 'voxel_semantics',
+                                'mask_lidar', 'mask_camera'])
+]
+test_pipeline = [
+    dict(type='PrepareImageInputs', data_config=data_config, sequential=False),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=False),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points', 'img_inputs'])
+        ])
+]
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=False)
+share_data_config = dict(
+    type=dataset_type,
+    data_root=data_root,
+    classes=class_names,
+    modality=input_modality,
+    stereo=True,
+    filter_empty_gt=False,
+    img_info_prototype='bevdet',
+)
+test_data_config = dict(
+    pipeline=test_pipeline,
+    ann_file=data_root + 'bevdetv2-nuscenes_infos_val.pkl')
+data = dict(
+    samples_per_gpu=4,
+    workers_per_gpu=4,
+    train=dict(
+        data_root=data_root,
+        ann_file=data_root + 'bevdetv2-nuscenes_infos_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        test_mode=False,
+        use_valid_flag=True,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR'),
+    val=test_data_config,
+    test=test_data_config)
+for key in ['val', 'train', 'test']:
+    data[key].update(share_data_config)
+# Optimizer
+optimizer = dict(type='AdamW', lr=1e-4, weight_decay=1e-2)
+optimizer_config = dict(grad_clip=dict(max_norm=5, norm_type=2))
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=200,
+    warmup_ratio=0.001,
+    step=[24, ])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
+custom_hooks = [
+    dict(
+        type='MEGVIIEMAHook',
+        init_updates=10560,
+        priority='NORMAL',
+    ),
+]
+load_from = "ckpts/bevdet-r50-cbgs.pth"
+# fp16 = dict(loss_scale='dynamic')
+evaluation = dict(interval=1, start=20, pipeline=test_pipeline)
+checkpoint_config = dict(interval=1, max_keep_ckpts=5)
+# with pretrain
+# ===> per class IoU of 6019 samples:
+# ===> others - IoU = 6.65
+# ===> barrier - IoU = 36.97
+# ===> bicycle - IoU = 8.33
+# ===> bus - IoU = 38.69
+# ===> car - IoU = 44.46
+# ===> construction_vehicle - IoU = 15.21
+# ===> motorcycle - IoU = 13.67
+# ===> pedestrian - IoU = 16.39
+# ===> traffic_cone - IoU = 15.27
+# ===> trailer - IoU = 27.11
+# ===> truck - IoU = 31.04
+# ===> driveable_surface - IoU = 78.7
+# ===> other_flat - IoU = 36.45
+# ===> sidewalk - IoU = 48.27
+# ===> terrain - IoU = 51.68
+# ===> manmade - IoU = 36.82
+# ===> vegetation - IoU = 32.09
+# ===> mIoU of 6019 samples: 31.64
+# with det pretrain; use_mask=False; class_balance=True
+# ===> per class IoU of 6019 samples:
+# ===> others - IoU = 4.36
+# ===> barrier - IoU = 28.87
+# ===> bicycle - IoU = 2.86
+# ===> bus - IoU = 29.27
+# ===> car - IoU = 32.45
+# ===> construction_vehicle - IoU = 11.05
+# ===> motorcycle - IoU = 12.82
+# ===> pedestrian - IoU = 10.11
+# ===> traffic_cone - IoU = 9.47
+# ===> trailer - IoU = 7.93
+# ===> truck - IoU = 21.58
+# ===> driveable_surface - IoU = 49.85
+# ===> other_flat - IoU = 25.5
+# ===> sidewalk - IoU = 26.78
+# ===> terrain - IoU = 21.14
+# ===> manmade - IoU = 5.76
+# ===> vegetation - IoU = 7.09
+# ===> mIoU of 6019 samples: 18.05
\ No newline at end of file
--- a/docker-hub/FlashOCC/Flashocc/projects/configs/bevdet_occ/bevdet-occ-stbase-4d-stereo-512x1408.py
+++ b/docker-hub/FlashOCC/Flashocc/projects/configs/bevdet_occ/bevdet-occ-stbase-4d-stereo-512x1408.py
+# Copyright (c) Phigent Robotics. All rights reserved.
+# align_after_view_transfromation=True
+# align_after_view_transfromation=False
+# 1x/12epoch
+# ===> per class IoU of 6019 samples:
+# ===> others - IoU = 10.12
+# ===> barrier - IoU = 48.06
+# ===> bicycle - IoU = 0.0
+# ===> bus - IoU = 51.19
+# ===> car - IoU = 53.61
+# ===> construction_vehicle - IoU = 27.15
+# ===> motorcycle - IoU = 2.74
+# ===> pedestrian - IoU = 28.3
+# ===> traffic_cone - IoU = 23.33
+# ===> trailer - IoU = 36.24
+# ===> truck - IoU = 42.13
+# ===> driveable_surface - IoU = 81.77
+# ===> other_flat - IoU = 42.43
+# ===> sidewalk - IoU = 53.67
+# ===> terrain - IoU = 57.31
+# ===> manmade - IoU = 48.27
+# ===> vegetation - IoU = 43.31
+# ===> mIoU of 6019 samples: 38.21
+# 2x/24epoch
+# ===> per class IoU of 6019 samples:
+# ===> others - IoU = 12.15
+# ===> barrier - IoU = 49.63
+# ===> bicycle - IoU = 25.1
+# ===> bus - IoU = 52.02
+# ===> car - IoU = 54.46
+# ===> construction_vehicle - IoU = 27.87
+# ===> motorcycle - IoU = 27.99
+# ===> pedestrian - IoU = 28.94
+# ===> traffic_cone - IoU = 27.23
+# ===> trailer - IoU = 36.43
+# ===> truck - IoU = 42.22
+# ===> driveable_surface - IoU = 82.31
+# ===> other_flat - IoU = 43.29
+# ===> sidewalk - IoU = 54.62
+# ===> terrain - IoU = 57.9
+# ===> manmade - IoU = 48.61
+# ===> vegetation - IoU = 43.55
+# ===> mIoU of 6019 samples: 42.02
+# 3x/36epoch
+# ===> per class IoU of 6019 samples:
+# ===> others - IoU = 12.37
+# ===> barrier - IoU = 50.15
+# ===> bicycle - IoU = 26.97
+# ===> bus - IoU = 51.86
+# ===> car - IoU = 54.65
+# ===> construction_vehicle - IoU = 28.38
+# ===> motorcycle - IoU = 28.96
+# ===> pedestrian - IoU = 29.02
+# ===> traffic_cone - IoU = 28.28
+# ===> trailer - IoU = 37.05
+# ===> truck - IoU = 42.52
+# ===> driveable_surface - IoU = 82.55
+# ===> other_flat - IoU = 43.15
+# ===> sidewalk - IoU = 54.87
+# ===> terrain - IoU = 58.33
+# ===> manmade - IoU = 48.78
+# ===> vegetation - IoU = 43.79
+# ===> mIoU of 6019 samples: 42.45
+_base_ = ['../../../mmdetection3d/configs/_base_/datasets/nus-3d.py',
+          '../../../mmdetection3d/configs/_base_/default_runtime.py']
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+data_config = {
+    'cams': [
+        'CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT',
+        'CAM_BACK', 'CAM_BACK_RIGHT'
+    ],
+    'Ncams':
+    6,
+    'input_size': (512, 1408),
+    'src_size': (900, 1600),
+    # Augmentation
+    'resize': (-0.06, 0.11),
+    'rot': (-5.4, 5.4),
+    'flip': True,
+    'crop_h': (0.0, 0.0),
+    'resize_test': 0.00,
+}
+# Model
+grid_config = {
+    'x': [-40, 40, 0.4],
+    'y': [-40, 40, 0.4],
+    'z': [-1, 5.4, 0.4],
+    'depth': [1.0, 45.0, 0.5],
+}
+voxel_size = [0.1, 0.1, 0.2]
+numC_Trans = 32
+multi_adj_frame_id_cfg = (1, 1+1, 1)
+model = dict(
+    type='BEVStereo4DOCC',
+    align_after_view_transfromation=False,
+    num_adj=len(range(*multi_adj_frame_id_cfg)),
+    img_backbone=dict(
+        type='SwinTransformer',
+        pretrain_img_size=224,
+        patch_size=4,
+        window_size=12,
+        mlp_ratio=4,
+        embed_dims=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        strides=(4, 2, 2, 2),
+        out_indices=(2, 3),
+        qkv_bias=True,
+        qk_scale=None,
+        patch_norm=True,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.1,
+        use_abs_pos_embed=False,
+        return_stereo_feat=True,
+        act_cfg=dict(type='GELU'),
+        norm_cfg=dict(type='LN', requires_grad=True),
+        pretrain_style='official',
+        output_missing_index_as_none=False),
+    img_neck=dict(
+        type='FPN_LSS',
+        in_channels=512 + 1024,
+        out_channels=512,
+        # with_cp=False,
+        extra_upsample=None,
+        input_feature_index=(0, 1),
+        scale_factor=2),
+    img_view_transformer=dict(
+        type='LSSViewTransformerBEVStereo',
+        grid_config=grid_config,
+        input_size=data_config['input_size'],
+        in_channels=512,
+        out_channels=numC_Trans,
+        sid=False,
+        collapse_z=False,
+        loss_depth_weight=0.05,
+        depthnet_cfg=dict(use_dcn=False,
+                          aspp_mid_channels=96,
+                          stereo=True,
+                          bias=5.),
+        downsample=16),
+    img_bev_encoder_backbone=dict(
+        type='CustomResNet3D',
+        numC_input=numC_Trans * (len(range(*multi_adj_frame_id_cfg))+1),
+        num_layer=[1, 2, 4],
+        with_cp=False,
+        num_channels=[numC_Trans,numC_Trans*2,numC_Trans*4],
+        stride=[1,2,2],
+        backbone_output_ids=[0,1,2]),
+    img_bev_encoder_neck=dict(type='LSSFPN3D',
+                              in_channels=numC_Trans*7,
+                              out_channels=numC_Trans),
+    pre_process=dict(
+        type='CustomResNet3D',
+        numC_input=numC_Trans,
+        with_cp=False,
+        num_layer=[1,],
+        num_channels=[numC_Trans,],
+        stride=[1,],
+        backbone_output_ids=[0,]),
+    occ_head=dict(
+        type='BEVOCCHead3D',
+        in_dim=numC_Trans,
+        out_dim=32,
+        use_mask=True,
+        num_classes=18,
+        use_predicter=True,
+        class_balance=False,
+        loss_occ=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=False,
+            ignore_index=255,
+            loss_weight=1.0
+        ),
+    )
+)
+# Data
+dataset_type = 'NuScenesDatasetOccpancy'
+data_root = 'data/nuscenes/'
+file_client_args = dict(backend='disk')
+bda_aug_conf = dict(
+    rot_lim=(-0., 0.),
+    scale_lim=(1., 1.),
+    flip_dx_ratio=0.5,
+    flip_dy_ratio=0.5)
+train_pipeline = [
+    dict(
+        type='PrepareImageInputs',
+        is_train=True,
+        data_config=data_config,
+        sequential=True),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=True),
+    dict(type='LoadOccGTFromFile'),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(type='PointToMultiViewDepth', downsample=1, grid_config=grid_config),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(
+        type='Collect3D', keys=['img_inputs', 'gt_depth', 'voxel_semantics',
+                                'mask_lidar','mask_camera'])
+]
+test_pipeline = [
+    dict(type='PrepareImageInputs', data_config=data_config, sequential=True),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=False),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points', 'img_inputs'])
+        ])
+]
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=False)
+share_data_config = dict(
+    type=dataset_type,
+    classes=class_names,
+    modality=input_modality,
+    stereo=True,
+    filter_empty_gt=False,
+    img_info_prototype='bevdet4d',
+    multi_adj_frame_id_cfg=multi_adj_frame_id_cfg,
+)
+test_data_config = dict(
+    pipeline=test_pipeline,
+    ann_file=data_root + 'bevdetv2-nuscenes_infos_val.pkl')
+data = dict(
+    samples_per_gpu=1,  # with 32 GPU
+    workers_per_gpu=4,
+    train=dict(
+        data_root=data_root,
+        ann_file=data_root + 'bevdetv2-nuscenes_infos_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        test_mode=False,
+        use_valid_flag=True,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR'),
+    val=test_data_config,
+    test=test_data_config)
+for key in ['val', 'train', 'test']:
+    data[key].update(share_data_config)
+# Optimizer
+optimizer = dict(type='AdamW', lr=2e-4, weight_decay=1e-2)
+optimizer_config = dict(grad_clip=dict(max_norm=5, norm_type=2))
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=200,
+    warmup_ratio=0.001,
+    step=[24,])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
+custom_hooks = [
+    dict(
+        type='MEGVIIEMAHook',
+        init_updates=10560,
+        priority='NORMAL',
+    ),
+    dict(
+        type='SyncbnControlHook',
+        syncbn_start_epoch=0,
+    ),
+]
+load_from="ckpts/bevdet-stbase-4d-stereo-512x1408-cbgs.pth"
+# fp16 = dict(loss_scale='dynamic')
--- a/docker-hub/FlashOCC/Flashocc/projects/configs/flashocc/flashocc-r50-4d-stereo.py
+++ b/docker-hub/FlashOCC/Flashocc/projects/configs/flashocc/flashocc-r50-4d-stereo.py
+_base_ = ['../../../mmdetection3d/configs/_base_/datasets/nus-3d.py',
+          '../../../mmdetection3d/configs/_base_/default_runtime.py']
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+data_config = {
+    'cams': [
+        'CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT',
+        'CAM_BACK', 'CAM_BACK_RIGHT'
+    ],
+    'Ncams': 6,
+    'input_size': (256, 704),
+    'src_size': (900, 1600),
+    # Augmentation
+    'resize': (-0.06, 0.11),
+    'rot': (-5.4, 5.4),
+    'flip': True,
+    'crop_h': (0.0, 0.0),
+    'resize_test': 0.00,
+}
+grid_config = {
+    'x': [-40, 40, 0.4],
+    'y': [-40, 40, 0.4],
+    'z': [-1, 5.4, 6.4],
+    'depth': [1.0, 45.0, 0.5],
+}
+voxel_size = [0.1, 0.1, 0.2]
+numC_Trans = 80
+multi_adj_frame_id_cfg = (1, 1+1, 1)
+model = dict(
+    type='BEVStereo4DOCC',
+    align_after_view_transfromation=False,
+    num_adj=len(range(*multi_adj_frame_id_cfg)),
+    img_backbone=dict(
+        pretrained='torchvision://resnet50',
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 2, 3),
+        frozen_stages=-1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=False,
+        with_cp=True,
+        style='pytorch'),
+    img_neck=dict(
+        type='CustomFPN',
+        in_channels=[1024, 2048],
+        out_channels=256,
+        num_outs=1,
+        start_level=0,
+        out_ids=[0]),
+    img_view_transformer=dict(
+        type='LSSViewTransformerBEVStereo',
+        grid_config=grid_config,
+        input_size=data_config['input_size'],
+        in_channels=256,
+        out_channels=numC_Trans,
+        sid=True,
+        loss_depth_weight=0.05,
+        depthnet_cfg=dict(use_dcn=False,
+                          aspp_mid_channels=96,
+                          stereo=True,
+                          bias=5.),
+        downsample=16),
+    img_bev_encoder_backbone=dict(
+        type='CustomResNet',
+        numC_input=numC_Trans * (len(range(*multi_adj_frame_id_cfg))+1),
+        num_channels=[numC_Trans * 2, numC_Trans * 4, numC_Trans * 8]),
+    img_bev_encoder_neck=dict(
+        type='FPN_LSS',
+        in_channels=numC_Trans * 8 + numC_Trans * 2,
+        out_channels=256),
+    pre_process=dict(
+        type='CustomResNet',
+        numC_input=numC_Trans,
+        num_layer=[1, ],
+        num_channels=[numC_Trans, ],
+        stride=[1, ],
+        backbone_output_ids=[0, ]),
+    occ_head=dict(
+        type='BEVOCCHead2D',
+        in_dim=256,
+        out_dim=256,
+        Dz=16,
+        use_mask=True,
+        num_classes=18,
+        use_predicter=True,
+        class_balance=False,
+        loss_occ=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=False,
+            ignore_index=255,
+            loss_weight=1.0
+        ),
+    )
+)
+# Data
+dataset_type = 'NuScenesDatasetOccpancy'
+data_root = 'data/nuscenes/'
+file_client_args = dict(backend='disk')
+bda_aug_conf = dict(
+    rot_lim=(-0., 0.),
+    scale_lim=(1., 1.),
+    flip_dx_ratio=0.5,
+    flip_dy_ratio=0.5)
+train_pipeline = [
+    dict(
+        type='PrepareImageInputs',
+        is_train=True,
+        data_config=data_config,
+        sequential=True),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=True),
+    dict(type='LoadOccGTFromFile'),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(type='PointToMultiViewDepth', downsample=1, grid_config=grid_config),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(
+        type='Collect3D', keys=['img_inputs', 'gt_depth', 'voxel_semantics',
+                                'mask_lidar', 'mask_camera'])
+]
+test_pipeline = [
+    dict(type='PrepareImageInputs', data_config=data_config, sequential=True),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=False),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points', 'img_inputs'])
+        ])
+]
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=False)
+share_data_config = dict(
+    type=dataset_type,
+    data_root=data_root,
+    classes=class_names,
+    modality=input_modality,
+    stereo=True,
+    filter_empty_gt=False,
+    img_info_prototype='bevdet4d',
+    multi_adj_frame_id_cfg=multi_adj_frame_id_cfg,
+)
+test_data_config = dict(
+    pipeline=test_pipeline,
+    ann_file=data_root + 'bevdetv2-nuscenes_infos_val.pkl')
+data = dict(
+    samples_per_gpu=4,
+    workers_per_gpu=4,
+    train=dict(
+        data_root=data_root,
+        ann_file=data_root + 'bevdetv2-nuscenes_infos_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        test_mode=False,
+        use_valid_flag=True,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR'),
+    val=test_data_config,
+    test=test_data_config)
+for key in ['val', 'train', 'test']:
+    data[key].update(share_data_config)
+# Optimizer
+optimizer = dict(type='AdamW', lr=1e-4, weight_decay=1e-2)
+optimizer_config = dict(grad_clip=dict(max_norm=5, norm_type=2))
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=200,
+    warmup_ratio=0.001,
+    step=[24, ])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
+custom_hooks = [
+    dict(
+        type='MEGVIIEMAHook',
+        init_updates=10560,
+        priority='NORMAL',
+    ),
+]
+load_from = "./ckpts/bevdet-r50-4d-stereo-cbgs.pth"
+# fp16 = dict(loss_scale='dynamic')
+evaluation = dict(interval=1, start=20, pipeline=test_pipeline)
+checkpoint_config = dict(interval=1, max_keep_ckpts=5)
+# with_pretrain:
+# ===> per class IoU of 6019 samples:
+# ===> others - IoU = 9.08
+# ===> barrier - IoU = 46.32
+# ===> bicycle - IoU = 17.71
+# ===> bus - IoU = 42.7
+# ===> car - IoU = 50.64
+# ===> construction_vehicle - IoU = 23.72
+# ===> motorcycle - IoU = 20.13
+# ===> pedestrian - IoU = 22.34
+# ===> traffic_cone - IoU = 24.09
+# ===> trailer - IoU = 30.26
+# ===> truck - IoU = 37.39
+# ===> driveable_surface - IoU = 81.68
+# ===> other_flat - IoU = 40.13
+# ===> sidewalk - IoU = 52.34
+# ===> terrain - IoU = 56.46
+# ===> manmade - IoU = 47.69
+# ===> vegetation - IoU = 40.6
+# ===> mIoU of 6019 samples: 37.84
--- a/docker-hub/FlashOCC/Flashocc/projects/configs/flashocc/flashocc-r50-M0-trt.py
+++ b/docker-hub/FlashOCC/Flashocc/projects/configs/flashocc/flashocc-r50-M0-trt.py
+_base_ = ['./flashocc-r50-M0.py',
+          ]
+model = dict(
+    wocc=True,
+    wdet3d=False,
+)
--- a/docker-hub/FlashOCC/Flashocc/projects/configs/flashocc/flashocc-r50-M0.py
+++ b/docker-hub/FlashOCC/Flashocc/projects/configs/flashocc/flashocc-r50-M0.py
+_base_ = ['../../../mmdetection3d/configs/_base_/datasets/nus-3d.py',
+          '../../../mmdetection3d/configs/_base_/default_runtime.py']
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+data_config = {
+    'cams': [
+        'CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT',
+        'CAM_BACK', 'CAM_BACK_RIGHT'
+    ],
+    'Ncams':
+    6,
+    'input_size': (256, 704),
+    'src_size': (900, 1600),
+    # Augmentation
+    'resize': (-0.06, 0.11),
+    'rot': (-5.4, 5.4),
+    'flip': True,
+    'crop_h': (0.0, 0.0),
+    'resize_test': 0.00,
+}
+grid_config = {
+    'x': [-40, 40, 0.4],
+    'y': [-40, 40, 0.4],
+    'z': [-1, 5.4, 6.4],
+    'depth': [1.0, 45.0, 1.0],
+}
+voxel_size = [0.1, 0.1, 0.2]
+numC_Trans = 64
+model = dict(
+    type='BEVDetOCC',
+    img_backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(2, 3),
+        frozen_stages=-1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=False,
+        with_cp=True,
+        style='pytorch',
+        pretrained='torchvision://resnet50',
+    ),
+    img_neck=dict(
+        type='CustomFPN',
+        in_channels=[1024, 2048],
+        out_channels=256,
+        num_outs=1,
+        start_level=0,
+        out_ids=[0]),
+    img_view_transformer=dict(
+        type='LSSViewTransformer',
+        grid_config=grid_config,
+        input_size=data_config['input_size'],
+        in_channels=256,
+        out_channels=numC_Trans,
+        sid=False,
+        collapse_z=True,
+        downsample=16),
+    img_bev_encoder_backbone=dict(
+        type='CustomResNet',
+        numC_input=numC_Trans,
+        num_channels=[numC_Trans * 2, numC_Trans * 4, numC_Trans * 8]),
+    img_bev_encoder_neck=dict(
+        type='FPN_LSS',
+        in_channels=numC_Trans * 8 + numC_Trans * 2,
+        out_channels=128),
+    occ_head=dict(
+        type='BEVOCCHead2D',
+        in_dim=128,
+        out_dim=128,
+        Dz=16,
+        use_mask=True,
+        num_classes=18,
+        use_predicter=True,
+        class_balance=False,
+        loss_occ=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=False,
+            ignore_index=255,
+            loss_weight=1.0
+        ),
+    )
+)
+# Data
+dataset_type = 'NuScenesDatasetOccpancy'
+data_root = 'data/nuscenes/'
+file_client_args = dict(backend='disk')
+bda_aug_conf = dict(
+    rot_lim=(-0., 0.),
+    scale_lim=(1., 1.),
+    flip_dx_ratio=0.5,
+    flip_dy_ratio=0.5
+)
+train_pipeline = [
+    dict(
+        type='PrepareImageInputs',
+        is_train=True,
+        data_config=data_config,
+        sequential=False),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=True),
+    dict(type='LoadOccGTFromFile'),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(type='PointToMultiViewDepth', downsample=1, grid_config=grid_config),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(
+        type='Collect3D', keys=['img_inputs', 'gt_depth', 'voxel_semantics',
+                                'mask_lidar', 'mask_camera'])
+]
+test_pipeline = [
+    dict(type='PrepareImageInputs', data_config=data_config, sequential=False),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=False),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points', 'img_inputs'])
+        ])
+]
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=False)
+share_data_config = dict(
+    type=dataset_type,
+    data_root=data_root,
+    classes=class_names,
+    modality=input_modality,
+    stereo=False,
+    filter_empty_gt=False,
+    img_info_prototype='bevdet',
+)
+test_data_config = dict(
+    pipeline=test_pipeline,
+    ann_file=data_root + 'bevdetv2-nuscenes_infos_val.pkl')
+data = dict(
+    samples_per_gpu=4,
+    workers_per_gpu=4,
+    train=dict(
+        data_root=data_root,
+        ann_file=data_root + 'bevdetv2-nuscenes_infos_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        test_mode=False,
+        use_valid_flag=True,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR'),
+    val=test_data_config,
+    test=test_data_config)
+for key in ['val', 'train', 'test']:
+    data[key].update(share_data_config)
+# Optimizer
+optimizer = dict(type='AdamW', lr=1e-4, weight_decay=1e-2)
+optimizer_config = dict(grad_clip=dict(max_norm=5, norm_type=2))
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=200,
+    warmup_ratio=0.001,
+    step=[24, ])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
+custom_hooks = [
+    dict(
+        type='MEGVIIEMAHook',
+        init_updates=10560,
+        priority='NORMAL',
+    ),
+]
+load_from = "ckpts/bevdet-r50-cbgs.pth"
+# fp16 = dict(loss_scale='dynamic')
+evaluation = dict(interval=1, start=20, pipeline=test_pipeline)
+checkpoint_config = dict(interval=1, max_keep_ckpts=5)
+# with det pretrain; use_mask=True; out_dim=256,
+# ===> per class IoU of 6019 samples:
+# ===> per class IoU of 6019 samples:
+# ===> others - IoU = 6.21
+# ===> barrier - IoU = 39.56
+# ===> bicycle - IoU = 11.27
+# ===> bus - IoU = 36.31
+# ===> car - IoU = 43.96
+# ===> construction_vehicle - IoU = 16.25
+# ===> motorcycle - IoU = 14.74
+# ===> pedestrian - IoU = 16.89
+# ===> traffic_cone - IoU = 15.76
+# ===> trailer - IoU = 28.56
+# ===> truck - IoU = 30.91
+# ===> driveable_surface - IoU = 78.16
+# ===> other_flat - IoU = 37.52
+# ===> sidewalk - IoU = 47.42
+# ===> terrain - IoU = 51.35
+# ===> manmade - IoU = 36.79
+# ===> vegetation - IoU = 31.42
+# ===> mIoU of 6019 samples: 31.95
+# {'mIoU': array([0.06207982, 0.39564533, 0.11270112, 0.36311426, 0.43955401,
+#        0.16252583, 0.14739984, 0.16885096, 0.15757262, 0.28564777,
+#        0.30909029, 0.7815907 , 0.37523904, 0.47420705, 0.51351759,
+#        0.36789645, 0.31420157, 0.87802724])}
--- a/docker-hub/FlashOCC/Flashocc/projects/configs/flashocc/flashocc-r50-trt.py
+++ b/docker-hub/FlashOCC/Flashocc/projects/configs/flashocc/flashocc-r50-trt.py
+_base_ = ['./flashocc-r50.py',
+          ]
+model = dict(
+    wocc=True,
+    wdet3d=False,
+)
--- a/docker-hub/FlashOCC/Flashocc/projects/configs/flashocc/flashocc-r50.py
+++ b/docker-hub/FlashOCC/Flashocc/projects/configs/flashocc/flashocc-r50.py
+_base_ = ['../../../mmdetection3d/configs/_base_/datasets/nus-3d.py',
+          '../../../mmdetection3d/configs/_base_/default_runtime.py']
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+data_config = {
+    'cams': [
+        'CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT',
+        'CAM_BACK', 'CAM_BACK_RIGHT'
+    ],
+    'Ncams':
+    6,
+    'input_size': (256, 704),
+    'src_size': (900, 1600),
+    # Augmentation
+    'resize': (-0.06, 0.11),
+    'rot': (-5.4, 5.4),
+    'flip': True,
+    'crop_h': (0.0, 0.0),
+    'resize_test': 0.00,
+}
+grid_config = {
+    'x': [-40, 40, 0.4],
+    'y': [-40, 40, 0.4],
+    'z': [-1, 5.4, 6.4],
+    'depth': [1.0, 45.0, 0.5],
+}
+voxel_size = [0.1, 0.1, 0.2]
+numC_Trans = 64
+model = dict(
+    type='BEVDetOCC',
+    img_backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(2, 3),
+        frozen_stages=-1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=False,
+        with_cp=True,
+        style='pytorch',
+        #pretrained='torchvision://resnet50',
+    ),
+    img_neck=dict(
+        type='CustomFPN',
+        in_channels=[1024, 2048],
+        out_channels=256,
+        num_outs=1,
+        start_level=0,
+        out_ids=[0]),
+    img_view_transformer=dict(
+        type='LSSViewTransformer',
+        grid_config=grid_config,
+        input_size=data_config['input_size'],
+        in_channels=256,
+        out_channels=numC_Trans,
+        sid=False,
+        collapse_z=True,
+        downsample=16),
+    img_bev_encoder_backbone=dict(
+        type='CustomResNet',
+        numC_input=numC_Trans,
+        num_channels=[numC_Trans * 2, numC_Trans * 4, numC_Trans * 8]),
+    img_bev_encoder_neck=dict(
+        type='FPN_LSS',
+        in_channels=numC_Trans * 8 + numC_Trans * 2,
+        out_channels=256),
+    occ_head=dict(
+        type='BEVOCCHead2D',
+        in_dim=256,
+        out_dim=256,    # out_dim=128 for M0!!!
+        Dz=16,
+        use_mask=True,
+        num_classes=18,
+        use_predicter=True,
+        class_balance=False,
+        loss_occ=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=False,
+            ignore_index=255,
+            loss_weight=1.0
+        ),
+    )
+)
+# Data
+dataset_type = 'NuScenesDatasetOccpancy'
+data_root = 'data/nuscenes/'
+file_client_args = dict(backend='disk')
+bda_aug_conf = dict(
+    rot_lim=(-0., 0.),
+    scale_lim=(1., 1.),
+    flip_dx_ratio=0.5,
+    flip_dy_ratio=0.5
+)
+train_pipeline = [
+    dict(
+        type='PrepareImageInputs',
+        is_train=True,
+        data_config=data_config,
+        sequential=False),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=True),
+    dict(type='LoadOccGTFromFile'),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(type='PointToMultiViewDepth', downsample=1, grid_config=grid_config),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(
+        type='Collect3D', keys=['img_inputs', 'gt_depth', 'voxel_semantics',
+                                'mask_lidar', 'mask_camera'])
+]
+test_pipeline = [
+    dict(type='PrepareImageInputs', data_config=data_config, sequential=False),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=False),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points', 'img_inputs'])
+        ])
+]
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=False)
+share_data_config = dict(
+    type=dataset_type,
+    data_root=data_root,
+    classes=class_names,
+    modality=input_modality,
+    stereo=False,
+    filter_empty_gt=False,
+    img_info_prototype='bevdet',
+)
+test_data_config = dict(
+    pipeline=test_pipeline,
+    ann_file=data_root + 'bevdetv2-nuscenes_infos_val.pkl')
+data = dict(
+    samples_per_gpu=24,
+    workers_per_gpu=24,
+    train=dict(
+        data_root=data_root,
+        ann_file=data_root + 'bevdetv2-nuscenes_infos_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        test_mode=False,
+        use_valid_flag=True,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR'),
+    val=test_data_config,
+    test=test_data_config)
+for key in ['val', 'train', 'test']:
+    data[key].update(share_data_config)
+# Optimizer
+optimizer = dict(type='AdamW', lr=1e-4, weight_decay=1e-2)
+optimizer_config = dict(grad_clip=dict(max_norm=5, norm_type=2))
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=200,
+    warmup_ratio=0.001,
+    step=[24, ])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
+custom_hooks = [
+    dict(
+        type='MEGVIIEMAHook',
+        init_updates=10560,
+        priority='NORMAL',
+    ),
+]
+load_from = "ckpts/bevdet-r50-cbgs.pth"
+# fp16 = dict(loss_scale='dynamic')
+evaluation = dict(interval=1, start=20, pipeline=test_pipeline)
+checkpoint_config = dict(interval=1, max_keep_ckpts=5)
+# with det pretrain; use_mask=True;
+# ===> per class IoU of 6019 samples:
+# ===> others - IoU = 6.74
+# ===> barrier - IoU = 37.65
+# ===> bicycle - IoU = 10.26
+# ===> bus - IoU = 39.55
+# ===> car - IoU = 44.36
+# ===> construction_vehicle - IoU = 14.88
+# ===> motorcycle - IoU = 13.4
+# ===> pedestrian - IoU = 15.79
+# ===> traffic_cone - IoU = 15.38
+# ===> trailer - IoU = 27.44
+# ===> truck - IoU = 31.73
+# ===> driveable_surface - IoU = 78.82
+# ===> other_flat - IoU = 37.98
+# ===> sidewalk - IoU = 48.7
+# ===> terrain - IoU = 52.5
+# ===> manmade - IoU = 37.89
+# ===> vegetation - IoU = 32.24
+# ===> mIoU of 6019 samples: 32.08
+# with det pretrain; use_mask=False; class_balance=True
+# ===> per class IoU of 6019 samples:
+# ===> others - IoU = 4.49
+# ===> barrier - IoU = 29.59
+# ===> bicycle - IoU = 7.38
+# ===> bus - IoU = 30.32
+# ===> car - IoU = 32.22
+# ===> construction_vehicle - IoU = 13.04
+# ===> motorcycle - IoU = 11.91
+# ===> pedestrian - IoU = 8.61
+# ===> traffic_cone - IoU = 8.11
+# ===> trailer - IoU = 7.66
+# ===> truck - IoU = 20.84
+# ===> driveable_surface - IoU = 48.59
+# ===> other_flat - IoU = 26.62
+# ===> sidewalk - IoU = 26.08
+# ===> terrain - IoU = 20.86
+# ===> manmade - IoU = 7.62
+# ===> vegetation - IoU = 7.14
+# ===> mIoU of 6019 samples: 18.3
--- a/docker-hub/FlashOCC/Flashocc/projects/configs/flashocc/flashocc-stbase-4d-stereo-512x1408_4x4_1e-2.py
+++ b/docker-hub/FlashOCC/Flashocc/projects/configs/flashocc/flashocc-stbase-4d-stereo-512x1408_4x4_1e-2.py
+_base_ = ['../../../mmdetection3d/configs/_base_/datasets/nus-3d.py',
+          '../../../mmdetection3d/configs/_base_/default_runtime.py']
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+data_config = {
+    'cams': [
+        'CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT',
+        'CAM_BACK', 'CAM_BACK_RIGHT'
+    ],
+    'Ncams':
+    6,
+    'input_size': (512, 1408),
+    'src_size': (900, 1600),
+    # Augmentation
+    'resize': (-0.06, 0.11),
+    'rot': (-5.4, 5.4),
+    'flip': True,
+    'crop_h': (0.0, 0.0),
+    'resize_test': 0.00,
+}
+# Model
+grid_config = {
+    'x': [-40, 40, 0.4],
+    'y': [-40, 40, 0.4],
+    'z': [-1, 5.4, 6.4],
+    'depth': [1.0, 45.0, 0.5],
+}
+voxel_size = [0.1, 0.1, 0.2]
+numC_Trans = 80
+multi_adj_frame_id_cfg = (1, 1+1, 1)
+model = dict(
+    type='BEVStereo4DOCC',
+    align_after_view_transfromation=False,
+    num_adj=len(range(*multi_adj_frame_id_cfg)),
+    img_backbone=dict(
+        type='SwinTransformer',
+        pretrain_img_size=224,
+        patch_size=4,
+        window_size=12,
+        mlp_ratio=4,
+        embed_dims=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        strides=(4, 2, 2, 2),
+        out_indices=(2, 3),
+        qkv_bias=True,
+        qk_scale=None,
+        patch_norm=True,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.1,
+        use_abs_pos_embed=False,
+        return_stereo_feat=True,
+        act_cfg=dict(type='GELU'),
+        norm_cfg=dict(type='LN', requires_grad=True),
+        pretrain_style='official',
+        output_missing_index_as_none=False),
+    img_neck=dict(
+        type='FPN_LSS',
+        in_channels=512 + 1024,
+        out_channels=512,
+        # with_cp=False,
+        extra_upsample=None,
+        input_feature_index=(0, 1),
+        scale_factor=2),
+    img_view_transformer=dict(
+        type='LSSViewTransformerBEVStereo',
+        grid_config=grid_config,
+        input_size=data_config['input_size'],
+        in_channels=512,
+        out_channels=numC_Trans,
+        sid=False,
+        collapse_z=True,
+        loss_depth_weight=0.05,
+        depthnet_cfg=dict(use_dcn=False,
+                          aspp_mid_channels=96,
+                          stereo=True,
+                          bias=5.),
+        downsample=16),
+    img_bev_encoder_backbone=dict(
+        type='CustomResNet',
+        with_cp=True,
+        numC_input=numC_Trans * (len(range(*multi_adj_frame_id_cfg))+1),
+        num_channels=[numC_Trans * 2, numC_Trans * 4, numC_Trans * 8]),
+    img_bev_encoder_neck=dict(
+        type='FPN_LSS',
+        in_channels=numC_Trans * 8 + numC_Trans * 2,
+        out_channels=256),
+    pre_process=dict(
+        type='CustomResNet',
+        numC_input=numC_Trans,
+        num_layer=[1, ],
+        num_channels=[numC_Trans, ],
+        stride=[1, ],
+        backbone_output_ids=[0, ]),
+    occ_head=dict(
+        type='BEVOCCHead2D',
+        in_dim=256,
+        out_dim=256,
+        Dz=16,
+        use_mask=True,
+        num_classes=18,
+        use_predicter=True,
+        class_balance=False,
+        loss_occ=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=False,
+            ignore_index=255,
+            loss_weight=1.0
+        ),
+    )
+)
+# Data
+dataset_type = 'NuScenesDatasetOccpancy'
+data_root = 'data/nuscenes/'
+file_client_args = dict(backend='disk')
+bda_aug_conf = dict(
+    rot_lim=(-0., 0.),
+    scale_lim=(1., 1.),
+    flip_dx_ratio=0.5,
+    flip_dy_ratio=0.5)
+train_pipeline = [
+    dict(
+        type='PrepareImageInputs',
+        is_train=True,
+        data_config=data_config,
+        sequential=True),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=True),
+    dict(type='LoadOccGTFromFile'),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(type='PointToMultiViewDepth', downsample=1, grid_config=grid_config),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(
+        type='Collect3D', keys=['img_inputs', 'gt_depth', 'voxel_semantics',
+                                'mask_lidar','mask_camera'])
+]
+test_pipeline = [
+    dict(type='PrepareImageInputs', data_config=data_config, sequential=True),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=False),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points', 'img_inputs'])
+        ])
+]
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=False)
+share_data_config = dict(
+    type=dataset_type,
+    classes=class_names,
+    modality=input_modality,
+    stereo=True,
+    filter_empty_gt=False,
+    img_info_prototype='bevdet4d',
+    multi_adj_frame_id_cfg=multi_adj_frame_id_cfg,
+)
+test_data_config = dict(
+    data_root=data_root,
+    pipeline=test_pipeline,
+    ann_file=data_root + 'bevdetv2-nuscenes_infos_val.pkl')
+data = dict(
+    samples_per_gpu=4,  # with 32 GPU
+    workers_per_gpu=4,
+    train=dict(
+        data_root=data_root,
+        ann_file=data_root + 'bevdetv2-nuscenes_infos_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        test_mode=False,
+        use_valid_flag=True,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR'),
+    val=test_data_config,
+    test=test_data_config)
+for key in ['val', 'train', 'test']:
+    data[key].update(share_data_config)
+# Optimizer
+optimizer = dict(type='AdamW', lr=1e-4, weight_decay=1e-2)
+optimizer_config = dict(grad_clip=dict(max_norm=5, norm_type=2))
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=200,
+    warmup_ratio=0.001,
+    step=[24, ])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
+custom_hooks = [
+    dict(
+        type='MEGVIIEMAHook',
+        init_updates=10560,
+        priority='NORMAL',
+    ),
+    dict(
+        type='SyncbnControlHook',
+        syncbn_start_epoch=0,
+    ),
+]
+evaluation = dict(interval=6, start=0, pipeline=test_pipeline)
+checkpoint_config = dict(interval=1, max_keep_ckpts=3)
+# load_from="ckpts/bevdet-stbase-4d-stereo-512x1408-cbgs.pth"
+resume_from="work_dirs/flashocc-stbase-4d-stereo-512x1408_4x4_1e-2/epoch_5.pth"
+# fp16 = dict(loss_scale='dynamic')
+# bash tools/dist_train.sh projects/configs/flashocc/flashocc-stbase-4d-stereo-512x1408_4x4_1e-2.py 4
\ No newline at end of file
--- a/docker-hub/FlashOCC/Flashocc/projects/configs/flashocc/flashocc-stbase-4d-stereo-512x1408_4x4_2e-4.py
+++ b/docker-hub/FlashOCC/Flashocc/projects/configs/flashocc/flashocc-stbase-4d-stereo-512x1408_4x4_2e-4.py
+_base_ = ['../../../mmdetection3d/configs/_base_/datasets/nus-3d.py',
+          '../../../mmdetection3d/configs/_base_/default_runtime.py']
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+data_config = {
+    'cams': [
+        'CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT',
+        'CAM_BACK', 'CAM_BACK_RIGHT'
+    ],
+    'Ncams':
+    6,
+    'input_size': (512, 1408),
+    'src_size': (900, 1600),
+    # Augmentation
+    'resize': (-0.06, 0.11),
+    'rot': (-5.4, 5.4),
+    'flip': True,
+    'crop_h': (0.0, 0.0),
+    'resize_test': 0.00,
+}
+# Model
+grid_config = {
+    'x': [-40, 40, 0.4],
+    'y': [-40, 40, 0.4],
+    'z': [-1, 5.4, 6.4],
+    'depth': [1.0, 45.0, 0.5],
+}
+voxel_size = [0.1, 0.1, 0.2]
+numC_Trans = 80
+multi_adj_frame_id_cfg = (1, 1+1, 1)
+model = dict(
+    type='BEVStereo4DOCC',
+    align_after_view_transfromation=False,
+    num_adj=len(range(*multi_adj_frame_id_cfg)),
+    img_backbone=dict(
+        type='SwinTransformer',
+        pretrain_img_size=224,
+        patch_size=4,
+        window_size=12,
+        mlp_ratio=4,
+        embed_dims=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        strides=(4, 2, 2, 2),
+        out_indices=(2, 3),
+        qkv_bias=True,
+        qk_scale=None,
+        patch_norm=True,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.1,
+        use_abs_pos_embed=False,
+        return_stereo_feat=True,
+        act_cfg=dict(type='GELU'),
+        norm_cfg=dict(type='LN', requires_grad=True),
+        pretrain_style='official',
+        output_missing_index_as_none=False),
+    img_neck=dict(
+        type='FPN_LSS',
+        in_channels=512 + 1024,
+        out_channels=512,
+        # with_cp=False,
+        extra_upsample=None,
+        input_feature_index=(0, 1),
+        scale_factor=2),
+    img_view_transformer=dict(
+        type='LSSViewTransformerBEVStereo',
+        grid_config=grid_config,
+        input_size=data_config['input_size'],
+        in_channels=512,
+        out_channels=numC_Trans,
+        sid=False,
+        collapse_z=True,
+        loss_depth_weight=0.05,
+        depthnet_cfg=dict(use_dcn=False,
+                          aspp_mid_channels=96,
+                          stereo=True,
+                          bias=5.),
+        downsample=16),
+    img_bev_encoder_backbone=dict(
+        type='CustomResNet',
+        with_cp=True,
+        numC_input=numC_Trans * (len(range(*multi_adj_frame_id_cfg))+1),
+        num_channels=[numC_Trans * 2, numC_Trans * 4, numC_Trans * 8]),
+    img_bev_encoder_neck=dict(
+        type='FPN_LSS',
+        in_channels=numC_Trans * 8 + numC_Trans * 2,
+        out_channels=256),
+    pre_process=dict(
+        type='CustomResNet',
+        numC_input=numC_Trans,
+        num_layer=[1, ],
+        num_channels=[numC_Trans, ],
+        stride=[1, ],
+        backbone_output_ids=[0, ]),
+    occ_head=dict(
+        type='BEVOCCHead2D',
+        in_dim=256,
+        out_dim=256,
+        Dz=16,
+        use_mask=True,
+        num_classes=18,
+        use_predicter=True,
+        class_wise=False,
+        loss_occ=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=False,
+            ignore_index=255,
+            loss_weight=1.0
+        ),
+    )
+)
+# Data
+dataset_type = 'NuScenesDatasetOccpancy'
+data_root = 'data/nuscenes/'
+file_client_args = dict(backend='disk')
+bda_aug_conf = dict(
+    rot_lim=(-0., 0.),
+    scale_lim=(1., 1.),
+    flip_dx_ratio=0.5,
+    flip_dy_ratio=0.5)
+train_pipeline = [
+    dict(
+        type='PrepareImageInputs',
+        is_train=True,
+        data_config=data_config,
+        sequential=True),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=True),
+    dict(type='LoadOccGTFromFile'),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(type='PointToMultiViewDepth', downsample=1, grid_config=grid_config),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(
+        type='Collect3D', keys=['img_inputs', 'gt_depth', 'voxel_semantics',
+                                'mask_lidar','mask_camera'])
+]
+test_pipeline = [
+    dict(type='PrepareImageInputs', data_config=data_config, sequential=True),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=False),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points', 'img_inputs'])
+        ])
+]
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=False)
+share_data_config = dict(
+    type=dataset_type,
+    classes=class_names,
+    modality=input_modality,
+    stereo=True,
+    filter_empty_gt=False,
+    img_info_prototype='bevdet4d',
+    multi_adj_frame_id_cfg=multi_adj_frame_id_cfg,
+)
+test_data_config = dict(
+    data_root=data_root,
+    pipeline=test_pipeline,
+    ann_file=data_root + 'bevdetv2-nuscenes_infos_val.pkl')
+data = dict(
+    samples_per_gpu=4,  # with 32 GPU
+    workers_per_gpu=4,
+    train=dict(
+        data_root=data_root,
+        ann_file=data_root + 'bevdetv2-nuscenes_infos_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        test_mode=False,
+        use_valid_flag=True,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR'),
+    val=test_data_config,
+    test=test_data_config)
+for key in ['val', 'train', 'test']:
+    data[key].update(share_data_config)
+# Optimizer
+optimizer = dict(type='AdamW', lr=2e-4, weight_decay=1e-2)
+optimizer_config = dict(grad_clip=dict(max_norm=5, norm_type=2))
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=200,
+    warmup_ratio=0.001,
+    step=[24, ])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
+custom_hooks = [
+    dict(
+        type='MEGVIIEMAHook',
+        init_updates=10560,
+        priority='NORMAL',
+    ),
+    dict(
+        type='SyncbnControlHook',
+        syncbn_start_epoch=0,
+    ),
+]
+evaluation = dict(interval=6, start=0, pipeline=test_pipeline)
+checkpoint_config = dict(interval=1, max_keep_ckpts=3)
+load_from="ckpts/bevdet-stbase-4d-stereo-512x1408-cbgs.pth"
+# fp16 = dict(loss_scale='dynamic')
+# bash tools/dist_train.sh projects/configs/flashocc/flashocc-stbase-4d-stereo-512x1408.py 4
+# ===> per class IoU of 6019 samples:
+# ===> others - IoU = 13.42
+# ===> barrier - IoU = 51.07
+# ===> bicycle - IoU = 27.68
+# ===> bus - IoU = 51.57
+# ===> car - IoU = 56.22
+# ===> construction_vehicle - IoU = 27.27
+# ===> motorcycle - IoU = 29.98
+# ===> pedestrian - IoU = 29.93
+# ===> traffic_cone - IoU = 29.8
+# ===> trailer - IoU = 37.77
+# ===> truck - IoU = 43.52
+# ===> driveable_surface - IoU = 83.81
+# ===> other_flat - IoU = 46.55
+# ===> sidewalk - IoU = 56.15
+# ===> terrain - IoU = 59.56
+# ===> manmade - IoU = 50.84
+# ===> vegetation - IoU = 44.67
+# ===> mIoU of 6019 samples: 43.52
+# ===> per class IoU of 6019 samples:
+# ===> others - IoU = 13.31
+# ===> barrier - IoU = 51.62
+# ===> bicycle - IoU = 28.07
+# ===> bus - IoU = 50.91
+# ===> car - IoU = 55.69
+# ===> construction_vehicle - IoU = 27.46
+# ===> motorcycle - IoU = 31.05
+# ===> pedestrian - IoU = 29.98
+# ===> traffic_cone - IoU = 29.2
+# ===> trailer - IoU = 38.86
+# ===> truck - IoU = 43.68
+# ===> driveable_surface - IoU = 83.87
+# ===> other_flat - IoU = 45.63
+# ===> sidewalk - IoU = 56.33
+# ===> terrain - IoU = 59.01
+# ===> manmade - IoU = 50.63
+# ===> vegetation - IoU = 44.56
+# ===> mIoU of 6019 samples: 43.52
+# {'mIoU': array([0.13311691, 0.51617081, 0.28070517, 0.50911942, 0.55694228,
+#        0.27461342, 0.31050779, 0.29979125, 0.29204287, 0.38862984,
+#        0.43680049, 0.83872518, 0.45630227, 0.56327839, 0.59008883,
+#        0.50627122, 0.44564523, 0.90959399])}
\ No newline at end of file
--- a/docker-hub/FlashOCC/Flashocc/projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth-pano.py
+++ b/docker-hub/FlashOCC/Flashocc/projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth-pano.py
+_base_ = ['../../../mmdetection3d/configs/_base_/datasets/nus-3d.py',
+          '../../../mmdetection3d/configs/_base_/default_runtime.py']
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+# point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+point_cloud_range = [-40.0, -40.0, -5.0, 40.0, 40.0, 3.0]
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+data_config = {
+    'cams': [
+        'CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT',
+        'CAM_BACK', 'CAM_BACK_RIGHT'
+    ],
+    'Ncams':
+    6,
+    'input_size': (256, 704),
+    'src_size': (900, 1600),
+    # Augmentation
+    'resize': (-0.06, 0.11),
+    'rot': (-5.4, 5.4),
+    'flip': True,
+    'crop_h': (0.0, 0.0),
+    'resize_test': 0.00,
+}
+grid_config = {
+    'x': [-40, 40, 0.4],
+    'y': [-40, 40, 0.4],
+    'z': [-1, 5.4, 6.4],
+    'depth': [1.0, 45.0, 0.5],
+}
+voxel_size = [0.1, 0.1, 0.2]
+numC_Trans = 80
+model = dict(
+    type='BEVDepthPano',     # single-frame
+    img_backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(2, 3),
+        frozen_stages=-1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=False,
+        with_cp=True,
+        style='pytorch',
+        pretrained='torchvision://resnet50',
+    ),
+    img_neck=dict(
+        type='CustomFPN',
+        in_channels=[1024, 2048],
+        out_channels=256,
+        num_outs=1,
+        start_level=0,
+        out_ids=[0]),
+    img_view_transformer=dict(
+        type='LSSViewTransformerBEVDepth',
+        grid_config=grid_config,
+        input_size=data_config['input_size'],
+        in_channels=256,
+        out_channels=numC_Trans,
+        loss_depth_weight=1,
+        depthnet_cfg=dict(use_dcn=False, aspp_mid_channels=96),
+        downsample=16),
+    img_bev_encoder_backbone=dict(
+        type='CustomResNet',
+        numC_input=numC_Trans,
+        num_channels=[numC_Trans * 2, numC_Trans * 4, numC_Trans * 8]),
+    img_bev_encoder_neck=dict(
+        type='FPN_LSS',
+        in_channels=numC_Trans * 8 + numC_Trans * 2,
+        out_channels=256),
+    aux_centerness_head=dict(
+        type='Centerness_Head',
+        task_specific_weight=[1, 1, 0, 0, 0],
+        in_channels=256,
+        tasks=[
+            dict(num_class=10, class_names=['car', 'truck',
+                                            'construction_vehicle',
+                                            'bus', 'trailer',
+                                            'barrier',
+                                            'motorcycle', 'bicycle',
+                                            'pedestrian', 'traffic_cone']),
+        ],
+        common_heads=dict(
+            reg=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2)),
+        share_conv_channel=64,
+        bbox_coder=dict(
+            type='CenterPointBBoxCoder',
+            pc_range=point_cloud_range[:2],
+            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            max_num=500,
+            score_threshold=0.3, # 
+            out_size_factor=4,
+            voxel_size=voxel_size[:2],
+            code_size=9),
+        separate_head=dict(
+            type='SeparateHead', init_bias=-2.19, final_kernel=3),
+        loss_cls=dict(type='GaussianFocalLoss', reduction='mean'),
+        loss_bbox=dict(type='L1Loss', reduction='mean', loss_weight=0.25),
+        norm_bbox=True),
+    occ_head=dict(
+        type='BEVOCCHead2D_V2',
+        in_dim=256,
+        out_dim=256,
+        Dz=16,
+        use_mask=False,
+        num_classes=18,
+        use_predicter=True,
+        class_balance=True,
+        loss_occ=dict(
+            type='CustomFocalLoss',
+            use_sigmoid=True,
+            loss_weight=1.0
+        ),
+    ),
+    # model training and testing settings
+    train_cfg=dict(
+        pts=dict(
+            point_cloud_range=point_cloud_range,
+            grid_size=[800, 800, 40],
+            voxel_size=voxel_size,
+            out_size_factor=4,
+            dense_reg=1,
+            gaussian_overlap=0.1,
+            max_objs=500,
+            min_radius=2,
+            code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2])),
+    test_cfg=dict(
+        pts=dict(
+            max_per_img=500,
+            max_pool_nms=False,
+            min_radius=[4, 12, 10, 1, 0.85, 0.175],
+            score_threshold=0.1,
+            out_size_factor=4,
+            voxel_size=voxel_size[:2],
+            pre_max_size=1000,
+            post_max_size=500,
+            # Scale-NMS
+            nms_type=['rotate'],
+            nms_thr=[0.2],
+            nms_rescale_factor=[[1.0, 0.7, 0.7, 0.4, 0.55,
+                                 1.1, 1.0, 1.0, 1.5, 3.5]]
+        )
+    ),
+)
+# Data
+dataset_type = 'NuScenesDatasetOccpancy'
+data_root = 'data/nuscenes/'
+file_client_args = dict(backend='disk')
+bda_aug_conf = dict(
+    rot_lim=(-0., 0.),
+    scale_lim=(1., 1.),
+    flip_dx_ratio=0.5,
+    flip_dy_ratio=0.5
+)
+train_pipeline = [
+    dict(
+        type='PrepareImageInputs',
+        is_train=True,
+        data_config=data_config,
+        sequential=False),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=True),
+    dict(type='LoadOccGTFromFile'),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(type='PointToMultiViewDepth', downsample=1, grid_config=grid_config),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(
+        type='Collect3D', keys=['img_inputs', 'gt_depth', 'voxel_semantics',
+                                'mask_lidar', 'mask_camera', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(type='PrepareImageInputs', data_config=data_config, sequential=False),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=False),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points', 'img_inputs', 'gt_bboxes_3d', 'gt_labels_3d'])
+        ])
+]
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=False)
+share_data_config = dict(
+    type=dataset_type,
+    data_root=data_root,
+    classes=class_names,
+    modality=input_modality,
+    stereo=False,
+    filter_empty_gt=False,
+    img_info_prototype='bevdet',
+)
+test_data_config = dict(
+    pipeline=test_pipeline,
+    ann_file=data_root + 'bevdetv2-nuscenes_infos_val.pkl')
+data = dict(
+    samples_per_gpu=4,
+    workers_per_gpu=4,
+    train=dict(
+        data_root=data_root,
+        ann_file=data_root + 'bevdetv2-nuscenes_infos_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        test_mode=False,
+        use_valid_flag=True,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR'),
+    val=test_data_config,
+    test=test_data_config)
+for key in ['val', 'train', 'test']:
+    data[key].update(share_data_config)
+# Optimizer
+optimizer = dict(type='AdamW', lr=1e-4, weight_decay=1e-2)
+optimizer_config = dict(grad_clip=dict(max_norm=5, norm_type=2))
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=200,
+    warmup_ratio=0.001,
+    step=[24, ])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
+custom_hooks = [
+    dict(
+        type='MEGVIIEMAHook',
+        init_updates=10560,
+        priority='NORMAL',
+    ),
+]
+load_from = "ckpts/bevdet-r50-4d-depth-cbgs.pth"
+# fp16 = dict(loss_scale='dynamic')
+evaluation = dict(interval=1, start=20, pipeline=test_pipeline)
+checkpoint_config = dict(interval=1, max_keep_ckpts=5)
+# ===> per class IoU of 6019 samples:
+# ===> others - IoU = 10.21
+# ===> barrier - IoU = 42.14
+# ===> bicycle - IoU = 22.82
+# ===> bus - IoU = 40.13
+# ===> car - IoU = 42.86
+# ===> construction_vehicle - IoU = 20.69
+# ===> motorcycle - IoU = 24.58
+# ===> pedestrian - IoU = 23.7
+# ===> traffic_cone - IoU = 24.02
+# ===> trailer - IoU = 25.48
+# ===> truck - IoU = 30.9
+# ===> driveable_surface - IoU = 58.65
+# ===> other_flat - IoU = 32.04
+# ===> sidewalk - IoU = 34.27
+# ===> terrain - IoU = 31.12
+# ===> manmade - IoU = 18.26
+# ===> vegetation - IoU = 17.79
+# ===> mIoU of 6019 samples: 29.39
+# {'mIoU': array([0.102, 0.421, 0.228, 0.401, 0.429, 0.207, 0.246, 0.237, 0.24 ,
+#        0.255, 0.309, 0.586, 0.32 , 0.343, 0.311, 0.183, 0.178, 0.833])}
+# +----------------------+----------+----------+----------+
+# |     Class Names      | RayIoU@1 | RayIoU@2 | RayIoU@4 |
+# +----------------------+----------+----------+----------+
+# |        others        |  0.090   |  0.102   |  0.105   |
+# |       barrier        |  0.387   |  0.442   |  0.465   |
+# |       bicycle        |  0.218   |  0.257   |  0.265   |
+# |         bus          |  0.514   |  0.613   |  0.669   |
+# |         car          |  0.487   |  0.564   |  0.592   |
+# | construction_vehicle |  0.176   |  0.254   |  0.288   |
+# |      motorcycle      |  0.203   |  0.292   |  0.310   |
+# |      pedestrian      |  0.301   |  0.349   |  0.366   |
+# |     traffic_cone     |  0.280   |  0.313   |  0.321   |
+# |       trailer        |  0.227   |  0.313   |  0.390   |
+# |        truck         |  0.395   |  0.493   |  0.537   |
+# |  driveable_surface   |  0.534   |  0.618   |  0.708   |
+# |      other_flat      |  0.289   |  0.326   |  0.356   |
+# |       sidewalk       |  0.234   |  0.280   |  0.329   |
+# |       terrain        |  0.222   |  0.291   |  0.356   |
+# |       manmade        |  0.280   |  0.351   |  0.401   |
+# |      vegetation      |  0.176   |  0.273   |  0.359   |
+# +----------------------+----------+----------+----------+
+# |         MEAN         |  0.295   |  0.361   |  0.401   |
+# +----------------------+----------+----------+----------+
+# +----------------------+---------+---------+---------+
+# |     Class Names      | RayPQ@1 | RayPQ@2 | RayPQ@4 |
+# +----------------------+---------+---------+---------+
+# |        others        |  0.017  |  0.025  |  0.026  |
+# |       barrier        |  0.125  |  0.182  |  0.218  |
+# |       bicycle        |  0.051  |  0.072  |  0.076  |
+# |         bus          |  0.275  |  0.366  |  0.422  |
+# |         car          |  0.242  |  0.332  |  0.356  |
+# | construction_vehicle |  0.016  |  0.058  |  0.092  |
+# |      motorcycle      |  0.071  |  0.124  |  0.137  |
+# |      pedestrian      |  0.017  |  0.022  |  0.023  |
+# |     traffic_cone     |  0.032  |  0.040  |  0.044  |
+# |       trailer        |  0.035  |  0.055  |  0.063  |
+# |        truck         |  0.145  |  0.232  |  0.282  |
+# |  driveable_surface   |  0.410  |  0.537  |  0.665  |
+# |      other_flat      |  0.062  |  0.087  |  0.109  |
+# |       sidewalk       |  0.008  |  0.030  |  0.064  |
+# |       terrain        |  0.010  |  0.026  |  0.047  |
+# |       manmade        |  0.054  |  0.091  |  0.134  |
+# |      vegetation      |  0.003  |  0.022  |  0.092  |
+# +----------------------+---------+---------+---------+
+# |         MEAN         |  0.092  |  0.135  |  0.168  |
+# +----------------------+---------+---------+---------+
+# {'RayIoU': 0.35223182059688496, 'RayIoU@1': 0.29499743138394385, 'RayIoU@2': 0.3607063492639709, 'RayIoU@4': 0.4009916811427401, 
+#  'RayPQ': 0.13182524545677765, 'RayPQ@1': 0.09247682620339576, 'RayPQ@2': 0.1354024129684159, 'RayPQ@4': 0.16759649719852124}
--- a/docker-hub/FlashOCC/Flashocc/projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth-tiny-pano-trt.py
+++ b/docker-hub/FlashOCC/Flashocc/projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth-tiny-pano-trt.py
+_base_ = ['./flashoccv2-r50-depth-tiny-pano.py',
+          ]
+model = dict(
+    wocc=True,
+    wdet3d=False,
+)
--- a/docker-hub/FlashOCC/Flashocc/projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth-tiny-pano.py
+++ b/docker-hub/FlashOCC/Flashocc/projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth-tiny-pano.py
+_base_ = ['../../../mmdetection3d/configs/_base_/datasets/nus-3d.py',
+          '../../../mmdetection3d/configs/_base_/default_runtime.py']
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+# point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+point_cloud_range = [-40.0, -40.0, -5.0, 40.0, 40.0, 3.0]
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+data_config = {
+    'cams': [
+        'CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT',
+        'CAM_BACK', 'CAM_BACK_RIGHT'
+    ],
+    'Ncams':
+    6,
+    'input_size': (256, 704),
+    'src_size': (900, 1600),
+    # Augmentation
+    'resize': (-0.06, 0.11),
+    'rot': (-5.4, 5.4),
+    'flip': True,
+    'crop_h': (0.0, 0.0),
+    'resize_test': 0.00,
+}
+grid_config = {
+    'x': [-40, 40, 0.4],
+    'y': [-40, 40, 0.4],
+    'z': [-1, 5.4, 6.4],
+    'depth': [1.0, 45.0, 1.0],
+}
+voxel_size = [0.1, 0.1, 0.2]
+numC_Trans = 64
+model = dict(
+    type='BEVDepthPano',     # single-frame
+    img_backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(2, 3),
+        frozen_stages=-1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=False,
+        with_cp=True,
+        style='pytorch',
+        pretrained='torchvision://resnet50',
+    ),
+    img_neck=dict(
+        type='CustomFPN',
+        in_channels=[1024, 2048],
+        out_channels=256,
+        num_outs=1,
+        start_level=0,
+        out_ids=[0]),
+    img_view_transformer=dict(
+        type='LSSViewTransformerBEVDepth',
+        grid_config=grid_config,
+        input_size=data_config['input_size'],
+        in_channels=256,
+        out_channels=numC_Trans,
+        loss_depth_weight=1,
+        depthnet_cfg=dict(use_dcn=False, aspp_mid_channels=96),
+        downsample=16),
+    img_bev_encoder_backbone=dict(
+        type='CustomResNet',
+        numC_input=numC_Trans,
+        num_channels=[numC_Trans * 2, numC_Trans * 4, numC_Trans * 8]),
+    img_bev_encoder_neck=dict(
+        type='FPN_LSS',
+        in_channels=numC_Trans * 8 + numC_Trans * 2,
+        out_channels=128),
+    aux_centerness_head=dict(
+        type='Centerness_Head',
+        task_specific_weight=[1, 1, 0, 0, 0],
+        in_channels=128,
+        tasks=[
+            dict(num_class=10, class_names=['car', 'truck',
+                                            'construction_vehicle',
+                                            'bus', 'trailer',
+                                            'barrier',
+                                            'motorcycle', 'bicycle',
+                                            'pedestrian', 'traffic_cone']),
+        ],
+        common_heads=dict(
+            reg=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2)),
+        share_conv_channel=64,
+        bbox_coder=dict(
+            type='CenterPointBBoxCoder',
+            pc_range=point_cloud_range[:2],
+            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            max_num=500,
+            score_threshold=0.3, # 
+            out_size_factor=4,
+            voxel_size=voxel_size[:2],
+            code_size=9),
+        separate_head=dict(
+            type='SeparateHead', init_bias=-2.19, final_kernel=3),
+        loss_cls=dict(type='GaussianFocalLoss', reduction='mean'),
+        loss_bbox=dict(type='L1Loss', reduction='mean', loss_weight=0.25),
+        norm_bbox=True),
+    occ_head=dict(
+        type='BEVOCCHead2D_V2',
+        in_dim=128,
+        out_dim=128,
+        Dz=16,
+        use_mask=False,
+        num_classes=18,
+        use_predicter=True,
+        class_balance=True,
+        loss_occ=dict(
+            type='CustomFocalLoss',
+            use_sigmoid=True,
+            loss_weight=1.0
+        ),
+    ),
+    # model training and testing settings
+    train_cfg=dict(
+        pts=dict(
+            point_cloud_range=point_cloud_range,
+            grid_size=[800, 800, 40],
+            voxel_size=voxel_size,
+            out_size_factor=4,
+            dense_reg=1,
+            gaussian_overlap=0.1,
+            max_objs=500,
+            min_radius=2,
+            code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2])),
+    test_cfg=dict(
+        pts=dict(
+            max_per_img=500,
+            max_pool_nms=False,
+            min_radius=[4, 12, 10, 1, 0.85, 0.175],
+            score_threshold=0.1,
+            out_size_factor=4,
+            voxel_size=voxel_size[:2],
+            pre_max_size=1000,
+            post_max_size=500,
+            # Scale-NMS
+            nms_type=['rotate'],
+            nms_thr=[0.2],
+            nms_rescale_factor=[[1.0, 0.7, 0.7, 0.4, 0.55,
+                                 1.1, 1.0, 1.0, 1.5, 3.5]]
+        )
+    ),
+)
+# Data
+dataset_type = 'NuScenesDatasetOccpancy'
+data_root = 'data/nuscenes/'
+file_client_args = dict(backend='disk')
+bda_aug_conf = dict(
+    rot_lim=(-0., 0.),
+    scale_lim=(1., 1.),
+    flip_dx_ratio=0.5,
+    flip_dy_ratio=0.5
+)
+train_pipeline = [
+    dict(
+        type='PrepareImageInputs',
+        is_train=True,
+        data_config=data_config,
+        sequential=False),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=True),
+    dict(type='LoadOccGTFromFile'),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(type='PointToMultiViewDepth', downsample=1, grid_config=grid_config),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(
+        type='Collect3D', keys=['img_inputs', 'gt_depth', 'voxel_semantics',
+                                'mask_lidar', 'mask_camera', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(type='PrepareImageInputs', data_config=data_config, sequential=False),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=False),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points', 'img_inputs', 'gt_bboxes_3d', 'gt_labels_3d'])
+        ])
+]
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=False)
+share_data_config = dict(
+    type=dataset_type,
+    data_root=data_root,
+    classes=class_names,
+    modality=input_modality,
+    stereo=False,
+    filter_empty_gt=False,
+    img_info_prototype='bevdet',
+)
+test_data_config = dict(
+    pipeline=test_pipeline,
+    ann_file=data_root + 'bevdetv2-nuscenes_infos_val.pkl')
+data = dict(
+    samples_per_gpu=4,
+    workers_per_gpu=4,
+    train=dict(
+        data_root=data_root,
+        ann_file=data_root + 'bevdetv2-nuscenes_infos_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        test_mode=False,
+        use_valid_flag=True,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR'),
+    val=test_data_config,
+    test=test_data_config)
+for key in ['val', 'train', 'test']:
+    data[key].update(share_data_config)
+# Optimizer
+optimizer = dict(type='AdamW', lr=1e-4, weight_decay=1e-2)
+optimizer_config = dict(grad_clip=dict(max_norm=5, norm_type=2))
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=200,
+    warmup_ratio=0.001,
+    step=[24, ])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
+custom_hooks = [
+    dict(
+        type='MEGVIIEMAHook',
+        init_updates=10560,
+        priority='NORMAL',
+    ),
+]
+load_from = "ckpts/bevdet-r50-4d-depth-cbgs.pth"
+# fp16 = dict(loss_scale='dynamic')
+evaluation = dict(interval=1, start=20, pipeline=test_pipeline)
+checkpoint_config = dict(interval=1, max_keep_ckpts=5)
+# ===> per class IoU of 6019 samples:
+# ===> others - IoU = 10.33
+# ===> barrier - IoU = 41.02
+# ===> bicycle - IoU = 22.16
+# ===> bus - IoU = 39.75
+# ===> car - IoU = 42.63
+# ===> construction_vehicle - IoU = 20.53
+# ===> motorcycle - IoU = 24.01
+# ===> pedestrian - IoU = 23.71
+# ===> traffic_cone - IoU = 24.65
+# ===> trailer - IoU = 25.58
+# ===> truck - IoU = 30.63
+# ===> driveable_surface - IoU = 58.0
+# ===> other_flat - IoU = 32.12
+# ===> sidewalk - IoU = 33.78
+# ===> terrain - IoU = 31.02
+# ===> manmade - IoU = 17.67
+# ===> vegetation - IoU = 17.74
+# ===> mIoU of 6019 samples: 29.14
+# {'mIoU': array([0.103, 0.41 , 0.222, 0.397, 0.426, 0.205, 0.24 , 0.237, 0.246,
+#        0.256, 0.306, 0.58 , 0.321, 0.338, 0.31 , 0.177, 0.177, 0.832])}
+# +----------------------+----------+----------+----------+
+# |     Class Names      | RayIoU@1 | RayIoU@2 | RayIoU@4 |
+# +----------------------+----------+----------+----------+
+# |        others        |  0.095   |  0.107   |  0.110   |
+# |       barrier        |  0.374   |  0.429   |  0.452   |
+# |       bicycle        |  0.208   |  0.242   |  0.248   |
+# |         bus          |  0.498   |  0.603   |  0.659   |
+# |         car          |  0.489   |  0.568   |  0.598   |
+# | construction_vehicle |  0.171   |  0.247   |  0.279   |
+# |      motorcycle      |  0.190   |  0.277   |  0.298   |
+# |      pedestrian      |  0.295   |  0.344   |  0.361   |
+# |     traffic_cone     |  0.290   |  0.324   |  0.332   |
+# |       trailer        |  0.207   |  0.292   |  0.368   |
+# |        truck         |  0.411   |  0.507   |  0.551   |
+# |  driveable_surface   |  0.531   |  0.614   |  0.704   |
+# |      other_flat      |  0.286   |  0.325   |  0.357   |
+# |       sidewalk       |  0.234   |  0.280   |  0.328   |
+# |       terrain        |  0.220   |  0.290   |  0.356   |
+# |       manmade        |  0.267   |  0.343   |  0.392   |
+# |      vegetation      |  0.174   |  0.272   |  0.358   |
+# +----------------------+----------+----------+----------+
+# |         MEAN         |  0.291   |  0.357   |  0.397   |
+# +----------------------+----------+----------+----------+
+# 6019it [09:34, 10.48it/s]
+# +----------------------+---------+---------+---------+
+# |     Class Names      | RayPQ@1 | RayPQ@2 | RayPQ@4 |
+# +----------------------+---------+---------+---------+
+# |        others        |  0.017  |  0.024  |  0.025  |
+# |       barrier        |  0.107  |  0.169  |  0.204  |
+# |       bicycle        |  0.069  |  0.086  |  0.088  |
+# |         bus          |  0.244  |  0.350  |  0.408  |
+# |         car          |  0.238  |  0.326  |  0.352  |
+# | construction_vehicle |  0.018  |  0.081  |  0.105  |
+# |      motorcycle      |  0.061  |  0.105  |  0.117  |
+# |      pedestrian      |  0.016  |  0.022  |  0.023  |
+# |     traffic_cone     |  0.030  |  0.049  |  0.052  |
+# |       trailer        |  0.029  |  0.047  |  0.056  |
+# |        truck         |  0.151  |  0.240  |  0.286  |
+# |  driveable_surface   |  0.407  |  0.531  |  0.662  |
+# |      other_flat      |  0.054  |  0.078  |  0.098  |
+# |       sidewalk       |  0.009  |  0.030  |  0.061  |
+# |       terrain        |  0.006  |  0.022  |  0.045  |
+# |       manmade        |  0.044  |  0.091  |  0.128  |
+# |      vegetation      |  0.001  |  0.021  |  0.091  |
+# +----------------------+---------+---------+---------+
+# |         MEAN         |  0.088  |  0.134  |  0.165  |
+# +----------------------+---------+---------+---------+
+# {'RayIoU': 0.34819957391233375, 'RayIoU@1': 0.29065973127346445, 'RayIoU@2': 0.3566749015912661, 'RayIoU@4': 0.39726408887227066, 
+#  'RayPQ': 0.12890890185841564, 'RayPQ@1': 0.08832135839934552, 'RayPQ@2': 0.1336058084882046, 'RayPQ@4': 0.1647995386876968}
--- a/docker-hub/FlashOCC/Flashocc/projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth-tiny.py
+++ b/docker-hub/FlashOCC/Flashocc/projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth-tiny.py
+_base_ = ['../../../mmdetection3d/configs/_base_/datasets/nus-3d.py',
+          '../../../mmdetection3d/configs/_base_/default_runtime.py']
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+data_config = {
+    'cams': [
+        'CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT',
+        'CAM_BACK', 'CAM_BACK_RIGHT'
+    ],
+    'Ncams':
+    6,
+    'input_size': (256, 704),
+    'src_size': (900, 1600),
+    # Augmentation
+    'resize': (-0.06, 0.11),
+    'rot': (-5.4, 5.4),
+    'flip': True,
+    'crop_h': (0.0, 0.0),
+    'resize_test': 0.00,
+}
+grid_config = {
+    'x': [-40, 40, 0.4],
+    'y': [-40, 40, 0.4],
+    'z': [-1, 5.4, 6.4],
+    'depth': [1.0, 45.0, 1.0],
+}
+voxel_size = [0.1, 0.1, 0.2]
+numC_Trans = 64
+model = dict(
+    type='BEVDepthOCC',     # single-frame
+    img_backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(2, 3),
+        frozen_stages=-1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=False,
+        with_cp=True,
+        style='pytorch',
+        pretrained='torchvision://resnet50',
+    ),
+    img_neck=dict(
+        type='CustomFPN',
+        in_channels=[1024, 2048],
+        out_channels=256,
+        num_outs=1,
+        start_level=0,
+        out_ids=[0]),
+    img_view_transformer=dict(
+        type='LSSViewTransformerBEVDepth',
+        grid_config=grid_config,
+        input_size=data_config['input_size'],
+        in_channels=256,
+        out_channels=numC_Trans,
+        loss_depth_weight=1,
+        depthnet_cfg=dict(use_dcn=False, aspp_mid_channels=96),
+        downsample=16),
+    img_bev_encoder_backbone=dict(
+        type='CustomResNet',
+        numC_input=numC_Trans,
+        num_channels=[numC_Trans * 2, numC_Trans * 4, numC_Trans * 8]),
+    img_bev_encoder_neck=dict(
+        type='FPN_LSS',
+        in_channels=numC_Trans * 8 + numC_Trans * 2,
+        out_channels=128),
+    occ_head=dict(
+        type='BEVOCCHead2D_V2',
+        in_dim=128,
+        out_dim=128,
+        Dz=16,
+        use_mask=False,
+        num_classes=18,
+        use_predicter=True,
+        class_balance=True,
+        loss_occ=dict(
+            type='CustomFocalLoss',
+            use_sigmoid=True,
+            loss_weight=1.0
+        ),
+    )
+)
+# Data
+dataset_type = 'NuScenesDatasetOccpancy'
+data_root = 'data/nuscenes/'
+file_client_args = dict(backend='disk')
+bda_aug_conf = dict(
+    rot_lim=(-0., 0.),
+    scale_lim=(1., 1.),
+    flip_dx_ratio=0.5,
+    flip_dy_ratio=0.5
+)
+train_pipeline = [
+    dict(
+        type='PrepareImageInputs',
+        is_train=True,
+        data_config=data_config,
+        sequential=False),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=True),
+    dict(type='LoadOccGTFromFile'),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(type='PointToMultiViewDepth', downsample=1, grid_config=grid_config),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(
+        type='Collect3D', keys=['img_inputs', 'gt_depth', 'voxel_semantics',
+                                'mask_lidar', 'mask_camera'])
+]
+test_pipeline = [
+    dict(type='PrepareImageInputs', data_config=data_config, sequential=False),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=False),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points', 'img_inputs'])
+        ])
+]
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=False)
+share_data_config = dict(
+    type=dataset_type,
+    data_root=data_root,
+    classes=class_names,
+    modality=input_modality,
+    stereo=False,
+    filter_empty_gt=False,
+    img_info_prototype='bevdet',
+)
+test_data_config = dict(
+    pipeline=test_pipeline,
+    ann_file=data_root + 'bevdetv2-nuscenes_infos_val.pkl')
+data = dict(
+    samples_per_gpu=4,
+    workers_per_gpu=4,
+    train=dict(
+        data_root=data_root,
+        ann_file=data_root + 'bevdetv2-nuscenes_infos_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        test_mode=False,
+        use_valid_flag=True,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR'),
+    val=test_data_config,
+    test=test_data_config)
+for key in ['val', 'train', 'test']:
+    data[key].update(share_data_config)
+# Optimizer
+optimizer = dict(type='AdamW', lr=1e-4, weight_decay=1e-2)
+optimizer_config = dict(grad_clip=dict(max_norm=5, norm_type=2))
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=200,
+    warmup_ratio=0.001,
+    step=[24, ])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
+custom_hooks = [
+    dict(
+        type='MEGVIIEMAHook',
+        init_updates=10560,
+        priority='NORMAL',
+    ),
+]
+load_from = "ckpts/bevdet-r50-4d-depth-cbgs.pth"
+# fp16 = dict(loss_scale='dynamic')
+evaluation = dict(interval=1, start=20, pipeline=test_pipeline)
+checkpoint_config = dict(interval=1, max_keep_ckpts=5)
+# use_mask = False
+# ===> per class IoU of 6019 samples:
+# ===> others - IoU = 10.69
+# ===> barrier - IoU = 39.67
+# ===> bicycle - IoU = 22.01
+# ===> bus - IoU = 39.99
+# ===> car - IoU = 40.46
+# ===> construction_vehicle - IoU = 20.44
+# ===> motorcycle - IoU = 24.52
+# ===> pedestrian - IoU = 22.5
+# ===> traffic_cone - IoU = 23.72
+# ===> trailer - IoU = 25.93
+# ===> truck - IoU = 29.75
+# ===> driveable_surface - IoU = 58.29
+# ===> other_flat - IoU = 31.46
+# ===> sidewalk - IoU = 33.92
+# ===> terrain - IoU = 31.25
+# ===> manmade - IoU = 17.46
+# ===> vegetation - IoU = 17.97
+# ===> mIoU of 6019 samples: 28.83
+# {'mIoU': array([0.1068576 , 0.3967071 , 0.220114  , 0.3998965 , 0.40462457,
+#        0.20442682, 0.24516316, 0.22497209, 0.23719173, 0.25925541,
+#        0.29754347, 0.58293305, 0.31458314, 0.33921965, 0.31254221,
+#        0.17456574, 0.17970859, 0.8315865 ])}
+# Starting Evaluation...
+# 6019it [10:23,  9.65it/s]
+# +----------------------+----------+----------+----------+
+# |     Class Names      | RayIoU@1 | RayIoU@2 | RayIoU@4 |
+# +----------------------+----------+----------+----------+
+# |        others        |  0.094   |  0.107   |  0.111   |
+# |       barrier        |  0.367   |  0.421   |  0.443   |
+# |       bicycle        |  0.209   |  0.251   |  0.261   |
+# |         bus          |  0.498   |  0.601   |  0.665   |
+# |         car          |  0.472   |  0.550   |  0.581   |
+# | construction_vehicle |  0.175   |  0.251   |  0.287   |
+# |      motorcycle      |  0.205   |  0.292   |  0.315   |
+# |      pedestrian      |  0.289   |  0.339   |  0.354   |
+# |     traffic_cone     |  0.276   |  0.302   |  0.314   |
+# |       trailer        |  0.203   |  0.289   |  0.380   |
+# |        truck         |  0.396   |  0.493   |  0.546   |
+# |  driveable_surface   |  0.528   |  0.611   |  0.702   |
+# |      other_flat      |  0.280   |  0.315   |  0.346   |
+# |       sidewalk       |  0.233   |  0.279   |  0.328   |
+# |       terrain        |  0.218   |  0.286   |  0.353   |
+# |       manmade        |  0.268   |  0.347   |  0.398   |
+# |      vegetation      |  0.174   |  0.272   |  0.358   |
+# +----------------------+----------+----------+----------+
+# |         MEAN         |  0.287   |  0.353   |  0.397   |
+# +----------------------+----------+----------+----------+
+# {'RayIoU': 0.34574739050176573, 'RayIoU@1': 0.2873820616941079, 'RayIoU@2': 0.3533573712072785,
+# 'RayIoU@4': 0.39650273860391083}
--- a/docker-hub/FlashOCC/Flashocc/projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth-trt.py
+++ b/docker-hub/FlashOCC/Flashocc/projects/configs/panoptic-flashocc/panoptic-flashocc-r50-depth-trt.py
+_base_ = ['./flashoccv2-r50-depth.py',
+          ]
+model = dict(
+    wocc=True,
+    wdet3d=False,
+)