init_0905

3b8d508a · lishj6 · e968ab0f · 3b8d508a · 3b8d508a · 3b8d508a
Commit 3b8d508a authored Sep 05, 2025 by lishj6 🏸
20 changed files
--- a/.gitignore
+++ b/.gitignore
+/ckpts/
+*.o
+*.so
+*.pyc
+work_dirs/*
+projects/mmdet3d_plugin.egg-info/
+projects/flashocc_plugin.egg-info/
+nuscenes
+ckpt
+projects/build
+mmdetection3d
+*debug.py
+launch.json
+ckpts
+mmdeploy
+mmdeploy_study
+data/
+*old*
+build-old
+ppl.cv
+.vscode
+*.npz
+
+vis/
+*.jpg
+*.png
\ No newline at end of file
--- a/LICENSE
+++ b/LICENSE
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright (c) 2024 Institute of Intelligent Control, Dalian University of Technology. All rights reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/doc/cmd.md
+++ b/doc/cmd.md
+# Training cmd
+
+## 1. FlashOcc
+```shell script
+bash tool/dist_train.sh projects/configs/flashocc/flashocc-r50-M0.py 4                             # 31.95
+bash tool/dist_train.sh projects/configs/flashocc/flashocc-r50.py 4                                # 32.08
+bash tool/dist_train.sh projects/configs/flashocc/flashocc-r50-4d-stereo.py 4                      # 37.84
+bash tool/dist_train.sh projects/configs/flashocc/flashocc-stbase-4d-stereo-512x1408_4x4_1e-4.py 4 # 41.80
+bash tool/dist_train.sh projects/configs/flashocc/flashocc-stbase-4d-stereo-512x1408_4x4_2e-4.py 4 # 43.52
+```
+
+## 2. Panoptic-FlashOcc
+### for train
+```shell script
+conda activate FlashOcc
+exp_name=panoptic-flashocc-r50-depth-tiny-pano
+exp_name=panoptic-flashocc-r50-depth-pano
+exp_name=panoptic-flashocc-r50-depth4d-pano
+exp_name=panoptic-flashocc-r50-depth4d-longterm8f-pano
+bash tools/dist_train.sh \
+    projects/configs/panoptic-flashocc/${exp_name}.py \
+    4
+```
+
+### for test
+```shell script
+conda activate FlashOcc
+exp_name=panoptic-flashocc-r50-depth-tiny-pano
+exp_name=panoptic-flashocc-r50-depth-pano
+exp_name=panoptic-flashocc-r50-depth4d-pano
+exp_name=panoptic-flashocc-r50-depth4d-longterm8f-pano
+bash tools/dist_test.sh \
+    projects/configs/panoptic-flashocc/${exp_name}.py \
+    work_dirs/${exp_name}/epoch_24_ema.pth \
+    4 \
+    --eval ray-iou
+```
+
+### for vis
+```shell script
+exp_name=panoptic-flashocc-r50-depth-tiny-pano
+exp_name=panoptic-flashocc-r50-depth-pano
+exp_name=panoptic-flashocc-r50-depth4d-pano
+exp_name=panoptic-flashocc-r50-depth4d-longterm8f-pano
+python tools/vis_occ.py --config projects/configs/panoptic-flashocc/${exp_name}.py --weights work_dirs/${exp_name}/epoch_24_ema.pth --viz-dir vis/${exp_name} --draw-gt
+```
+
+### for test inference time
+```shell script
+conda activate FlashOcc
+source activate FlashOcc
+exp_name=panoptic-flashocc-r50-depth-tiny-pano
+exp_name=panoptic-flashocc-r50-depth-pano
+python tools/analysis_tools/benchmark.py \
+    projects/configs/panoptic-flashocc/${exp_name}.py \
+    work_dirs/${exp_name}/epoch_24_ema.pth \
+    --w_pano --w_panoproc
+
+exp_name=panoptic-flashocc-r50-depth4d-pano
+exp_name=panoptic-flashocc-r50-depth4d-longterm8f-pano
+python tools/analysis_tools/benchmark_sequential.py \
+    projects/configs/panoptic-flashocc/${exp_name}.py \
+    work_dirs/${exp_name}/epoch_24_ema.pth \
+    --w_pano --w_panoproc
+
+```
--- a/doc/install.md
+++ b/doc/install.md
+## Environment Setup
+step 1. Install environment for pytorch training
+```
+conda create --name FlashOcc python=3.8.5
+conda activate FlashOcc
+pip install torch==1.10.0+cu111 torchvision==0.11.0+cu111 torchaudio==0.10.0 -f https://download.pytorch.org/whl/torch_stable.html
+pip install mmcv-full==1.5.3
+pip install mmdet==2.25.1
+pip install mmsegmentation==0.25.0
+
+sudo apt-get install python3-dev 
+sudo apt-get install libevent-dev
+sudo apt-get groupinstall 'development tools'
+export PATH=/usr/local/cuda/bin:$PATH
+export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+export CUDA_ROOT=/usr/local/cuda
+pip install pycuda
+
+pip install lyft_dataset_sdk
+pip install networkx==2.2
+pip install numba==0.53.0
+pip install numpy==1.23.5
+pip install nuscenes-devkit
+pip install plyfile
+pip install scikit-image
+pip install tensorboard
+pip install trimesh==2.35.39
+pip install setuptools==59.5.0
+pip install yapf==0.40.1
+
+cd Path_to_FlashOcc
+git clone git@github.com:Yzichen/FlashOCC.git
+
+cd Path_to_FlashOcc/FlashOcc
+git clone https://github.com/open-mmlab/mmdetection3d.git
+
+cd Path_to_FlashOcc/FlashOcc/mmdetection3d
+git checkout v1.0.0rc4
+pip install -v -e . 
+
+cd Path_to_FlashOcc/FlashOcc/projects
+pip install -v -e . 
+```
+
+step 3. Prepare nuScenes dataset as introduced in [nuscenes_det.md](nuscenes_det.md) and create the pkl for FlashOCC by running:
+```shell
+python tools/create_data_bevdet.py
+```
+thus, the folder will be ranged as following:
+```shell script
+└── Path_to_FlashOcc/
+    └── data
+        └── nuscenes
+            ├── v1.0-trainval (existing)
+            ├── sweeps  (existing)
+            ├── samples (existing)
+            ├── bevdetv2-nuscenes_infos_train.pkl (new)
+            └── bevdetv2-nuscenes_infos_val.pkl (new)
+```
+
+step 4. For Occupancy Prediction task, download (only) the 'gts' from [CVPR2023-3D-Occupancy-Prediction](https://github.com/CVPR2023-3D-Occupancy-Prediction/CVPR2023-3D-Occupancy-Prediction) and arrange the folder as:
+```shell script
+└── Path_to_FlashOcc/
+    └── data
+        └── nuscenes
+            ├── v1.0-trainval (existing)
+            ├── sweeps  (existing)
+            ├── samples (existing)
+            ├── gts (new)
+            ├── bevdetv2-nuscenes_infos_train.pkl (new)
+            └── bevdetv2-nuscenes_infos_val.pkl (new)
+```
+(for panoptic occupancy), we follow the data setting in SparseOcc:
+
+(1) Download Occ3D-nuScenes occupancy GT from [gdrive](https://drive.google.com/file/d/1kiXVNSEi3UrNERPMz_CfiJXKkgts_5dY/view?usp=drive_link), unzip it, and save it to `data/nuscenes/occ3d`.
+
+(2) Generate the panoptic occupancy ground truth with `gen_instance_info.py`. The panoptic version of Occ3D will be saved to `data/nuscenes/occ3d_panoptic`.
+
+
+step 5. CKPTS Preparation
+(1) Download flashocc-r50-256x704.pth[https://drive.google.com/file/d/1k9BzXB2nRyvXhqf7GQx3XNSej6Oq6I-B/view] to Path_to_FlashOcc/FlashOcc/ckpts/, then run:
+```shell script
+bash tools/dist_test.sh projects/configs/flashocc/flashocc-r50.py  ckpts/flashocc-r50-256x704.pth 4 --eval map
+```
+
+step 6. (Optional) Install mmdeploy for tensorrt testing
+```shell script
+conda activate FlashOcc
+pip install Cython==0.29.24
+
+### get tensorrt
+wget https://developer.download.nvidia.com/compute/machine-learning/tensorrt/secure/8.4.0/tars/TensorRT-8.4.0.6.Linux.x86_64-gnu.cuda-11.6.cudnn8.3.tar.gz
+export TENSORRT_DIR=Path_to_TensorRT-8.4.0.6
+
+### get onnxruntime
+ONNXRUNTIME_VERSION=1.8.1
+pip install onnxruntime-gpu==${ONNXRUNTIME_VERSION}
+cd Path_to_your_onnxruntime
+wget https://github.com/microsoft/onnxruntime/releases/download/v${ONNXRUNTIME_VERSION}/onnxruntime-linux-x64-${ONNXRUNTIME_VERSION}.tgz \
+     && tar -zxvf onnxruntime-linux-x64-${ONNXRUNTIME_VERSION}.tgz
+# export ONNXRUNTIME_DIR=/data01/shuchangyong/pkgs/onnxruntime-linux-x64-1.8.1
+export ONNXRUNTIME_DIR=Path_to_your_onnxruntime/onnxruntime-linux-x64-1.8.1
+cd Path_to_FlashOcc/FlashOcc/
+git clone git@github.com:drilistbox/mmdeploy.git
+cd Path_to_FlashOcc/FlashOcc/mmdeploy
+git submodule update --init --recursive
+mkdir -p build
+cd Path_to_FlashOcc/FlashOcc/mmdeploy/build
+cmake -DMMDEPLOY_TARGET_BACKENDS="ort;trt" ..
+make -j 16
+cd Path_to_FlashOcc/FlashOcc/mmdeploy
+pip install -e .
+
+### build sdk
+cd Path_to_pplcv/
+git clone https://github.com/openppl-public/ppl.cv.git
+cd Path_to_pplcv/ppl.cv
+export PPLCV_VERSION=0.7.0
+git checkout tags/v${PPLCV_VERSION} -b v${PPLCV_VERSION}
+./build.sh cuda
+
+#pip install nvidia-tensorrt==8.4.0.6
+pip install nvidia-tensorrt==8.4.1.5
+pip install tensorrt
+#pip install h5py
+pip install spconv==2.3.6
+
+export PATH=Path_to_TensorRT-8.4.0.6/bin:$PATH
+export LD_LIBRARY_PATH=Path_to_TensorRT-8.4.0.6/lib:$LD_LIBRARY_PATH
+export LIBRARY_PATH=Path_to_TensorRT-8.4.0.6/lib:$LIBRARY_PATH
+```
+
+## The finally overall rangement
+1. Tensort
+```shell script
+└── Path_to_TensorRT-8.4.0.6
+    └── TensorRT-8.4.0.6
+```
+2. FlashOcc
+```shell script
+└── Path_to_FlashOcc/
+    └── data
+        └── nuscenes
+            ├── v1.0-trainval (existing)
+            ├── sweeps  (existing)
+            ├── samples (existing)
+            ├── gts (new)
+            ├── bevdetv2-nuscenes_infos_train.pkl (new)
+            └── bevdetv2-nuscenes_infos_val.pkl (new)
+    └── doc
+        ├── install.md
+        └── trt_test.md
+    ├── figs
+    ├── mmdeploy (new)
+    ├── mmdetection3d (new)
+    ├── projects
+    ├── requirements
+    ├── tools
+    └── README.md
+```
+3. ppl.cv
+```shell script
+└── Path_to_pplcv
+    └── ppl.cv
+```
--- a/doc/mmdeploy_test.md
+++ b/doc/mmdeploy_test.md
+# trt inference speed
+```shell
+conda activate FlashOcc
+1. cmd for M0
+exp_name=flashocc-r50-M0
+fold_name=flashocc
+config=projects/configs/${fold_name}/${exp_name}-trt.py
+checkpoint=ckpts/flashocc-r50-M0-256x704.pth
+work_dir=work_dirs/${exp_name}/onnx_trt/
+
+2. cmd for M1
+exp_name=flashocc-r50
+fold_name=flashocc
+config=projects/configs/${fold_name}/${exp_name}-trt.py
+checkpoint=ckpts/flashocc-r50-256x704.pth
+work_dir=work_dirs/${exp_name}/onnx_trt/
+
+
+# int8 test. 
+engine=work_dirs/${exp_name}/onnx_trt/bevdet_int8_fuse.engine
+python tools/convert_bevdet_to_TRT.py $config $checkpoint $work_dir --fuse-conv-bn --int8 --calib_num 256
+python tools/analysis_tools/benchmark_trt.py $config $engine --eval
+# ===> per class IoU of 6019 samples:
+# ===> others - IoU = 4.58
+# ===> barrier - IoU = 34.13
+# ===> bicycle - IoU = 8.68
+# ===> bus - IoU = 34.9
+# ===> car - IoU = 40.48
+# ===> construction_vehicle - IoU = 15.99
+# ===> motorcycle - IoU = 15.49
+# ===> pedestrian - IoU = 13.58
+# ===> traffic_cone - IoU = 12.83
+# ===> trailer - IoU = 25.31
+# ===> truck - IoU = 28.08
+# ===> driveable_surface - IoU = 76.7
+# ===> other_flat - IoU = 31.5
+# ===> sidewalk - IoU = 45.01
+# ===> terrain - IoU = 49.63
+# ===> manmade - IoU = 35.72
+# ===> vegetation - IoU = 30.39
+# ===> mIoU of 6019 samples: 29.59
+
+# int8+fp16 test. 
+engine=work_dirs/${exp_name}/onnx_trt/bevdet_int8_fp16_fuse.engine
+python tools/convert_bevdet_to_TRT.py $config $checkpoint $work_dir --fuse-conv-bn --fp16 --int8 --calib_num 256
+python tools/analysis_tools/benchmark_trt.py $config $engine --eval
+# ===> per class IoU of 6019 samples:
+# ===> others - IoU = 4.59
+# ===> barrier - IoU = 34.13
+# ===> bicycle - IoU = 8.71
+# ===> bus - IoU = 34.9
+# ===> car - IoU = 40.49
+# ===> construction_vehicle - IoU = 16.01
+# ===> motorcycle - IoU = 15.55
+# ===> pedestrian - IoU = 13.63
+# ===> traffic_cone - IoU = 12.86
+# ===> trailer - IoU = 25.33
+# ===> truck - IoU = 28.1
+# ===> driveable_surface - IoU = 76.7
+# ===> other_flat - IoU = 31.51
+# ===> sidewalk - IoU = 45.01
+# ===> terrain - IoU = 49.63
+# ===> manmade - IoU = 35.72
+# ===> vegetation - IoU = 30.39
+# ===> mIoU of 6019 samples: 29.6
+
+# fp16 test
+engine=work_dirs/${exp_name}/onnx_trt/bevdet_fp16_fuse.engine
+python tools/convert_bevdet_to_TRT.py $config $checkpoint $work_dir --fuse-conv-bn --fp16
+python tools/analysis_tools/benchmark_trt.py $config $engine
+python tools/analysis_tools/benchmark_trt.py $config $engine --eval
+# ===> per class IoU of 6019 samples:
+# ===> others - IoU = 5.97
+# ===> barrier - IoU = 36.37
+# ===> bicycle - IoU = 10.14
+# ===> bus - IoU = 35.47
+# ===> car - IoU = 41.57
+# ===> construction_vehicle - IoU = 15.73
+# ===> motorcycle - IoU = 14.8
+# ===> pedestrian - IoU = 15.65
+# ===> traffic_cone - IoU = 14.46
+# ===> trailer - IoU = 27.47
+# ===> truck - IoU = 29.39
+# ===> driveable_surface - IoU = 77.14
+# ===> other_flat - IoU = 34.66
+# ===> sidewalk - IoU = 46.44
+# ===> terrain - IoU = 51.05
+# ===> manmade - IoU = 35.79
+# ===> vegetation - IoU = 31.19
+# ===> mIoU of 6019 samples: 30.78
+
+# fp32 test
+engine=work_dirs/${exp_name}/onnx_trt/bevdet_fuse.engine
+python tools/convert_bevdet_to_TRT.py $config $checkpoint $work_dir --fuse-conv-bn
+python tools/analysis_tools/benchmark_trt.py $config $engine
+python tools/analysis_tools/benchmark_trt.py $config $engine --eval
+# ===> per class IoU of 6019 samples:
+# ===> others - IoU = 5.97
+# ===> barrier - IoU = 36.37
+# ===> bicycle - IoU = 10.15
+# ===> bus - IoU = 35.46
+# ===> car - IoU = 41.56
+# ===> construction_vehicle - IoU = 15.73
+# ===> motorcycle - IoU = 14.78
+# ===> pedestrian - IoU = 15.64
+# ===> traffic_cone - IoU = 14.44
+# ===> trailer - IoU = 27.46
+# ===> truck - IoU = 29.39
+# ===> driveable_surface - IoU = 77.14
+# ===> other_flat - IoU = 34.68
+# ===> sidewalk - IoU = 46.44
+# ===> terrain - IoU = 51.05
+# ===> manmade - IoU = 35.79
+# ===> vegetation - IoU = 31.18
+# ===> mIoU of 6019 samples: 30.78
+
+```
+
+
+3. cmd for flashoccv2
+```
+exp_name=flashoccv2-r50-depth
+fold_name=flashoccv2
+config=projects/configs/${fold_name}/${exp_name}-trt.py
+checkpoint=work_dirs/${exp_name}/epoch_24_ema.pth
+work_dir=work_dirs/${exp_name}/onnx_trt/
+
+# fp16 test
+engine=work_dirs/${exp_name}/onnx_trt/bevdet_fp16_fuse.engine
+python tools/convert_bevdet_to_TRT.py $config $checkpoint $work_dir --fuse-conv-bn --fp16
+python tools/analysis_tools/benchmark_trt.py $config $engine
+python tools/analysis_tools/benchmark_trt.py $config $engine --eval
+```
+
+# Flops and params
+```shell
+python tools/analysis_tools/get_flops.py projects/configs/bevdet_occ/bevdet-occ-r50.py --modality image --shape 256 704
+python tools/analysis_tools/get_flops.py projects/configs/flashocc/flashocc-r50-M0.py --modality image --shape 256 704
+python tools/analysis_tools/get_flops.py projects/configs/flashocc/flashocc-r50.py --modality image --shape 256 704
+python tools/analysis_tools/get_flops.py projects/configs/flashocc/flashocc-stbase-4d-stereo-512x1408.py --modality image --shape 512 1408
+python tools/analysis_tools/get_flops.py projects/configs/flashoccv2/flashoccv2-r50-depth.py --modality image --shape 256 704
+python tools/analysis_tools/get_flops.py projects/configs/flashoccv2/flashoccv2-r50-depth-tiny.py --modality image --shape 256 704
+```
\ No newline at end of file
--- a/doc/model_training.md
+++ b/doc/model_training.md
+
+#### Train model
+```shell
+# single gpu
+python tools/train.py $config
+# multiple gpu
+./tools/dist_train.sh $config num_gpu
+```
+
+#### Test model
+```shell
+# single gpu
+python tools/test.py $config $checkpoint --eval mAP
+# multiple gpu
+./tools/dist_test.sh $config $checkpoint num_gpu --eval mAP
+# ray-iou metric
+./tools/dist_test.sh $config $checkpoint num_gpu --eval ray-iou
+```
+
+#### FPS for Panoptic-FlashOcc
+```shell
+# for single-frame
+python tools/analysis_tools/benchmark.py  config ckpt 
+python tools/analysis_tools/benchmark.py  config ckpt --w_pano
+
+# for multi-frame
+python tools/analysis_tools/benchmark_sequential.py  config ckpt 
+python tools/analysis_tools/benchmark_sequential.py  config ckpt --w_pano
+```
--- a/doc/nuscenes_det.md
+++ b/doc/nuscenes_det.md
+# NuScenes Dataset for 3D Object Detection
+
+This page provides specific tutorials about the usage of MMDetection3D for nuScenes dataset.
+
+## Before Preparation
+
+You can download nuScenes 3D detection data [HERE](https://www.nuscenes.org/download) and unzip all zip files.
+
+Like the general way to prepare dataset, it is recommended to symlink the dataset root to `$MMDETECTION3D/data`.
+
+The folder structure should be organized as follows before our processing.
+
+```
+mmdetection3d
+├── mmdet3d
+├── tools
+├── configs
+├── data
+│   ├── nuscenes
+│   │   ├── maps
+│   │   ├── samples
+│   │   ├── sweeps
+│   │   ├── v1.0-test
+|   |   ├── v1.0-trainval
+```
+
+## Dataset Preparation
+
+We typically need to organize the useful data information with a .pkl or .json file in a specific style, e.g., coco-style for organizing images and their annotations.
+To prepare these files for nuScenes, run the following command:
+
+```bash
+python tools/create_data.py nuscenes --root-path ./data/nuscenes --out-dir ./data/nuscenes --extra-tag nuscenes
+```
+
+The folder structure after processing should be as below.
+
+```
+mmdetection3d
+├── mmdet3d
+├── tools
+├── configs
+├── data
+│   ├── nuscenes
+│   │   ├── maps
+│   │   ├── samples
+│   │   ├── sweeps
+│   │   ├── v1.0-test
+|   |   ├── v1.0-trainval
+│   │   ├── nuscenes_database
+│   │   ├── nuscenes_infos_train.pkl
+│   │   ├── nuscenes_infos_val.pkl
+│   │   ├── nuscenes_infos_test.pkl
+│   │   ├── nuscenes_dbinfos_train.pkl
+│   │   ├── nuscenes_infos_train_mono3d.coco.json
+│   │   ├── nuscenes_infos_val_mono3d.coco.json
+│   │   ├── nuscenes_infos_test_mono3d.coco.json
+```
+
+Here, .pkl files are generally used for methods involving point clouds and coco-style .json files are more suitable for image-based methods, such as image-based 2D and 3D detection.
+Next, we will elaborate on the details recorded in these info files.
+
+- `nuscenes_database/xxxxx.bin`: point cloud data included in each 3D bounding box of the training dataset
+- `nuscenes_infos_train.pkl`: training dataset info, each frame info has two keys: `metadata` and `infos`.
+  `metadata` contains the basic information for the dataset itself, such as `{'version': 'v1.0-trainval'}`, while `infos` contains the detailed information as follows:
+  - info\['lidar_path'\]: The file path of the lidar point cloud data.
+  - info\['token'\]: Sample data token.
+  - info\['sweeps'\]: Sweeps information (`sweeps` in the nuScenes refer to the intermediate frames without annotations, while `samples` refer to those key frames with annotations).
+    - info\['sweeps'\]\[i\]\['data_path'\]: The data path of i-th sweep.
+    - info\['sweeps'\]\[i\]\['type'\]: The sweep data type, e.g., `'lidar'`.
+    - info\['sweeps'\]\[i\]\['sample_data_token'\]: The sweep sample data token.
+    - info\['sweeps'\]\[i\]\['sensor2ego_translation'\]: The translation from the current sensor (for collecting the sweep data) to ego vehicle. (1x3 list)
+    - info\['sweeps'\]\[i\]\['sensor2ego_rotation'\]: The rotation from the current sensor (for collecting the sweep data) to ego vehicle. (1x4 list in the quaternion format)
+    - info\['sweeps'\]\[i\]\['ego2global_translation'\]: The translation from the ego vehicle to global coordinates. (1x3 list)
+    - info\['sweeps'\]\[i\]\['ego2global_rotation'\]: The rotation from the ego vehicle to global coordinates. (1x4 list in the quaternion format)
+    - info\['sweeps'\]\[i\]\['timestamp'\]: Timestamp of the sweep data.
+    - info\['sweeps'\]\[i\]\['sensor2lidar_translation'\]: The translation from the current sensor (for collecting the sweep data) to lidar. (1x3 list)
+    - info\['sweeps'\]\[i\]\['sensor2lidar_rotation'\]: The rotation from the current sensor (for collecting the sweep data) to lidar. (1x4 list in the quaternion format)
+  - info\['cams'\]: Cameras calibration information. It contains six keys corresponding to each camera: `'CAM_FRONT'`, `'CAM_FRONT_RIGHT'`, `'CAM_FRONT_LEFT'`, `'CAM_BACK'`, `'CAM_BACK_LEFT'`, `'CAM_BACK_RIGHT'`.
+    Each dictionary contains detailed information following the above way for each sweep data (has the same keys for each information as above). In addition, each camera has a key `'cam_intrinsic'` for recording the intrinsic parameters when projecting 3D points to each image plane.
+  - info\['lidar2ego_translation'\]: The translation from lidar to ego vehicle. (1x3 list)
+  - info\['lidar2ego_rotation'\]: The rotation from lidar to ego vehicle. (1x4 list in the quaternion format)
+  - info\['ego2global_translation'\]: The translation from the ego vehicle to global coordinates. (1x3 list)
+  - info\['ego2global_rotation'\]: The rotation from the ego vehicle to global coordinates. (1x4 list in the quaternion format)
+  - info\['timestamp'\]: Timestamp of the sample data.
+  - info\['gt_boxes'\]: 7-DoF annotations of 3D bounding boxes, an Nx7 array.
+  - info\['gt_names'\]: Categories of 3D bounding boxes, an 1xN array.
+  - info\['gt_velocity'\]: Velocities of 3D bounding boxes (no vertical measurements due to inaccuracy), an Nx2 array.
+  - info\['num_lidar_pts'\]: Number of lidar points included in each 3D bounding box.
+  - info\['num_radar_pts'\]: Number of radar points included in each 3D bounding box.
+  - info\['valid_flag'\]: Whether each bounding box is valid. In general, we only take the 3D boxes that include at least one lidar or radar point as valid boxes.
+- `nuscenes_infos_train_mono3d.coco.json`: training dataset coco-style info. This file organizes image-based data into three categories (keys): `'categories'`, `'images'`, `'annotations'`.
+  - info\['categories'\]: A list containing all the category names. Each element follows the dictionary format and consists of two keys: `'id'` and `'name'`.
+  - info\['images'\]: A list containing all the image info.
+    - info\['images'\]\[i\]\['file_name'\]: The file name of the i-th image.
+    - info\['images'\]\[i\]\['id'\]: Sample data token of the i-th image.
+    - info\['images'\]\[i\]\['token'\]: Sample token corresponding to this frame.
+    - info\['images'\]\[i\]\['cam2ego_rotation'\]: The rotation from the camera to ego vehicle. (1x4 list in the quaternion format)
+    - info\['images'\]\[i\]\['cam2ego_translation'\]: The translation from the camera to ego vehicle. (1x3 list)
+    - info\['images'\]\[i\]\['ego2global_rotation''\]: The rotation from the ego vehicle to global coordinates. (1x4 list in the quaternion format)
+    - info\['images'\]\[i\]\['ego2global_translation'\]: The translation from the ego vehicle to global coordinates. (1x3 list)
+    - info\['images'\]\[i\]\['cam_intrinsic'\]: Camera intrinsic matrix. (3x3 list)
+    - info\['images'\]\[i\]\['width'\]: Image width, 1600 by default in nuScenes.
+    - info\['images'\]\[i\]\['height'\]: Image height, 900 by default in nuScenes.
+  - info\['annotations'\]: A list containing all the annotation info.
+    - info\['annotations'\]\[i\]\['file_name'\]: The file name of the corresponding image.
+    - info\['annotations'\]\[i\]\['image_id'\]: The image id (token) of the corresponding image.
+    - info\['annotations'\]\[i\]\['area'\]: Area of the 2D bounding box.
+    - info\['annotations'\]\[i\]\['category_name'\]: Category name.
+    - info\['annotations'\]\[i\]\['category_id'\]: Category id.
+    - info\['annotations'\]\[i\]\['bbox'\]: 2D bounding box annotation (exterior rectangle of the projected 3D box), 1x4 list following \[x1, y1, x2-x1, y2-y1\].
+      x1/y1 are minimum coordinates along horizontal/vertical direction of the image.
+    - info\['annotations'\]\[i\]\['iscrowd'\]: Whether the region is crowded. Defaults to 0.
+    - info\['annotations'\]\[i\]\['bbox_cam3d'\]: 3D bounding box (gravity) center location (3), size (3), (global) yaw angle (1), 1x7 list.
+    - info\['annotations'\]\[i\]\['velo_cam3d'\]: Velocities of 3D bounding boxes (no vertical measurements due to inaccuracy), an Nx2 array.
+    - info\['annotations'\]\[i\]\['center2d'\]: Projected 3D-center containing 2.5D information: projected center location on the image (2) and depth (1), 1x3 list.
+    - info\['annotations'\]\[i\]\['attribute_name'\]: Attribute name.
+    - info\['annotations'\]\[i\]\['attribute_id'\]: Attribute id.
+      We maintain a default attribute collection and mapping for attribute classification.
+      Please refer to [here](https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/datasets/nuscenes_mono_dataset.py#L53) for more details.
+    - info\['annotations'\]\[i\]\['id'\]: Annotation id. Defaults to `i`.
+
+Here we only explain the data recorded in the training info files. The same applies to validation and testing set.
+
+The core function to get `nuscenes_infos_xxx.pkl` and `nuscenes_infos_xxx_mono3d.coco.json` are [\_fill_trainval_infos](https://github.com/open-mmlab/mmdetection3d/blob/master/tools/data_converter/nuscenes_converter.py#L143) and [get_2d_boxes](https://github.com/open-mmlab/mmdetection3d/blob/master/tools/data_converter/nuscenes_converter.py#L397), respectively.
+Please refer to [nuscenes_converter.py](https://github.com/open-mmlab/mmdetection3d/blob/master/tools/data_converter/nuscenes_converter.py) for more details.
+
+## Training pipeline
+
+### LiDAR-Based Methods
+
+A typical training pipeline of LiDAR-based 3D detection (including multi-modality methods) on nuScenes is as below.
+
+```python
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        file_client_args=file_client_args),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.3925, 0.3925],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0, 0, 0]),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectNameFilter', classes=class_names),
+    dict(type='PointShuffle'),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+```
+
+Compared to general cases, nuScenes has a specific `'LoadPointsFromMultiSweeps'` pipeline to load point clouds from consecutive frames. This is a common practice used in this setting.
+Please refer to the nuScenes [original paper](https://arxiv.org/abs/1903.11027) for more details.
+The default `use_dim` in `'LoadPointsFromMultiSweeps'` is `[0, 1, 2, 4]`, where the first 3 dimensions refer to point coordinates and the last refers to timestamp differences.
+Intensity is not used by default due to its yielded noise when concatenating the points from different frames.
+
+### Vision-Based Methods
+
+A typical training pipeline of image-based 3D detection on nuScenes is as below.
+
+```python
+train_pipeline = [
+    dict(type='LoadImageFromFileMono3D'),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox=True,
+        with_label=True,
+        with_attr_label=True,
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_bbox_depth=True),
+    dict(type='Resize', img_scale=(1600, 900), keep_ratio=True),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(
+        type='Collect3D',
+        keys=[
+            'img', 'gt_bboxes', 'gt_labels', 'attr_labels', 'gt_bboxes_3d',
+            'gt_labels_3d', 'centers2d', 'depths'
+        ]),
+]
+```
+
+It follows the general pipeline of 2D detection while differs in some details:
+
+- It uses monocular pipelines to load images, which includes additional required information like camera intrinsics.
+- It needs to load 3D annotations.
+- Some data augmentation techniques need to be adjusted, such as `RandomFlip3D`.
+  Currently we do not support more augmentation methods, because how to transfer and apply other techniques is still under explored.
+
+## Evaluation
+
+An example to evaluate PointPillars with 8 GPUs with nuScenes metrics is as follows.
+
+```shell
+bash ./tools/dist_test.sh configs/pointpillars/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d.py checkpoints/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d_20200620_230405-2fa62f3d.pth 8 --eval bbox
+```
+
+## Metrics
+
+NuScenes proposes a comprehensive metric, namely nuScenes detection score (NDS), to evaluate different methods and set up the benchmark.
+It consists of mean Average Precision (mAP), Average Translation Error (ATE), Average Scale Error (ASE), Average Orientation Error (AOE), Average Velocity Error (AVE) and Average Attribute Error (AAE).
+Please refer to its [official website](https://www.nuscenes.org/object-detection?externalData=all&mapData=all&modalities=Any) for more details.
+
+We also adopt this approach for evaluation on nuScenes. An example of printed evaluation results is as follows:
+
+```
+mAP: 0.3197
+mATE: 0.7595
+mASE: 0.2700
+mAOE: 0.4918
+mAVE: 1.3307
+mAAE: 0.1724
+NDS: 0.3905
+Eval time: 170.8s
+
+Per-class results:
+Object Class    AP      ATE     ASE     AOE     AVE     AAE
+car     0.503   0.577   0.152   0.111   2.096   0.136
+truck   0.223   0.857   0.224   0.220   1.389   0.179
+bus     0.294   0.855   0.204   0.190   2.689   0.283
+trailer 0.081   1.094   0.243   0.553   0.742   0.167
+construction_vehicle    0.058   1.017   0.450   1.019   0.137   0.341
+pedestrian      0.392   0.687   0.284   0.694   0.876   0.158
+motorcycle      0.317   0.737   0.265   0.580   2.033   0.104
+bicycle 0.308   0.704   0.299   0.892   0.683   0.010
+traffic_cone    0.555   0.486   0.309   nan     nan     nan
+barrier 0.466   0.581   0.269   0.169   nan     nan
+```
+
+## Testing and make a submission
+
+An example to test PointPillars on nuScenes with 8 GPUs and generate a submission to the leaderboard is as follows.
+
+```shell
+./tools/dist_test.sh configs/pointpillars/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d.py work_dirs/pp-nus/latest.pth 8 --out work_dirs/pp-nus/results_eval.pkl --format-only --eval-options 'jsonfile_prefix=work_dirs/pp-nus/results_eval'
+```
+
+Note that the testing info should be changed to that for testing set instead of validation set [here](https://github.com/open-mmlab/mmdetection3d/blob/master/configs/_base_/datasets/nus-3d.py#L132).
+
+After generating the `work_dirs/pp-nus/results_eval.json`, you can compress it and submit it to nuScenes benchmark. Please refer to the [nuScenes official website](https://www.nuscenes.org/object-detection?externalData=all&mapData=all&modalities=Any) for more information.
+
+We can also visualize the prediction results with our developed visualization tools. Please refer to the [visualization doc](https://mmdetection3d.readthedocs.io/en/latest/useful_tools.html#visualization) for more details.
+
+## Notes
+
+### Transformation between `NuScenesBox` and our `CameraInstanceBoxes`.
+
+In general, the main difference of `NuScenesBox` and our `CameraInstanceBoxes` is mainly reflected in the yaw definition. `NuScenesBox` defines the rotation with a quaternion or three Euler angles while ours only defines one yaw angle due to the practical scenario. It requires us to add some additional rotations manually in the pre-processing and post-processing, such as [here](https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/datasets/nuscenes_mono_dataset.py#L673).
+
+In addition, please note that the definition of corners and locations are detached in the `NuScenesBox`. For example, in monocular 3D detection, the definition of the box location is in its camera coordinate (see its official [illustration](https://www.nuscenes.org/nuscenes#data-collection) for car setup), which is consistent with [ours](https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/core/bbox/structures/cam_box3d.py). In contrast, its corners are defined with the [convention](https://github.com/nutonomy/nuscenes-devkit/blob/02e9200218977193a1058dd7234f935834378319/python-sdk/nuscenes/utils/data_classes.py#L527) "x points forward, y to the left, z up". It results in different philosophy of dimension and rotation definitions from our `CameraInstanceBoxes`. An example to remove similar hacks is PR [#744](https://github.com/open-mmlab/mmdetection3d/pull/744). The same problem also exists in the LiDAR system. To deal with them, we typically add some transformation in the pre-processing and post-processing to guarantee the box will be in our coordinate system during the entire training and inference procedure.
--- a/doc/visualization.md
+++ b/doc/visualization.md
+
+# FlashOcc
+```shell
+# step 1. generate result 
+bash tools/dist_test.sh projects/configs/flashocc/flashocc-r50.py ckpts/flashocc-r50-256x704.pth 4 --eval map --eval-options show_dir=work_dirs/flashocc_r50/results
+# step 2. visualization
+python tools/analysis_tools/vis_occ.py work_dirs/flashocc_r50/results/ --root_path ./data/nuscenes --save_path ./vis
+```
+
+# Panoptic-FlashOcc
+```shell
+
+exp_name=panoptic-flashocc-r50-depth4d-longterm8f-pano
+python tools/vis_occ.py --config projects/configs/panoptic-flashocc/${exp_name}.py --weights work_dirs/${exp_name}/epoch_24_ema.pth --viz-dir vis/${exp_name} --draw-pano-gt
+
+```
+
--- a/flashocc_lts.tar.gz
+++ b/flashocc_lts.tar.gz
--- a/lib/dvr/dvr.cpp
+++ b/lib/dvr/dvr.cpp
+// Acknowledgments: https://github.com/tarashakhurana/4d-occ-forecasting
+// Modified by Haisong Liu
+
+#include <string>
+#include <torch/extension.h>
+#include <vector>
+
+/*
+ * CUDA forward declarations
+ */
+
+std::vector<torch::Tensor> render_forward_cuda(torch::Tensor sigma,
+                                               torch::Tensor origin,
+                                               torch::Tensor points,
+                                               torch::Tensor tindex,
+                                               const std::vector<int> grid,
+                                               std::string phase_name);
+
+std::vector<torch::Tensor>
+render_cuda(torch::Tensor sigma, torch::Tensor origin, torch::Tensor points,
+            torch::Tensor tindex, std::string loss_name);
+
+torch::Tensor init_cuda(torch::Tensor points, torch::Tensor tindex,
+                        const std::vector<int> grid);
+
+
+/*
+ * C++ interface
+ */
+
+#define CHECK_CUDA(x)                                                          \
+  TORCH_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x)                                                    \
+  TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x)                                                         \
+  CHECK_CUDA(x);                                                               \
+  CHECK_CONTIGUOUS(x)
+
+std::vector<torch::Tensor>
+render_forward(torch::Tensor sigma, torch::Tensor origin, torch::Tensor points,
+               torch::Tensor tindex, const std::vector<int> grid,
+               std::string phase_name) {
+  CHECK_INPUT(sigma);
+  CHECK_INPUT(origin);
+  CHECK_INPUT(points);
+  CHECK_INPUT(tindex);
+  return render_forward_cuda(sigma, origin, points, tindex, grid, phase_name);
+}
+
+
+std::vector<torch::Tensor> render(torch::Tensor sigma, torch::Tensor origin,
+                                  torch::Tensor points, torch::Tensor tindex,
+                                  std::string loss_name) {
+  CHECK_INPUT(sigma);
+  CHECK_INPUT(origin);
+  CHECK_INPUT(points);
+  CHECK_INPUT(tindex);
+  return render_cuda(sigma, origin, points, tindex, loss_name);
+}
+
+torch::Tensor init(torch::Tensor points, torch::Tensor tindex,
+                   const std::vector<int> grid) {
+  CHECK_INPUT(points);
+  CHECK_INPUT(tindex);
+  return init_cuda(points, tindex, grid);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("init", &init, "Initialize");
+  m.def("render", &render, "Render");
+  m.def("render_forward", &render_forward, "Render (forward pass only)");
+}
--- a/lib/dvr/dvr.cu
+++ b/lib/dvr/dvr.cu
+// Acknowledgments: https://github.com/tarashakhurana/4d-occ-forecasting
+// Modified by Haisong Liu
+
+#include <torch/extension.h>
+#include <stdio.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <vector>
+#include <string>
+#include <iostream>
+
+#define MAX_D 1446 // 700 + 700 + 45 + 1
+#define MAX_STEP 1000
+
+enum LossType {L1, L2, ABSREL};
+enum PhaseName {TEST, TRAIN};
+
+template <typename scalar_t>
+__global__ void init_cuda_kernel(
+    const torch::PackedTensorAccessor32<scalar_t,3,torch::RestrictPtrTraits> points,
+    const torch::PackedTensorAccessor32<scalar_t,2,torch::RestrictPtrTraits> tindex,
+    torch::PackedTensorAccessor32<scalar_t,5,torch::RestrictPtrTraits> occupancy) {
+
+    // batch index
+    const auto n = blockIdx.y;
+
+    // ray index
+    const auto c = blockIdx.x * blockDim.x + threadIdx.x;
+
+    // num of rays
+    const auto M = points.size(1);
+    const auto T = occupancy.size(1);
+
+    // we allocated more threads than num_rays
+    if (c < M) {
+        // ray end point
+        const auto t = tindex[n][c];
+
+        // invalid points
+        assert(T == 1 || t < T);
+
+        // if t < 0, it is a padded point
+        if (t < 0) return;
+
+        // time index for sigma
+        // when T = 1, we have a static sigma
+        const auto ts = (T == 1) ? 0 : t;
+
+        // grid shape
+        const int vzsize = occupancy.size(2);
+        const int vysize = occupancy.size(3);
+        const int vxsize = occupancy.size(4);
+        // assert(vzsize + vysize + vxsize <= MAX_D);
+
+        // end point
+        const int vx = int(points[n][c][0]);
+        const int vy = int(points[n][c][1]);
+        const int vz = int(points[n][c][2]);
+
+        //
+        if (0 <= vx && vx < vxsize &&
+            0 <= vy && vy < vysize &&
+            0 <= vz && vz < vzsize) {
+            occupancy[n][ts][vz][vy][vx] = 1;
+        }
+    }
+}
+
+template <typename scalar_t>
+__global__ void render_forward_cuda_kernel(
+    const torch::PackedTensorAccessor32<scalar_t,5,torch::RestrictPtrTraits> sigma,
+    const torch::PackedTensorAccessor32<scalar_t,3,torch::RestrictPtrTraits> origin,
+    const torch::PackedTensorAccessor32<scalar_t,3,torch::RestrictPtrTraits> points,
+    const torch::PackedTensorAccessor32<scalar_t,2,torch::RestrictPtrTraits> tindex,
+    // torch::PackedTensorAccessor32<scalar_t,5,torch::RestrictPtrTraits> pog,
+    torch::PackedTensorAccessor32<scalar_t,2,torch::RestrictPtrTraits> pred_dist,
+    torch::PackedTensorAccessor32<scalar_t,2,torch::RestrictPtrTraits> gt_dist,
+    torch::PackedTensorAccessor32<scalar_t,3,torch::RestrictPtrTraits> coord_index,
+    PhaseName train_phase) {
+
+    // batch index
+    const auto n = blockIdx.y;
+
+    // ray index
+    const auto c = blockIdx.x * blockDim.x + threadIdx.x;
+
+    // num of rays
+    const auto M = points.size(1);
+    const auto T = sigma.size(1);
+
+    // we allocated more threads than num_rays
+    if (c < M) {
+        // ray end point
+        const auto t = tindex[n][c];
+
+        // invalid points
+        // assert(t < T);
+        assert(T == 1 || t < T);
+
+        // time index for sigma
+        // when T = 1, we have a static sigma
+        const auto ts = (T == 1) ? 0 : t;
+
+        // if t < 0, it is a padded point
+        if (t < 0) return;
+
+        // grid shape
+        const int vzsize = sigma.size(2);
+        const int vysize = sigma.size(3);
+        const int vxsize = sigma.size(4);
+        // assert(vzsize + vysize + vxsize <= MAX_D);
+
+        // origin
+        const double xo = origin[n][t][0];
+        const double yo = origin[n][t][1];
+        const double zo = origin[n][t][2];
+
+        // end point
+        const double xe = points[n][c][0];
+        const double ye = points[n][c][1];
+        const double ze = points[n][c][2];
+
+        // locate the voxel where the origin resides
+        const int vxo = int(xo);
+        const int vyo = int(yo);
+        const int vzo = int(zo);
+
+        const int vxe = int(xe);
+        const int vye = int(ye);
+        const int vze = int(ze);
+
+        // NOTE: new
+        int vx = vxo;
+        int vy = vyo;
+        int vz = vzo;
+
+        // origin to end
+        const double rx = xe - xo;
+        const double ry = ye - yo;
+        const double rz = ze - zo;
+        double gt_d = sqrt(rx * rx + ry * ry + rz * rz);
+
+        // directional vector
+        const double dx = rx / gt_d;
+        const double dy = ry / gt_d;
+        const double dz = rz / gt_d;
+
+        // In which direction the voxel ids are incremented.
+        const int stepX = (dx >= 0) ? 1 : -1;
+        const int stepY = (dy >= 0) ? 1 : -1;
+        const int stepZ = (dz >= 0) ? 1 : -1;
+
+        // Distance along the ray to the next voxel border from the current position (tMaxX, tMaxY, tMaxZ).
+        const double next_voxel_boundary_x = vx + (stepX < 0 ? 0 : 1);
+        const double next_voxel_boundary_y = vy + (stepY < 0 ? 0 : 1);
+        const double next_voxel_boundary_z = vz + (stepZ < 0 ? 0 : 1);
+
+        // tMaxX, tMaxY, tMaxZ -- distance until next intersection with voxel-border
+        // the value of t at which the ray crosses the first vertical voxel boundary
+        double tMaxX = (dx!=0) ? (next_voxel_boundary_x - xo)/dx : DBL_MAX; //
+        double tMaxY = (dy!=0) ? (next_voxel_boundary_y - yo)/dy : DBL_MAX; //
+        double tMaxZ = (dz!=0) ? (next_voxel_boundary_z - zo)/dz : DBL_MAX; //
+
+        // tDeltaX, tDeltaY, tDeltaZ --
+        // how far along the ray we must move for the horizontal component to equal the width of a voxel
+        // the direction in which we traverse the grid
+        // can only be FLT_MAX if we never go in that direction
+        const double tDeltaX = (dx!=0) ? stepX/dx : DBL_MAX;
+        const double tDeltaY = (dy!=0) ? stepY/dy : DBL_MAX;
+        const double tDeltaZ = (dz!=0) ? stepZ/dz : DBL_MAX;
+
+        int3 path[MAX_D];
+        double csd[MAX_D];  // cumulative sum of sigma times delta
+        double p[MAX_D];  // alpha
+        double d[MAX_D];
+
+        // forward raymarching with voxel traversal
+        int step = 0;  // total number of voxels traversed
+        int count = 0;  // number of voxels traversed inside the voxel grid
+        double last_d = 0.0;  // correct initialization
+
+        // voxel traversal raycasting
+        bool was_inside = false;
+        while (true) {
+            bool inside = (0 <= vx && vx < vxsize) &&
+                (0 <= vy && vy < vysize) &&
+                (0 <= vz && vz < vzsize);
+            if (inside) {
+                was_inside = true;
+                path[count] = make_int3(vx, vy, vz);
+            } else if (was_inside) { // was but no longer inside
+                // we know we are not coming back so terminate
+                break;
+            } /*else if (last_d > gt_d) {
+                break;
+            } */
+            /*else { // has not gone inside yet
+                // assert(count == 0);
+                // (1) when we have hit the destination but haven't gone inside the voxel grid
+                // (2) when we have traveled MAX_D voxels but haven't found one valid voxel
+                //     handle intersection corner cases in case of infinite loop
+                bool hit = (vx == vxe && vy == vye && vz == vze);  // this test seems brittle with corner cases
+                if (hit || step >= MAX_D)
+                    break;
+                //if (last_d >= gt_d || step >= MAX_D) break;
+            } */
+            // _d represents the ray distance has traveled before escaping the current voxel cell
+            double _d = 0.0;
+            // voxel traversal
+            if (tMaxX < tMaxY) {
+                if (tMaxX < tMaxZ) {
+                    _d = tMaxX;
+                    vx += stepX;
+                    tMaxX += tDeltaX;
+                } else {
+                    _d = tMaxZ;
+                    vz += stepZ;
+                    tMaxZ += tDeltaZ;
+                }
+            } else {
+                if (tMaxY < tMaxZ) {
+                    _d = tMaxY;
+                    vy += stepY;
+                    tMaxY += tDeltaY;
+                } else {
+                    _d = tMaxZ;
+                    vz += stepZ;
+                    tMaxZ += tDeltaZ;
+                }
+            }
+            if (inside) {
+                // get sigma at the current voxel
+                const int3 &v = path[count];  // use the recorded index
+                const double _sigma = sigma[n][ts][v.z][v.y][v.x];
+                const double _delta = max(0.0, _d - last_d);  // THIS TURNS OUT IMPORTANT
+                const double sd = _sigma * _delta;
+                if (count == 0) { // the first voxel inside
+                    csd[count] = sd;
+                    p[count] = 1 - exp(-sd);
+                } else {
+                    csd[count] = csd[count-1] + sd;
+                    p[count] = exp(-csd[count-1]) - exp(-csd[count]);
+                }
+                // record the traveled distance
+                d[count] = _d;
+                // count the number of voxels we have escaped
+                count ++;
+            }
+            last_d = _d;
+            step ++;
+
+            if (step > MAX_STEP) {
+                break;
+            }
+        }
+
+        // the total number of voxels visited should not exceed this number
+        assert(count <= MAX_D);
+        
+        if (count > 0) {
+            // compute the expected ray distance
+            //double exp_d = 0.0;
+            double exp_d = d[count-1];
+            
+            const int3 &v_init = path[count-1];
+            int x = v_init.x;
+            int y = v_init.y;
+            int z = v_init.z;
+
+            for (int i = 0; i < count; i++) {
+                //printf("%f\t%f\n",p[i], d[i]);
+                //exp_d += p[i] * d[i];
+                const int3 &v = path[i];
+                const double occ = sigma[n][ts][v.z][v.y][v.x];
+                if (occ > 0.5) {
+                    exp_d = d[i];
+                    
+                    x = v.x;
+                    y = v.y;
+                    z = v.z;
+                
+                    break;
+                }
+
+            }
+            //printf("%f\n",exp_d);
+
+            // add an imaginary sample at the end point should gt_d exceeds max_d
+            double p_out = exp(-csd[count-1]);
+            double max_d = d[count-1];
+
+            // if (gt_d > max_d)
+            //   exp_d += (p_out * gt_d);
+
+            // p_out is the probability the ray escapes the voxel grid
+            //exp_d += (p_out * max_d);
+            if (train_phase == 1) {
+                gt_d = min(gt_d, max_d);
+            }
+
+            // write the rendered ray distance (max_d)
+            pred_dist[n][c] = exp_d;
+            gt_dist[n][c] = gt_d;
+          
+            coord_index[n][c][0] = double(x);
+            coord_index[n][c][1] = double(y);
+            coord_index[n][c][2] = double(z);
+
+            // // write occupancy
+            // for (int i = 0; i < count; i ++) {
+            //     const int3 &v = path[i];
+            //     auto & occ = pog[n][t][v.z][v.y][v.x];
+            //     if (p[i] >= occ) {
+            //         occ = p[i];
+            //     }
+            // }
+        }
+    }
+}
+
+/*
+ * input shape
+ *   sigma      : N x T x H x L x W
+ *   origin   : N x T x 3
+ *   points   : N x M x 4
+ * output shape
+ *   dist     : N x M
+ */
+std::vector<torch::Tensor> render_forward_cuda(
+    torch::Tensor sigma,
+    torch::Tensor origin,
+    torch::Tensor points,
+    torch::Tensor tindex,
+    const std::vector<int> grid,
+    std::string phase_name) {
+
+    const auto N = points.size(0); // batch size
+    const auto M = points.size(1); // num of rays
+
+    const auto T = grid[0];
+    const auto H = grid[1];
+    const auto L = grid[2];
+    const auto W = grid[3];
+
+    const auto device = sigma.device();
+
+    const int threads = 1024;
+    const dim3 blocks((M + threads - 1) / threads, N);
+
+    //
+    // const auto dtype = points.dtype();
+    // const auto options = torch::TensorOptions().dtype(dtype).device(device).requires_grad(false);
+    // auto pog = torch::zeros({N, T, H, L, W}, options);
+
+    // perform rendering
+    auto gt_dist = -torch::ones({N, M}, device);
+    auto pred_dist = -torch::ones({N, M}, device);
+
+    auto coord_index = torch::zeros({N, M, 3}, device);
+
+    PhaseName train_phase;
+    if (phase_name.compare("test") == 0) {
+        train_phase = TEST;
+    } else if (phase_name.compare("train") == 0){
+        train_phase = TRAIN;
+    } else {
+        std::cout << "UNKNOWN PHASE NAME: " << phase_name << std::endl;
+        exit(1);
+    }
+
+    AT_DISPATCH_FLOATING_TYPES(sigma.type(), "render_forward_cuda", ([&] {
+                render_forward_cuda_kernel<scalar_t><<<blocks, threads>>>(
+                    sigma.packed_accessor32<scalar_t,5,torch::RestrictPtrTraits>(),
+                    origin.packed_accessor32<scalar_t,3,torch::RestrictPtrTraits>(),
+                    points.packed_accessor32<scalar_t,3,torch::RestrictPtrTraits>(),
+                    tindex.packed_accessor32<scalar_t,2,torch::RestrictPtrTraits>(),
+                    // pog.packed_accessor32<scalar_t,5,torch::RestrictPtrTraits>(),
+                    pred_dist.packed_accessor32<scalar_t,2,torch::RestrictPtrTraits>(),
+                    gt_dist.packed_accessor32<scalar_t,2,torch::RestrictPtrTraits>(),
+                    coord_index.packed_accessor32<scalar_t,3,torch::RestrictPtrTraits>(),
+                    train_phase);
+            }));
+
+    cudaDeviceSynchronize();
+
+    // return {pog, pred_dist, gt_dist};
+    return {pred_dist, gt_dist, coord_index};
+}
+
+template <typename scalar_t>
+__global__ void render_cuda_kernel(
+    const torch::PackedTensorAccessor32<scalar_t,5,torch::RestrictPtrTraits> sigma,
+    const torch::PackedTensorAccessor32<scalar_t,3,torch::RestrictPtrTraits> origin,
+    const torch::PackedTensorAccessor32<scalar_t,3,torch::RestrictPtrTraits> points,
+    const torch::PackedTensorAccessor32<scalar_t,2,torch::RestrictPtrTraits> tindex,
+    // const torch::PackedTensorAccessor32<scalar_t,5,torch::RestrictPtrTraits> occupancy,
+    torch::PackedTensorAccessor32<scalar_t,2,torch::RestrictPtrTraits> pred_dist,
+    torch::PackedTensorAccessor32<scalar_t,2,torch::RestrictPtrTraits> gt_dist,
+    torch::PackedTensorAccessor32<scalar_t,5,torch::RestrictPtrTraits> grad_sigma,
+    // torch::PackedTensorAccessor32<scalar_t,5,torch::RestrictPtrTraits> grad_sigma_count,
+    LossType loss_type) {
+
+    // batch index
+    const auto n = blockIdx.y;
+
+    // ray index
+    const auto c = blockIdx.x * blockDim.x + threadIdx.x;
+
+    // num of rays
+    const auto M = points.size(1);
+    const auto T = sigma.size(1);
+
+    // we allocated more threads than num_rays
+    if (c < M) {
+        // ray end point
+        const auto t = tindex[n][c];
+
+        // invalid points
+        // assert(t < T);
+        assert(T == 1 || t < T);
+
+        // time index for sigma
+        // when T = 1, we have a static sigma
+        const auto ts = (T == 1) ? 0 : t;
+
+        // if t < 0, it is a padded point
+        if (t < 0) return;
+
+        // grid shape
+        const int vzsize = sigma.size(2);
+        const int vysize = sigma.size(3);
+        const int vxsize = sigma.size(4);
+        // assert(vzsize + vysize + vxsize <= MAX_D);
+
+        // origin
+        const double xo = origin[n][t][0];
+        const double yo = origin[n][t][1];
+        const double zo = origin[n][t][2];
+
+        // end point
+        const double xe = points[n][c][0];
+        const double ye = points[n][c][1];
+        const double ze = points[n][c][2];
+
+        // locate the voxel where the origin resides
+        const int vxo = int(xo);
+        const int vyo = int(yo);
+        const int vzo = int(zo);
+
+        //
+        const int vxe = int(xe);
+        const int vye = int(ye);
+        const int vze = int(ze);
+
+        // NOTE: new
+        int vx = vxo;
+        int vy = vyo;
+        int vz = vzo;
+
+        // origin to end
+        const double rx = xe - xo;
+        const double ry = ye - yo;
+        const double rz = ze - zo;
+        double gt_d = sqrt(rx * rx + ry * ry + rz * rz);
+
+        // directional vector
+        const double dx = rx / gt_d;
+        const double dy = ry / gt_d;
+        const double dz = rz / gt_d;
+
+        // In which direction the voxel ids are incremented.
+        const int stepX = (dx >= 0) ? 1 : -1;
+        const int stepY = (dy >= 0) ? 1 : -1;
+        const int stepZ = (dz >= 0) ? 1 : -1;
+
+        // Distance along the ray to the next voxel border from the current position (tMaxX, tMaxY, tMaxZ).
+        const double next_voxel_boundary_x = vx + (stepX < 0 ? 0 : 1);
+        const double next_voxel_boundary_y = vy + (stepY < 0 ? 0 : 1);
+        const double next_voxel_boundary_z = vz + (stepZ < 0 ? 0 : 1);
+
+        // tMaxX, tMaxY, tMaxZ -- distance until next intersection with voxel-border
+        // the value of t at which the ray crosses the first vertical voxel boundary
+        double tMaxX = (dx!=0) ? (next_voxel_boundary_x - xo)/dx : DBL_MAX; //
+        double tMaxY = (dy!=0) ? (next_voxel_boundary_y - yo)/dy : DBL_MAX; //
+        double tMaxZ = (dz!=0) ? (next_voxel_boundary_z - zo)/dz : DBL_MAX; //
+
+        // tDeltaX, tDeltaY, tDeltaZ --
+        // how far along the ray we must move for the horizontal component to equal the width of a voxel
+        // the direction in which we traverse the grid
+        // can only be FLT_MAX if we never go in that direction
+        const double tDeltaX = (dx!=0) ? stepX/dx : DBL_MAX;
+        const double tDeltaY = (dy!=0) ? stepY/dy : DBL_MAX;
+        const double tDeltaZ = (dz!=0) ? stepZ/dz : DBL_MAX;
+
+        int3 path[MAX_D];
+        double csd[MAX_D];  // cumulative sum of sigma times delta
+        double p[MAX_D];  // alpha
+        double d[MAX_D];
+        double dt[MAX_D];
+
+        // forward raymarching with voxel traversal
+        int step = 0;  // total number of voxels traversed
+        int count = 0;  // number of voxels traversed inside the voxel grid
+        double last_d = 0.0;  // correct initialization
+
+        // voxel traversal raycasting
+        bool was_inside = false;
+        while (true) {
+            bool inside = (0 <= vx && vx < vxsize) &&
+                (0 <= vy && vy < vysize) &&
+                (0 <= vz && vz < vzsize);
+            if (inside) { // now inside
+                was_inside = true;
+                path[count] = make_int3(vx, vy, vz);
+            } else if (was_inside) { // was inside but no longer
+                // we know we are not coming back so terminate
+                break;
+            } else if (last_d > gt_d) {
+                break;
+            } /* else { // has not gone inside yet
+                // assert(count == 0);
+                // (1) when we have hit the destination but haven't gone inside the voxel grid
+                // (2) when we have traveled MAX_D voxels but haven't found one valid voxel
+                //     handle intersection corner cases in case of infinite loop
+                // bool hit = (vx == vxe && vy == vye && vz == vze);
+                // if (hit || step >= MAX_D)
+                //     break;
+                if (last_d >= gt_d || step >= MAX_D) break;
+            } */
+            // _d represents the ray distance has traveled before escaping the current voxel cell
+            double _d = 0.0;
+            // voxel traversal
+            if (tMaxX < tMaxY) {
+                if (tMaxX < tMaxZ) {
+                    _d = tMaxX;
+                    vx += stepX;
+                    tMaxX += tDeltaX;
+                } else {
+                    _d = tMaxZ;
+                    vz += stepZ;
+                    tMaxZ += tDeltaZ;
+                }
+            } else {
+                if (tMaxY < tMaxZ) {
+                    _d = tMaxY;
+                    vy += stepY;
+                    tMaxY += tDeltaY;
+                } else {
+                    _d = tMaxZ;
+                    vz += stepZ;
+                    tMaxZ += tDeltaZ;
+                }
+            }
+            if (inside) {
+                // get sigma at the current voxel
+                const int3 &v = path[count];  // use the recorded index
+                const double _sigma = sigma[n][ts][v.z][v.y][v.x];
+                const double _delta = max(0.0, _d - last_d);  // THIS TURNS OUT IMPORTANT
+                const double sd = _sigma * _delta;
+                if (count == 0) { // the first voxel inside
+                    csd[count] = sd;
+                    p[count] = 1 - exp(-sd);
+                } else {
+                    csd[count] = csd[count-1] + sd;
+                    p[count] = exp(-csd[count-1]) - exp(-csd[count]);
+                }
+                // record the traveled distance
+                d[count] = _d;
+                dt[count] = _delta;
+                // count the number of voxels we have escaped
+                count ++;
+            }
+            last_d = _d;
+            step ++;
+
+            if (step > MAX_STEP) {
+                break;
+            }
+        }
+
+        // the total number of voxels visited should not exceed this number
+        assert(count <= MAX_D);
+
+        // WHEN THERE IS AN INTERSECTION BETWEEN THE RAY AND THE VOXEL GRID
+        if (count > 0) {
+            // compute the expected ray distance
+            double exp_d = 0.0;
+            for (int i = 0; i < count; i ++)
+                exp_d += p[i] * d[i];
+
+            // add an imaginary sample at the end point should gt_d exceeds max_d
+            double p_out = exp(-csd[count-1]);
+            double max_d = d[count-1];
+
+            exp_d += (p_out * max_d);
+            gt_d = min(gt_d, max_d);
+
+            // write the rendered ray distance (max_d)
+            pred_dist[n][c] = exp_d;
+            gt_dist[n][c] = gt_d;
+
+            /* backward raymarching */
+            double dd_dsigma[MAX_D];
+            for (int i = count - 1; i >= 0; i --) {
+                // NOTE: probably need to double check again
+                if (i == count - 1)
+                    dd_dsigma[i] = p_out * max_d;
+                else
+                    dd_dsigma[i] = dd_dsigma[i+1] - exp(-csd[i]) * (d[i+1] - d[i]);
+            }
+
+            for (int i = count - 1; i >= 0; i --)
+                dd_dsigma[i] *= dt[i];
+
+            // option 2: cap at the boundary
+            for (int i = count - 1; i >= 0; i --)
+                dd_dsigma[i] -= dt[i] * p_out * max_d;
+
+            double dl_dd = 1.0;
+            if (loss_type == L1)
+                dl_dd = (exp_d >= gt_d) ? 1 : -1;
+            else if (loss_type == L2)
+                dl_dd = (exp_d - gt_d);
+            else if (loss_type == ABSREL)
+                dl_dd = (exp_d >= gt_d) ? (1.0/gt_d) : -(1.0/gt_d);
+
+            // apply chain rule
+            for (int i = 0; i < count; i ++) {
+                const int3 &v = path[i];
+                // NOTE: potential race conditions when writing gradients
+                grad_sigma[n][ts][v.z][v.y][v.x] += dl_dd * dd_dsigma[i];
+                // grad_sigma_count[n][ts][v.z][v.y][v.x] += 1;
+            }
+        }
+    }
+}
+
+/*
+ * input shape
+ *   sigma      : N x T x H x L x W
+ *   origin   : N x T x 3
+ *   points   : N x M x 4
+ * output shape
+ *   dist     : N x M
+ *   loss     : N x M
+ *   grad_sigma : N x T x H x L x W
+ */
+std::vector<torch::Tensor> render_cuda(
+    torch::Tensor sigma,
+    torch::Tensor origin,
+    torch::Tensor points,
+    torch::Tensor tindex,
+    std::string loss_name) {
+
+    const auto N = points.size(0); // batch size
+    const auto M = points.size(1); // num of rays
+
+    const auto device = sigma.device();
+
+    const int threads = 1024;
+    const dim3 blocks((M + threads - 1) / threads, N);
+
+    // perform rendering
+    auto gt_dist = -torch::ones({N, M}, device);
+    auto pred_dist = -torch::ones({N, M}, device);
+    auto grad_sigma = torch::zeros_like(sigma);
+    // auto grad_sigma_count = torch::zeros_like(sigma);
+
+    LossType loss_type;
+    if (loss_name.compare("l1") == 0) {
+        loss_type = L1;
+    } else if (loss_name.compare("l2") == 0) {
+        loss_type = L2;
+    } else if (loss_name.compare("absrel") == 0) {
+        loss_type = ABSREL;
+    } else if (loss_name.compare("bce") == 0){
+        loss_type = L1;
+    } else {
+        std::cout << "UNKNOWN LOSS TYPE: " << loss_name << std::endl;
+        exit(1);
+    }
+
+    AT_DISPATCH_FLOATING_TYPES(sigma.type(), "render_cuda", ([&] {
+                render_cuda_kernel<scalar_t><<<blocks, threads>>>(
+                    sigma.packed_accessor32<scalar_t,5,torch::RestrictPtrTraits>(),
+                    origin.packed_accessor32<scalar_t,3,torch::RestrictPtrTraits>(),
+                    points.packed_accessor32<scalar_t,3,torch::RestrictPtrTraits>(),
+                    tindex.packed_accessor32<scalar_t,2,torch::RestrictPtrTraits>(),
+                    // occupancy.packed_accessor32<scalar_t,5,torch::RestrictPtrTraits>(),
+                    pred_dist.packed_accessor32<scalar_t,2,torch::RestrictPtrTraits>(),
+                    gt_dist.packed_accessor32<scalar_t,2,torch::RestrictPtrTraits>(),
+                    grad_sigma.packed_accessor32<scalar_t,5,torch::RestrictPtrTraits>(),
+                    // grad_sigma_count.packed_accessor32<scalar_t,5,torch::RestrictPtrTraits>(),
+                    loss_type);
+            }));
+
+    cudaDeviceSynchronize();
+
+    // grad_sigma_count += (grad_sigma_count == 0);
+    // grad_sigma /= grad_sigma_count;
+
+    return {pred_dist, gt_dist, grad_sigma};
+}
+
+
+/*
+ * input shape
+ *   origin   : N x T x 3
+ *   points   : N x M x 3
+ *   tindex   : N x M
+ * output shape
+ *   occupancy: N x T x H x L x W
+ */
+torch::Tensor init_cuda(
+    torch::Tensor points,
+    torch::Tensor tindex,
+    const std::vector<int> grid) {
+
+    const auto N = points.size(0); // batch size
+    const auto M = points.size(1); // num of rays
+
+    const auto T = grid[0];
+    const auto H = grid[1];
+    const auto L = grid[2];
+    const auto W = grid[3];
+
+    const auto dtype = points.dtype();
+    const auto device = points.device();
+    const auto options = torch::TensorOptions().dtype(dtype).device(device).requires_grad(false);
+    auto occupancy = torch::zeros({N, T, H, L, W}, options);
+
+    const int threads = 1024;
+    const dim3 blocks((M + threads - 1) / threads, N);
+
+    // initialize occupancy such that every voxel with one or more points is occupied
+    AT_DISPATCH_FLOATING_TYPES(points.type(), "init_cuda", ([&] {
+                init_cuda_kernel<scalar_t><<<blocks, threads>>>(
+                    points.packed_accessor32<scalar_t,3,torch::RestrictPtrTraits>(),
+                    tindex.packed_accessor32<scalar_t,2,torch::RestrictPtrTraits>(),
+                    occupancy.packed_accessor32<scalar_t,5,torch::RestrictPtrTraits>());
+            }));
+
+    // synchronize
+    cudaDeviceSynchronize();
+
+    return occupancy;
+}
\ No newline at end of file
--- a/lib/dvr/dvr.hip
+++ b/lib/dvr/dvr.hip
+// !!! This is a file automatically generated by hipify!!!
+#include <ATen/dtk_macros.h>
+// Acknowledgments: https://github.com/tarashakhurana/4d-occ-forecasting
+// Modified by Haisong Liu
+
+#include <torch/extension.h>
+#include <stdio.h>
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime.h>
+#include <vector>
+#include <string>
+#include <iostream>
+
+#define MAX_D 1446 // 700 + 700 + 45 + 1
+#define MAX_STEP 1000
+
+enum LossType {L1, L2, ABSREL};
+enum PhaseName {TEST, TRAIN};
+
+template <typename scalar_t>
+__global__ void init_cuda_kernel(
+    const torch::PackedTensorAccessor32<scalar_t,3,torch::RestrictPtrTraits> points,
+    const torch::PackedTensorAccessor32<scalar_t,2,torch::RestrictPtrTraits> tindex,
+    torch::PackedTensorAccessor32<scalar_t,5,torch::RestrictPtrTraits> occupancy) {
+
+    // batch index
+    const auto n = blockIdx.y;
+
+    // ray index
+    const auto c = blockIdx.x * blockDim.x + threadIdx.x;
+
+    // num of rays
+    const auto M = points.size(1);
+    const auto T = occupancy.size(1);
+
+    // we allocated more threads than num_rays
+    if (c < M) {
+        // ray end point
+        const auto t = tindex[n][c];
+
+        // invalid points
+        assert(T == 1 || t < T);
+
+        // if t < 0, it is a padded point
+        if (t < 0) return;
+
+        // time index for sigma
+        // when T = 1, we have a static sigma
+        const auto ts = (T == 1) ? 0 : t;
+
+        // grid shape
+        const int vzsize = occupancy.size(2);
+        const int vysize = occupancy.size(3);
+        const int vxsize = occupancy.size(4);
+        // assert(vzsize + vysize + vxsize <= MAX_D);
+
+        // end point
+        const int vx = int(points[n][c][0]);
+        const int vy = int(points[n][c][1]);
+        const int vz = int(points[n][c][2]);
+
+        //
+        if (0 <= vx && vx < vxsize &&
+            0 <= vy && vy < vysize &&
+            0 <= vz && vz < vzsize) {
+            occupancy[n][ts][vz][vy][vx] = 1;
+        }
+    }
+}
+
+template <typename scalar_t>
+__global__ void render_forward_cuda_kernel(
+    const torch::PackedTensorAccessor32<scalar_t,5,torch::RestrictPtrTraits> sigma,
+    const torch::PackedTensorAccessor32<scalar_t,3,torch::RestrictPtrTraits> origin,
+    const torch::PackedTensorAccessor32<scalar_t,3,torch::RestrictPtrTraits> points,
+    const torch::PackedTensorAccessor32<scalar_t,2,torch::RestrictPtrTraits> tindex,
+    // torch::PackedTensorAccessor32<scalar_t,5,torch::RestrictPtrTraits> pog,
+    torch::PackedTensorAccessor32<scalar_t,2,torch::RestrictPtrTraits> pred_dist,
+    torch::PackedTensorAccessor32<scalar_t,2,torch::RestrictPtrTraits> gt_dist,
+    torch::PackedTensorAccessor32<scalar_t,3,torch::RestrictPtrTraits> coord_index,
+    PhaseName train_phase) {
+
+    // batch index
+    const auto n = blockIdx.y;
+
+    // ray index
+    const auto c = blockIdx.x * blockDim.x + threadIdx.x;
+
+    // num of rays
+    const auto M = points.size(1);
+    const auto T = sigma.size(1);
+
+    // we allocated more threads than num_rays
+    if (c < M) {
+        // ray end point
+        const auto t = tindex[n][c];
+
+        // invalid points
+        // assert(t < T);
+        assert(T == 1 || t < T);
+
+        // time index for sigma
+        // when T = 1, we have a static sigma
+        const auto ts = (T == 1) ? 0 : t;
+
+        // if t < 0, it is a padded point
+        if (t < 0) return;
+
+        // grid shape
+        const int vzsize = sigma.size(2);
+        const int vysize = sigma.size(3);
+        const int vxsize = sigma.size(4);
+        // assert(vzsize + vysize + vxsize <= MAX_D);
+
+        // origin
+        const double xo = origin[n][t][0];
+        const double yo = origin[n][t][1];
+        const double zo = origin[n][t][2];
+
+        // end point
+        const double xe = points[n][c][0];
+        const double ye = points[n][c][1];
+        const double ze = points[n][c][2];
+
+        // locate the voxel where the origin resides
+        const int vxo = int(xo);
+        const int vyo = int(yo);
+        const int vzo = int(zo);
+
+        const int vxe = int(xe);
+        const int vye = int(ye);
+        const int vze = int(ze);
+
+        // NOTE: new
+        int vx = vxo;
+        int vy = vyo;
+        int vz = vzo;
+
+        // origin to end
+        const double rx = xe - xo;
+        const double ry = ye - yo;
+        const double rz = ze - zo;
+        double gt_d = sqrt(rx * rx + ry * ry + rz * rz);
+
+        // directional vector
+        const double dx = rx / gt_d;
+        const double dy = ry / gt_d;
+        const double dz = rz / gt_d;
+
+        // In which direction the voxel ids are incremented.
+        const int stepX = (dx >= 0) ? 1 : -1;
+        const int stepY = (dy >= 0) ? 1 : -1;
+        const int stepZ = (dz >= 0) ? 1 : -1;
+
+        // Distance along the ray to the next voxel border from the current position (tMaxX, tMaxY, tMaxZ).
+        const double next_voxel_boundary_x = vx + (stepX < 0 ? 0 : 1);
+        const double next_voxel_boundary_y = vy + (stepY < 0 ? 0 : 1);
+        const double next_voxel_boundary_z = vz + (stepZ < 0 ? 0 : 1);
+
+        // tMaxX, tMaxY, tMaxZ -- distance until next intersection with voxel-border
+        // the value of t at which the ray crosses the first vertical voxel boundary
+        double tMaxX = (dx!=0) ? (next_voxel_boundary_x - xo)/dx : DBL_MAX; //
+        double tMaxY = (dy!=0) ? (next_voxel_boundary_y - yo)/dy : DBL_MAX; //
+        double tMaxZ = (dz!=0) ? (next_voxel_boundary_z - zo)/dz : DBL_MAX; //
+
+        // tDeltaX, tDeltaY, tDeltaZ --
+        // how far along the ray we must move for the horizontal component to equal the width of a voxel
+        // the direction in which we traverse the grid
+        // can only be FLT_MAX if we never go in that direction
+        const double tDeltaX = (dx!=0) ? stepX/dx : DBL_MAX;
+        const double tDeltaY = (dy!=0) ? stepY/dy : DBL_MAX;
+        const double tDeltaZ = (dz!=0) ? stepZ/dz : DBL_MAX;
+
+        int3 path[MAX_D];
+        double csd[MAX_D];  // cumulative sum of sigma times delta
+        double p[MAX_D];  // alpha
+        double d[MAX_D];
+
+        // forward raymarching with voxel traversal
+        int step = 0;  // total number of voxels traversed
+        int count = 0;  // number of voxels traversed inside the voxel grid
+        double last_d = 0.0;  // correct initialization
+
+        // voxel traversal raycasting
+        bool was_inside = false;
+        while (true) {
+            bool inside = (0 <= vx && vx < vxsize) &&
+                (0 <= vy && vy < vysize) &&
+                (0 <= vz && vz < vzsize);
+            if (inside) {
+                was_inside = true;
+                path[count] = make_int3(vx, vy, vz);
+            } else if (was_inside) { // was but no longer inside
+                // we know we are not coming back so terminate
+                break;
+            } /*else if (last_d > gt_d) {
+                break;
+            } */
+            /*else { // has not gone inside yet
+                // assert(count == 0);
+                // (1) when we have hit the destination but haven't gone inside the voxel grid
+                // (2) when we have traveled MAX_D voxels but haven't found one valid voxel
+                //     handle intersection corner cases in case of infinite loop
+                bool hit = (vx == vxe && vy == vye && vz == vze);  // this test seems brittle with corner cases
+                if (hit || step >= MAX_D)
+                    break;
+                //if (last_d >= gt_d || step >= MAX_D) break;
+            } */
+            // _d represents the ray distance has traveled before escaping the current voxel cell
+            double _d = 0.0;
+            // voxel traversal
+            if (tMaxX < tMaxY) {
+                if (tMaxX < tMaxZ) {
+                    _d = tMaxX;
+                    vx += stepX;
+                    tMaxX += tDeltaX;
+                } else {
+                    _d = tMaxZ;
+                    vz += stepZ;
+                    tMaxZ += tDeltaZ;
+                }
+            } else {
+                if (tMaxY < tMaxZ) {
+                    _d = tMaxY;
+                    vy += stepY;
+                    tMaxY += tDeltaY;
+                } else {
+                    _d = tMaxZ;
+                    vz += stepZ;
+                    tMaxZ += tDeltaZ;
+                }
+            }
+            if (inside) {
+                // get sigma at the current voxel
+                const int3 &v = path[count];  // use the recorded index
+                const double _sigma = sigma[n][ts][v.z][v.y][v.x];
+                const double _delta = max(0.0, _d - last_d);  // THIS TURNS OUT IMPORTANT
+                const double sd = _sigma * _delta;
+                if (count == 0) { // the first voxel inside
+                    csd[count] = sd;
+                    p[count] = 1 - exp(-sd);
+                } else {
+                    csd[count] = csd[count-1] + sd;
+                    p[count] = exp(-csd[count-1]) - exp(-csd[count]);
+                }
+                // record the traveled distance
+                d[count] = _d;
+                // count the number of voxels we have escaped
+                count ++;
+            }
+            last_d = _d;
+            step ++;
+
+            if (step > MAX_STEP) {
+                break;
+            }
+        }
+
+        // the total number of voxels visited should not exceed this number
+        assert(count <= MAX_D);
+        
+        if (count > 0) {
+            // compute the expected ray distance
+            //double exp_d = 0.0;
+            double exp_d = d[count-1];
+            
+            const int3 &v_init = path[count-1];
+            int x = v_init.x;
+            int y = v_init.y;
+            int z = v_init.z;
+
+            for (int i = 0; i < count; i++) {
+                //printf("%f\t%f\n",p[i], d[i]);
+                //exp_d += p[i] * d[i];
+                const int3 &v = path[i];
+                const double occ = sigma[n][ts][v.z][v.y][v.x];
+                if (occ > 0.5) {
+                    exp_d = d[i];
+                    
+                    x = v.x;
+                    y = v.y;
+                    z = v.z;
+                
+                    break;
+                }
+
+            }
+            //printf("%f\n",exp_d);
+
+            // add an imaginary sample at the end point should gt_d exceeds max_d
+            double p_out = exp(-csd[count-1]);
+            double max_d = d[count-1];
+
+            // if (gt_d > max_d)
+            //   exp_d += (p_out * gt_d);
+
+            // p_out is the probability the ray escapes the voxel grid
+            //exp_d += (p_out * max_d);
+            if (train_phase == 1) {
+                gt_d = min(gt_d, max_d);
+            }
+
+            // write the rendered ray distance (max_d)
+            pred_dist[n][c] = exp_d;
+            gt_dist[n][c] = gt_d;
+          
+            coord_index[n][c][0] = double(x);
+            coord_index[n][c][1] = double(y);
+            coord_index[n][c][2] = double(z);
+
+            // // write occupancy
+            // for (int i = 0; i < count; i ++) {
+            //     const int3 &v = path[i];
+            //     auto & occ = pog[n][t][v.z][v.y][v.x];
+            //     if (p[i] >= occ) {
+            //         occ = p[i];
+            //     }
+            // }
+        }
+    }
+}
+
+/*
+ * input shape
+ *   sigma      : N x T x H x L x W
+ *   origin   : N x T x 3
+ *   points   : N x M x 4
+ * output shape
+ *   dist     : N x M
+ */
+std::vector<torch::Tensor> render_forward_cuda(
+    torch::Tensor sigma,
+    torch::Tensor origin,
+    torch::Tensor points,
+    torch::Tensor tindex,
+    const std::vector<int> grid,
+    std::string phase_name) {
+
+    const auto N = points.size(0); // batch size
+    const auto M = points.size(1); // num of rays
+
+    const auto T = grid[0];
+    const auto H = grid[1];
+    const auto L = grid[2];
+    const auto W = grid[3];
+
+    const auto device = sigma.device();
+
+    const int threads = 1024;
+    const dim3 blocks((M + threads - 1) / threads, N);
+
+    //
+    // const auto dtype = points.dtype();
+    // const auto options = torch::TensorOptions().dtype(dtype).device(device).requires_grad(false);
+    // auto pog = torch::zeros({N, T, H, L, W}, options);
+
+    // perform rendering
+    auto gt_dist = -torch::ones({N, M}, device);
+    auto pred_dist = -torch::ones({N, M}, device);
+
+    auto coord_index = torch::zeros({N, M, 3}, device);
+
+    PhaseName train_phase;
+    if (phase_name.compare("test") == 0) {
+        train_phase = TEST;
+    } else if (phase_name.compare("train") == 0){
+        train_phase = TRAIN;
+    } else {
+        std::cout << "UNKNOWN PHASE NAME: " << phase_name << std::endl;
+        exit(1);
+    }
+
+    AT_DISPATCH_FLOATING_TYPES(sigma.type(), "render_forward_cuda", ([&] {
+               hipLaunchKernelGGL(( render_forward_cuda_kernel<scalar_t>), dim3(blocks), dim3(threads), 0, 0, 
+                    sigma.packed_accessor32<scalar_t,5,torch::RestrictPtrTraits>(),
+                    origin.packed_accessor32<scalar_t,3,torch::RestrictPtrTraits>(),
+                    points.packed_accessor32<scalar_t,3,torch::RestrictPtrTraits>(),
+                    tindex.packed_accessor32<scalar_t,2,torch::RestrictPtrTraits>(),
+                    // pog.packed_accessor32<scalar_t,5,torch::RestrictPtrTraits>(),
+                    pred_dist.packed_accessor32<scalar_t,2,torch::RestrictPtrTraits>(),
+                    gt_dist.packed_accessor32<scalar_t,2,torch::RestrictPtrTraits>(),
+                    coord_index.packed_accessor32<scalar_t,3,torch::RestrictPtrTraits>(),
+                    train_phase);
+            }));
+
+    hipDeviceSynchronize();
+
+    // return {pog, pred_dist, gt_dist};
+    return {pred_dist, gt_dist, coord_index};
+}
+
+template <typename scalar_t>
+__global__ void render_cuda_kernel(
+    const torch::PackedTensorAccessor32<scalar_t,5,torch::RestrictPtrTraits> sigma,
+    const torch::PackedTensorAccessor32<scalar_t,3,torch::RestrictPtrTraits> origin,
+    const torch::PackedTensorAccessor32<scalar_t,3,torch::RestrictPtrTraits> points,
+    const torch::PackedTensorAccessor32<scalar_t,2,torch::RestrictPtrTraits> tindex,
+    // const torch::PackedTensorAccessor32<scalar_t,5,torch::RestrictPtrTraits> occupancy,
+    torch::PackedTensorAccessor32<scalar_t,2,torch::RestrictPtrTraits> pred_dist,
+    torch::PackedTensorAccessor32<scalar_t,2,torch::RestrictPtrTraits> gt_dist,
+    torch::PackedTensorAccessor32<scalar_t,5,torch::RestrictPtrTraits> grad_sigma,
+    // torch::PackedTensorAccessor32<scalar_t,5,torch::RestrictPtrTraits> grad_sigma_count,
+    LossType loss_type) {
+
+    // batch index
+    const auto n = blockIdx.y;
+
+    // ray index
+    const auto c = blockIdx.x * blockDim.x + threadIdx.x;
+
+    // num of rays
+    const auto M = points.size(1);
+    const auto T = sigma.size(1);
+
+    // we allocated more threads than num_rays
+    if (c < M) {
+        // ray end point
+        const auto t = tindex[n][c];
+
+        // invalid points
+        // assert(t < T);
+        assert(T == 1 || t < T);
+
+        // time index for sigma
+        // when T = 1, we have a static sigma
+        const auto ts = (T == 1) ? 0 : t;
+
+        // if t < 0, it is a padded point
+        if (t < 0) return;
+
+        // grid shape
+        const int vzsize = sigma.size(2);
+        const int vysize = sigma.size(3);
+        const int vxsize = sigma.size(4);
+        // assert(vzsize + vysize + vxsize <= MAX_D);
+
+        // origin
+        const double xo = origin[n][t][0];
+        const double yo = origin[n][t][1];
+        const double zo = origin[n][t][2];
+
+        // end point
+        const double xe = points[n][c][0];
+        const double ye = points[n][c][1];
+        const double ze = points[n][c][2];
+
+        // locate the voxel where the origin resides
+        const int vxo = int(xo);
+        const int vyo = int(yo);
+        const int vzo = int(zo);
+
+        //
+        const int vxe = int(xe);
+        const int vye = int(ye);
+        const int vze = int(ze);
+
+        // NOTE: new
+        int vx = vxo;
+        int vy = vyo;
+        int vz = vzo;
+
+        // origin to end
+        const double rx = xe - xo;
+        const double ry = ye - yo;
+        const double rz = ze - zo;
+        double gt_d = sqrt(rx * rx + ry * ry + rz * rz);
+
+        // directional vector
+        const double dx = rx / gt_d;
+        const double dy = ry / gt_d;
+        const double dz = rz / gt_d;
+
+        // In which direction the voxel ids are incremented.
+        const int stepX = (dx >= 0) ? 1 : -1;
+        const int stepY = (dy >= 0) ? 1 : -1;
+        const int stepZ = (dz >= 0) ? 1 : -1;
+
+        // Distance along the ray to the next voxel border from the current position (tMaxX, tMaxY, tMaxZ).
+        const double next_voxel_boundary_x = vx + (stepX < 0 ? 0 : 1);
+        const double next_voxel_boundary_y = vy + (stepY < 0 ? 0 : 1);
+        const double next_voxel_boundary_z = vz + (stepZ < 0 ? 0 : 1);
+
+        // tMaxX, tMaxY, tMaxZ -- distance until next intersection with voxel-border
+        // the value of t at which the ray crosses the first vertical voxel boundary
+        double tMaxX = (dx!=0) ? (next_voxel_boundary_x - xo)/dx : DBL_MAX; //
+        double tMaxY = (dy!=0) ? (next_voxel_boundary_y - yo)/dy : DBL_MAX; //
+        double tMaxZ = (dz!=0) ? (next_voxel_boundary_z - zo)/dz : DBL_MAX; //
+
+        // tDeltaX, tDeltaY, tDeltaZ --
+        // how far along the ray we must move for the horizontal component to equal the width of a voxel
+        // the direction in which we traverse the grid
+        // can only be FLT_MAX if we never go in that direction
+        const double tDeltaX = (dx!=0) ? stepX/dx : DBL_MAX;
+        const double tDeltaY = (dy!=0) ? stepY/dy : DBL_MAX;
+        const double tDeltaZ = (dz!=0) ? stepZ/dz : DBL_MAX;
+
+        int3 path[MAX_D];
+        double csd[MAX_D];  // cumulative sum of sigma times delta
+        double p[MAX_D];  // alpha
+        double d[MAX_D];
+        double dt[MAX_D];
+
+        // forward raymarching with voxel traversal
+        int step = 0;  // total number of voxels traversed
+        int count = 0;  // number of voxels traversed inside the voxel grid
+        double last_d = 0.0;  // correct initialization
+
+        // voxel traversal raycasting
+        bool was_inside = false;
+        while (true) {
+            bool inside = (0 <= vx && vx < vxsize) &&
+                (0 <= vy && vy < vysize) &&
+                (0 <= vz && vz < vzsize);
+            if (inside) { // now inside
+                was_inside = true;
+                path[count] = make_int3(vx, vy, vz);
+            } else if (was_inside) { // was inside but no longer
+                // we know we are not coming back so terminate
+                break;
+            } else if (last_d > gt_d) {
+                break;
+            } /* else { // has not gone inside yet
+                // assert(count == 0);
+                // (1) when we have hit the destination but haven't gone inside the voxel grid
+                // (2) when we have traveled MAX_D voxels but haven't found one valid voxel
+                //     handle intersection corner cases in case of infinite loop
+                // bool hit = (vx == vxe && vy == vye && vz == vze);
+                // if (hit || step >= MAX_D)
+                //     break;
+                if (last_d >= gt_d || step >= MAX_D) break;
+            } */
+            // _d represents the ray distance has traveled before escaping the current voxel cell
+            double _d = 0.0;
+            // voxel traversal
+            if (tMaxX < tMaxY) {
+                if (tMaxX < tMaxZ) {
+                    _d = tMaxX;
+                    vx += stepX;
+                    tMaxX += tDeltaX;
+                } else {
+                    _d = tMaxZ;
+                    vz += stepZ;
+                    tMaxZ += tDeltaZ;
+                }
+            } else {
+                if (tMaxY < tMaxZ) {
+                    _d = tMaxY;
+                    vy += stepY;
+                    tMaxY += tDeltaY;
+                } else {
+                    _d = tMaxZ;
+                    vz += stepZ;
+                    tMaxZ += tDeltaZ;
+                }
+            }
+            if (inside) {
+                // get sigma at the current voxel
+                const int3 &v = path[count];  // use the recorded index
+                const double _sigma = sigma[n][ts][v.z][v.y][v.x];
+                const double _delta = max(0.0, _d - last_d);  // THIS TURNS OUT IMPORTANT
+                const double sd = _sigma * _delta;
+                if (count == 0) { // the first voxel inside
+                    csd[count] = sd;
+                    p[count] = 1 - exp(-sd);
+                } else {
+                    csd[count] = csd[count-1] + sd;
+                    p[count] = exp(-csd[count-1]) - exp(-csd[count]);
+                }
+                // record the traveled distance
+                d[count] = _d;
+                dt[count] = _delta;
+                // count the number of voxels we have escaped
+                count ++;
+            }
+            last_d = _d;
+            step ++;
+
+            if (step > MAX_STEP) {
+                break;
+            }
+        }
+
+        // the total number of voxels visited should not exceed this number
+        assert(count <= MAX_D);
+
+        // WHEN THERE IS AN INTERSECTION BETWEEN THE RAY AND THE VOXEL GRID
+        if (count > 0) {
+            // compute the expected ray distance
+            double exp_d = 0.0;
+            for (int i = 0; i < count; i ++)
+                exp_d += p[i] * d[i];
+
+            // add an imaginary sample at the end point should gt_d exceeds max_d
+            double p_out = exp(-csd[count-1]);
+            double max_d = d[count-1];
+
+            exp_d += (p_out * max_d);
+            gt_d = min(gt_d, max_d);
+
+            // write the rendered ray distance (max_d)
+            pred_dist[n][c] = exp_d;
+            gt_dist[n][c] = gt_d;
+
+            /* backward raymarching */
+            double dd_dsigma[MAX_D];
+            for (int i = count - 1; i >= 0; i --) {
+                // NOTE: probably need to double check again
+                if (i == count - 1)
+                    dd_dsigma[i] = p_out * max_d;
+                else
+                    dd_dsigma[i] = dd_dsigma[i+1] - exp(-csd[i]) * (d[i+1] - d[i]);
+            }
+
+            for (int i = count - 1; i >= 0; i --)
+                dd_dsigma[i] *= dt[i];
+
+            // option 2: cap at the boundary
+            for (int i = count - 1; i >= 0; i --)
+                dd_dsigma[i] -= dt[i] * p_out * max_d;
+
+            double dl_dd = 1.0;
+            if (loss_type == L1)
+                dl_dd = (exp_d >= gt_d) ? 1 : -1;
+            else if (loss_type == L2)
+                dl_dd = (exp_d - gt_d);
+            else if (loss_type == ABSREL)
+                dl_dd = (exp_d >= gt_d) ? (1.0/gt_d) : -(1.0/gt_d);
+
+            // apply chain rule
+            for (int i = 0; i < count; i ++) {
+                const int3 &v = path[i];
+                // NOTE: potential race conditions when writing gradients
+                grad_sigma[n][ts][v.z][v.y][v.x] += dl_dd * dd_dsigma[i];
+                // grad_sigma_count[n][ts][v.z][v.y][v.x] += 1;
+            }
+        }
+    }
+}
+
+/*
+ * input shape
+ *   sigma      : N x T x H x L x W
+ *   origin   : N x T x 3
+ *   points   : N x M x 4
+ * output shape
+ *   dist     : N x M
+ *   loss     : N x M
+ *   grad_sigma : N x T x H x L x W
+ */
+std::vector<torch::Tensor> render_cuda(
+    torch::Tensor sigma,
+    torch::Tensor origin,
+    torch::Tensor points,
+    torch::Tensor tindex,
+    std::string loss_name) {
+
+    const auto N = points.size(0); // batch size
+    const auto M = points.size(1); // num of rays
+
+    const auto device = sigma.device();
+
+    const int threads = 1024;
+    const dim3 blocks((M + threads - 1) / threads, N);
+
+    // perform rendering
+    auto gt_dist = -torch::ones({N, M}, device);
+    auto pred_dist = -torch::ones({N, M}, device);
+    auto grad_sigma = torch::zeros_like(sigma);
+    // auto grad_sigma_count = torch::zeros_like(sigma);
+
+    LossType loss_type;
+    if (loss_name.compare("l1") == 0) {
+        loss_type = L1;
+    } else if (loss_name.compare("l2") == 0) {
+        loss_type = L2;
+    } else if (loss_name.compare("absrel") == 0) {
+        loss_type = ABSREL;
+    } else if (loss_name.compare("bce") == 0){
+        loss_type = L1;
+    } else {
+        std::cout << "UNKNOWN LOSS TYPE: " << loss_name << std::endl;
+        exit(1);
+    }
+
+    AT_DISPATCH_FLOATING_TYPES(sigma.type(), "render_cuda", ([&] {
+               hipLaunchKernelGGL(( render_cuda_kernel<scalar_t>), dim3(blocks), dim3(threads), 0, 0, 
+                    sigma.packed_accessor32<scalar_t,5,torch::RestrictPtrTraits>(),
+                    origin.packed_accessor32<scalar_t,3,torch::RestrictPtrTraits>(),
+                    points.packed_accessor32<scalar_t,3,torch::RestrictPtrTraits>(),
+                    tindex.packed_accessor32<scalar_t,2,torch::RestrictPtrTraits>(),
+                    // occupancy.packed_accessor32<scalar_t,5,torch::RestrictPtrTraits>(),
+                    pred_dist.packed_accessor32<scalar_t,2,torch::RestrictPtrTraits>(),
+                    gt_dist.packed_accessor32<scalar_t,2,torch::RestrictPtrTraits>(),
+                    grad_sigma.packed_accessor32<scalar_t,5,torch::RestrictPtrTraits>(),
+                    // grad_sigma_count.packed_accessor32<scalar_t,5,torch::RestrictPtrTraits>(),
+                    loss_type);
+            }));
+
+    hipDeviceSynchronize();
+
+    // grad_sigma_count += (grad_sigma_count == 0);
+    // grad_sigma /= grad_sigma_count;
+
+    return {pred_dist, gt_dist, grad_sigma};
+}
+
+
+/*
+ * input shape
+ *   origin   : N x T x 3
+ *   points   : N x M x 3
+ *   tindex   : N x M
+ * output shape
+ *   occupancy: N x T x H x L x W
+ */
+torch::Tensor init_cuda(
+    torch::Tensor points,
+    torch::Tensor tindex,
+    const std::vector<int> grid) {
+
+    const auto N = points.size(0); // batch size
+    const auto M = points.size(1); // num of rays
+
+    const auto T = grid[0];
+    const auto H = grid[1];
+    const auto L = grid[2];
+    const auto W = grid[3];
+
+    const auto dtype = points.dtype();
+    const auto device = points.device();
+    const auto options = torch::TensorOptions().dtype(dtype).device(device).requires_grad(false);
+    auto occupancy = torch::zeros({N, T, H, L, W}, options);
+
+    const int threads = 1024;
+    const dim3 blocks((M + threads - 1) / threads, N);
+
+    // initialize occupancy such that every voxel with one or more points is occupied
+    AT_DISPATCH_FLOATING_TYPES(points.type(), "init_cuda", ([&] {
+               hipLaunchKernelGGL(( init_cuda_kernel<scalar_t>), dim3(blocks), dim3(threads), 0, 0, 
+                    points.packed_accessor32<scalar_t,3,torch::RestrictPtrTraits>(),
+                    tindex.packed_accessor32<scalar_t,2,torch::RestrictPtrTraits>(),
+                    occupancy.packed_accessor32<scalar_t,5,torch::RestrictPtrTraits>());
+            }));
+
+    // synchronize
+    hipDeviceSynchronize();
+
+    return occupancy;
+}
\ No newline at end of file
--- a/projects/__init__.py
+++ b/projects/__init__.py
--- a/projects/configs/bevdet_occ/bevdet-occ-r50-4d-stereo.py
+++ b/projects/configs/bevdet_occ/bevdet-occ-r50-4d-stereo.py
+_base_ = ['../../../mmdetection3d/configs/_base_/datasets/nus-3d.py',
+          '../../../mmdetection3d/configs/_base_/default_runtime.py']
+
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+
+data_config = {
+    'cams': [
+        'CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT',
+        'CAM_BACK', 'CAM_BACK_RIGHT'
+    ],
+    'Ncams':
+    6,
+    'input_size': (256, 704),
+    'src_size': (900, 1600),
+
+    # Augmentation
+    'resize': (-0.06, 0.11),
+    'rot': (-5.4, 5.4),
+    'flip': True,
+    'crop_h': (0.0, 0.0),
+    'resize_test': 0.00,
+}
+
+grid_config = {
+    'x': [-40, 40, 0.4],
+    'y': [-40, 40, 0.4],
+    'z': [-1, 5.4, 0.4],
+    'depth': [1.0, 45.0, 0.5],
+}
+
+voxel_size = [0.1, 0.1, 0.2]
+
+numC_Trans = 32
+multi_adj_frame_id_cfg = (1, 1+1, 1)
+
+
+model = dict(
+    type='BEVStereo4DOCC',
+    align_after_view_transfromation=False,
+    num_adj=len(range(*multi_adj_frame_id_cfg)),
+    img_backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 2, 3),
+        frozen_stages=-1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=False,
+        with_cp=True,
+        style='pytorch'),
+    img_neck=dict(
+        type='CustomFPN',
+        in_channels=[1024, 2048],
+        out_channels=256,
+        num_outs=1,
+        start_level=0,
+        out_ids=[0]),
+    img_view_transformer=dict(
+        type='LSSViewTransformerBEVStereo',
+        grid_config=grid_config,
+        input_size=data_config['input_size'],
+        in_channels=256,
+        out_channels=numC_Trans,
+        sid=False,
+        collapse_z=False,
+        loss_depth_weight=0.05,
+        depthnet_cfg=dict(use_dcn=False,
+                          aspp_mid_channels=96,
+                          stereo=True,
+                          bias=5.),
+        downsample=16),
+    img_bev_encoder_backbone=dict(
+        type='CustomResNet3D',
+        numC_input=numC_Trans * (len(range(*multi_adj_frame_id_cfg))+1),
+        num_layer=[1, 2, 4],
+        with_cp=False,
+        num_channels=[numC_Trans, numC_Trans*2, numC_Trans*4],
+        stride=[1, 2, 2],
+        backbone_output_ids=[0, 1, 2]),
+    img_bev_encoder_neck=dict(type='LSSFPN3D',
+                              in_channels=numC_Trans*7,
+                              out_channels=numC_Trans),
+    pre_process=dict(
+        type='CustomResNet3D',
+        numC_input=numC_Trans,
+        with_cp=False,
+        num_layer=[1, ],
+        num_channels=[numC_Trans, ],
+        stride=[1, ],
+        backbone_output_ids=[0, ]),
+    occ_head=dict(
+        type='BEVOCCHead3D',
+        in_dim=numC_Trans,
+        out_dim=32,
+        use_mask=True,
+        num_classes=18,
+        use_predicter=True,
+        class_balance=False,
+        loss_occ=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=False,
+            ignore_index=255,
+            loss_weight=1.0
+        ),
+    )
+)
+
+# Data
+dataset_type = 'NuScenesDatasetOccpancy'
+data_root = 'data/nuscenes/'
+file_client_args = dict(backend='disk')
+
+bda_aug_conf = dict(
+    rot_lim=(-0., 0.),
+    scale_lim=(1., 1.),
+    flip_dx_ratio=0.5,
+    flip_dy_ratio=0.5)
+
+train_pipeline = [
+    dict(
+        type='PrepareImageInputs',
+        is_train=True,
+        data_config=data_config,
+        sequential=True),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=True),
+    dict(type='LoadOccGTFromFile'),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(type='PointToMultiViewDepth', downsample=1, grid_config=grid_config),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(
+        type='Collect3D', keys=['img_inputs', 'gt_depth', 'voxel_semantics',
+                                'mask_lidar', 'mask_camera'])
+]
+
+test_pipeline = [
+    dict(type='PrepareImageInputs', data_config=data_config, sequential=True),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=False),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points', 'img_inputs'])
+        ])
+]
+
+
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=False)
+
+share_data_config = dict(
+    type=dataset_type,
+    data_root=data_root,
+    classes=class_names,
+    modality=input_modality,
+    stereo=True,
+    filter_empty_gt=False,
+    img_info_prototype='bevdet4d',
+    multi_adj_frame_id_cfg=multi_adj_frame_id_cfg,
+)
+
+test_data_config = dict(
+    pipeline=test_pipeline,
+    ann_file=data_root + 'bevdetv2-nuscenes_infos_val.pkl')
+
+data = dict(
+    samples_per_gpu=4,
+    workers_per_gpu=4,
+    train=dict(
+        data_root=data_root,
+        ann_file=data_root + 'bevdetv2-nuscenes_infos_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        test_mode=False,
+        use_valid_flag=True,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR'),
+    val=test_data_config,
+    test=test_data_config)
+
+for key in ['val', 'train', 'test']:
+    data[key].update(share_data_config)
+
+# Optimizer
+optimizer = dict(type='AdamW', lr=1e-4, weight_decay=1e-2)
+optimizer_config = dict(grad_clip=dict(max_norm=5, norm_type=2))
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=200,
+    warmup_ratio=0.001,
+    step=[24, ])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
+
+custom_hooks = [
+    dict(
+        type='MEGVIIEMAHook',
+        init_updates=10560,
+        priority='NORMAL',
+    ),
+]
+
+load_from = "ckpts/bevdet-r50-4d-stereo-cbgs.pth"
+# fp16 = dict(loss_scale='dynamic')
+evaluation = dict(interval=1, start=20, pipeline=test_pipeline)
+checkpoint_config = dict(interval=1, max_keep_ckpts=5)
+
+
+# with_pretrain:
+# align_after_view_transfromation=False
+# ===> per class IoU of 6019 samples:
+# ===> others - IoU = 8.22
+# ===> barrier - IoU = 44.21
+# ===> bicycle - IoU = 10.34
+# ===> bus - IoU = 42.08
+# ===> car - IoU = 49.63
+# ===> construction_vehicle - IoU = 23.37
+# ===> motorcycle - IoU = 17.41
+# ===> pedestrian - IoU = 21.49
+# ===> traffic_cone - IoU = 19.7
+# ===> trailer - IoU = 31.33
+# ===> truck - IoU = 37.09
+# ===> driveable_surface - IoU = 80.13
+# ===> other_flat - IoU = 37.37
+# ===> sidewalk - IoU = 50.41
+# ===> terrain - IoU = 54.29
+# ===> manmade - IoU = 45.56
+# ===> vegetation - IoU = 39.59
+# ===> mIoU of 6019 samples: 36.01
--- a/projects/configs/bevdet_occ/bevdet-occ-r50.py
+++ b/projects/configs/bevdet_occ/bevdet-occ-r50.py
+_base_ = ['../../../mmdetection3d/configs/_base_/datasets/nus-3d.py',
+          '../../../mmdetection3d/configs/_base_/default_runtime.py']
+
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+
+data_config = {
+    'cams': [
+        'CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT',
+        'CAM_BACK', 'CAM_BACK_RIGHT'
+    ],
+    'Ncams':
+    6,
+    'input_size': (256, 704),
+    'src_size': (900, 1600),
+
+    # Augmentation
+    'resize': (-0.06, 0.11),
+    'rot': (-5.4, 5.4),
+    'flip': True,
+    'crop_h': (0.0, 0.0),
+    'resize_test': 0.00,
+}
+
+grid_config = {
+    'x': [-40, 40, 0.4],
+    'y': [-40, 40, 0.4],
+    'z': [-1, 5.4, 0.4],
+    'depth': [1.0, 45.0, 0.5],
+}
+
+voxel_size = [0.1, 0.1, 0.2]
+
+numC_Trans = 32
+
+model = dict(
+    type='BEVDetOCC',
+    img_backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(2, 3),
+        frozen_stages=-1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=False,
+        with_cp=True,
+        style='pytorch',
+        pretrained='torchvision://resnet50',
+    ),
+    img_neck=dict(
+        type='CustomFPN',
+        in_channels=[1024, 2048],
+        out_channels=256,
+        num_outs=1,
+        start_level=0,
+        out_ids=[0]),
+    img_view_transformer=dict(
+        type='LSSViewTransformer',
+        grid_config=grid_config,
+        input_size=data_config['input_size'],
+        in_channels=256,
+        out_channels=numC_Trans,
+        sid=False,
+        collapse_z=False,
+        downsample=16),
+    img_bev_encoder_backbone=dict(
+        type='CustomResNet3D',
+        numC_input=numC_Trans,
+        num_layer=[1, 2, 4],
+        with_cp=False,
+        num_channels=[numC_Trans, numC_Trans*2, numC_Trans*4],
+        stride=[1, 2, 2],
+        backbone_output_ids=[0, 1, 2]),
+    img_bev_encoder_neck=dict(type='LSSFPN3D',
+                              in_channels=numC_Trans*7,
+                              out_channels=numC_Trans),
+    occ_head=dict(
+        type='BEVOCCHead3D',
+        in_dim=numC_Trans,
+        out_dim=32,
+        use_mask=True,
+        num_classes=18,
+        use_predicter=True,
+        class_balance=False,
+        loss_occ=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=False,
+            ignore_index=255,
+            loss_weight=1.0
+        ),
+    )
+)
+
+# Data
+dataset_type = 'NuScenesDatasetOccpancy'
+data_root = 'data/nuscenes/'
+file_client_args = dict(backend='disk')
+
+bda_aug_conf = dict(
+    rot_lim=(-0., 0.),
+    scale_lim=(1., 1.),
+    flip_dx_ratio=0.5,
+    flip_dy_ratio=0.5)
+
+train_pipeline = [
+    dict(
+        type='PrepareImageInputs',
+        is_train=True,
+        data_config=data_config,
+        sequential=False),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=True),
+    dict(type='LoadOccGTFromFile'),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(type='PointToMultiViewDepth', downsample=1, grid_config=grid_config),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(
+        type='Collect3D', keys=['img_inputs', 'gt_depth', 'voxel_semantics',
+                                'mask_lidar', 'mask_camera'])
+]
+
+test_pipeline = [
+    dict(type='PrepareImageInputs', data_config=data_config, sequential=False),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=False),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points', 'img_inputs'])
+        ])
+]
+
+
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=False)
+
+share_data_config = dict(
+    type=dataset_type,
+    data_root=data_root,
+    classes=class_names,
+    modality=input_modality,
+    stereo=True,
+    filter_empty_gt=False,
+    img_info_prototype='bevdet',
+)
+
+test_data_config = dict(
+    pipeline=test_pipeline,
+    ann_file=data_root + 'bevdetv2-nuscenes_infos_val.pkl')
+
+data = dict(
+    samples_per_gpu=4,
+    workers_per_gpu=4,
+    train=dict(
+        data_root=data_root,
+        ann_file=data_root + 'bevdetv2-nuscenes_infos_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        test_mode=False,
+        use_valid_flag=True,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR'),
+    val=test_data_config,
+    test=test_data_config)
+
+for key in ['val', 'train', 'test']:
+    data[key].update(share_data_config)
+
+# Optimizer
+optimizer = dict(type='AdamW', lr=1e-4, weight_decay=1e-2)
+optimizer_config = dict(grad_clip=dict(max_norm=5, norm_type=2))
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=200,
+    warmup_ratio=0.001,
+    step=[24, ])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
+
+custom_hooks = [
+    dict(
+        type='MEGVIIEMAHook',
+        init_updates=10560,
+        priority='NORMAL',
+    ),
+]
+
+load_from = "ckpts/bevdet-r50-cbgs.pth"
+# fp16 = dict(loss_scale='dynamic')
+evaluation = dict(interval=1, start=20, pipeline=test_pipeline)
+checkpoint_config = dict(interval=1, max_keep_ckpts=5)
+
+
+# with pretrain
+# ===> per class IoU of 6019 samples:
+# ===> others - IoU = 6.65
+# ===> barrier - IoU = 36.97
+# ===> bicycle - IoU = 8.33
+# ===> bus - IoU = 38.69
+# ===> car - IoU = 44.46
+# ===> construction_vehicle - IoU = 15.21
+# ===> motorcycle - IoU = 13.67
+# ===> pedestrian - IoU = 16.39
+# ===> traffic_cone - IoU = 15.27
+# ===> trailer - IoU = 27.11
+# ===> truck - IoU = 31.04
+# ===> driveable_surface - IoU = 78.7
+# ===> other_flat - IoU = 36.45
+# ===> sidewalk - IoU = 48.27
+# ===> terrain - IoU = 51.68
+# ===> manmade - IoU = 36.82
+# ===> vegetation - IoU = 32.09
+# ===> mIoU of 6019 samples: 31.64
+
+
+# with det pretrain; use_mask=False; class_balance=True
+# ===> per class IoU of 6019 samples:
+# ===> others - IoU = 4.36
+# ===> barrier - IoU = 28.87
+# ===> bicycle - IoU = 2.86
+# ===> bus - IoU = 29.27
+# ===> car - IoU = 32.45
+# ===> construction_vehicle - IoU = 11.05
+# ===> motorcycle - IoU = 12.82
+# ===> pedestrian - IoU = 10.11
+# ===> traffic_cone - IoU = 9.47
+# ===> trailer - IoU = 7.93
+# ===> truck - IoU = 21.58
+# ===> driveable_surface - IoU = 49.85
+# ===> other_flat - IoU = 25.5
+# ===> sidewalk - IoU = 26.78
+# ===> terrain - IoU = 21.14
+# ===> manmade - IoU = 5.76
+# ===> vegetation - IoU = 7.09
+# ===> mIoU of 6019 samples: 18.05
\ No newline at end of file
--- a/projects/configs/bevdet_occ/bevdet-occ-stbase-4d-stereo-512x1408.py
+++ b/projects/configs/bevdet_occ/bevdet-occ-stbase-4d-stereo-512x1408.py
+# Copyright (c) Phigent Robotics. All rights reserved.
+# align_after_view_transfromation=True
+
+# align_after_view_transfromation=False
+# 1x/12epoch
+# ===> per class IoU of 6019 samples:
+# ===> others - IoU = 10.12
+# ===> barrier - IoU = 48.06
+# ===> bicycle - IoU = 0.0
+# ===> bus - IoU = 51.19
+# ===> car - IoU = 53.61
+# ===> construction_vehicle - IoU = 27.15
+# ===> motorcycle - IoU = 2.74
+# ===> pedestrian - IoU = 28.3
+# ===> traffic_cone - IoU = 23.33
+# ===> trailer - IoU = 36.24
+# ===> truck - IoU = 42.13
+# ===> driveable_surface - IoU = 81.77
+# ===> other_flat - IoU = 42.43
+# ===> sidewalk - IoU = 53.67
+# ===> terrain - IoU = 57.31
+# ===> manmade - IoU = 48.27
+# ===> vegetation - IoU = 43.31
+# ===> mIoU of 6019 samples: 38.21
+
+# 2x/24epoch
+# ===> per class IoU of 6019 samples:
+# ===> others - IoU = 12.15
+# ===> barrier - IoU = 49.63
+# ===> bicycle - IoU = 25.1
+# ===> bus - IoU = 52.02
+# ===> car - IoU = 54.46
+# ===> construction_vehicle - IoU = 27.87
+# ===> motorcycle - IoU = 27.99
+# ===> pedestrian - IoU = 28.94
+# ===> traffic_cone - IoU = 27.23
+# ===> trailer - IoU = 36.43
+# ===> truck - IoU = 42.22
+# ===> driveable_surface - IoU = 82.31
+# ===> other_flat - IoU = 43.29
+# ===> sidewalk - IoU = 54.62
+# ===> terrain - IoU = 57.9
+# ===> manmade - IoU = 48.61
+# ===> vegetation - IoU = 43.55
+# ===> mIoU of 6019 samples: 42.02
+
+# 3x/36epoch
+# ===> per class IoU of 6019 samples:
+# ===> others - IoU = 12.37
+# ===> barrier - IoU = 50.15
+# ===> bicycle - IoU = 26.97
+# ===> bus - IoU = 51.86
+# ===> car - IoU = 54.65
+# ===> construction_vehicle - IoU = 28.38
+# ===> motorcycle - IoU = 28.96
+# ===> pedestrian - IoU = 29.02
+# ===> traffic_cone - IoU = 28.28
+# ===> trailer - IoU = 37.05
+# ===> truck - IoU = 42.52
+# ===> driveable_surface - IoU = 82.55
+# ===> other_flat - IoU = 43.15
+# ===> sidewalk - IoU = 54.87
+# ===> terrain - IoU = 58.33
+# ===> manmade - IoU = 48.78
+# ===> vegetation - IoU = 43.79
+# ===> mIoU of 6019 samples: 42.45
+
+_base_ = ['../../../mmdetection3d/configs/_base_/datasets/nus-3d.py',
+          '../../../mmdetection3d/configs/_base_/default_runtime.py']
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+
+data_config = {
+    'cams': [
+        'CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT',
+        'CAM_BACK', 'CAM_BACK_RIGHT'
+    ],
+    'Ncams':
+    6,
+    'input_size': (512, 1408),
+    'src_size': (900, 1600),
+
+    # Augmentation
+    'resize': (-0.06, 0.11),
+    'rot': (-5.4, 5.4),
+    'flip': True,
+    'crop_h': (0.0, 0.0),
+    'resize_test': 0.00,
+}
+
+# Model
+grid_config = {
+    'x': [-40, 40, 0.4],
+    'y': [-40, 40, 0.4],
+    'z': [-1, 5.4, 0.4],
+    'depth': [1.0, 45.0, 0.5],
+}
+
+voxel_size = [0.1, 0.1, 0.2]
+
+numC_Trans = 32
+
+multi_adj_frame_id_cfg = (1, 1+1, 1)
+
+model = dict(
+    type='BEVStereo4DOCC',
+    align_after_view_transfromation=False,
+    num_adj=len(range(*multi_adj_frame_id_cfg)),
+    img_backbone=dict(
+        type='SwinTransformer',
+        pretrain_img_size=224,
+        patch_size=4,
+        window_size=12,
+        mlp_ratio=4,
+        embed_dims=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        strides=(4, 2, 2, 2),
+        out_indices=(2, 3),
+        qkv_bias=True,
+        qk_scale=None,
+        patch_norm=True,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.1,
+        use_abs_pos_embed=False,
+        return_stereo_feat=True,
+        act_cfg=dict(type='GELU'),
+        norm_cfg=dict(type='LN', requires_grad=True),
+        pretrain_style='official',
+        output_missing_index_as_none=False),
+    img_neck=dict(
+        type='FPN_LSS',
+        in_channels=512 + 1024,
+        out_channels=512,
+        # with_cp=False,
+        extra_upsample=None,
+        input_feature_index=(0, 1),
+        scale_factor=2),
+    img_view_transformer=dict(
+        type='LSSViewTransformerBEVStereo',
+        grid_config=grid_config,
+        input_size=data_config['input_size'],
+        in_channels=512,
+        out_channels=numC_Trans,
+        sid=False,
+        collapse_z=False,
+        loss_depth_weight=0.05,
+        depthnet_cfg=dict(use_dcn=False,
+                          aspp_mid_channels=96,
+                          stereo=True,
+                          bias=5.),
+        downsample=16),
+    img_bev_encoder_backbone=dict(
+        type='CustomResNet3D',
+        numC_input=numC_Trans * (len(range(*multi_adj_frame_id_cfg))+1),
+        num_layer=[1, 2, 4],
+        with_cp=False,
+        num_channels=[numC_Trans,numC_Trans*2,numC_Trans*4],
+        stride=[1,2,2],
+        backbone_output_ids=[0,1,2]),
+    img_bev_encoder_neck=dict(type='LSSFPN3D',
+                              in_channels=numC_Trans*7,
+                              out_channels=numC_Trans),
+    pre_process=dict(
+        type='CustomResNet3D',
+        numC_input=numC_Trans,
+        with_cp=False,
+        num_layer=[1,],
+        num_channels=[numC_Trans,],
+        stride=[1,],
+        backbone_output_ids=[0,]),
+    occ_head=dict(
+        type='BEVOCCHead3D',
+        in_dim=numC_Trans,
+        out_dim=32,
+        use_mask=True,
+        num_classes=18,
+        use_predicter=True,
+        class_balance=False,
+        loss_occ=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=False,
+            ignore_index=255,
+            loss_weight=1.0
+        ),
+    )
+)
+
+# Data
+dataset_type = 'NuScenesDatasetOccpancy'
+data_root = 'data/nuscenes/'
+file_client_args = dict(backend='disk')
+
+bda_aug_conf = dict(
+    rot_lim=(-0., 0.),
+    scale_lim=(1., 1.),
+    flip_dx_ratio=0.5,
+    flip_dy_ratio=0.5)
+
+train_pipeline = [
+    dict(
+        type='PrepareImageInputs',
+        is_train=True,
+        data_config=data_config,
+        sequential=True),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=True),
+    dict(type='LoadOccGTFromFile'),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(type='PointToMultiViewDepth', downsample=1, grid_config=grid_config),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(
+        type='Collect3D', keys=['img_inputs', 'gt_depth', 'voxel_semantics',
+                                'mask_lidar','mask_camera'])
+]
+
+test_pipeline = [
+    dict(type='PrepareImageInputs', data_config=data_config, sequential=True),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=False),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points', 'img_inputs'])
+        ])
+]
+
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=False)
+
+share_data_config = dict(
+    type=dataset_type,
+    classes=class_names,
+    modality=input_modality,
+    stereo=True,
+    filter_empty_gt=False,
+    img_info_prototype='bevdet4d',
+    multi_adj_frame_id_cfg=multi_adj_frame_id_cfg,
+)
+
+test_data_config = dict(
+    pipeline=test_pipeline,
+    ann_file=data_root + 'bevdetv2-nuscenes_infos_val.pkl')
+
+data = dict(
+    samples_per_gpu=1,  # with 32 GPU
+    workers_per_gpu=4,
+    train=dict(
+        data_root=data_root,
+        ann_file=data_root + 'bevdetv2-nuscenes_infos_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        test_mode=False,
+        use_valid_flag=True,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR'),
+    val=test_data_config,
+    test=test_data_config)
+
+for key in ['val', 'train', 'test']:
+    data[key].update(share_data_config)
+
+# Optimizer
+optimizer = dict(type='AdamW', lr=2e-4, weight_decay=1e-2)
+optimizer_config = dict(grad_clip=dict(max_norm=5, norm_type=2))
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=200,
+    warmup_ratio=0.001,
+    step=[24,])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
+
+custom_hooks = [
+    dict(
+        type='MEGVIIEMAHook',
+        init_updates=10560,
+        priority='NORMAL',
+    ),
+    dict(
+        type='SyncbnControlHook',
+        syncbn_start_epoch=0,
+    ),
+]
+
+load_from="ckpts/bevdet-stbase-4d-stereo-512x1408-cbgs.pth"
+# fp16 = dict(loss_scale='dynamic')
--- a/projects/configs/flashocc/flashocc-r50-4d-stereo.py
+++ b/projects/configs/flashocc/flashocc-r50-4d-stereo.py
+_base_ = ['../../../mmdetection3d/configs/_base_/datasets/nus-3d.py',
+          '../../../mmdetection3d/configs/_base_/default_runtime.py']
+
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+
+data_config = {
+    'cams': [
+        'CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT',
+        'CAM_BACK', 'CAM_BACK_RIGHT'
+    ],
+    'Ncams': 6,
+    'input_size': (256, 704),
+    'src_size': (900, 1600),
+
+    # Augmentation
+    'resize': (-0.06, 0.11),
+    'rot': (-5.4, 5.4),
+    'flip': True,
+    'crop_h': (0.0, 0.0),
+    'resize_test': 0.00,
+}
+
+grid_config = {
+    'x': [-40, 40, 0.4],
+    'y': [-40, 40, 0.4],
+    'z': [-1, 5.4, 6.4],
+    'depth': [1.0, 45.0, 0.5],
+}
+
+voxel_size = [0.1, 0.1, 0.2]
+
+numC_Trans = 80
+multi_adj_frame_id_cfg = (1, 1+1, 1)
+
+
+model = dict(
+    type='BEVStereo4DOCC',
+    align_after_view_transfromation=False,
+    num_adj=len(range(*multi_adj_frame_id_cfg)),
+    img_backbone=dict(
+        pretrained='torchvision://resnet50',
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 2, 3),
+        frozen_stages=-1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=False,
+        with_cp=True,
+        style='pytorch'),
+    img_neck=dict(
+        type='CustomFPN',
+        in_channels=[1024, 2048],
+        out_channels=256,
+        num_outs=1,
+        start_level=0,
+        out_ids=[0]),
+    img_view_transformer=dict(
+        type='LSSViewTransformerBEVStereo',
+        grid_config=grid_config,
+        input_size=data_config['input_size'],
+        in_channels=256,
+        out_channels=numC_Trans,
+        sid=True,
+        loss_depth_weight=0.05,
+        depthnet_cfg=dict(use_dcn=False,
+                          aspp_mid_channels=96,
+                          stereo=True,
+                          bias=5.),
+        downsample=16),
+    img_bev_encoder_backbone=dict(
+        type='CustomResNet',
+        numC_input=numC_Trans * (len(range(*multi_adj_frame_id_cfg))+1),
+        num_channels=[numC_Trans * 2, numC_Trans * 4, numC_Trans * 8]),
+    img_bev_encoder_neck=dict(
+        type='FPN_LSS',
+        in_channels=numC_Trans * 8 + numC_Trans * 2,
+        out_channels=256),
+    pre_process=dict(
+        type='CustomResNet',
+        numC_input=numC_Trans,
+        num_layer=[1, ],
+        num_channels=[numC_Trans, ],
+        stride=[1, ],
+        backbone_output_ids=[0, ]),
+    occ_head=dict(
+        type='BEVOCCHead2D',
+        in_dim=256,
+        out_dim=256,
+        Dz=16,
+        use_mask=True,
+        num_classes=18,
+        use_predicter=True,
+        class_balance=False,
+        loss_occ=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=False,
+            ignore_index=255,
+            loss_weight=1.0
+        ),
+    )
+)
+
+# Data
+dataset_type = 'NuScenesDatasetOccpancy'
+data_root = 'data/nuscenes/'
+file_client_args = dict(backend='disk')
+
+bda_aug_conf = dict(
+    rot_lim=(-0., 0.),
+    scale_lim=(1., 1.),
+    flip_dx_ratio=0.5,
+    flip_dy_ratio=0.5)
+
+train_pipeline = [
+    dict(
+        type='PrepareImageInputs',
+        is_train=True,
+        data_config=data_config,
+        sequential=True),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=True),
+    dict(type='LoadOccGTFromFile'),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(type='PointToMultiViewDepth', downsample=1, grid_config=grid_config),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(
+        type='Collect3D', keys=['img_inputs', 'gt_depth', 'voxel_semantics',
+                                'mask_lidar', 'mask_camera'])
+]
+
+test_pipeline = [
+    dict(type='PrepareImageInputs', data_config=data_config, sequential=True),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=False),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points', 'img_inputs'])
+        ])
+]
+
+
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=False)
+
+share_data_config = dict(
+    type=dataset_type,
+    data_root=data_root,
+    classes=class_names,
+    modality=input_modality,
+    stereo=True,
+    filter_empty_gt=False,
+    img_info_prototype='bevdet4d',
+    multi_adj_frame_id_cfg=multi_adj_frame_id_cfg,
+)
+
+test_data_config = dict(
+    pipeline=test_pipeline,
+    ann_file=data_root + 'bevdetv2-nuscenes_infos_val.pkl')
+
+data = dict(
+    samples_per_gpu=4,
+    workers_per_gpu=4,
+    train=dict(
+        data_root=data_root,
+        ann_file=data_root + 'bevdetv2-nuscenes_infos_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        test_mode=False,
+        use_valid_flag=True,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR'),
+    val=test_data_config,
+    test=test_data_config)
+
+for key in ['val', 'train', 'test']:
+    data[key].update(share_data_config)
+
+# Optimizer
+optimizer = dict(type='AdamW', lr=1e-4, weight_decay=1e-2)
+optimizer_config = dict(grad_clip=dict(max_norm=5, norm_type=2))
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=200,
+    warmup_ratio=0.001,
+    step=[24, ])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
+
+custom_hooks = [
+    dict(
+        type='MEGVIIEMAHook',
+        init_updates=10560,
+        priority='NORMAL',
+    ),
+]
+
+load_from = "./ckpts/bevdet-r50-4d-stereo-cbgs.pth"
+# fp16 = dict(loss_scale='dynamic')
+evaluation = dict(interval=1, start=20, pipeline=test_pipeline)
+checkpoint_config = dict(interval=1, max_keep_ckpts=5)
+
+
+# with_pretrain:
+# ===> per class IoU of 6019 samples:
+# ===> others - IoU = 9.08
+# ===> barrier - IoU = 46.32
+# ===> bicycle - IoU = 17.71
+# ===> bus - IoU = 42.7
+# ===> car - IoU = 50.64
+# ===> construction_vehicle - IoU = 23.72
+# ===> motorcycle - IoU = 20.13
+# ===> pedestrian - IoU = 22.34
+# ===> traffic_cone - IoU = 24.09
+# ===> trailer - IoU = 30.26
+# ===> truck - IoU = 37.39
+# ===> driveable_surface - IoU = 81.68
+# ===> other_flat - IoU = 40.13
+# ===> sidewalk - IoU = 52.34
+# ===> terrain - IoU = 56.46
+# ===> manmade - IoU = 47.69
+# ===> vegetation - IoU = 40.6
+# ===> mIoU of 6019 samples: 37.84
--- a/projects/configs/flashocc/flashocc-r50-M0-trt.py
+++ b/projects/configs/flashocc/flashocc-r50-M0-trt.py
+_base_ = ['./flashocc-r50-M0.py',
+          ]
+
+model = dict(
+    wocc=True,
+    wdet3d=False,
+)
--- a/projects/configs/flashocc/flashocc-r50-M0.py
+++ b/projects/configs/flashocc/flashocc-r50-M0.py
+_base_ = ['../../../mmdetection3d/configs/_base_/datasets/nus-3d.py',
+          '../../../mmdetection3d/configs/_base_/default_runtime.py']
+
+plugin = True
+plugin_dir = 'projects/mmdet3d_plugin/'
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+
+data_config = {
+    'cams': [
+        'CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT',
+        'CAM_BACK', 'CAM_BACK_RIGHT'
+    ],
+    'Ncams':
+    6,
+    'input_size': (256, 704),
+    'src_size': (900, 1600),
+
+    # Augmentation
+    'resize': (-0.06, 0.11),
+    'rot': (-5.4, 5.4),
+    'flip': True,
+    'crop_h': (0.0, 0.0),
+    'resize_test': 0.00,
+}
+
+grid_config = {
+    'x': [-40, 40, 0.4],
+    'y': [-40, 40, 0.4],
+    'z': [-1, 5.4, 6.4],
+    'depth': [1.0, 45.0, 1.0],
+}
+
+voxel_size = [0.1, 0.1, 0.2]
+
+numC_Trans = 64
+
+model = dict(
+    type='BEVDetOCC',
+    img_backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(2, 3),
+        frozen_stages=-1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=False,
+        with_cp=True,
+        style='pytorch',
+        pretrained='torchvision://resnet50',
+    ),
+    img_neck=dict(
+        type='CustomFPN',
+        in_channels=[1024, 2048],
+        out_channels=256,
+        num_outs=1,
+        start_level=0,
+        out_ids=[0]),
+    img_view_transformer=dict(
+        type='LSSViewTransformer',
+        grid_config=grid_config,
+        input_size=data_config['input_size'],
+        in_channels=256,
+        out_channels=numC_Trans,
+        sid=False,
+        collapse_z=True,
+        downsample=16),
+    img_bev_encoder_backbone=dict(
+        type='CustomResNet',
+        numC_input=numC_Trans,
+        num_channels=[numC_Trans * 2, numC_Trans * 4, numC_Trans * 8]),
+    img_bev_encoder_neck=dict(
+        type='FPN_LSS',
+        in_channels=numC_Trans * 8 + numC_Trans * 2,
+        out_channels=128),
+    occ_head=dict(
+        type='BEVOCCHead2D',
+        in_dim=128,
+        out_dim=128,
+        Dz=16,
+        use_mask=True,
+        num_classes=18,
+        use_predicter=True,
+        class_balance=False,
+        loss_occ=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=False,
+            ignore_index=255,
+            loss_weight=1.0
+        ),
+    )
+)
+
+# Data
+dataset_type = 'NuScenesDatasetOccpancy'
+data_root = 'data/nuscenes/'
+file_client_args = dict(backend='disk')
+
+bda_aug_conf = dict(
+    rot_lim=(-0., 0.),
+    scale_lim=(1., 1.),
+    flip_dx_ratio=0.5,
+    flip_dy_ratio=0.5
+)
+
+train_pipeline = [
+    dict(
+        type='PrepareImageInputs',
+        is_train=True,
+        data_config=data_config,
+        sequential=False),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=True),
+    dict(type='LoadOccGTFromFile'),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(type='PointToMultiViewDepth', downsample=1, grid_config=grid_config),
+    dict(type='DefaultFormatBundle3D', class_names=class_names),
+    dict(
+        type='Collect3D', keys=['img_inputs', 'gt_depth', 'voxel_semantics',
+                                'mask_lidar', 'mask_camera'])
+]
+
+test_pipeline = [
+    dict(type='PrepareImageInputs', data_config=data_config, sequential=False),
+    dict(
+        type='LoadAnnotationsBEVDepth',
+        bda_aug_conf=bda_aug_conf,
+        classes=class_names,
+        is_train=False),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        file_client_args=file_client_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='DefaultFormatBundle3D',
+                class_names=class_names,
+                with_label=False),
+            dict(type='Collect3D', keys=['points', 'img_inputs'])
+        ])
+]
+
+
+input_modality = dict(
+    use_lidar=False,
+    use_camera=True,
+    use_radar=False,
+    use_map=False,
+    use_external=False)
+
+share_data_config = dict(
+    type=dataset_type,
+    data_root=data_root,
+    classes=class_names,
+    modality=input_modality,
+    stereo=False,
+    filter_empty_gt=False,
+    img_info_prototype='bevdet',
+)
+
+test_data_config = dict(
+    pipeline=test_pipeline,
+    ann_file=data_root + 'bevdetv2-nuscenes_infos_val.pkl')
+
+data = dict(
+    samples_per_gpu=4,
+    workers_per_gpu=4,
+    train=dict(
+        data_root=data_root,
+        ann_file=data_root + 'bevdetv2-nuscenes_infos_train.pkl',
+        pipeline=train_pipeline,
+        classes=class_names,
+        test_mode=False,
+        use_valid_flag=True,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR'),
+    val=test_data_config,
+    test=test_data_config)
+
+for key in ['val', 'train', 'test']:
+    data[key].update(share_data_config)
+
+# Optimizer
+optimizer = dict(type='AdamW', lr=1e-4, weight_decay=1e-2)
+optimizer_config = dict(grad_clip=dict(max_norm=5, norm_type=2))
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=200,
+    warmup_ratio=0.001,
+    step=[24, ])
+runner = dict(type='EpochBasedRunner', max_epochs=24)
+
+custom_hooks = [
+    dict(
+        type='MEGVIIEMAHook',
+        init_updates=10560,
+        priority='NORMAL',
+    ),
+]
+
+load_from = "ckpts/bevdet-r50-cbgs.pth"
+# fp16 = dict(loss_scale='dynamic')
+evaluation = dict(interval=1, start=20, pipeline=test_pipeline)
+checkpoint_config = dict(interval=1, max_keep_ckpts=5)
+
+
+# with det pretrain; use_mask=True; out_dim=256,
+# ===> per class IoU of 6019 samples:
+# ===> per class IoU of 6019 samples:
+# ===> others - IoU = 6.21
+# ===> barrier - IoU = 39.56
+# ===> bicycle - IoU = 11.27
+# ===> bus - IoU = 36.31
+# ===> car - IoU = 43.96
+# ===> construction_vehicle - IoU = 16.25
+# ===> motorcycle - IoU = 14.74
+# ===> pedestrian - IoU = 16.89
+# ===> traffic_cone - IoU = 15.76
+# ===> trailer - IoU = 28.56
+# ===> truck - IoU = 30.91
+# ===> driveable_surface - IoU = 78.16
+# ===> other_flat - IoU = 37.52
+# ===> sidewalk - IoU = 47.42
+# ===> terrain - IoU = 51.35
+# ===> manmade - IoU = 36.79
+# ===> vegetation - IoU = 31.42
+# ===> mIoU of 6019 samples: 31.95
+# {'mIoU': array([0.06207982, 0.39564533, 0.11270112, 0.36311426, 0.43955401,
+#        0.16252583, 0.14739984, 0.16885096, 0.15757262, 0.28564777,
+#        0.30909029, 0.7815907 , 0.37523904, 0.47420705, 0.51351759,
+#        0.36789645, 0.31420157, 0.87802724])}
--- a/projects/configs/flashocc/flashocc-r50-trt.py
+++ b/projects/configs/flashocc/flashocc-r50-trt.py
+_base_ = ['./flashocc-r50.py',
+          ]
+
+model = dict(
+    wocc=True,
+    wdet3d=False,
+)