Adjust the file structure.

Signed-off-by: lijian <lijian6@sugon.com>

Adjust the file structure.
Signed-off-by: lijian <lijian6@sugon.com>
447238de · lijian6 · c1d9c169 · 447238de · 447238de · c1d9c169
Commit 447238de authored Oct 21, 2025 by lijian6
18 changed files
--- a/build.sh
+++ b/build.sh
@@ -8,12 +8,12 @@ fi
 PYTHON_INCLUDE=$(python3 -c "from sysconfig import get_paths; print(get_paths()['include'])")
 PYTHON_PLATLIB=$(python3 -c "from sysconfig import get_paths; print(get_paths()['platlib'])")
-/opt/dtk/bin/hipcc  -Icsrc/ -I$(pwd)/rocshmem_dir/include/ -I/public/home/lishen/Code/rocSHMEM/3rd_party/install_dtk25.04.1/ompi/include -I${PYTHON_PLATLIB}/torch/include -I${PYTHON_PLATLIB}/torch/include/torch/csrc/api/include -I${PYTHON_PLATLIB}/torch/include/TH -I${PYTHON_PLATLIB}/torch/include/THC -I${PYTHON_PLATLIB}/torch/include/THH -I/opt/dtk/include -I${PYTHON_INCLUDE} -c -c ./csrc/kernels/intranode.hip -o build_/intranode.o -fPIC -D__HIP_PLATFORM_AMD__=1 -DUSE_ROCM=1 -DHIPBLAS_V2 -DCUDA_HAS_FP16=1 -D__HIP_NO_HALF_OPERATORS__=1 -D__HIP_NO_HALF_CONVERSIONS__=1 -O3 -fgpu-rdc -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1014"' -DTORCH_EXTENSION_NAME=deep_ep_cpp -D_GLIBCXX_USE_CXX11_ABI=1 --offload-arch=gfx936 -std=c++17
+/opt/dtk/bin/hipcc  -Icsrc/ -I$(pwd)/rocshmem_dir/include/ -I/opt/mpi/include -I${PYTHON_PLATLIB}/torch/include -I${PYTHON_PLATLIB}/torch/include/torch/csrc/api/include -I${PYTHON_PLATLIB}/torch/include/TH -I${PYTHON_PLATLIB}/torch/include/THC -I${PYTHON_PLATLIB}/torch/include/THH -I/opt/dtk/include -I${PYTHON_INCLUDE} -c -c ./csrc/kernels/intranode.cu -o build_/intranode.o -fPIC -D__HIP_PLATFORM_AMD__=1 -DUSE_ROCM=1 -DHIPBLAS_V2 -DCUDA_HAS_FP16=1 -D__HIP_NO_HALF_OPERATORS__=1 -D__HIP_NO_HALF_CONVERSIONS__=1 -O3 -fgpu-rdc -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1014"' -DTORCH_EXTENSION_NAME=deep_ep_cpp -D_GLIBCXX_USE_CXX11_ABI=1 --offload-arch=gfx936 -std=c++17
-/opt/dtk/bin/hipcc  -Icsrc/ -I$(pwd)/rocshmem_dir/include/ -I/public/home/lishen/Code/rocSHMEM/3rd_party/install_dtk25.04.1/ompi/include -I${PYTHON_PLATLIB}/torch/include -I${PYTHON_PLATLIB}/torch/include/torch/csrc/api/include -I${PYTHON_PLATLIB}/torch/include/TH -I${PYTHON_PLATLIB}/torch/include/THC -I${PYTHON_PLATLIB}/torch/include/THH -I/opt/dtk/include -I${PYTHON_INCLUDE} -c -c ./csrc/kernels/runtime.hip -o build_/runtime.o -fPIC -D__HIP_PLATFORM_AMD__=1 -DUSE_ROCM=1 -DHIPBLAS_V2 -DCUDA_HAS_FP16=1 -D__HIP_NO_HALF_OPERATORS__=1 -D__HIP_NO_HALF_CONVERSIONS__=1 -O3 -fgpu-rdc -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1014"' -DTORCH_EXTENSION_NAME=deep_ep_cpp -D_GLIBCXX_USE_CXX11_ABI=1 --offload-arch=gfx936 -std=c++17
+/opt/dtk/bin/hipcc  -Icsrc/ -I$(pwd)/rocshmem_dir/include/ -I/opt/mpi/include -I${PYTHON_PLATLIB}/torch/include -I${PYTHON_PLATLIB}/torch/include/torch/csrc/api/include -I${PYTHON_PLATLIB}/torch/include/TH -I${PYTHON_PLATLIB}/torch/include/THC -I${PYTHON_PLATLIB}/torch/include/THH -I/opt/dtk/include -I${PYTHON_INCLUDE} -c -c ./csrc/kernels/runtime.cu -o build_/runtime.o -fPIC -D__HIP_PLATFORM_AMD__=1 -DUSE_ROCM=1 -DHIPBLAS_V2 -DCUDA_HAS_FP16=1 -D__HIP_NO_HALF_OPERATORS__=1 -D__HIP_NO_HALF_CONVERSIONS__=1 -O3 -fgpu-rdc -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1014"' -DTORCH_EXTENSION_NAME=deep_ep_cpp -D_GLIBCXX_USE_CXX11_ABI=1 --offload-arch=gfx936 -std=c++17
-/opt/dtk/bin/hipcc  -Icsrc/ -I$(pwd)/rocshmem_dir/include/ -I/public/home/lishen/Code/rocSHMEM/3rd_party/install_dtk25.04.1/ompi/include -I${PYTHON_PLATLIB}/torch/include -I${PYTHON_PLATLIB}/torch/include/torch/csrc/api/include -I${PYTHON_PLATLIB}/torch/include/TH -I${PYTHON_PLATLIB}/torch/include/THC -I${PYTHON_PLATLIB}/torch/include/THH -I/opt/dtk/include -I${PYTHON_INCLUDE} -c -c ./csrc/kernels/layout.cu -o build_/layout.o -fPIC -D__HIP_PLATFORM_AMD__=1 -DUSE_ROCM=1 -DHIPBLAS_V2 -DCUDA_HAS_FP16=1 -D__HIP_NO_HALF_OPERATORS__=1 -D__HIP_NO_HALF_CONVERSIONS__=1 -O3 -fgpu-rdc -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1014"' -DTORCH_EXTENSION_NAME=deep_ep_cpp -D_GLIBCXX_USE_CXX11_ABI=1 --offload-arch=gfx936 -std=c++17
+/opt/dtk/bin/hipcc  -Icsrc/ -I$(pwd)/rocshmem_dir/include/ -I/opt/mpi/include -I${PYTHON_PLATLIB}/torch/include -I${PYTHON_PLATLIB}/torch/include/torch/csrc/api/include -I${PYTHON_PLATLIB}/torch/include/TH -I${PYTHON_PLATLIB}/torch/include/THC -I${PYTHON_PLATLIB}/torch/include/THH -I/opt/dtk/include -I${PYTHON_INCLUDE} -c -c ./csrc/kernels/layout.cu -o build_/layout.o -fPIC -D__HIP_PLATFORM_AMD__=1 -DUSE_ROCM=1 -DHIPBLAS_V2 -DCUDA_HAS_FP16=1 -D__HIP_NO_HALF_OPERATORS__=1 -D__HIP_NO_HALF_CONVERSIONS__=1 -O3 -fgpu-rdc -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1014"' -DTORCH_EXTENSION_NAME=deep_ep_cpp -D_GLIBCXX_USE_CXX11_ABI=1 --offload-arch=gfx936 -std=c++17
-/opt/dtk/bin/hipcc  -Icsrc/ -I$(pwd)/rocshmem_dir/include/ -I/public/home/lishen/Code/rocSHMEM/3rd_party/install_dtk25.04.1/ompi/include -I${PYTHON_PLATLIB}/torch/include -I${PYTHON_PLATLIB}/torch/include/torch/csrc/api/include -I${PYTHON_PLATLIB}/torch/include/TH -I${PYTHON_PLATLIB}/torch/include/THC -I${PYTHON_PLATLIB}/torch/include/THH -I/opt/dtk/include -I${PYTHON_INCLUDE} -c -c ./csrc/deep_ep.hip -o build_/deep_ep.o -fPIC -D__HIP_PLATFORM_AMD__=1 -DUSE_ROCM=1 -DHIPBLAS_V2 -DCUDA_HAS_FP16=1 -D__HIP_NO_HALF_OPERATORS__=1 -D__HIP_NO_HALF_CONVERSIONS__=1 -O3 -fgpu-rdc -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1014"' -DTORCH_EXTENSION_NAME=deep_ep_cpp -D_GLIBCXX_USE_CXX11_ABI=1 --offload-arch=gfx936 -std=c++17
+/opt/dtk/bin/hipcc  -Icsrc/ -I$(pwd)/rocshmem_dir/include/ -I/opt/mpi/include -I${PYTHON_PLATLIB}/torch/include -I${PYTHON_PLATLIB}/torch/include/torch/csrc/api/include -I${PYTHON_PLATLIB}/torch/include/TH -I${PYTHON_PLATLIB}/torch/include/THC -I${PYTHON_PLATLIB}/torch/include/THH -I/opt/dtk/include -I${PYTHON_INCLUDE} -c -c ./csrc/deep_ep.cu -o build_/deep_ep.o -fPIC -D__HIP_PLATFORM_AMD__=1 -DUSE_ROCM=1 -DHIPBLAS_V2 -DCUDA_HAS_FP16=1 -D__HIP_NO_HALF_OPERATORS__=1 -D__HIP_NO_HALF_CONVERSIONS__=1 -O3 -fgpu-rdc -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1014"' -DTORCH_EXTENSION_NAME=deep_ep_cpp -D_GLIBCXX_USE_CXX11_ABI=1 --offload-arch=gfx936 -std=c++17
-/opt/dtk/bin/hipcc  -Icsrc/ -I$(pwd)/rocshmem_dir/include/ -I/public/home/lishen/Code/rocSHMEM/3rd_party/install_dtk25.04.1/ompi/include -I${PYTHON_PLATLIB}/torch/include -I${PYTHON_PLATLIB}/torch/include/torch/csrc/api/include -I${PYTHON_PLATLIB}/torch/include/TH -I${PYTHON_PLATLIB}/torch/include/THC -I${PYTHON_PLATLIB}/torch/include/THH -I/opt/dtk/include -I${PYTHON_INCLUDE} -c -c ./csrc/kernels/internode.hip -o build_/internode.o -fPIC -D__HIP_PLATFORM_AMD__=1 -DUSE_ROCM=1 -DHIPBLAS_V2 -DCUDA_HAS_FP16=1 -D__HIP_NO_HALF_OPERATORS__=1 -D__HIP_NO_HALF_CONVERSIONS__=1 -O3 -fgpu-rdc -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1014"' -DTORCH_EXTENSION_NAME=deep_ep_cpp -D_GLIBCXX_USE_CXX11_ABI=1 --offload-arch=gfx936 -std=c++17
+/opt/dtk/bin/hipcc  -Icsrc/ -I$(pwd)/rocshmem_dir/include/ -I/opt/mpi/include -I${PYTHON_PLATLIB}/torch/include -I${PYTHON_PLATLIB}/torch/include/torch/csrc/api/include -I${PYTHON_PLATLIB}/torch/include/TH -I${PYTHON_PLATLIB}/torch/include/THC -I${PYTHON_PLATLIB}/torch/include/THH -I/opt/dtk/include -I${PYTHON_INCLUDE} -c -c ./csrc/kernels/internode.cu -o build_/internode.o -fPIC -D__HIP_PLATFORM_AMD__=1 -DUSE_ROCM=1 -DHIPBLAS_V2 -DCUDA_HAS_FP16=1 -D__HIP_NO_HALF_OPERATORS__=1 -D__HIP_NO_HALF_CONVERSIONS__=1 -O3 -fgpu-rdc -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1014"' -DTORCH_EXTENSION_NAME=deep_ep_cpp -D_GLIBCXX_USE_CXX11_ABI=1 --offload-arch=gfx936 -std=c++17
-hipcc -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -shared -Wl,-O1 -Wl,-Bsymbolic-functions build_/internode.o build_/intranode.o build_/runtime.o build_/deep_ep.o build_/layout.o -L$(pwd)/rocshmem_dir/lib/ -L/opt/mpi/lib -L/opt/dtk/hip/lib -L/usr/lib/x86_64-linux-gnu -lhipblaslt -lamdhip64 -o deep_ep/deep_ep_cpp.cpython-310-x86_64-linux-gnu.so -Wl,-rpath,/opt/dtk/lib -fgpu-rdc --hip-link --offload-arch=gfx936 -shared -Wl,-soname,deep_ep/deep_ep_cpp.cpython-310-x86_64-linux-gnu.so -Wl,-rpath,$(pwd)/rocshmem_dir/lib/ -L"/opt/dtk/llvm/lib/clang/15.0.0/include/../lib/linux" -lclang_rt.builtins-x86_64 /opt/dtk/hip/lib/libgalaxyhip.so.5.2.25211.1469-8d6b0397 /opt/dtk/llvm/lib/clang/15.0.0/lib/linux/libclang_rt.builtins-x86_64.a /opt/hyhal/lib/libhsa-runtime64.so.1.11.0 -L${PYTHON_PLATLIB}/torch/lib -L/opt/dtk/lib -L/opt/dtk/hip/lib -L/usr/local/lib -lc10 -ltorch -ltorch_cpu -ltorch_python -lamdhip64 -lc10_hip -ltorch_hip -lrocm-core -lrocm_smi64 -l:librocshmem.a -fgpu-rdc --hip-link -lamdhip64 -lhsa-runtime64 -l:libmpi.so -Wl,-rpath,/public/home/lishen/Code/rocSHMEM/3rd_party/install_dtk25.04.1/ompi/lib/ -libverbs -lmlx5
+hipcc -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -shared -Wl,-O1 -Wl,-Bsymbolic-functions build_/internode.o build_/intranode.o build_/runtime.o build_/deep_ep.o build_/layout.o -L$(pwd)/rocshmem_dir/lib/ -L/opt/mpi/lib -L/opt/dtk/hip/lib -L/usr/lib/x86_64-linux-gnu -lhipblaslt -lamdhip64 -o deep_ep/deep_ep_cpp.cpython-310-x86_64-linux-gnu.so -Wl,-rpath,/opt/dtk/lib -fgpu-rdc --hip-link --offload-arch=gfx936 -shared -Wl,-soname,deep_ep/deep_ep_cpp.cpython-310-x86_64-linux-gnu.so -Wl,-rpath,$(pwd)/rocshmem_dir/lib/ -L"/opt/dtk/llvm/lib/clang/15.0.0/include/../lib/linux" -lclang_rt.builtins-x86_64 /opt/dtk/hip/lib/libgalaxyhip.so /opt/dtk/llvm/lib/clang/15.0.0/lib/linux/libclang_rt.builtins-x86_64.a /opt/hyhal/lib/libhsa-runtime64.so.1.11.0 -L${PYTHON_PLATLIB}/torch/lib -L/opt/dtk/lib -L/opt/dtk/hip/lib -L/usr/local/lib -lc10 -ltorch -ltorch_cpu -ltorch_python -lamdhip64 -lc10_hip -ltorch_hip -lrocm-core -lrocm_smi64 -l:librocshmem.a -fgpu-rdc --hip-link -lamdhip64 -lhsa-runtime64 -l:libmpi.so -Wl,-rpath,/opt/mpi/lib/ -libverbs -lmlx5
 # build whl
 echo "Using Python: $(which python3)"
@@ -21,10 +21,3 @@ python3 --version
 python setup.py bdist_wheel
 echo "✅ Build complete:"
 ls -lh dist/
-# /opt/dtk/bin/hipcc  -Icsrc/ -I./rocshmem_dir/include/ -I/public/home/lishen/Code/rocSHMEM/3rd_party/install_dtk25.04.1/ompi/include -I${PYTHON_PLATLIB}/torch/include -I${PYTHON_PLATLIB}/torch/include/torch/csrc/api/include -I${PYTHON_PLATLIB}/torch/include/TH -I${PYTHON_PLATLIB}/torch/include/THC -I${PYTHON_PLATLIB}/torch/include/THH -I/opt/dtk/include -I${PYTHON_INCLUDE} -c -c ./csrc/kernels/intranode.hip -o build_/intranode.o -fPIC -D__HIP_PLATFORM_AMD__=1 -DUSE_ROCM=1 -DHIPBLAS_V2 -DCUDA_HAS_FP16=1 -D__HIP_NO_HALF_OPERATORS__=1 -D__HIP_NO_HALF_CONVERSIONS__=1 -O3 -fgpu-rdc -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1014"' -DTORCH_EXTENSION_NAME=deep_ep_cpp -D_GLIBCXX_USE_CXX11_ABI=1 --offload-arch=gfx936 -std=c++17
-# /opt/dtk/bin/hipcc  -Icsrc/ -I./rocshmem_dir/include/ -I/public/home/lishen/Code/rocSHMEM/3rd_party/install_dtk25.04.1/ompi/include -I${PYTHON_PLATLIB}/torch/include -I${PYTHON_PLATLIB}/torch/include/torch/csrc/api/include -I${PYTHON_PLATLIB}/torch/include/TH -I${PYTHON_PLATLIB}/torch/include/THC -I${PYTHON_PLATLIB}/torch/include/THH -I/opt/dtk/include -I${PYTHON_INCLUDE} -c -c ./csrc/kernels/runtime.hip -o build_/runtime.o -fPIC -D__HIP_PLATFORM_AMD__=1 -DUSE_ROCM=1 -DHIPBLAS_V2 -DCUDA_HAS_FP16=1 -D__HIP_NO_HALF_OPERATORS__=1 -D__HIP_NO_HALF_CONVERSIONS__=1 -O3 -fgpu-rdc -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1014"' -DTORCH_EXTENSION_NAME=deep_ep_cpp -D_GLIBCXX_USE_CXX11_ABI=1 --offload-arch=gfx936 -std=c++17
-# /opt/dtk/bin/hipcc  -Icsrc/ -I./rocshmem_dir/include/ -I/public/home/lishen/Code/rocSHMEM/3rd_party/install_dtk25.04.1/ompi/include -I${PYTHON_PLATLIB}/torch/include -I${PYTHON_PLATLIB}/torch/include/torch/csrc/api/include -I${PYTHON_PLATLIB}/torch/include/TH -I${PYTHON_PLATLIB}/torch/include/THC -I${PYTHON_PLATLIB}/torch/include/THH -I/opt/dtk/include -I${PYTHON_INCLUDE} -c -c ./csrc/kernels/layout.cu -o build_/layout.o -fPIC -D__HIP_PLATFORM_AMD__=1 -DUSE_ROCM=1 -DHIPBLAS_V2 -DCUDA_HAS_FP16=1 -D__HIP_NO_HALF_OPERATORS__=1 -D__HIP_NO_HALF_CONVERSIONS__=1 -O3 -fgpu-rdc -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1014"' -DTORCH_EXTENSION_NAME=deep_ep_cpp -D_GLIBCXX_USE_CXX11_ABI=1 --offload-arch=gfx936 -std=c++17
-# /opt/dtk/bin/hipcc  -Icsrc/ -I./rocshmem_dir/include/ -I/public/home/lishen/Code/rocSHMEM/3rd_party/install_dtk25.04.1/ompi/include -I${PYTHON_PLATLIB}/torch/include -I${PYTHON_PLATLIB}/torch/include/torch/csrc/api/include -I${PYTHON_PLATLIB}/torch/include/TH -I${PYTHON_PLATLIB}/torch/include/THC -I${PYTHON_PLATLIB}/torch/include/THH -I/opt/dtk/include -I${PYTHON_INCLUDE} -c -c ./csrc/deep_ep.hip -o build_/deep_ep.o -fPIC -D__HIP_PLATFORM_AMD__=1 -DUSE_ROCM=1 -DHIPBLAS_V2 -DCUDA_HAS_FP16=1 -D__HIP_NO_HALF_OPERATORS__=1 -D__HIP_NO_HALF_CONVERSIONS__=1 -O3 -fgpu-rdc -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1014"' -DTORCH_EXTENSION_NAME=deep_ep_cpp -D_GLIBCXX_USE_CXX11_ABI=1 --offload-arch=gfx936 -std=c++17
-# /opt/dtk/bin/hipcc  -Icsrc/ -I./rocshmem_dir/include/ -I/public/home/lishen/Code/rocSHMEM/3rd_party/install_dtk25.04.1/ompi/include -I${PYTHON_PLATLIB}/torch/include -I${PYTHON_PLATLIB}/torch/include/torch/csrc/api/include -I${PYTHON_PLATLIB}/torch/include/TH -I${PYTHON_PLATLIB}/torch/include/THC -I${PYTHON_PLATLIB}/torch/include/THH -I/opt/dtk/include -I${PYTHON_INCLUDE} -c -c ./csrc/kernels/internode.hip -o build_/internode.o -fPIC -D__HIP_PLATFORM_AMD__=1 -DUSE_ROCM=1 -DHIPBLAS_V2 -DCUDA_HAS_FP16=1 -D__HIP_NO_HALF_OPERATORS__=1 -D__HIP_NO_HALF_CONVERSIONS__=1 -O3 -fgpu-rdc -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1014"' -DTORCH_EXTENSION_NAME=deep_ep_cpp -D_GLIBCXX_USE_CXX11_ABI=1 --offload-arch=gfx936 -std=c++17
-# hipcc -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -shared -Wl,-O1 -Wl,-Bsymbolic-functions build_/internode.o build_/intranode.o build_/runtime.o build_/deep_ep.o build_/layout.o -L/work/Tmp/DeepEP/rocshmem_dir/lib/ -L/opt/mpi/lib -L/opt/dtk/hip/lib -L/usr/lib/x86_64-linux-gnu -lhipblaslt -lamdhip64 -o aaa.so -Wl,-rpath,/opt/dtk/lib -fgpu-rdc --hip-link --offload-arch=gfx936 -shared -Wl,-soname,aaa.so -Wl,-rpath,/work/Tmp/DeepEP/rocshmem_dir/lib/ -L"/opt/dtk/llvm/lib/clang/15.0.0/include/../lib/linux" -lclang_rt.builtins-x86_64 /opt/dtk/hip/lib/libgalaxyhip.so.5.2.25211.1469-8d6b0397 /opt/dtk/llvm/lib/clang/15.0.0/lib/linux/libclang_rt.builtins-x86_64.a /opt/hyhal/lib/libhsa-runtime64.so.1.11.0 -L${PYTHON_PLATLIB}/torch/lib -L/opt/dtk/lib -L/opt/dtk/hip/lib -L/usr/local/lib -lc10 -ltorch -ltorch_cpu -ltorch_python -lamdhip64 -lc10_hip -ltorch_hip -lrocm-core -lrocm_smi64 -l:librocshmem.a -fgpu-rdc --hip-link -lamdhip64 -lhsa-runtime64 -l:libmpi.so -Wl,-rpath,/public/home/lishen/Code/rocSHMEM/3rd_party/install_dtk25.04.1/ompi/lib/ -libverbs -lmlx5
--- a/csrc/config.hpp
+++ b/csrc/config.hpp
 #pragma once
-#include "./kernels/api.cuh"
+#include "kernels/api.cuh"
-#include "./kernels/configs.cuh"
+#include "kernels/configs.cuh"
 #include "kernels/exception.cuh"
 namespace deep_ep {

--- a/csrc/config_hip.hpp
+++ b/csrc/config_hip.hpp
-// !!! This is a file automatically generated by hipify!!!
-#include <ATen/dtk_macros.h>
-#pragma once
-#include "kernels/api.cuh"
-#include "kernels/configs.cuh"
-#include "kernels/exception.cuh"
-namespace deep_ep {
-struct Config {
-    int num_sms;
-    int num_max_nvl_chunked_send_tokens;
-    int num_max_nvl_chunked_recv_tokens;
-    int num_max_rdma_chunked_send_tokens;
-    int num_max_rdma_chunked_recv_tokens;
-    Config(int num_sms, int num_max_nvl_chunked_send_tokens, int num_max_nvl_chunked_recv_tokens,
-           int num_max_rdma_chunked_send_tokens, int num_max_rdma_chunked_recv_tokens)
-        : num_sms(num_sms), num_max_nvl_chunked_send_tokens(num_max_nvl_chunked_send_tokens),
-          num_max_nvl_chunked_recv_tokens(num_max_nvl_chunked_recv_tokens),
-          num_max_rdma_chunked_send_tokens(num_max_rdma_chunked_send_tokens),
-          num_max_rdma_chunked_recv_tokens(num_max_rdma_chunked_recv_tokens) {
-        EP_HOST_ASSERT(num_sms >= 0);
-        EP_HOST_ASSERT(num_max_nvl_chunked_send_tokens > 0 and
-                           num_max_nvl_chunked_recv_tokens > 0);
-        EP_HOST_ASSERT(num_max_nvl_chunked_send_tokens < num_max_nvl_chunked_recv_tokens);
-        EP_HOST_ASSERT(num_max_rdma_chunked_send_tokens > 0 and
-                           num_max_rdma_chunked_recv_tokens > 0);
-        // Ceil up RDMA buffer size
-        this->num_max_rdma_chunked_recv_tokens =
-            ALIGN<int>(num_max_rdma_chunked_recv_tokens, num_max_rdma_chunked_send_tokens);
-        EP_HOST_ASSERT(num_max_rdma_chunked_send_tokens < num_max_rdma_chunked_recv_tokens);
-        // NOTES: this assertion is related to RDMA lazy head update, we must ensure senders always
-        // have space to push
-        EP_HOST_ASSERT(num_max_rdma_chunked_send_tokens <=
-                           num_max_rdma_chunked_recv_tokens / 2);
-    }
-    size_t get_nvl_buffer_size_hint(size_t hidden_bytes, int num_ranks) const {
-        // Below are some assumptions
-        // TODO: add assertions
-        constexpr int kNumMaxTopK   = 128;
-        constexpr int kNumMaxScales = 128;
-        EP_HOST_ASSERT(num_ranks < NUM_MAX_NVL_PEERS or num_ranks % NUM_MAX_NVL_PEERS == 0);
-        EP_HOST_ASSERT(num_ranks <= NUM_MAX_NVL_PEERS or num_sms % 2 == 0);
-        const auto num_rdma_ranks = std::max(num_ranks / NUM_MAX_NVL_PEERS, 1);
-        const auto num_nvl_ranks  = std::min(num_ranks, NUM_MAX_NVL_PEERS);
-        const int  num_channels   = num_sms / 2;
-        size_t num_bytes = 0;
-        num_bytes += num_channels * num_nvl_ranks * (2 * num_rdma_ranks + 3) * sizeof(int);
-        num_bytes += num_channels * num_nvl_ranks * num_max_nvl_chunked_recv_tokens * hidden_bytes;
-#ifndef DISABLE_ROCSHMEM
-        num_bytes += num_channels * num_nvl_ranks * num_max_nvl_chunked_recv_tokens *
-                     internode::get_source_meta_bytes();
-#endif
-        num_bytes += num_channels * num_nvl_ranks * num_max_nvl_chunked_recv_tokens * kNumMaxTopK *
-                     sizeof(int64_t);
-        num_bytes += num_channels * num_nvl_ranks * num_max_nvl_chunked_recv_tokens * kNumMaxTopK *
-                     sizeof(float);
-        num_bytes += num_channels * num_nvl_ranks * num_max_nvl_chunked_recv_tokens *
-                     kNumMaxScales * sizeof(float);
-        num_bytes = ((num_bytes + 127) / 128) * 128;
-        return num_bytes;
-    }
-    size_t get_rdma_buffer_size_hint(int64_t hidden_bytes, int num_ranks) const {
-#ifndef DISABLE_ROCSHMEM
-        // Legacy mode
-        if (num_ranks <= NUM_MAX_NVL_PEERS)
-            return 0;
-        // Below are some assumptions
-        // TODO: add assertions
-        constexpr int kNumMaxTopK   = 128;
-        constexpr int kNumMaxScales = 128;
-        EP_HOST_ASSERT(num_ranks % NUM_MAX_NVL_PEERS == 0);
-        EP_HOST_ASSERT(num_sms % 2 == 0);
-        const int num_rdma_ranks = num_ranks / NUM_MAX_NVL_PEERS;
-        const int num_channels   = num_sms / 2;
-        size_t num_bytes = 0;
-        num_bytes += num_channels * num_rdma_ranks * (NUM_MAX_NVL_PEERS * 2 + 2) * 2 * sizeof(int);
-        num_bytes +=
-            num_channels * num_rdma_ranks * num_max_rdma_chunked_recv_tokens * hidden_bytes * 2;
-        num_bytes += num_channels * num_rdma_ranks * num_max_rdma_chunked_recv_tokens *
-                     internode::get_source_meta_bytes() * 2;
-        num_bytes += num_channels * num_rdma_ranks * num_max_rdma_chunked_recv_tokens *
-                     kNumMaxTopK * sizeof(int64_t) * 2;
-        num_bytes += num_channels * num_rdma_ranks * num_max_rdma_chunked_recv_tokens *
-                     kNumMaxTopK * sizeof(float) * 2;
-        num_bytes += num_channels * num_rdma_ranks * num_max_rdma_chunked_recv_tokens *
-                     kNumMaxScales * sizeof(float) * 2;
-        num_bytes +=
-            num_channels * num_rdma_ranks * num_max_rdma_chunked_recv_tokens * sizeof(int4) * 2;
-        num_bytes = ((num_bytes + 127) / 128) * 128;
-        return num_bytes;
-#else
-        EP_HOST_ASSERT(false and "rocSHMEM is disabled during compilation, please install "
-                                  "rocSHMEM by following docs/install_dependencies.md");
-#endif
-    }
-};
-struct LowLatencyBuffer {
-    int num_clean_int = 0;
-    void *dispatch_rdma_send_buffer       = nullptr;
-    void *dispatch_rdma_recv_data_buffer  = nullptr;
-    int  *dispatch_rdma_recv_count_buffer = nullptr;
-    void *combine_rdma_send_buffer      = nullptr;
-    void *combine_rdma_recv_data_buffer = nullptr;
-    int  *combine_rdma_recv_flag_buffer = nullptr;
-    void  *combine_rdma_send_buffer_data_start = nullptr;
-    size_t num_bytes_per_combine_msg           = 0;
-    std::pair<int *, int> clean_meta() {
-        EP_HOST_ASSERT(dispatch_rdma_recv_count_buffer == combine_rdma_recv_flag_buffer);
-        return {dispatch_rdma_recv_count_buffer, num_clean_int};
-    }
-};
-struct LowLatencyLayout {
-    size_t           total_bytes = 0;
-    LowLatencyBuffer buffers[2];
-    template <typename out_ptr_t = void *, typename count_ptr_t = uint8_t *,
-              typename in_ptr_t = void *>
-    out_ptr_t advance(const in_ptr_t &ptr, size_t count) {
-        return reinterpret_cast<out_ptr_t>(reinterpret_cast<count_ptr_t>(ptr) + count);
-    }
-    LowLatencyLayout(void *rdma_buffer, int num_max_dispatch_tokens_per_rank, int hidden,
-                     int num_ranks, int num_experts) {
-        const int num_scales = hidden / 128;
-        // Dispatch and combine layout:
-        //  - 2 symmetric odd/even send buffer
-        //  - 2 symmetric odd/even receive buffers
-        //  - 2 symmetric odd/even signaling buffers
-        // Message sizes
-        // NOTES: you should add a control `int4` for combine messages if you want to do data
-        // transformation
-        EP_HOST_ASSERT(num_scales * sizeof(float) <= static_cast<size_t>(hidden));
-        size_t num_bytes_per_dispatch_msg =
-            sizeof(int4) +
-            std::max(hidden * sizeof(hip_bfloat16), hidden + num_scales * sizeof(float));
-        size_t num_bytes_per_combine_msg = hidden * sizeof(hip_bfloat16);
-        // Send buffer
-        size_t dispatch_send_buffer_bytes =
-            num_max_dispatch_tokens_per_rank * num_bytes_per_dispatch_msg;
-        size_t combine_send_buffer_bytes =
-            num_experts * num_max_dispatch_tokens_per_rank * num_bytes_per_combine_msg;
-        size_t send_buffer_bytes = std::max(dispatch_send_buffer_bytes, combine_send_buffer_bytes);
-        EP_HOST_ASSERT(send_buffer_bytes % sizeof(int4) == 0);
-        total_bytes += send_buffer_bytes * 2;
-        // Symmetric receive buffers
-        // TODO: optimize memory usages
-        size_t dispatch_recv_data_buffer_bytes =
-            num_experts * num_max_dispatch_tokens_per_rank * num_bytes_per_dispatch_msg;
-        size_t combine_recv_buffer_bytes =
-            num_experts * num_max_dispatch_tokens_per_rank * num_bytes_per_combine_msg;
-        size_t recv_buffer_bytes =
-            std::max(dispatch_recv_data_buffer_bytes, combine_recv_buffer_bytes);
-        EP_HOST_ASSERT(recv_buffer_bytes % sizeof(int4) == 0);
-        total_bytes += recv_buffer_bytes * 2;
-        // Symmetric signaling buffers
-        size_t dispatch_recv_count_buffer_bytes = num_experts * sizeof(int);
-        size_t combine_recv_flag_buffer_bytes   = dispatch_recv_count_buffer_bytes;
-        size_t signaling_buffer_bytes =
-            std::max(dispatch_recv_count_buffer_bytes, combine_recv_flag_buffer_bytes);
-        size_t signaling_buffer_bytes_aligned = ALIGN<size_t>(signaling_buffer_bytes, 128);
-        total_bytes += signaling_buffer_bytes_aligned * 2;
-        // Assign pointers
-        // NOTES: we still leave some space for distinguishing dispatch/combine buffer,
-        // so you may see some parameters are duplicated
-        for (int i = 0; i < 2; ++i) {
-            buffers[i] = {
-                static_cast<int>(signaling_buffer_bytes / sizeof(int)),
-                advance(rdma_buffer, signaling_buffer_bytes_aligned * 2 + send_buffer_bytes * i),
-                advance(rdma_buffer, signaling_buffer_bytes_aligned * 2 + send_buffer_bytes * 2 +
-                                         recv_buffer_bytes * i),
-                advance<int *>(rdma_buffer, signaling_buffer_bytes_aligned * i),
-                advance(rdma_buffer, signaling_buffer_bytes_aligned * 2 + send_buffer_bytes * i),
-                advance(rdma_buffer, signaling_buffer_bytes_aligned * 2 + send_buffer_bytes * 2 +
-                                         recv_buffer_bytes * i),
-                advance<int *>(rdma_buffer, signaling_buffer_bytes_aligned * i),
-                advance(rdma_buffer, signaling_buffer_bytes_aligned * 2 + send_buffer_bytes * i),
-                num_bytes_per_combine_msg};
-        }
-    }
-};
-inline size_t get_low_latency_rdma_size_hint(int num_max_dispatch_tokens_per_rank, int hidden,
-                                             int num_ranks, int num_experts) {
-    auto num_bytes =
-        LowLatencyLayout(nullptr, num_max_dispatch_tokens_per_rank, hidden, num_ranks, num_experts)
-            .total_bytes;
-    return ((num_bytes + NUM_BUFFER_ALIGNMENT_BYTES) / NUM_BUFFER_ALIGNMENT_BYTES) *
-           NUM_BUFFER_ALIGNMENT_BYTES;
-}
-} // namespace deep_ep
--- a/csrc/deep_ep.cpp.orig
+++ b/csrc/deep_ep.cpp.orig
--- a/csrc/deep_ep.cu
+++ b/csrc/deep_ep.cu
+// #include <ATen/dtk_macros.h>
 #include <ATen/hip/HIPContext.h>
 #include <ATen/hip/HIPDataType.h>
 #include <chrono>
@@ -5,8 +6,8 @@
 #include <pybind11/functional.h>
 #include <torch/python.h>
-#include "./kernels/api.cuh"
+#include "kernels/api.cuh"
-#include "./kernels/configs.cuh"
+#include "kernels/configs.cuh"
 #include "deep_ep.hpp"
 namespace deep_ep {
@@ -40,8 +41,8 @@ Buffer::Buffer(int rank, int num_ranks, int64_t num_nvl_bytes, int64_t num_rdma_
    // Get ranks
    CUDA_CHECK(hipGetDevice(&device_id));
    rdma_rank = rank / NUM_MAX_NVL_PEERS, nvl_rank = rank % NUM_MAX_NVL_PEERS;
-    num_rdma_ranks = std::max(1, num_ranks / NUM_MAX_NVL_PEERS),
+    num_rdma_ranks = ::max(1, num_ranks / NUM_MAX_NVL_PEERS),
-    num_nvl_ranks  = std::min(num_ranks, NUM_MAX_NVL_PEERS);
+    num_nvl_ranks  = ::min(num_ranks, NUM_MAX_NVL_PEERS);
 #ifdef DISABLE_ROCSHMEM
    EP_HOST_ASSERT(num_rdma_ranks == 1 and not low_latency_mode and
@@ -803,8 +804,8 @@ Buffer::internode_dispatch(const torch::Tensor &x, const std::optional<torch::Te
    // here.
    pybind11::gil_scoped_release release;
-    const int num_channels = config.num_sms / 2;
+    const int num_channels = config.num_sms / 3;
-    EP_HOST_ASSERT(config.num_sms % 2 == 0);
+    EP_HOST_ASSERT(config.num_sms % 3 == 0);
    EP_HOST_ASSERT(0 < get_num_rdma_ranks() and get_num_rdma_ranks() <= NUM_MAX_RDMA_PEERS);
    bool cached_mode = cached_rdma_channel_prefix_matrix.has_value();
@@ -901,10 +902,10 @@ Buffer::internode_dispatch(const torch::Tensor &x, const std::optional<torch::Te
    // Allocate all tensors on comm stream if set
    // NOTES: do not allocate tensors upfront!
-    auto compute_stream = at::cuda::getCurrentCUDAStream();
+    auto compute_stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
    if (allocate_on_comm_stream) {
        EP_HOST_ASSERT(previous_event.has_value() and async);
-        at::cuda::setCurrentCUDAStream(comm_stream);
+        at::hip::setCurrentHIPStreamMasqueradingAsCUDA(comm_stream);
    }
    // Wait previous tasks to be finished
@@ -1088,7 +1089,7 @@ Buffer::internode_dispatch(const torch::Tensor &x, const std::optional<torch::Te
    // Switch back compute stream
    if (allocate_on_comm_stream)
-        at::cuda::setCurrentCUDAStream(compute_stream);
+        at::hip::setCurrentHIPStreamMasqueradingAsCUDA(compute_stream);
    // Return values
    return {recv_x,
@@ -1124,8 +1125,8 @@ Buffer::internode_combine(
    const torch::Tensor &combined_nvl_head, const Config &config,
    std::optional<EventHandle> &previous_event, bool async, bool allocate_on_comm_stream) {
 #ifndef DISABLE_ROCSHMEM
-    const int num_channels = config.num_sms / 2;
+    const int num_channels = config.num_sms / 3;
-    EP_HOST_ASSERT(config.num_sms % 2 == 0);
+    EP_HOST_ASSERT(config.num_sms % 3 == 0);
    // Shape and contiguous checks
    EP_HOST_ASSERT(x.dim() == 2 and x.is_contiguous());
@@ -1167,10 +1168,10 @@ Buffer::internode_combine(
    // Allocate all tensors on comm stream if set
    // NOTES: do not allocate tensors upfront!
-    auto compute_stream = at::cuda::getCurrentCUDAStream();
+    auto compute_stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
    if (allocate_on_comm_stream) {
        EP_HOST_ASSERT(previous_event.has_value() and async);
-        at::cuda::setCurrentCUDAStream(comm_stream);
+        at::hip::setCurrentHIPStreamMasqueradingAsCUDA(comm_stream);
    }
    // Wait previous tasks to be finished
@@ -1216,7 +1217,7 @@ Buffer::internode_combine(
    void *bias_ptrs[2] = {nullptr, nullptr};
    for (int i = 0; i < 2; ++i)
        if (bias_opts[i].has_value()) {
-            // EP_HOST_ASSERT(false and "bias is not supported in internode combine");
+            EP_HOST_ASSERT(false and "bias is not supported in internode combine");
            auto bias = bias_opts[i].value();
            EP_HOST_ASSERT(bias.dim() == 2 and bias.is_contiguous());
            EP_HOST_ASSERT(bias.scalar_type() == x.scalar_type());
@@ -1260,7 +1261,7 @@ Buffer::internode_combine(
    // Switch back compute stream
    if (allocate_on_comm_stream)
-        at::cuda::setCurrentCUDAStream(compute_stream);
+        at::hip::setCurrentHIPStreamMasqueradingAsCUDA(compute_stream);
    // Return values
    return {combined_x, combined_topk_weights, event};

--- a/csrc/deep_ep.hip
+++ b/csrc/deep_ep.hip
--- a/csrc/deep_ep.hpp
+++ b/csrc/deep_ep.hpp
@@ -6,7 +6,7 @@
 #include <tuple>
 #include <vector>
-#include "./kernels/configs.cuh"
+#include "kernels/configs.cuh"
 #include "kernels/exception.cuh"
 #include "config.hpp"
 #include "event.hpp"

--- a/csrc/deep_ep_hip.hpp
+++ b/csrc/deep_ep_hip.hpp
-// !!! This is a file automatically generated by hipify!!!
-#include <ATen/dtk_macros.h>
-#pragma once
-#include <pybind11/pybind11.h>
-#include <pybind11/pytypes.h>
-#include <torch/types.h>
-#include <tuple>
-#include <vector>
-#include "kernels/configs.cuh"
-#include "kernels/exception.cuh"
-#include "config_hip.hpp"
-#include "event.hpp"
-namespace deep_ep {
-struct Buffer {
-    EP_STATIC_ASSERT(NUM_MAX_NVL_PEERS == 8, "The number of maximum NVLink peers must be 8");
-private:
-    // Low-latency mode buffer
-    int  low_latency_buffer_idx = 0;
-    bool low_latency_mode       = false;
-    // NVLink Buffer
-    int64_t num_nvl_bytes;
-    void   *buffer_ptrs[NUM_MAX_NVL_PEERS] = {nullptr};
-    void  **buffer_ptrs_gpu                = nullptr;
-    // NVSHMEM Buffer
-    int64_t num_rdma_bytes;
-    void   *rdma_buffer_ptr = nullptr;
-    // Device info and communication
-    int               device_id;
-    int               num_device_sms;
-    int               rank, rdma_rank, nvl_rank;
-    int               num_ranks, num_rdma_ranks, num_nvl_ranks;
-    hipIpcMemHandle_t ipc_handles[NUM_MAX_NVL_PEERS];
-    // Stream for communication
-    at::hip::HIPStreamMasqueradingAsCUDA comm_stream;
-    // After IPC/NVSHMEM synchronization, this flag will be true
-    bool available = false;
-    // Whether explicit `destroy()` is required.
-    bool explicitly_destroy;
-    // After `destroy()` be called, this flag will be true
-    bool destroyed = false;
-    // Barrier signals
-    int  *barrier_signal_ptrs[NUM_MAX_NVL_PEERS] = {nullptr};
-    int **barrier_signal_ptrs_gpu                = nullptr;
-    // Workspace
-    void *workspace = nullptr;
-    // Host-side MoE info
-    volatile int *moe_recv_counter        = nullptr;
-    int          *moe_recv_counter_mapped = nullptr;
-    // Host-side expert-level MoE info
-    volatile int *moe_recv_expert_counter        = nullptr;
-    int          *moe_recv_expert_counter_mapped = nullptr;
-    // Host-side RDMA-level MoE info
-    volatile int *moe_recv_rdma_counter        = nullptr;
-    int          *moe_recv_rdma_counter_mapped = nullptr;
-    bool use_default_stream_as_comm_stream = false;
-public:
-    Buffer(int rank, int num_ranks, int64_t num_nvl_bytes, int64_t num_rdma_bytes,
-           bool low_latency_mode, bool explicitly_destroy, bool use_default_stream_as_comm_stream);
-    ~Buffer() noexcept(false);
-    bool is_available() const;
-    bool is_internode_available() const;
-    int get_num_rdma_ranks() const;
-    int get_rdma_rank() const;
-    int get_root_rdma_rank(bool global) const;
-    int get_local_device_id() const;
-    pybind11::bytearray get_local_ipc_handle() const;
-    pybind11::bytearray get_local_nvshmem_unique_id() const;
-    torch::Tensor get_local_buffer_tensor(const pybind11::object &dtype, int64_t offset,
-                                          bool use_rdma_buffer) const;
-    torch::Stream get_comm_stream() const;
-    void sync(const std::vector<int>                                &device_ids,
-              const std::vector<std::optional<pybind11::bytearray>> &all_gathered_handles,
-              const std::optional<pybind11::bytearray>              &root_unique_id_opt);
-    void destroy();
-    std::tuple<torch::Tensor, std::optional<torch::Tensor>, torch::Tensor, torch::Tensor,
-               std::optional<EventHandle>>
-    get_dispatch_layout(const torch::Tensor &topk_idx, int num_experts,
-                        std::optional<EventHandle> &previous_event, bool async,
-                        bool allocate_on_comm_stream);
-    std::tuple<torch::Tensor, std::optional<torch::Tensor>, std::optional<torch::Tensor>,
-               std::optional<torch::Tensor>, std::vector<int>, torch::Tensor, torch::Tensor,
-               torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor,
-               std::optional<EventHandle>>
-    intranode_dispatch(const torch::Tensor &x, const std::optional<torch::Tensor> &x_scales,
-                       const std::optional<torch::Tensor> &topk_idx,
-                       const std::optional<torch::Tensor> &topk_weights,
-                       const std::optional<torch::Tensor> &num_tokens_per_rank,
-                       const torch::Tensor                &is_token_in_rank,
-                       const std::optional<torch::Tensor> &num_tokens_per_expert,
-                       int                                 cached_num_recv_tokens,
-                       const std::optional<torch::Tensor> &cached_rank_prefix_matrix,
-                       const std::optional<torch::Tensor> &cached_channel_prefix_matrix,
-                       int expert_alignment, int num_worst_tokens, const Config &config,
-                       std::optional<EventHandle> &previous_event, bool async,
-                       bool allocate_on_comm_stream);
-    std::tuple<torch::Tensor, std::optional<torch::Tensor>, std::optional<EventHandle>>
-    intranode_combine(const torch::Tensor &x, const std::optional<torch::Tensor> &topk_weights,
-                      const std::optional<torch::Tensor> &bias_0,
-                      const std::optional<torch::Tensor> &bias_1, const torch::Tensor &src_idx,
-                      const torch::Tensor &rank_prefix_matrix,
-                      const torch::Tensor &channel_prefix_matrix, const torch::Tensor &send_head,
-                      const Config &config, std::optional<EventHandle> &previous_event, bool async,
-                      bool allocate_on_comm_stream);
-    std::tuple<torch::Tensor, std::optional<torch::Tensor>, std::optional<torch::Tensor>,
-               std::optional<torch::Tensor>, std::vector<int>, torch::Tensor, torch::Tensor,
-               std::optional<torch::Tensor>, torch::Tensor, std::optional<torch::Tensor>,
-               torch::Tensor, std::optional<torch::Tensor>, std::optional<torch::Tensor>,
-               std::optional<torch::Tensor>, std::optional<EventHandle>>
-    internode_dispatch(const torch::Tensor &x, const std::optional<torch::Tensor> &x_scales,
-                       const std::optional<torch::Tensor> &topk_idx,
-                       const std::optional<torch::Tensor> &topk_weights,
-                       const std::optional<torch::Tensor> &num_tokens_per_rank,
-                       const std::optional<torch::Tensor> &num_tokens_per_rdma_rank,
-                       const torch::Tensor                &is_token_in_rank,
-                       const std::optional<torch::Tensor> &num_tokens_per_expert,
-                       int cached_num_recv_tokens, int cached_num_rdma_recv_tokens,
-                       const std::optional<torch::Tensor> &cached_rdma_channel_prefix_matrix,
-                       const std::optional<torch::Tensor> &cached_recv_rdma_rank_prefix_sum,
-                       const std::optional<torch::Tensor> &cached_gbl_channel_prefix_matrix,
-                       const std::optional<torch::Tensor> &cached_recv_gbl_rank_prefix_sum,
-                       int expert_alignment, const Config &config,
-                       std::optional<EventHandle> &previous_event, bool async,
-                       bool allocate_on_comm_stream);
-    std::tuple<torch::Tensor, std::optional<torch::Tensor>, std::optional<EventHandle>>
-    internode_combine(
-        const torch::Tensor &x, const std::optional<torch::Tensor> &topk_weights,
-        const std::optional<torch::Tensor> &bias_0, const std::optional<torch::Tensor> &bias_1,
-        const torch::Tensor &src_meta, const torch::Tensor &is_combined_token_in_rank,
-        const torch::Tensor &rdma_channel_prefix_matrix, const torch::Tensor &rdma_rank_prefix_sum,
-        const torch::Tensor &gbl_channel_prefix_matrix, const torch::Tensor &combined_rdma_head,
-        const torch::Tensor &combined_nvl_head, const Config &config,
-        std::optional<EventHandle> &previous_event, bool async, bool allocate_on_comm_stream);
-    void clean_low_latency_buffer(int num_max_dispatch_tokens_per_rank, int hidden,
-                                  int num_experts);
-    std::tuple<torch::Tensor, std::optional<torch::Tensor>, torch::Tensor, torch::Tensor,
-               torch::Tensor, std::optional<EventHandle>, std::optional<std::function<void()>>>
-    low_latency_dispatch(const torch::Tensor &x, const torch::Tensor &topk_idx,
-                         const std::optional<torch::Tensor> &cumulative_local_expert_recv_stats,
-                         const std::optional<torch::Tensor> &dispatch_wait_recv_cost_stats,
-                         int num_max_dispatch_tokens_per_rank, int num_experts, bool use_fp8,
-                         bool round_scale, bool use_ue8m0, bool async, bool return_recv_hook);
-    std::tuple<torch::Tensor, std::optional<EventHandle>, std::optional<std::function<void()>>>
-    low_latency_combine(const torch::Tensor &x, const torch::Tensor &topk_idx,
-                        const torch::Tensor &topk_weights, const torch::Tensor &src_info,
-                        const torch::Tensor                &layout_range,
-                        const std::optional<torch::Tensor> &combine_wait_recv_cost_stats,
-                        int num_max_dispatch_tokens_per_rank, int num_experts, bool use_logfmt,
-                        bool zero_copy, bool async, bool return_recv_hook,
-                        const std::optional<torch::Tensor> &out = std::nullopt);
-    torch::Tensor get_next_low_latency_combine_buffer(int num_max_dispatch_tokens_per_rank,
-                                                      int hidden, int num_experts) const;
-};
-} // namespace deep_ep
--- a/csrc/kernels/internode.cu
+++ b/csrc/kernels/internode.cu
--- a/csrc/kernels/internode.hip
+++ b/csrc/kernels/internode.hip
--- a/csrc/kernels/intranode.cu
+++ b/csrc/kernels/intranode.cu
@@ -393,9 +393,9 @@ __global__ void __launch_bounds__(kNumThreads, 1)
                    total_offset;
            num_tokens_to_recv -= total_offset;
        }
-        total_offset = __shfl_sync(kFullWarpMask, total_offset, 0);
+        total_offset = shfl_sync(total_offset, 0);
        total_offset += rank_offset;
-        num_tokens_to_recv = __shfl_sync(kFullWarpMask, num_tokens_to_recv, 0);
+        num_tokens_to_recv = shfl_sync(num_tokens_to_recv, 0);
        // Shared tail indices for different warps
        __shared__ volatile int shared_channel_tail_idx[kNumRanks];
@@ -583,7 +583,7 @@ __global__ void cached_notify_combine(void **buffer_ptrs, int *send_head, int nu
                                    ? __ldg(send_head + token_idx * kNumRanks + rank_id)
                                    : -1;
            for (int i = 0; i < min(kWarpSize, token_idx_tail - token_start_idx + 1); ++i) {
-                const int head = __shfl_sync(kFullWarpMask, current_head, i);
+                const int head = shfl_sync(current_head, i);
                if (head < 0) {
                    if (lane_id == i)
                        expected_head = -last_head - 1;
@@ -606,7 +606,7 @@ void cached_notify_combine(void **buffer_ptrs, int *send_head, int num_channels,
                                  barrier_signal_ptrs, rank);                                      \
    break
-    const int num_threads = std::max(128, kWarpSize * num_ranks);
+    const int num_threads = ::max(128, kWarpSize * num_ranks);
    EP_HOST_ASSERT(num_ranks <= num_threads);
    EP_HOST_ASSERT(num_threads <= 1024);
    EP_HOST_ASSERT(1 + num_channels <= num_channels * 2);

--- a/csrc/kernels/intranode.hip
+++ b/csrc/kernels/intranode.hip
--- a/csrc/kernels/launch.cuh
+++ b/csrc/kernels/launch.cuh
+#include "hip/hip_runtime.h"
 #pragma once
 #include "configs.cuh"

--- a/csrc/kernels/launch_hip.cuh
+++ b/csrc/kernels/launch_hip.cuh
-// !!! This is a file automatically generated by hipify!!!
-#include <ATen/dtk_macros.h>
-#include "hip/hip_runtime.h"
-#pragma once
-#include "configs.cuh"
-#include "exception.cuh"
-// ROCm helper functions and structures
-namespace rocm::experimental {
-typedef struct {
-    dim3         num_sms;
-    dim3         num_threads;
-    unsigned int shared_mem_bytes;
-    hipStream_t  stream;
-} hipLaunchConfig_t;
-// Compile time void** kernelArgs array fill with variadic arguments
-template <typename T> void fill_kernel_args(void **f, size_t idx, T &&arg) {
-    f[idx] = static_cast<void *>(std::addressof(arg));
-}
-template <typename Head, typename... Tail>
-void fill_kernel_args(void **f, size_t idx, Head &&head, Tail &&...tail) {
-    f[idx] = static_cast<void *>(std::addressof(head));
-    fill_kernel_args(f, idx + 1, std::forward<Tail>(tail)...);
-}
-} // namespace rocm::experimental
-#ifndef SETUP_LAUNCH_CONFIG
-// The code below is a workaround for ROCm. All the proposed overhead
-// is to match current macro signatures and should be reworked once
-// cudaLaunchKernelExt() hip alternative is live.
-#define SETUP_LAUNCH_CONFIG(num_sms, num_threads, stream)                                          \
-    rocm::experimental::hipLaunchConfig_t cfg = {(num_sms), (num_threads), 0, stream};
-#endif // #ifndef SETUP_LAUNCH_CONFIG
-#ifndef LAUNCH_KERNEL
-template <typename T, typename Kern, typename... Args>
-inline void LAUNCH_KERNEL(T &&config, Kern &&kernel, Args &&...args) {
-    constexpr size_t k_num_kernel_args = sizeof...(args);
-    void            *kernel_args[k_num_kernel_args];
-    rocm::experimental::fill_kernel_args(kernel_args, 0, std::forward<Args>(args)...);
-    CUDA_CHECK(hipLaunchCooperativeKernel(std::forward<Kern>(kernel), config->num_sms,
-                                                      config->num_threads, kernel_args,
-                                                      config->shared_mem_bytes, config->stream));
-}
-template <typename T, typename Kern, typename... Args>
-inline void LAUNCH_KERNEL_NON_COOPERATIVE(T &&config, Kern &&kernel, Args &&...args) {
-    hipLaunchKernelGGL((*kernel), dim3(config->num_sms), dim3(config->num_threads), config->shared_mem_bytes, config->stream, 
-        std::forward<Args>(args)...);
-}
-#endif // #ifndef LAUNCH_KERNEL
-#define SWITCH_RANKS(case_macro)                                                                   \
-    switch (num_ranks) {                                                                           \
-    case 2:                                                                                        \
-        case_macro(2);                                                                             \
-    case 4:                                                                                        \
-        case_macro(4);                                                                             \
-    case 8:                                                                                        \
-        case_macro(8);                                                                             \
-    default:                                                                                       \
-        EP_HOST_ASSERT(false and "Unsupported ranks");                                         \
-    }                                                                                              \
-    while (false)
-#define SWITCH_RDMA_RANKS(case_macro)                                                              \
-    switch (num_ranks / NUM_MAX_NVL_PEERS) {                                                       \
-    case 2:                                                                                        \
-        case_macro(2);                                                                             \
-    case 3:                                                                                        \
-        case_macro(3);                                                                             \
-    case 4:                                                                                        \
-        case_macro(4);                                                                             \
-    case 8:                                                                                        \
-        case_macro(8);                                                                             \
-    case 16:                                                                                       \
-        case_macro(16);                                                                            \
-    case 18:                                                                                       \
-        case_macro(18);                                                                            \
-    case 20:                                                                                       \
-        case_macro(20);                                                                            \
-    default:                                                                                       \
-        EP_HOST_ASSERT(false and "Unsupported RDMA ranks");                                       \
-    }                                                                                              \
-    while (false)
-#define SWITCH_RANKS_WITH_DTYPE(dtype, case_macro)                                                 \
-    switch (num_ranks) {                                                                           \
-    case 2:                                                                                        \
-        case_macro(dtype, 2);                                                                      \
-    case 4:                                                                                        \
-        case_macro(dtype, 4);                                                                      \
-    case 8:                                                                                        \
-        case_macro(dtype, 8);                                                                      \
-    default:                                                                                       \
-        EP_HOST_ASSERT(false and "Unsupported ranks");                                            \
-    }                                                                                              \
-    while (false)
-#define SWITCH_TYPES(case_macro)                                                                   \
-    switch (type) {                                                                                \
-    case HIP_R_16BF:                                                                               \
-        case_macro(hip_bfloat16);                                                                  \
-    case HIP_R_32F:                                                                                \
-        case_macro(float);                                                                         \
-    default:                                                                                       \
-        EP_HOST_ASSERT(false and "Unsupported type");                                             \
-    }                                                                                              \
-    while (false)
-#define SWITCH_HIDDEN(case_macro)                                                                  \
-    switch (hidden) {                                                                              \
-    case 2560:                                                                                     \
-        case_macro(2560);                                                                          \
-    case 5120:                                                                                     \
-        case_macro(5120);                                                                          \
-    case 4096:                                                                                     \
-        case_macro(4096);                                                                          \
-    case 7168:                                                                                     \
-        case_macro(7168);                                                                          \
-    default:                                                                                       \
-        EP_HOST_ASSERT(false and "Unsupported hidden");                                           \
-    }                                                                                              \
-    while (false)
--- a/csrc/kernels/runtime.cu
+++ b/csrc/kernels/runtime.cu
+#include "hip/hip_runtime.h"
 #include <cstring>
 #include "configs.cuh"

--- a/csrc/kernels/runtime.hip
+++ b/csrc/kernels/runtime.hip
-// !!! This is a file automatically generated by hipify!!!
-#include <ATen/dtk_macros.h>
-#include "hip/hip_runtime.h"
-#include <cstring>
-#include "configs.cuh"
-#include "exception.cuh"
-#include "launch_hip.cuh"
-#include "utils_hip.cuh"
-#ifndef DISABLE_ROCSHMEM
-#include <rocshmem/rocshmem.hpp>
-#endif
-namespace deep_ep {
-namespace intranode {
-template <int kNumRanks> 
-__global__ void barrier(int **barrier_signal_ptrs, int rank) {
-    barrier_block<kNumRanks>(barrier_signal_ptrs, rank);
-}
-void barrier(int **barrier_signal_ptrs, int rank, int num_ranks, hipStream_t stream) {
-#define BARRIER_LAUNCH_CASE(ranks)                                                                 \
-    LAUNCH_KERNEL(&cfg, barrier<ranks>, barrier_signal_ptrs, rank);                                \
-    break
-    SETUP_LAUNCH_CONFIG(1, kWarpSize, stream);
-    SWITCH_RANKS(BARRIER_LAUNCH_CASE);
-#undef BARRIER_LAUNCH_CASE
-}
-} // namespace intranode
-namespace internode {
-#ifndef DISABLE_ROCSHMEM
-rocshmem::rocshmem_team_t        cpu_rdma_team = rocshmem::ROCSHMEM_TEAM_INVALID;
-rocshmem::rocshmem_team_config_t cpu_rdma_team_config;
-std::vector<uint8_t> get_unique_id() {
-    rocshmem::rocshmem_uniqueid_t unique_id;
-    rocshmem::rocshmem_get_uniqueid(&unique_id);
-    std::vector<uint8_t> result(sizeof(rocshmem::rocshmem_uniqueid_t));
-    std::memcpy(result.data(), &unique_id, sizeof(rocshmem::rocshmem_uniqueid_t));
-    return result;
-}
-int init(const std::vector<uint8_t> &root_unique_id_val, int rank, int num_ranks,
-         bool low_latency_mode) {
-    rocshmem::rocshmem_uniqueid_t  root_unique_id;
-    rocshmem::rocshmem_init_attr_t attr;
-    std::memcpy(&root_unique_id, root_unique_id_val.data(), sizeof(rocshmem::rocshmem_uniqueid_t));
-    rocshmem::rocshmem_set_attr_uniqueid_args(rank, num_ranks, &root_unique_id, &attr);
-    rocshmem::rocshmem_init_attr(rocshmem::ROCSHMEM_INIT_WITH_UNIQUEID, &attr);
-    // Create sub-RDMA teams
-    // NOTES: if `num_ranks <= NUM_MAX_NVL_PEERS` then only low-latency kernels are used
-    if (low_latency_mode and num_ranks > NUM_MAX_NVL_PEERS) {
-        EP_HOST_ASSERT(cpu_rdma_team == rocshmem::ROCSHMEM_TEAM_INVALID);
-        EP_HOST_ASSERT(num_ranks % NUM_MAX_NVL_PEERS == 0);
-        EP_HOST_ASSERT(rocshmem::rocshmem_team_split_strided(
-                               rocshmem::ROCSHMEM_TEAM_WORLD, rank % NUM_MAX_NVL_PEERS,
-                               NUM_MAX_NVL_PEERS, num_ranks / NUM_MAX_NVL_PEERS,
-                               &cpu_rdma_team_config, 0, &cpu_rdma_team) == 0);
-        EP_HOST_ASSERT(cpu_rdma_team != rocshmem::ROCSHMEM_TEAM_INVALID);
-    }
-    rocshmem::rocshmem_barrier_all();
-    return rocshmem::rocshmem_my_pe();
-}
-void *alloc(size_t size, size_t alignment) {
-    auto alloc_size = ALIGN(size, alignment);
-    return rocshmem::rocshmem_malloc(alloc_size);
-}
-void free(void *ptr) {
-    rocshmem::rocshmem_free(ptr);
-}
-void barrier() {
-    rocshmem::rocshmem_barrier_all();
-}
-void finalize() {
-    if (cpu_rdma_team != rocshmem::ROCSHMEM_TEAM_INVALID) {
-        rocshmem::rocshmem_team_destroy(cpu_rdma_team);
-        cpu_rdma_team = rocshmem::ROCSHMEM_TEAM_INVALID;
-    }
-    rocshmem::rocshmem_finalize();
-}
-#endif
-} // namespace internode
-} // namespace deep_ep
--- a/csrc/kernels/utils.cuh
+++ b/csrc/kernels/utils.cuh
+#include "hip/hip_runtime.h"
 #pragma once
 #include "configs.cuh"
 #include "exception.cuh"
@@ -194,8 +195,7 @@ __device__ __forceinline__ int64_t ld_volatile_global(const volatile uint64_t *p
    return ret;
 }
-template <typename dtype_t> 
+template <typename dtype_t> __device__ __forceinline__ dtype_t ld_nc_global(const dtype_t *ptr) {
-__device__ __forceinline__ dtype_t ld_nc_global(const dtype_t *ptr) {
    using T  = typename VecInt<sizeof(dtype_t)>::vec_t;
    auto ret = __builtin_nontemporal_load(reinterpret_cast<const T *>(ptr));
    return *reinterpret_cast<dtype_t *>(&ret);

--- a/csrc/kernels/utils_hip.cuh
+++ b/csrc/kernels/utils_hip.cuh
-// !!! This is a file automatically generated by hipify!!!
-#include <ATen/dtk_macros.h>
-#include "hip/hip_runtime.h"
-#pragma once
-#include "configs.cuh"
-#include "exception.cuh"
-#define UNROLLED_WARP_COPY(UNROLL_FACTOR, LANE_ID, N, DST, SRC, LD_FUNC, ST_FUNC)                  \
-    {                                                                                              \
-        constexpr int kLoopStride = kWarpSize * (UNROLL_FACTOR);                                   \
-        typename std::remove_reference<decltype(LD_FUNC((SRC) + 0))>::type                         \
-             unrolled_values[(UNROLL_FACTOR)];                                                     \
-        auto __src = (SRC);                                                                        \
-        auto __dst = (DST);                                                                        \
-        for (int __i = (LANE_ID); __i < ((N) / kLoopStride) * kLoopStride; __i += kLoopStride) {   \
-            _Pragma("unroll") for (int __j = 0; __j < (UNROLL_FACTOR); ++__j)                      \
-                unrolled_values[__j] = LD_FUNC(__src + __i + __j * kWarpSize);                     \
-            _Pragma("unroll") for (int __j = 0; __j < (UNROLL_FACTOR); ++__j)                      \
-                ST_FUNC(__dst + __i + __j * kWarpSize, unrolled_values[__j]);                      \
-        }                                                                                          \
-        {                                                                                          \
-            int __i = ((N) / kLoopStride) * kLoopStride + (LANE_ID);                               \
-            _Pragma("unroll") for (int __j = 0; __j < (UNROLL_FACTOR); ++__j) {                    \
-                if (__i + __j * kWarpSize < (N)) {                                                 \
-                    unrolled_values[__j] = LD_FUNC(__src + __i + __j * kWarpSize);                 \
-                }                                                                                  \
-            }                                                                                      \
-            _Pragma("unroll") for (int __j = 0; __j < (UNROLL_FACTOR); ++__j) {                    \
-                if (__i + __j * kWarpSize < (N)) {                                                 \
-                    ST_FUNC(__dst + __i + __j * kWarpSize, unrolled_values[__j]);                  \
-                }                                                                                  \
-            }                                                                                      \
-        }                                                                                          \
-    }
-#define UNROLLED_WARP_COPY_EMULATED(UNROLL_FACTOR, LANE_ID, N, DST, SRC, LD_FUNC, ST_FUNC)         \
-    {                                                                                              \
-        constexpr int kLoopStride = kEmulatedWarpSize * (UNROLL_FACTOR);                           \
-        typename std::remove_reference<decltype(LD_FUNC((SRC) + 0))>::type                         \
-             unrolled_values[(UNROLL_FACTOR)];                                                     \
-        auto __src = (SRC);                                                                        \
-        auto __dst = (DST);                                                                        \
-        for (int __i = (LANE_ID); __i < ((N) / kLoopStride) * kLoopStride; __i += kLoopStride) {   \
-            _Pragma("unroll") for (int __j = 0; __j < (UNROLL_FACTOR); ++__j)                      \
-                unrolled_values[__j] = LD_FUNC(__src + __i + __j * kEmulatedWarpSize);             \
-            _Pragma("unroll") for (int __j = 0; __j < (UNROLL_FACTOR); ++__j)                      \
-                ST_FUNC(__dst + __i + __j * kEmulatedWarpSize, unrolled_values[__j]);              \
-        }                                                                                          \
-        for (int __i = ((N) / kLoopStride) * kLoopStride + (LANE_ID); __i < (N);                   \
-             __i += kEmulatedWarpSize)                                                             \
-            ST_FUNC(__dst + __i, LD_FUNC(__src + __i));                                            \
-    }
-// HELPER FUNCTIONS
-// #####################################################################################
-template <typename T>
-__device__ __forceinline__ T shfl_xor(const T val, int laneMask, int width = kWarpSize,
-                                      uint64_t shfl_sync_mask = kFullWarpMask) {
-    return __shfl_xor(val, laneMask, width);
-}
-__device__ __forceinline__ int
-shfl_sync(const int val, int srcLane = 0, int width = kWarpSize,
-          uint64_t shfl_sync_mask = kFullWarpMask) { // Let compiler deduce type
-    return __shfl(val, srcLane, width);
-}
-__device__ __forceinline__ int __any_sync(uint64_t mask, int predicate) {
-    uint64_t predicate_bit_pattern = __ballot(predicate);
-    return (predicate_bit_pattern & mask) > 0;
-}
-__device__ __forceinline__ int __all_sync(uint64_t mask, int predicate) {
-    uint64_t predicate_bit_pattern = __ballot(predicate);
-    return (~predicate_bit_pattern & mask) == 0;
-}
-__device__ __forceinline__ void syncwarp() {
-    __builtin_amdgcn_fence(__ATOMIC_RELEASE, "wavefront");
-    __builtin_amdgcn_wave_barrier();
-    __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "wavefront");
-}
-// ######################################################################################################
-namespace deep_ep {
-template <int kBytes> struct VecInt {};
-template <> struct VecInt<1> {
-    using vec_t = int8_t;
-};
-template <> struct VecInt<2> {
-    using vec_t = int16_t;
-};
-template <> struct VecInt<4> {
-    using vec_t = int;
-};
-template <> struct VecInt<8> {
-    using vec_t = int64_t;
-};
-template <> struct VecInt<16> {
-    using native_int4 = int __attribute__((ext_vector_type(4)));
-    using vec_t       = native_int4;
-};
-__device__ __forceinline__ void trap() {
-    abort();
-}
-__device__ __forceinline__ void memory_fence() {
-    __threadfence_system();
-}
-__device__ __forceinline__ void memory_fence_gpu() {
-    __threadfence();
-}
-__device__ __forceinline__ void memory_fence_cta() {
-    __threadfence_block();
-}
-__device__ __forceinline__ void st_relaxed_sys_global(int *ptr, int val) {
-    __builtin_nontemporal_store(val, ptr);
-}
-__device__ __forceinline__ void st_release_sys_global(const int *ptr, int val) {
-    __hip_atomic_store(const_cast<int *>(ptr), val, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_SYSTEM);
-}
-__device__ __forceinline__ void st_release_cta(const int *ptr, int val) {
-    __hip_atomic_store(const_cast<int *>(ptr), val, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_WORKGROUP);
-}
-__device__ __forceinline__ int ld_relaxed_sys_global(const int *ptr) {
-    int res = __builtin_nontemporal_load(ptr);
-    return res;
-}
-__device__ __forceinline__ int ld_relaxed_sys_global(const uint64_t *ptr) {
-    uint64_t ret;
-    ret = __hip_atomic_load(ptr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
-    return ret;
-}
-__device__ __forceinline__ int ld_acquire_sys_global(const int *ptr) {
-    int ret;
-    ret = __hip_atomic_load(ptr, __ATOMIC_ACQUIRE, __HIP_MEMORY_SCOPE_SYSTEM);
-    return ret;
-}
-__device__ __forceinline__ uint64_t ld_acquire_sys_global(const uint64_t *ptr) {
-    uint64_t ret;
-    ret = __hip_atomic_load(ptr, __ATOMIC_ACQUIRE, __HIP_MEMORY_SCOPE_SYSTEM);
-    return ret;
-}
-__device__ __forceinline__ int ld_acquire_global(const int *ptr) {
-    int ret;
-    ret = __hip_atomic_load(ptr, __ATOMIC_ACQUIRE, __HIP_MEMORY_SCOPE_AGENT);
-    return ret;
-}
-__device__ __forceinline__ int atomic_add_release_global(const int *ptr, int value) {
-    int ret;
-    // ret = __hip_atomic_fetch_add(const_cast<int *>(ptr), value, __ATOMIC_RELEASE,
-    //                              __HIP_MEMORY_SCOPE_AGENT);
-    ret = atomicAdd((int*)ptr, value);
-    return ret;
-}
-__device__ __forceinline__ int ld_acquire_cta(const int *ptr) {
-    int ret;
-    ret = __hip_atomic_load(ptr, __ATOMIC_ACQUIRE, __HIP_MEMORY_SCOPE_WORKGROUP);
-    return ret;
-}
-__device__ __forceinline__ int ld_volatile_global(const volatile int *ptr) {
-    int ret;
-    ret = __hip_atomic_load(ptr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
-    return ret;
-}
-__device__ __forceinline__ float ld_volatile_global(const volatile float *ptr) {
-    float ret;
-    ret = __hip_atomic_load(ptr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
-    return ret;
-}
-__device__ __forceinline__ int64_t ld_volatile_global(const volatile int64_t *ptr) {
-    int64_t ret;
-    ret = __hip_atomic_load(ptr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
-    return ret;
-}
-__device__ __forceinline__ int64_t ld_volatile_global(const volatile uint64_t *ptr) {
-    int64_t ret;
-    ret = __hip_atomic_load(ptr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
-    return ret;
-}
-template <typename dtype_t> __device__ __forceinline__ dtype_t ld_nc_global(const dtype_t *ptr) {
-    using T  = typename VecInt<sizeof(dtype_t)>::vec_t;
-    auto ret = __builtin_nontemporal_load(reinterpret_cast<const T *>(ptr));
-    return *reinterpret_cast<dtype_t *>(&ret);
-}
-////////////////// used in ibgda
-__device__ __forceinline__ void st_na_relaxed(const uint8_t *ptr, uint8_t val) {
-    uint8_t *non_const_ptr = const_cast<uint8_t *>(ptr);
-    __hip_atomic_store(non_const_ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
-}
-__device__ __forceinline__ void st_na_relaxed(const uint16_t *ptr, uint16_t val) {
-    uint16_t *non_const_ptr = const_cast<uint16_t *>(ptr);
-    __hip_atomic_store(non_const_ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
-}
-__device__ __forceinline__ void st_na_relaxed(const uint32_t *ptr, uint32_t val) {
-    uint32_t *non_const_ptr = const_cast<uint32_t *>(ptr);
-    __hip_atomic_store(non_const_ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
-}
-__device__ __forceinline__ void st_na_relaxed(const int *ptr, int val) {
-    int *non_const_ptr = const_cast<int *>(ptr);
-    __hip_atomic_store(non_const_ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
-}
-__device__ __forceinline__ void st_na_relaxed(const int4 *ptr, int4 val) {
-    int4 *non_const_ptr = const_cast<int4 *>(ptr);
-    non_const_ptr->x    = val.x;
-    non_const_ptr->y    = val.y;
-    non_const_ptr->z    = val.z;
-    non_const_ptr->w    = val.w;
-}
-__device__ __forceinline__ void st_na_release(const int *ptr, int val) {
-    int *non_const_ptr = const_cast<int *>(ptr);
-    __hip_atomic_store(non_const_ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
-}
-__device__ __forceinline__ void st_na_release(const uint32_t *ptr, uint32_t val) {
-    uint32_t *non_const_ptr = const_cast<uint32_t *>(ptr);
-    __hip_atomic_store(non_const_ptr, val, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_AGENT);
-}
-__device__ __forceinline__ void st_na_release(const uint64_t *ptr, uint64_t val) {
-    uint64_t *non_const_ptr = const_cast<uint64_t *>(ptr);
-    __hip_atomic_store(non_const_ptr, val, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_AGENT);
-}
-// TODO:: apply "st.global.L1::no_allocate" in ROCM
-template <typename dtype_t>
-__device__ __forceinline__ void st_na_global(const dtype_t *ptr, const dtype_t &value) {
-    st_na_global(reinterpret_cast<const typename VecInt<sizeof(dtype_t)>::vec_t *>(ptr),
-                 *reinterpret_cast<const typename VecInt<sizeof(dtype_t)>::vec_t *>(&value));
-}
-template <> __device__ __forceinline__ void st_na_global(const int *ptr, const int &value) {
-    int *non_const_ptr = const_cast<int *>(ptr);
-    *non_const_ptr     = value;
-}
-template <> __device__ __forceinline__ void st_na_global(const int64_t *ptr, const int64_t &value) {
-    int64_t *non_const_ptr = const_cast<int64_t *>(ptr);
-    *non_const_ptr         = value;
-}
-template <> __device__ __forceinline__ void st_na_global(const float *ptr, const float &value) {
-    float *non_const_ptr = const_cast<float *>(ptr);
-    *non_const_ptr       = value;
-}
-template <> __device__ __forceinline__ void st_na_global(const int4 *ptr, const int4 &value) {
-    int4 *non_const_ptr = const_cast<int4 *>(ptr);
-    *non_const_ptr      = value;
-}
-__forceinline__ __device__ void get_channel_task_range(int num_tokens, int num_sms, int sm_id,
-                                                       int &token_start_idx, int &token_end_idx) {
-    int num_tokens_per_sm = DIVUP(num_tokens, num_sms);
-    token_start_idx       = min(num_tokens_per_sm * sm_id, num_tokens);
-    token_end_idx         = min(token_start_idx + num_tokens_per_sm, num_tokens);
-}
-template <typename dtype_t>
-__device__ __forceinline__ dtype_t broadcast(dtype_t &ptr, int src_lane_idx) {
-    EP_STATIC_ASSERT(sizeof(dtype_t) % sizeof(int) == 0, "");
-    auto send_int_values = reinterpret_cast<int *>(&ptr);
-    int  recv_int_values[sizeof(dtype_t) / sizeof(int)];
-#pragma unroll
-    for (int i = 0; i < sizeof(dtype_t) / sizeof(int); ++i)
-        recv_int_values[i] = shfl_sync(send_int_values[i], src_lane_idx);
-    return *reinterpret_cast<dtype_t *>(recv_int_values);
-}
-__forceinline__ __device__ int warp_reduce_sum(int value) {
-    if constexpr (kWarpSize == 64)
-        value += shfl_xor<int>(value, 32);
-    value += shfl_xor<int>(value, 16);
-    value += shfl_xor<int>(value, 8);
-    value += shfl_xor<int>(value, 4);
-    value += shfl_xor<int>(value, 2);
-    value += shfl_xor<int>(value, 1);
-    return value;
-}
-__forceinline__ __device__ int get_lane_id() {
-    int lane_id = threadIdx.x % kWarpSize;
-    return lane_id;
-}
-template <int kNumRanks, bool kSyncOnly = false>
-__forceinline__ __device__ void barrier_block(int **barrier_signal_ptrs, int rank) {
-    auto thread_id = static_cast<int>(threadIdx.x);
-    // For non-sync-only cases, the memory operations by other threads in the block must be visible
-    // to the `sys` scope
-    if constexpr (not kSyncOnly) {
-        memory_fence();
-        __syncthreads();
-    }
-    // Add self-ranks, sub other ranks
-    if (thread_id < kNumRanks) {
-        atomicAdd_system(barrier_signal_ptrs[rank] + thread_id, FINISHED_SUM_TAG);
-        atomicSub_system(barrier_signal_ptrs[thread_id] + rank, FINISHED_SUM_TAG);
-    }
-    EP_DEVICE_ASSERT(kNumRanks <= blockDim.x);
-    // Check timeout
-    auto start_time = clock64();
-    while (true) {
-        auto value =
-            thread_id < kNumRanks ? ld_volatile_global(barrier_signal_ptrs[rank] + thread_id) : 0;
-        if (__all_sync(kFullWarpMask, value <= 0))
-            break;
-        if (clock64() - start_time > NUM_TIMEOUT_CYCLES and thread_id < kNumRanks) {
-            printf("DeepEP timeout check failed: rank = %d, thread = %d, value = %d)\n", rank,
-                   thread_id, value);
-            trap();
-        }
-    }
-    __syncthreads();
-}
-} // namespace deep_ep