"openmmapi/src/PythonForce.cpp" did not exist on "aa96846eb06fc74182eca10d94313d4ca6adfbae"
Commit 447238de authored by lijian6's avatar lijian6
Browse files

Adjust the file structure.


Signed-off-by: lijian6's avatarlijian <lijian6@sugon.com>
parent c1d9c169
...@@ -8,12 +8,12 @@ fi ...@@ -8,12 +8,12 @@ fi
PYTHON_INCLUDE=$(python3 -c "from sysconfig import get_paths; print(get_paths()['include'])") PYTHON_INCLUDE=$(python3 -c "from sysconfig import get_paths; print(get_paths()['include'])")
PYTHON_PLATLIB=$(python3 -c "from sysconfig import get_paths; print(get_paths()['platlib'])") PYTHON_PLATLIB=$(python3 -c "from sysconfig import get_paths; print(get_paths()['platlib'])")
/opt/dtk/bin/hipcc -Icsrc/ -I$(pwd)/rocshmem_dir/include/ -I/public/home/lishen/Code/rocSHMEM/3rd_party/install_dtk25.04.1/ompi/include -I${PYTHON_PLATLIB}/torch/include -I${PYTHON_PLATLIB}/torch/include/torch/csrc/api/include -I${PYTHON_PLATLIB}/torch/include/TH -I${PYTHON_PLATLIB}/torch/include/THC -I${PYTHON_PLATLIB}/torch/include/THH -I/opt/dtk/include -I${PYTHON_INCLUDE} -c -c ./csrc/kernels/intranode.hip -o build_/intranode.o -fPIC -D__HIP_PLATFORM_AMD__=1 -DUSE_ROCM=1 -DHIPBLAS_V2 -DCUDA_HAS_FP16=1 -D__HIP_NO_HALF_OPERATORS__=1 -D__HIP_NO_HALF_CONVERSIONS__=1 -O3 -fgpu-rdc -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1014"' -DTORCH_EXTENSION_NAME=deep_ep_cpp -D_GLIBCXX_USE_CXX11_ABI=1 --offload-arch=gfx936 -std=c++17 /opt/dtk/bin/hipcc -Icsrc/ -I$(pwd)/rocshmem_dir/include/ -I/opt/mpi/include -I${PYTHON_PLATLIB}/torch/include -I${PYTHON_PLATLIB}/torch/include/torch/csrc/api/include -I${PYTHON_PLATLIB}/torch/include/TH -I${PYTHON_PLATLIB}/torch/include/THC -I${PYTHON_PLATLIB}/torch/include/THH -I/opt/dtk/include -I${PYTHON_INCLUDE} -c -c ./csrc/kernels/intranode.cu -o build_/intranode.o -fPIC -D__HIP_PLATFORM_AMD__=1 -DUSE_ROCM=1 -DHIPBLAS_V2 -DCUDA_HAS_FP16=1 -D__HIP_NO_HALF_OPERATORS__=1 -D__HIP_NO_HALF_CONVERSIONS__=1 -O3 -fgpu-rdc -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1014"' -DTORCH_EXTENSION_NAME=deep_ep_cpp -D_GLIBCXX_USE_CXX11_ABI=1 --offload-arch=gfx936 -std=c++17
/opt/dtk/bin/hipcc -Icsrc/ -I$(pwd)/rocshmem_dir/include/ -I/public/home/lishen/Code/rocSHMEM/3rd_party/install_dtk25.04.1/ompi/include -I${PYTHON_PLATLIB}/torch/include -I${PYTHON_PLATLIB}/torch/include/torch/csrc/api/include -I${PYTHON_PLATLIB}/torch/include/TH -I${PYTHON_PLATLIB}/torch/include/THC -I${PYTHON_PLATLIB}/torch/include/THH -I/opt/dtk/include -I${PYTHON_INCLUDE} -c -c ./csrc/kernels/runtime.hip -o build_/runtime.o -fPIC -D__HIP_PLATFORM_AMD__=1 -DUSE_ROCM=1 -DHIPBLAS_V2 -DCUDA_HAS_FP16=1 -D__HIP_NO_HALF_OPERATORS__=1 -D__HIP_NO_HALF_CONVERSIONS__=1 -O3 -fgpu-rdc -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1014"' -DTORCH_EXTENSION_NAME=deep_ep_cpp -D_GLIBCXX_USE_CXX11_ABI=1 --offload-arch=gfx936 -std=c++17 /opt/dtk/bin/hipcc -Icsrc/ -I$(pwd)/rocshmem_dir/include/ -I/opt/mpi/include -I${PYTHON_PLATLIB}/torch/include -I${PYTHON_PLATLIB}/torch/include/torch/csrc/api/include -I${PYTHON_PLATLIB}/torch/include/TH -I${PYTHON_PLATLIB}/torch/include/THC -I${PYTHON_PLATLIB}/torch/include/THH -I/opt/dtk/include -I${PYTHON_INCLUDE} -c -c ./csrc/kernels/runtime.cu -o build_/runtime.o -fPIC -D__HIP_PLATFORM_AMD__=1 -DUSE_ROCM=1 -DHIPBLAS_V2 -DCUDA_HAS_FP16=1 -D__HIP_NO_HALF_OPERATORS__=1 -D__HIP_NO_HALF_CONVERSIONS__=1 -O3 -fgpu-rdc -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1014"' -DTORCH_EXTENSION_NAME=deep_ep_cpp -D_GLIBCXX_USE_CXX11_ABI=1 --offload-arch=gfx936 -std=c++17
/opt/dtk/bin/hipcc -Icsrc/ -I$(pwd)/rocshmem_dir/include/ -I/public/home/lishen/Code/rocSHMEM/3rd_party/install_dtk25.04.1/ompi/include -I${PYTHON_PLATLIB}/torch/include -I${PYTHON_PLATLIB}/torch/include/torch/csrc/api/include -I${PYTHON_PLATLIB}/torch/include/TH -I${PYTHON_PLATLIB}/torch/include/THC -I${PYTHON_PLATLIB}/torch/include/THH -I/opt/dtk/include -I${PYTHON_INCLUDE} -c -c ./csrc/kernels/layout.cu -o build_/layout.o -fPIC -D__HIP_PLATFORM_AMD__=1 -DUSE_ROCM=1 -DHIPBLAS_V2 -DCUDA_HAS_FP16=1 -D__HIP_NO_HALF_OPERATORS__=1 -D__HIP_NO_HALF_CONVERSIONS__=1 -O3 -fgpu-rdc -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1014"' -DTORCH_EXTENSION_NAME=deep_ep_cpp -D_GLIBCXX_USE_CXX11_ABI=1 --offload-arch=gfx936 -std=c++17 /opt/dtk/bin/hipcc -Icsrc/ -I$(pwd)/rocshmem_dir/include/ -I/opt/mpi/include -I${PYTHON_PLATLIB}/torch/include -I${PYTHON_PLATLIB}/torch/include/torch/csrc/api/include -I${PYTHON_PLATLIB}/torch/include/TH -I${PYTHON_PLATLIB}/torch/include/THC -I${PYTHON_PLATLIB}/torch/include/THH -I/opt/dtk/include -I${PYTHON_INCLUDE} -c -c ./csrc/kernels/layout.cu -o build_/layout.o -fPIC -D__HIP_PLATFORM_AMD__=1 -DUSE_ROCM=1 -DHIPBLAS_V2 -DCUDA_HAS_FP16=1 -D__HIP_NO_HALF_OPERATORS__=1 -D__HIP_NO_HALF_CONVERSIONS__=1 -O3 -fgpu-rdc -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1014"' -DTORCH_EXTENSION_NAME=deep_ep_cpp -D_GLIBCXX_USE_CXX11_ABI=1 --offload-arch=gfx936 -std=c++17
/opt/dtk/bin/hipcc -Icsrc/ -I$(pwd)/rocshmem_dir/include/ -I/public/home/lishen/Code/rocSHMEM/3rd_party/install_dtk25.04.1/ompi/include -I${PYTHON_PLATLIB}/torch/include -I${PYTHON_PLATLIB}/torch/include/torch/csrc/api/include -I${PYTHON_PLATLIB}/torch/include/TH -I${PYTHON_PLATLIB}/torch/include/THC -I${PYTHON_PLATLIB}/torch/include/THH -I/opt/dtk/include -I${PYTHON_INCLUDE} -c -c ./csrc/deep_ep.hip -o build_/deep_ep.o -fPIC -D__HIP_PLATFORM_AMD__=1 -DUSE_ROCM=1 -DHIPBLAS_V2 -DCUDA_HAS_FP16=1 -D__HIP_NO_HALF_OPERATORS__=1 -D__HIP_NO_HALF_CONVERSIONS__=1 -O3 -fgpu-rdc -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1014"' -DTORCH_EXTENSION_NAME=deep_ep_cpp -D_GLIBCXX_USE_CXX11_ABI=1 --offload-arch=gfx936 -std=c++17 /opt/dtk/bin/hipcc -Icsrc/ -I$(pwd)/rocshmem_dir/include/ -I/opt/mpi/include -I${PYTHON_PLATLIB}/torch/include -I${PYTHON_PLATLIB}/torch/include/torch/csrc/api/include -I${PYTHON_PLATLIB}/torch/include/TH -I${PYTHON_PLATLIB}/torch/include/THC -I${PYTHON_PLATLIB}/torch/include/THH -I/opt/dtk/include -I${PYTHON_INCLUDE} -c -c ./csrc/deep_ep.cu -o build_/deep_ep.o -fPIC -D__HIP_PLATFORM_AMD__=1 -DUSE_ROCM=1 -DHIPBLAS_V2 -DCUDA_HAS_FP16=1 -D__HIP_NO_HALF_OPERATORS__=1 -D__HIP_NO_HALF_CONVERSIONS__=1 -O3 -fgpu-rdc -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1014"' -DTORCH_EXTENSION_NAME=deep_ep_cpp -D_GLIBCXX_USE_CXX11_ABI=1 --offload-arch=gfx936 -std=c++17
/opt/dtk/bin/hipcc -Icsrc/ -I$(pwd)/rocshmem_dir/include/ -I/public/home/lishen/Code/rocSHMEM/3rd_party/install_dtk25.04.1/ompi/include -I${PYTHON_PLATLIB}/torch/include -I${PYTHON_PLATLIB}/torch/include/torch/csrc/api/include -I${PYTHON_PLATLIB}/torch/include/TH -I${PYTHON_PLATLIB}/torch/include/THC -I${PYTHON_PLATLIB}/torch/include/THH -I/opt/dtk/include -I${PYTHON_INCLUDE} -c -c ./csrc/kernels/internode.hip -o build_/internode.o -fPIC -D__HIP_PLATFORM_AMD__=1 -DUSE_ROCM=1 -DHIPBLAS_V2 -DCUDA_HAS_FP16=1 -D__HIP_NO_HALF_OPERATORS__=1 -D__HIP_NO_HALF_CONVERSIONS__=1 -O3 -fgpu-rdc -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1014"' -DTORCH_EXTENSION_NAME=deep_ep_cpp -D_GLIBCXX_USE_CXX11_ABI=1 --offload-arch=gfx936 -std=c++17 /opt/dtk/bin/hipcc -Icsrc/ -I$(pwd)/rocshmem_dir/include/ -I/opt/mpi/include -I${PYTHON_PLATLIB}/torch/include -I${PYTHON_PLATLIB}/torch/include/torch/csrc/api/include -I${PYTHON_PLATLIB}/torch/include/TH -I${PYTHON_PLATLIB}/torch/include/THC -I${PYTHON_PLATLIB}/torch/include/THH -I/opt/dtk/include -I${PYTHON_INCLUDE} -c -c ./csrc/kernels/internode.cu -o build_/internode.o -fPIC -D__HIP_PLATFORM_AMD__=1 -DUSE_ROCM=1 -DHIPBLAS_V2 -DCUDA_HAS_FP16=1 -D__HIP_NO_HALF_OPERATORS__=1 -D__HIP_NO_HALF_CONVERSIONS__=1 -O3 -fgpu-rdc -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1014"' -DTORCH_EXTENSION_NAME=deep_ep_cpp -D_GLIBCXX_USE_CXX11_ABI=1 --offload-arch=gfx936 -std=c++17
hipcc -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -shared -Wl,-O1 -Wl,-Bsymbolic-functions build_/internode.o build_/intranode.o build_/runtime.o build_/deep_ep.o build_/layout.o -L$(pwd)/rocshmem_dir/lib/ -L/opt/mpi/lib -L/opt/dtk/hip/lib -L/usr/lib/x86_64-linux-gnu -lhipblaslt -lamdhip64 -o deep_ep/deep_ep_cpp.cpython-310-x86_64-linux-gnu.so -Wl,-rpath,/opt/dtk/lib -fgpu-rdc --hip-link --offload-arch=gfx936 -shared -Wl,-soname,deep_ep/deep_ep_cpp.cpython-310-x86_64-linux-gnu.so -Wl,-rpath,$(pwd)/rocshmem_dir/lib/ -L"/opt/dtk/llvm/lib/clang/15.0.0/include/../lib/linux" -lclang_rt.builtins-x86_64 /opt/dtk/hip/lib/libgalaxyhip.so.5.2.25211.1469-8d6b0397 /opt/dtk/llvm/lib/clang/15.0.0/lib/linux/libclang_rt.builtins-x86_64.a /opt/hyhal/lib/libhsa-runtime64.so.1.11.0 -L${PYTHON_PLATLIB}/torch/lib -L/opt/dtk/lib -L/opt/dtk/hip/lib -L/usr/local/lib -lc10 -ltorch -ltorch_cpu -ltorch_python -lamdhip64 -lc10_hip -ltorch_hip -lrocm-core -lrocm_smi64 -l:librocshmem.a -fgpu-rdc --hip-link -lamdhip64 -lhsa-runtime64 -l:libmpi.so -Wl,-rpath,/public/home/lishen/Code/rocSHMEM/3rd_party/install_dtk25.04.1/ompi/lib/ -libverbs -lmlx5 hipcc -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -shared -Wl,-O1 -Wl,-Bsymbolic-functions build_/internode.o build_/intranode.o build_/runtime.o build_/deep_ep.o build_/layout.o -L$(pwd)/rocshmem_dir/lib/ -L/opt/mpi/lib -L/opt/dtk/hip/lib -L/usr/lib/x86_64-linux-gnu -lhipblaslt -lamdhip64 -o deep_ep/deep_ep_cpp.cpython-310-x86_64-linux-gnu.so -Wl,-rpath,/opt/dtk/lib -fgpu-rdc --hip-link --offload-arch=gfx936 -shared -Wl,-soname,deep_ep/deep_ep_cpp.cpython-310-x86_64-linux-gnu.so -Wl,-rpath,$(pwd)/rocshmem_dir/lib/ -L"/opt/dtk/llvm/lib/clang/15.0.0/include/../lib/linux" -lclang_rt.builtins-x86_64 /opt/dtk/hip/lib/libgalaxyhip.so /opt/dtk/llvm/lib/clang/15.0.0/lib/linux/libclang_rt.builtins-x86_64.a /opt/hyhal/lib/libhsa-runtime64.so.1.11.0 -L${PYTHON_PLATLIB}/torch/lib -L/opt/dtk/lib -L/opt/dtk/hip/lib -L/usr/local/lib -lc10 -ltorch -ltorch_cpu -ltorch_python -lamdhip64 -lc10_hip -ltorch_hip -lrocm-core -lrocm_smi64 -l:librocshmem.a -fgpu-rdc --hip-link -lamdhip64 -lhsa-runtime64 -l:libmpi.so -Wl,-rpath,/opt/mpi/lib/ -libverbs -lmlx5
# build whl # build whl
echo "Using Python: $(which python3)" echo "Using Python: $(which python3)"
...@@ -21,10 +21,3 @@ python3 --version ...@@ -21,10 +21,3 @@ python3 --version
python setup.py bdist_wheel python setup.py bdist_wheel
echo "✅ Build complete:" echo "✅ Build complete:"
ls -lh dist/ ls -lh dist/
# /opt/dtk/bin/hipcc -Icsrc/ -I./rocshmem_dir/include/ -I/public/home/lishen/Code/rocSHMEM/3rd_party/install_dtk25.04.1/ompi/include -I${PYTHON_PLATLIB}/torch/include -I${PYTHON_PLATLIB}/torch/include/torch/csrc/api/include -I${PYTHON_PLATLIB}/torch/include/TH -I${PYTHON_PLATLIB}/torch/include/THC -I${PYTHON_PLATLIB}/torch/include/THH -I/opt/dtk/include -I${PYTHON_INCLUDE} -c -c ./csrc/kernels/intranode.hip -o build_/intranode.o -fPIC -D__HIP_PLATFORM_AMD__=1 -DUSE_ROCM=1 -DHIPBLAS_V2 -DCUDA_HAS_FP16=1 -D__HIP_NO_HALF_OPERATORS__=1 -D__HIP_NO_HALF_CONVERSIONS__=1 -O3 -fgpu-rdc -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1014"' -DTORCH_EXTENSION_NAME=deep_ep_cpp -D_GLIBCXX_USE_CXX11_ABI=1 --offload-arch=gfx936 -std=c++17
# /opt/dtk/bin/hipcc -Icsrc/ -I./rocshmem_dir/include/ -I/public/home/lishen/Code/rocSHMEM/3rd_party/install_dtk25.04.1/ompi/include -I${PYTHON_PLATLIB}/torch/include -I${PYTHON_PLATLIB}/torch/include/torch/csrc/api/include -I${PYTHON_PLATLIB}/torch/include/TH -I${PYTHON_PLATLIB}/torch/include/THC -I${PYTHON_PLATLIB}/torch/include/THH -I/opt/dtk/include -I${PYTHON_INCLUDE} -c -c ./csrc/kernels/runtime.hip -o build_/runtime.o -fPIC -D__HIP_PLATFORM_AMD__=1 -DUSE_ROCM=1 -DHIPBLAS_V2 -DCUDA_HAS_FP16=1 -D__HIP_NO_HALF_OPERATORS__=1 -D__HIP_NO_HALF_CONVERSIONS__=1 -O3 -fgpu-rdc -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1014"' -DTORCH_EXTENSION_NAME=deep_ep_cpp -D_GLIBCXX_USE_CXX11_ABI=1 --offload-arch=gfx936 -std=c++17
# /opt/dtk/bin/hipcc -Icsrc/ -I./rocshmem_dir/include/ -I/public/home/lishen/Code/rocSHMEM/3rd_party/install_dtk25.04.1/ompi/include -I${PYTHON_PLATLIB}/torch/include -I${PYTHON_PLATLIB}/torch/include/torch/csrc/api/include -I${PYTHON_PLATLIB}/torch/include/TH -I${PYTHON_PLATLIB}/torch/include/THC -I${PYTHON_PLATLIB}/torch/include/THH -I/opt/dtk/include -I${PYTHON_INCLUDE} -c -c ./csrc/kernels/layout.cu -o build_/layout.o -fPIC -D__HIP_PLATFORM_AMD__=1 -DUSE_ROCM=1 -DHIPBLAS_V2 -DCUDA_HAS_FP16=1 -D__HIP_NO_HALF_OPERATORS__=1 -D__HIP_NO_HALF_CONVERSIONS__=1 -O3 -fgpu-rdc -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1014"' -DTORCH_EXTENSION_NAME=deep_ep_cpp -D_GLIBCXX_USE_CXX11_ABI=1 --offload-arch=gfx936 -std=c++17
# /opt/dtk/bin/hipcc -Icsrc/ -I./rocshmem_dir/include/ -I/public/home/lishen/Code/rocSHMEM/3rd_party/install_dtk25.04.1/ompi/include -I${PYTHON_PLATLIB}/torch/include -I${PYTHON_PLATLIB}/torch/include/torch/csrc/api/include -I${PYTHON_PLATLIB}/torch/include/TH -I${PYTHON_PLATLIB}/torch/include/THC -I${PYTHON_PLATLIB}/torch/include/THH -I/opt/dtk/include -I${PYTHON_INCLUDE} -c -c ./csrc/deep_ep.hip -o build_/deep_ep.o -fPIC -D__HIP_PLATFORM_AMD__=1 -DUSE_ROCM=1 -DHIPBLAS_V2 -DCUDA_HAS_FP16=1 -D__HIP_NO_HALF_OPERATORS__=1 -D__HIP_NO_HALF_CONVERSIONS__=1 -O3 -fgpu-rdc -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1014"' -DTORCH_EXTENSION_NAME=deep_ep_cpp -D_GLIBCXX_USE_CXX11_ABI=1 --offload-arch=gfx936 -std=c++17
# /opt/dtk/bin/hipcc -Icsrc/ -I./rocshmem_dir/include/ -I/public/home/lishen/Code/rocSHMEM/3rd_party/install_dtk25.04.1/ompi/include -I${PYTHON_PLATLIB}/torch/include -I${PYTHON_PLATLIB}/torch/include/torch/csrc/api/include -I${PYTHON_PLATLIB}/torch/include/TH -I${PYTHON_PLATLIB}/torch/include/THC -I${PYTHON_PLATLIB}/torch/include/THH -I/opt/dtk/include -I${PYTHON_INCLUDE} -c -c ./csrc/kernels/internode.hip -o build_/internode.o -fPIC -D__HIP_PLATFORM_AMD__=1 -DUSE_ROCM=1 -DHIPBLAS_V2 -DCUDA_HAS_FP16=1 -D__HIP_NO_HALF_OPERATORS__=1 -D__HIP_NO_HALF_CONVERSIONS__=1 -O3 -fgpu-rdc -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1014"' -DTORCH_EXTENSION_NAME=deep_ep_cpp -D_GLIBCXX_USE_CXX11_ABI=1 --offload-arch=gfx936 -std=c++17
# hipcc -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -shared -Wl,-O1 -Wl,-Bsymbolic-functions build_/internode.o build_/intranode.o build_/runtime.o build_/deep_ep.o build_/layout.o -L/work/Tmp/DeepEP/rocshmem_dir/lib/ -L/opt/mpi/lib -L/opt/dtk/hip/lib -L/usr/lib/x86_64-linux-gnu -lhipblaslt -lamdhip64 -o aaa.so -Wl,-rpath,/opt/dtk/lib -fgpu-rdc --hip-link --offload-arch=gfx936 -shared -Wl,-soname,aaa.so -Wl,-rpath,/work/Tmp/DeepEP/rocshmem_dir/lib/ -L"/opt/dtk/llvm/lib/clang/15.0.0/include/../lib/linux" -lclang_rt.builtins-x86_64 /opt/dtk/hip/lib/libgalaxyhip.so.5.2.25211.1469-8d6b0397 /opt/dtk/llvm/lib/clang/15.0.0/lib/linux/libclang_rt.builtins-x86_64.a /opt/hyhal/lib/libhsa-runtime64.so.1.11.0 -L${PYTHON_PLATLIB}/torch/lib -L/opt/dtk/lib -L/opt/dtk/hip/lib -L/usr/local/lib -lc10 -ltorch -ltorch_cpu -ltorch_python -lamdhip64 -lc10_hip -ltorch_hip -lrocm-core -lrocm_smi64 -l:librocshmem.a -fgpu-rdc --hip-link -lamdhip64 -lhsa-runtime64 -l:libmpi.so -Wl,-rpath,/public/home/lishen/Code/rocSHMEM/3rd_party/install_dtk25.04.1/ompi/lib/ -libverbs -lmlx5
#pragma once #pragma once
#include "./kernels/api.cuh" #include "kernels/api.cuh"
#include "./kernels/configs.cuh" #include "kernels/configs.cuh"
#include "kernels/exception.cuh" #include "kernels/exception.cuh"
namespace deep_ep { namespace deep_ep {
......
// !!! This is a file automatically generated by hipify!!!
#include <ATen/dtk_macros.h>
#pragma once
#include "kernels/api.cuh"
#include "kernels/configs.cuh"
#include "kernels/exception.cuh"
namespace deep_ep {
struct Config {
int num_sms;
int num_max_nvl_chunked_send_tokens;
int num_max_nvl_chunked_recv_tokens;
int num_max_rdma_chunked_send_tokens;
int num_max_rdma_chunked_recv_tokens;
Config(int num_sms, int num_max_nvl_chunked_send_tokens, int num_max_nvl_chunked_recv_tokens,
int num_max_rdma_chunked_send_tokens, int num_max_rdma_chunked_recv_tokens)
: num_sms(num_sms), num_max_nvl_chunked_send_tokens(num_max_nvl_chunked_send_tokens),
num_max_nvl_chunked_recv_tokens(num_max_nvl_chunked_recv_tokens),
num_max_rdma_chunked_send_tokens(num_max_rdma_chunked_send_tokens),
num_max_rdma_chunked_recv_tokens(num_max_rdma_chunked_recv_tokens) {
EP_HOST_ASSERT(num_sms >= 0);
EP_HOST_ASSERT(num_max_nvl_chunked_send_tokens > 0 and
num_max_nvl_chunked_recv_tokens > 0);
EP_HOST_ASSERT(num_max_nvl_chunked_send_tokens < num_max_nvl_chunked_recv_tokens);
EP_HOST_ASSERT(num_max_rdma_chunked_send_tokens > 0 and
num_max_rdma_chunked_recv_tokens > 0);
// Ceil up RDMA buffer size
this->num_max_rdma_chunked_recv_tokens =
ALIGN<int>(num_max_rdma_chunked_recv_tokens, num_max_rdma_chunked_send_tokens);
EP_HOST_ASSERT(num_max_rdma_chunked_send_tokens < num_max_rdma_chunked_recv_tokens);
// NOTES: this assertion is related to RDMA lazy head update, we must ensure senders always
// have space to push
EP_HOST_ASSERT(num_max_rdma_chunked_send_tokens <=
num_max_rdma_chunked_recv_tokens / 2);
}
size_t get_nvl_buffer_size_hint(size_t hidden_bytes, int num_ranks) const {
// Below are some assumptions
// TODO: add assertions
constexpr int kNumMaxTopK = 128;
constexpr int kNumMaxScales = 128;
EP_HOST_ASSERT(num_ranks < NUM_MAX_NVL_PEERS or num_ranks % NUM_MAX_NVL_PEERS == 0);
EP_HOST_ASSERT(num_ranks <= NUM_MAX_NVL_PEERS or num_sms % 2 == 0);
const auto num_rdma_ranks = std::max(num_ranks / NUM_MAX_NVL_PEERS, 1);
const auto num_nvl_ranks = std::min(num_ranks, NUM_MAX_NVL_PEERS);
const int num_channels = num_sms / 2;
size_t num_bytes = 0;
num_bytes += num_channels * num_nvl_ranks * (2 * num_rdma_ranks + 3) * sizeof(int);
num_bytes += num_channels * num_nvl_ranks * num_max_nvl_chunked_recv_tokens * hidden_bytes;
#ifndef DISABLE_ROCSHMEM
num_bytes += num_channels * num_nvl_ranks * num_max_nvl_chunked_recv_tokens *
internode::get_source_meta_bytes();
#endif
num_bytes += num_channels * num_nvl_ranks * num_max_nvl_chunked_recv_tokens * kNumMaxTopK *
sizeof(int64_t);
num_bytes += num_channels * num_nvl_ranks * num_max_nvl_chunked_recv_tokens * kNumMaxTopK *
sizeof(float);
num_bytes += num_channels * num_nvl_ranks * num_max_nvl_chunked_recv_tokens *
kNumMaxScales * sizeof(float);
num_bytes = ((num_bytes + 127) / 128) * 128;
return num_bytes;
}
size_t get_rdma_buffer_size_hint(int64_t hidden_bytes, int num_ranks) const {
#ifndef DISABLE_ROCSHMEM
// Legacy mode
if (num_ranks <= NUM_MAX_NVL_PEERS)
return 0;
// Below are some assumptions
// TODO: add assertions
constexpr int kNumMaxTopK = 128;
constexpr int kNumMaxScales = 128;
EP_HOST_ASSERT(num_ranks % NUM_MAX_NVL_PEERS == 0);
EP_HOST_ASSERT(num_sms % 2 == 0);
const int num_rdma_ranks = num_ranks / NUM_MAX_NVL_PEERS;
const int num_channels = num_sms / 2;
size_t num_bytes = 0;
num_bytes += num_channels * num_rdma_ranks * (NUM_MAX_NVL_PEERS * 2 + 2) * 2 * sizeof(int);
num_bytes +=
num_channels * num_rdma_ranks * num_max_rdma_chunked_recv_tokens * hidden_bytes * 2;
num_bytes += num_channels * num_rdma_ranks * num_max_rdma_chunked_recv_tokens *
internode::get_source_meta_bytes() * 2;
num_bytes += num_channels * num_rdma_ranks * num_max_rdma_chunked_recv_tokens *
kNumMaxTopK * sizeof(int64_t) * 2;
num_bytes += num_channels * num_rdma_ranks * num_max_rdma_chunked_recv_tokens *
kNumMaxTopK * sizeof(float) * 2;
num_bytes += num_channels * num_rdma_ranks * num_max_rdma_chunked_recv_tokens *
kNumMaxScales * sizeof(float) * 2;
num_bytes +=
num_channels * num_rdma_ranks * num_max_rdma_chunked_recv_tokens * sizeof(int4) * 2;
num_bytes = ((num_bytes + 127) / 128) * 128;
return num_bytes;
#else
EP_HOST_ASSERT(false and "rocSHMEM is disabled during compilation, please install "
"rocSHMEM by following docs/install_dependencies.md");
#endif
}
};
struct LowLatencyBuffer {
int num_clean_int = 0;
void *dispatch_rdma_send_buffer = nullptr;
void *dispatch_rdma_recv_data_buffer = nullptr;
int *dispatch_rdma_recv_count_buffer = nullptr;
void *combine_rdma_send_buffer = nullptr;
void *combine_rdma_recv_data_buffer = nullptr;
int *combine_rdma_recv_flag_buffer = nullptr;
void *combine_rdma_send_buffer_data_start = nullptr;
size_t num_bytes_per_combine_msg = 0;
std::pair<int *, int> clean_meta() {
EP_HOST_ASSERT(dispatch_rdma_recv_count_buffer == combine_rdma_recv_flag_buffer);
return {dispatch_rdma_recv_count_buffer, num_clean_int};
}
};
struct LowLatencyLayout {
size_t total_bytes = 0;
LowLatencyBuffer buffers[2];
template <typename out_ptr_t = void *, typename count_ptr_t = uint8_t *,
typename in_ptr_t = void *>
out_ptr_t advance(const in_ptr_t &ptr, size_t count) {
return reinterpret_cast<out_ptr_t>(reinterpret_cast<count_ptr_t>(ptr) + count);
}
LowLatencyLayout(void *rdma_buffer, int num_max_dispatch_tokens_per_rank, int hidden,
int num_ranks, int num_experts) {
const int num_scales = hidden / 128;
// Dispatch and combine layout:
// - 2 symmetric odd/even send buffer
// - 2 symmetric odd/even receive buffers
// - 2 symmetric odd/even signaling buffers
// Message sizes
// NOTES: you should add a control `int4` for combine messages if you want to do data
// transformation
EP_HOST_ASSERT(num_scales * sizeof(float) <= static_cast<size_t>(hidden));
size_t num_bytes_per_dispatch_msg =
sizeof(int4) +
std::max(hidden * sizeof(hip_bfloat16), hidden + num_scales * sizeof(float));
size_t num_bytes_per_combine_msg = hidden * sizeof(hip_bfloat16);
// Send buffer
size_t dispatch_send_buffer_bytes =
num_max_dispatch_tokens_per_rank * num_bytes_per_dispatch_msg;
size_t combine_send_buffer_bytes =
num_experts * num_max_dispatch_tokens_per_rank * num_bytes_per_combine_msg;
size_t send_buffer_bytes = std::max(dispatch_send_buffer_bytes, combine_send_buffer_bytes);
EP_HOST_ASSERT(send_buffer_bytes % sizeof(int4) == 0);
total_bytes += send_buffer_bytes * 2;
// Symmetric receive buffers
// TODO: optimize memory usages
size_t dispatch_recv_data_buffer_bytes =
num_experts * num_max_dispatch_tokens_per_rank * num_bytes_per_dispatch_msg;
size_t combine_recv_buffer_bytes =
num_experts * num_max_dispatch_tokens_per_rank * num_bytes_per_combine_msg;
size_t recv_buffer_bytes =
std::max(dispatch_recv_data_buffer_bytes, combine_recv_buffer_bytes);
EP_HOST_ASSERT(recv_buffer_bytes % sizeof(int4) == 0);
total_bytes += recv_buffer_bytes * 2;
// Symmetric signaling buffers
size_t dispatch_recv_count_buffer_bytes = num_experts * sizeof(int);
size_t combine_recv_flag_buffer_bytes = dispatch_recv_count_buffer_bytes;
size_t signaling_buffer_bytes =
std::max(dispatch_recv_count_buffer_bytes, combine_recv_flag_buffer_bytes);
size_t signaling_buffer_bytes_aligned = ALIGN<size_t>(signaling_buffer_bytes, 128);
total_bytes += signaling_buffer_bytes_aligned * 2;
// Assign pointers
// NOTES: we still leave some space for distinguishing dispatch/combine buffer,
// so you may see some parameters are duplicated
for (int i = 0; i < 2; ++i) {
buffers[i] = {
static_cast<int>(signaling_buffer_bytes / sizeof(int)),
advance(rdma_buffer, signaling_buffer_bytes_aligned * 2 + send_buffer_bytes * i),
advance(rdma_buffer, signaling_buffer_bytes_aligned * 2 + send_buffer_bytes * 2 +
recv_buffer_bytes * i),
advance<int *>(rdma_buffer, signaling_buffer_bytes_aligned * i),
advance(rdma_buffer, signaling_buffer_bytes_aligned * 2 + send_buffer_bytes * i),
advance(rdma_buffer, signaling_buffer_bytes_aligned * 2 + send_buffer_bytes * 2 +
recv_buffer_bytes * i),
advance<int *>(rdma_buffer, signaling_buffer_bytes_aligned * i),
advance(rdma_buffer, signaling_buffer_bytes_aligned * 2 + send_buffer_bytes * i),
num_bytes_per_combine_msg};
}
}
};
inline size_t get_low_latency_rdma_size_hint(int num_max_dispatch_tokens_per_rank, int hidden,
int num_ranks, int num_experts) {
auto num_bytes =
LowLatencyLayout(nullptr, num_max_dispatch_tokens_per_rank, hidden, num_ranks, num_experts)
.total_bytes;
return ((num_bytes + NUM_BUFFER_ALIGNMENT_BYTES) / NUM_BUFFER_ALIGNMENT_BYTES) *
NUM_BUFFER_ALIGNMENT_BYTES;
}
} // namespace deep_ep
This diff is collapsed.
// #include <ATen/dtk_macros.h>
#include <ATen/hip/HIPContext.h> #include <ATen/hip/HIPContext.h>
#include <ATen/hip/HIPDataType.h> #include <ATen/hip/HIPDataType.h>
#include <chrono> #include <chrono>
...@@ -5,8 +6,8 @@ ...@@ -5,8 +6,8 @@
#include <pybind11/functional.h> #include <pybind11/functional.h>
#include <torch/python.h> #include <torch/python.h>
#include "./kernels/api.cuh" #include "kernels/api.cuh"
#include "./kernels/configs.cuh" #include "kernels/configs.cuh"
#include "deep_ep.hpp" #include "deep_ep.hpp"
namespace deep_ep { namespace deep_ep {
...@@ -40,8 +41,8 @@ Buffer::Buffer(int rank, int num_ranks, int64_t num_nvl_bytes, int64_t num_rdma_ ...@@ -40,8 +41,8 @@ Buffer::Buffer(int rank, int num_ranks, int64_t num_nvl_bytes, int64_t num_rdma_
// Get ranks // Get ranks
CUDA_CHECK(hipGetDevice(&device_id)); CUDA_CHECK(hipGetDevice(&device_id));
rdma_rank = rank / NUM_MAX_NVL_PEERS, nvl_rank = rank % NUM_MAX_NVL_PEERS; rdma_rank = rank / NUM_MAX_NVL_PEERS, nvl_rank = rank % NUM_MAX_NVL_PEERS;
num_rdma_ranks = std::max(1, num_ranks / NUM_MAX_NVL_PEERS), num_rdma_ranks = ::max(1, num_ranks / NUM_MAX_NVL_PEERS),
num_nvl_ranks = std::min(num_ranks, NUM_MAX_NVL_PEERS); num_nvl_ranks = ::min(num_ranks, NUM_MAX_NVL_PEERS);
#ifdef DISABLE_ROCSHMEM #ifdef DISABLE_ROCSHMEM
EP_HOST_ASSERT(num_rdma_ranks == 1 and not low_latency_mode and EP_HOST_ASSERT(num_rdma_ranks == 1 and not low_latency_mode and
...@@ -803,8 +804,8 @@ Buffer::internode_dispatch(const torch::Tensor &x, const std::optional<torch::Te ...@@ -803,8 +804,8 @@ Buffer::internode_dispatch(const torch::Tensor &x, const std::optional<torch::Te
// here. // here.
pybind11::gil_scoped_release release; pybind11::gil_scoped_release release;
const int num_channels = config.num_sms / 2; const int num_channels = config.num_sms / 3;
EP_HOST_ASSERT(config.num_sms % 2 == 0); EP_HOST_ASSERT(config.num_sms % 3 == 0);
EP_HOST_ASSERT(0 < get_num_rdma_ranks() and get_num_rdma_ranks() <= NUM_MAX_RDMA_PEERS); EP_HOST_ASSERT(0 < get_num_rdma_ranks() and get_num_rdma_ranks() <= NUM_MAX_RDMA_PEERS);
bool cached_mode = cached_rdma_channel_prefix_matrix.has_value(); bool cached_mode = cached_rdma_channel_prefix_matrix.has_value();
...@@ -901,10 +902,10 @@ Buffer::internode_dispatch(const torch::Tensor &x, const std::optional<torch::Te ...@@ -901,10 +902,10 @@ Buffer::internode_dispatch(const torch::Tensor &x, const std::optional<torch::Te
// Allocate all tensors on comm stream if set // Allocate all tensors on comm stream if set
// NOTES: do not allocate tensors upfront! // NOTES: do not allocate tensors upfront!
auto compute_stream = at::cuda::getCurrentCUDAStream(); auto compute_stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
if (allocate_on_comm_stream) { if (allocate_on_comm_stream) {
EP_HOST_ASSERT(previous_event.has_value() and async); EP_HOST_ASSERT(previous_event.has_value() and async);
at::cuda::setCurrentCUDAStream(comm_stream); at::hip::setCurrentHIPStreamMasqueradingAsCUDA(comm_stream);
} }
// Wait previous tasks to be finished // Wait previous tasks to be finished
...@@ -1088,7 +1089,7 @@ Buffer::internode_dispatch(const torch::Tensor &x, const std::optional<torch::Te ...@@ -1088,7 +1089,7 @@ Buffer::internode_dispatch(const torch::Tensor &x, const std::optional<torch::Te
// Switch back compute stream // Switch back compute stream
if (allocate_on_comm_stream) if (allocate_on_comm_stream)
at::cuda::setCurrentCUDAStream(compute_stream); at::hip::setCurrentHIPStreamMasqueradingAsCUDA(compute_stream);
// Return values // Return values
return {recv_x, return {recv_x,
...@@ -1124,8 +1125,8 @@ Buffer::internode_combine( ...@@ -1124,8 +1125,8 @@ Buffer::internode_combine(
const torch::Tensor &combined_nvl_head, const Config &config, const torch::Tensor &combined_nvl_head, const Config &config,
std::optional<EventHandle> &previous_event, bool async, bool allocate_on_comm_stream) { std::optional<EventHandle> &previous_event, bool async, bool allocate_on_comm_stream) {
#ifndef DISABLE_ROCSHMEM #ifndef DISABLE_ROCSHMEM
const int num_channels = config.num_sms / 2; const int num_channels = config.num_sms / 3;
EP_HOST_ASSERT(config.num_sms % 2 == 0); EP_HOST_ASSERT(config.num_sms % 3 == 0);
// Shape and contiguous checks // Shape and contiguous checks
EP_HOST_ASSERT(x.dim() == 2 and x.is_contiguous()); EP_HOST_ASSERT(x.dim() == 2 and x.is_contiguous());
...@@ -1167,10 +1168,10 @@ Buffer::internode_combine( ...@@ -1167,10 +1168,10 @@ Buffer::internode_combine(
// Allocate all tensors on comm stream if set // Allocate all tensors on comm stream if set
// NOTES: do not allocate tensors upfront! // NOTES: do not allocate tensors upfront!
auto compute_stream = at::cuda::getCurrentCUDAStream(); auto compute_stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
if (allocate_on_comm_stream) { if (allocate_on_comm_stream) {
EP_HOST_ASSERT(previous_event.has_value() and async); EP_HOST_ASSERT(previous_event.has_value() and async);
at::cuda::setCurrentCUDAStream(comm_stream); at::hip::setCurrentHIPStreamMasqueradingAsCUDA(comm_stream);
} }
// Wait previous tasks to be finished // Wait previous tasks to be finished
...@@ -1216,7 +1217,7 @@ Buffer::internode_combine( ...@@ -1216,7 +1217,7 @@ Buffer::internode_combine(
void *bias_ptrs[2] = {nullptr, nullptr}; void *bias_ptrs[2] = {nullptr, nullptr};
for (int i = 0; i < 2; ++i) for (int i = 0; i < 2; ++i)
if (bias_opts[i].has_value()) { if (bias_opts[i].has_value()) {
// EP_HOST_ASSERT(false and "bias is not supported in internode combine"); EP_HOST_ASSERT(false and "bias is not supported in internode combine");
auto bias = bias_opts[i].value(); auto bias = bias_opts[i].value();
EP_HOST_ASSERT(bias.dim() == 2 and bias.is_contiguous()); EP_HOST_ASSERT(bias.dim() == 2 and bias.is_contiguous());
EP_HOST_ASSERT(bias.scalar_type() == x.scalar_type()); EP_HOST_ASSERT(bias.scalar_type() == x.scalar_type());
...@@ -1260,7 +1261,7 @@ Buffer::internode_combine( ...@@ -1260,7 +1261,7 @@ Buffer::internode_combine(
// Switch back compute stream // Switch back compute stream
if (allocate_on_comm_stream) if (allocate_on_comm_stream)
at::cuda::setCurrentCUDAStream(compute_stream); at::hip::setCurrentHIPStreamMasqueradingAsCUDA(compute_stream);
// Return values // Return values
return {combined_x, combined_topk_weights, event}; return {combined_x, combined_topk_weights, event};
......
This diff is collapsed.
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
#include <tuple> #include <tuple>
#include <vector> #include <vector>
#include "./kernels/configs.cuh" #include "kernels/configs.cuh"
#include "kernels/exception.cuh" #include "kernels/exception.cuh"
#include "config.hpp" #include "config.hpp"
#include "event.hpp" #include "event.hpp"
......
// !!! This is a file automatically generated by hipify!!!
#include <ATen/dtk_macros.h>
#pragma once
#include <pybind11/pybind11.h>
#include <pybind11/pytypes.h>
#include <torch/types.h>
#include <tuple>
#include <vector>
#include "kernels/configs.cuh"
#include "kernels/exception.cuh"
#include "config_hip.hpp"
#include "event.hpp"
namespace deep_ep {
struct Buffer {
EP_STATIC_ASSERT(NUM_MAX_NVL_PEERS == 8, "The number of maximum NVLink peers must be 8");
private:
// Low-latency mode buffer
int low_latency_buffer_idx = 0;
bool low_latency_mode = false;
// NVLink Buffer
int64_t num_nvl_bytes;
void *buffer_ptrs[NUM_MAX_NVL_PEERS] = {nullptr};
void **buffer_ptrs_gpu = nullptr;
// NVSHMEM Buffer
int64_t num_rdma_bytes;
void *rdma_buffer_ptr = nullptr;
// Device info and communication
int device_id;
int num_device_sms;
int rank, rdma_rank, nvl_rank;
int num_ranks, num_rdma_ranks, num_nvl_ranks;
hipIpcMemHandle_t ipc_handles[NUM_MAX_NVL_PEERS];
// Stream for communication
at::hip::HIPStreamMasqueradingAsCUDA comm_stream;
// After IPC/NVSHMEM synchronization, this flag will be true
bool available = false;
// Whether explicit `destroy()` is required.
bool explicitly_destroy;
// After `destroy()` be called, this flag will be true
bool destroyed = false;
// Barrier signals
int *barrier_signal_ptrs[NUM_MAX_NVL_PEERS] = {nullptr};
int **barrier_signal_ptrs_gpu = nullptr;
// Workspace
void *workspace = nullptr;
// Host-side MoE info
volatile int *moe_recv_counter = nullptr;
int *moe_recv_counter_mapped = nullptr;
// Host-side expert-level MoE info
volatile int *moe_recv_expert_counter = nullptr;
int *moe_recv_expert_counter_mapped = nullptr;
// Host-side RDMA-level MoE info
volatile int *moe_recv_rdma_counter = nullptr;
int *moe_recv_rdma_counter_mapped = nullptr;
bool use_default_stream_as_comm_stream = false;
public:
Buffer(int rank, int num_ranks, int64_t num_nvl_bytes, int64_t num_rdma_bytes,
bool low_latency_mode, bool explicitly_destroy, bool use_default_stream_as_comm_stream);
~Buffer() noexcept(false);
bool is_available() const;
bool is_internode_available() const;
int get_num_rdma_ranks() const;
int get_rdma_rank() const;
int get_root_rdma_rank(bool global) const;
int get_local_device_id() const;
pybind11::bytearray get_local_ipc_handle() const;
pybind11::bytearray get_local_nvshmem_unique_id() const;
torch::Tensor get_local_buffer_tensor(const pybind11::object &dtype, int64_t offset,
bool use_rdma_buffer) const;
torch::Stream get_comm_stream() const;
void sync(const std::vector<int> &device_ids,
const std::vector<std::optional<pybind11::bytearray>> &all_gathered_handles,
const std::optional<pybind11::bytearray> &root_unique_id_opt);
void destroy();
std::tuple<torch::Tensor, std::optional<torch::Tensor>, torch::Tensor, torch::Tensor,
std::optional<EventHandle>>
get_dispatch_layout(const torch::Tensor &topk_idx, int num_experts,
std::optional<EventHandle> &previous_event, bool async,
bool allocate_on_comm_stream);
std::tuple<torch::Tensor, std::optional<torch::Tensor>, std::optional<torch::Tensor>,
std::optional<torch::Tensor>, std::vector<int>, torch::Tensor, torch::Tensor,
torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor,
std::optional<EventHandle>>
intranode_dispatch(const torch::Tensor &x, const std::optional<torch::Tensor> &x_scales,
const std::optional<torch::Tensor> &topk_idx,
const std::optional<torch::Tensor> &topk_weights,
const std::optional<torch::Tensor> &num_tokens_per_rank,
const torch::Tensor &is_token_in_rank,
const std::optional<torch::Tensor> &num_tokens_per_expert,
int cached_num_recv_tokens,
const std::optional<torch::Tensor> &cached_rank_prefix_matrix,
const std::optional<torch::Tensor> &cached_channel_prefix_matrix,
int expert_alignment, int num_worst_tokens, const Config &config,
std::optional<EventHandle> &previous_event, bool async,
bool allocate_on_comm_stream);
std::tuple<torch::Tensor, std::optional<torch::Tensor>, std::optional<EventHandle>>
intranode_combine(const torch::Tensor &x, const std::optional<torch::Tensor> &topk_weights,
const std::optional<torch::Tensor> &bias_0,
const std::optional<torch::Tensor> &bias_1, const torch::Tensor &src_idx,
const torch::Tensor &rank_prefix_matrix,
const torch::Tensor &channel_prefix_matrix, const torch::Tensor &send_head,
const Config &config, std::optional<EventHandle> &previous_event, bool async,
bool allocate_on_comm_stream);
std::tuple<torch::Tensor, std::optional<torch::Tensor>, std::optional<torch::Tensor>,
std::optional<torch::Tensor>, std::vector<int>, torch::Tensor, torch::Tensor,
std::optional<torch::Tensor>, torch::Tensor, std::optional<torch::Tensor>,
torch::Tensor, std::optional<torch::Tensor>, std::optional<torch::Tensor>,
std::optional<torch::Tensor>, std::optional<EventHandle>>
internode_dispatch(const torch::Tensor &x, const std::optional<torch::Tensor> &x_scales,
const std::optional<torch::Tensor> &topk_idx,
const std::optional<torch::Tensor> &topk_weights,
const std::optional<torch::Tensor> &num_tokens_per_rank,
const std::optional<torch::Tensor> &num_tokens_per_rdma_rank,
const torch::Tensor &is_token_in_rank,
const std::optional<torch::Tensor> &num_tokens_per_expert,
int cached_num_recv_tokens, int cached_num_rdma_recv_tokens,
const std::optional<torch::Tensor> &cached_rdma_channel_prefix_matrix,
const std::optional<torch::Tensor> &cached_recv_rdma_rank_prefix_sum,
const std::optional<torch::Tensor> &cached_gbl_channel_prefix_matrix,
const std::optional<torch::Tensor> &cached_recv_gbl_rank_prefix_sum,
int expert_alignment, const Config &config,
std::optional<EventHandle> &previous_event, bool async,
bool allocate_on_comm_stream);
std::tuple<torch::Tensor, std::optional<torch::Tensor>, std::optional<EventHandle>>
internode_combine(
const torch::Tensor &x, const std::optional<torch::Tensor> &topk_weights,
const std::optional<torch::Tensor> &bias_0, const std::optional<torch::Tensor> &bias_1,
const torch::Tensor &src_meta, const torch::Tensor &is_combined_token_in_rank,
const torch::Tensor &rdma_channel_prefix_matrix, const torch::Tensor &rdma_rank_prefix_sum,
const torch::Tensor &gbl_channel_prefix_matrix, const torch::Tensor &combined_rdma_head,
const torch::Tensor &combined_nvl_head, const Config &config,
std::optional<EventHandle> &previous_event, bool async, bool allocate_on_comm_stream);
void clean_low_latency_buffer(int num_max_dispatch_tokens_per_rank, int hidden,
int num_experts);
std::tuple<torch::Tensor, std::optional<torch::Tensor>, torch::Tensor, torch::Tensor,
torch::Tensor, std::optional<EventHandle>, std::optional<std::function<void()>>>
low_latency_dispatch(const torch::Tensor &x, const torch::Tensor &topk_idx,
const std::optional<torch::Tensor> &cumulative_local_expert_recv_stats,
const std::optional<torch::Tensor> &dispatch_wait_recv_cost_stats,
int num_max_dispatch_tokens_per_rank, int num_experts, bool use_fp8,
bool round_scale, bool use_ue8m0, bool async, bool return_recv_hook);
std::tuple<torch::Tensor, std::optional<EventHandle>, std::optional<std::function<void()>>>
low_latency_combine(const torch::Tensor &x, const torch::Tensor &topk_idx,
const torch::Tensor &topk_weights, const torch::Tensor &src_info,
const torch::Tensor &layout_range,
const std::optional<torch::Tensor> &combine_wait_recv_cost_stats,
int num_max_dispatch_tokens_per_rank, int num_experts, bool use_logfmt,
bool zero_copy, bool async, bool return_recv_hook,
const std::optional<torch::Tensor> &out = std::nullopt);
torch::Tensor get_next_low_latency_combine_buffer(int num_max_dispatch_tokens_per_rank,
int hidden, int num_experts) const;
};
} // namespace deep_ep
This diff is collapsed.
This diff is collapsed.
...@@ -393,9 +393,9 @@ __global__ void __launch_bounds__(kNumThreads, 1) ...@@ -393,9 +393,9 @@ __global__ void __launch_bounds__(kNumThreads, 1)
total_offset; total_offset;
num_tokens_to_recv -= total_offset; num_tokens_to_recv -= total_offset;
} }
total_offset = __shfl_sync(kFullWarpMask, total_offset, 0); total_offset = shfl_sync(total_offset, 0);
total_offset += rank_offset; total_offset += rank_offset;
num_tokens_to_recv = __shfl_sync(kFullWarpMask, num_tokens_to_recv, 0); num_tokens_to_recv = shfl_sync(num_tokens_to_recv, 0);
// Shared tail indices for different warps // Shared tail indices for different warps
__shared__ volatile int shared_channel_tail_idx[kNumRanks]; __shared__ volatile int shared_channel_tail_idx[kNumRanks];
...@@ -583,7 +583,7 @@ __global__ void cached_notify_combine(void **buffer_ptrs, int *send_head, int nu ...@@ -583,7 +583,7 @@ __global__ void cached_notify_combine(void **buffer_ptrs, int *send_head, int nu
? __ldg(send_head + token_idx * kNumRanks + rank_id) ? __ldg(send_head + token_idx * kNumRanks + rank_id)
: -1; : -1;
for (int i = 0; i < min(kWarpSize, token_idx_tail - token_start_idx + 1); ++i) { for (int i = 0; i < min(kWarpSize, token_idx_tail - token_start_idx + 1); ++i) {
const int head = __shfl_sync(kFullWarpMask, current_head, i); const int head = shfl_sync(current_head, i);
if (head < 0) { if (head < 0) {
if (lane_id == i) if (lane_id == i)
expected_head = -last_head - 1; expected_head = -last_head - 1;
...@@ -606,7 +606,7 @@ void cached_notify_combine(void **buffer_ptrs, int *send_head, int num_channels, ...@@ -606,7 +606,7 @@ void cached_notify_combine(void **buffer_ptrs, int *send_head, int num_channels,
barrier_signal_ptrs, rank); \ barrier_signal_ptrs, rank); \
break break
const int num_threads = std::max(128, kWarpSize * num_ranks); const int num_threads = ::max(128, kWarpSize * num_ranks);
EP_HOST_ASSERT(num_ranks <= num_threads); EP_HOST_ASSERT(num_ranks <= num_threads);
EP_HOST_ASSERT(num_threads <= 1024); EP_HOST_ASSERT(num_threads <= 1024);
EP_HOST_ASSERT(1 + num_channels <= num_channels * 2); EP_HOST_ASSERT(1 + num_channels <= num_channels * 2);
......
This diff is collapsed.
#include "hip/hip_runtime.h"
#pragma once #pragma once
#include "configs.cuh" #include "configs.cuh"
......
// !!! This is a file automatically generated by hipify!!!
#include <ATen/dtk_macros.h>
#include "hip/hip_runtime.h"
#pragma once
#include "configs.cuh"
#include "exception.cuh"
// ROCm helper functions and structures
namespace rocm::experimental {
typedef struct {
dim3 num_sms;
dim3 num_threads;
unsigned int shared_mem_bytes;
hipStream_t stream;
} hipLaunchConfig_t;
// Compile time void** kernelArgs array fill with variadic arguments
template <typename T> void fill_kernel_args(void **f, size_t idx, T &&arg) {
f[idx] = static_cast<void *>(std::addressof(arg));
}
template <typename Head, typename... Tail>
void fill_kernel_args(void **f, size_t idx, Head &&head, Tail &&...tail) {
f[idx] = static_cast<void *>(std::addressof(head));
fill_kernel_args(f, idx + 1, std::forward<Tail>(tail)...);
}
} // namespace rocm::experimental
#ifndef SETUP_LAUNCH_CONFIG
// The code below is a workaround for ROCm. All the proposed overhead
// is to match current macro signatures and should be reworked once
// cudaLaunchKernelExt() hip alternative is live.
#define SETUP_LAUNCH_CONFIG(num_sms, num_threads, stream) \
rocm::experimental::hipLaunchConfig_t cfg = {(num_sms), (num_threads), 0, stream};
#endif // #ifndef SETUP_LAUNCH_CONFIG
#ifndef LAUNCH_KERNEL
template <typename T, typename Kern, typename... Args>
inline void LAUNCH_KERNEL(T &&config, Kern &&kernel, Args &&...args) {
constexpr size_t k_num_kernel_args = sizeof...(args);
void *kernel_args[k_num_kernel_args];
rocm::experimental::fill_kernel_args(kernel_args, 0, std::forward<Args>(args)...);
CUDA_CHECK(hipLaunchCooperativeKernel(std::forward<Kern>(kernel), config->num_sms,
config->num_threads, kernel_args,
config->shared_mem_bytes, config->stream));
}
template <typename T, typename Kern, typename... Args>
inline void LAUNCH_KERNEL_NON_COOPERATIVE(T &&config, Kern &&kernel, Args &&...args) {
hipLaunchKernelGGL((*kernel), dim3(config->num_sms), dim3(config->num_threads), config->shared_mem_bytes, config->stream,
std::forward<Args>(args)...);
}
#endif // #ifndef LAUNCH_KERNEL
#define SWITCH_RANKS(case_macro) \
switch (num_ranks) { \
case 2: \
case_macro(2); \
case 4: \
case_macro(4); \
case 8: \
case_macro(8); \
default: \
EP_HOST_ASSERT(false and "Unsupported ranks"); \
} \
while (false)
#define SWITCH_RDMA_RANKS(case_macro) \
switch (num_ranks / NUM_MAX_NVL_PEERS) { \
case 2: \
case_macro(2); \
case 3: \
case_macro(3); \
case 4: \
case_macro(4); \
case 8: \
case_macro(8); \
case 16: \
case_macro(16); \
case 18: \
case_macro(18); \
case 20: \
case_macro(20); \
default: \
EP_HOST_ASSERT(false and "Unsupported RDMA ranks"); \
} \
while (false)
#define SWITCH_RANKS_WITH_DTYPE(dtype, case_macro) \
switch (num_ranks) { \
case 2: \
case_macro(dtype, 2); \
case 4: \
case_macro(dtype, 4); \
case 8: \
case_macro(dtype, 8); \
default: \
EP_HOST_ASSERT(false and "Unsupported ranks"); \
} \
while (false)
#define SWITCH_TYPES(case_macro) \
switch (type) { \
case HIP_R_16BF: \
case_macro(hip_bfloat16); \
case HIP_R_32F: \
case_macro(float); \
default: \
EP_HOST_ASSERT(false and "Unsupported type"); \
} \
while (false)
#define SWITCH_HIDDEN(case_macro) \
switch (hidden) { \
case 2560: \
case_macro(2560); \
case 5120: \
case_macro(5120); \
case 4096: \
case_macro(4096); \
case 7168: \
case_macro(7168); \
default: \
EP_HOST_ASSERT(false and "Unsupported hidden"); \
} \
while (false)
#include "hip/hip_runtime.h"
#include <cstring> #include <cstring>
#include "configs.cuh" #include "configs.cuh"
......
// !!! This is a file automatically generated by hipify!!!
#include <ATen/dtk_macros.h>
#include "hip/hip_runtime.h"
#include <cstring>
#include "configs.cuh"
#include "exception.cuh"
#include "launch_hip.cuh"
#include "utils_hip.cuh"
#ifndef DISABLE_ROCSHMEM
#include <rocshmem/rocshmem.hpp>
#endif
namespace deep_ep {
namespace intranode {
template <int kNumRanks>
__global__ void barrier(int **barrier_signal_ptrs, int rank) {
barrier_block<kNumRanks>(barrier_signal_ptrs, rank);
}
void barrier(int **barrier_signal_ptrs, int rank, int num_ranks, hipStream_t stream) {
#define BARRIER_LAUNCH_CASE(ranks) \
LAUNCH_KERNEL(&cfg, barrier<ranks>, barrier_signal_ptrs, rank); \
break
SETUP_LAUNCH_CONFIG(1, kWarpSize, stream);
SWITCH_RANKS(BARRIER_LAUNCH_CASE);
#undef BARRIER_LAUNCH_CASE
}
} // namespace intranode
namespace internode {
#ifndef DISABLE_ROCSHMEM
rocshmem::rocshmem_team_t cpu_rdma_team = rocshmem::ROCSHMEM_TEAM_INVALID;
rocshmem::rocshmem_team_config_t cpu_rdma_team_config;
std::vector<uint8_t> get_unique_id() {
rocshmem::rocshmem_uniqueid_t unique_id;
rocshmem::rocshmem_get_uniqueid(&unique_id);
std::vector<uint8_t> result(sizeof(rocshmem::rocshmem_uniqueid_t));
std::memcpy(result.data(), &unique_id, sizeof(rocshmem::rocshmem_uniqueid_t));
return result;
}
int init(const std::vector<uint8_t> &root_unique_id_val, int rank, int num_ranks,
bool low_latency_mode) {
rocshmem::rocshmem_uniqueid_t root_unique_id;
rocshmem::rocshmem_init_attr_t attr;
std::memcpy(&root_unique_id, root_unique_id_val.data(), sizeof(rocshmem::rocshmem_uniqueid_t));
rocshmem::rocshmem_set_attr_uniqueid_args(rank, num_ranks, &root_unique_id, &attr);
rocshmem::rocshmem_init_attr(rocshmem::ROCSHMEM_INIT_WITH_UNIQUEID, &attr);
// Create sub-RDMA teams
// NOTES: if `num_ranks <= NUM_MAX_NVL_PEERS` then only low-latency kernels are used
if (low_latency_mode and num_ranks > NUM_MAX_NVL_PEERS) {
EP_HOST_ASSERT(cpu_rdma_team == rocshmem::ROCSHMEM_TEAM_INVALID);
EP_HOST_ASSERT(num_ranks % NUM_MAX_NVL_PEERS == 0);
EP_HOST_ASSERT(rocshmem::rocshmem_team_split_strided(
rocshmem::ROCSHMEM_TEAM_WORLD, rank % NUM_MAX_NVL_PEERS,
NUM_MAX_NVL_PEERS, num_ranks / NUM_MAX_NVL_PEERS,
&cpu_rdma_team_config, 0, &cpu_rdma_team) == 0);
EP_HOST_ASSERT(cpu_rdma_team != rocshmem::ROCSHMEM_TEAM_INVALID);
}
rocshmem::rocshmem_barrier_all();
return rocshmem::rocshmem_my_pe();
}
void *alloc(size_t size, size_t alignment) {
auto alloc_size = ALIGN(size, alignment);
return rocshmem::rocshmem_malloc(alloc_size);
}
void free(void *ptr) {
rocshmem::rocshmem_free(ptr);
}
void barrier() {
rocshmem::rocshmem_barrier_all();
}
void finalize() {
if (cpu_rdma_team != rocshmem::ROCSHMEM_TEAM_INVALID) {
rocshmem::rocshmem_team_destroy(cpu_rdma_team);
cpu_rdma_team = rocshmem::ROCSHMEM_TEAM_INVALID;
}
rocshmem::rocshmem_finalize();
}
#endif
} // namespace internode
} // namespace deep_ep
#include "hip/hip_runtime.h"
#pragma once #pragma once
#include "configs.cuh" #include "configs.cuh"
#include "exception.cuh" #include "exception.cuh"
...@@ -194,8 +195,7 @@ __device__ __forceinline__ int64_t ld_volatile_global(const volatile uint64_t *p ...@@ -194,8 +195,7 @@ __device__ __forceinline__ int64_t ld_volatile_global(const volatile uint64_t *p
return ret; return ret;
} }
template <typename dtype_t> template <typename dtype_t> __device__ __forceinline__ dtype_t ld_nc_global(const dtype_t *ptr) {
__device__ __forceinline__ dtype_t ld_nc_global(const dtype_t *ptr) {
using T = typename VecInt<sizeof(dtype_t)>::vec_t; using T = typename VecInt<sizeof(dtype_t)>::vec_t;
auto ret = __builtin_nontemporal_load(reinterpret_cast<const T *>(ptr)); auto ret = __builtin_nontemporal_load(reinterpret_cast<const T *>(ptr));
return *reinterpret_cast<dtype_t *>(&ret); return *reinterpret_cast<dtype_t *>(&ret);
......
// !!! This is a file automatically generated by hipify!!!
#include <ATen/dtk_macros.h>
#include "hip/hip_runtime.h"
#pragma once
#include "configs.cuh"
#include "exception.cuh"
#define UNROLLED_WARP_COPY(UNROLL_FACTOR, LANE_ID, N, DST, SRC, LD_FUNC, ST_FUNC) \
{ \
constexpr int kLoopStride = kWarpSize * (UNROLL_FACTOR); \
typename std::remove_reference<decltype(LD_FUNC((SRC) + 0))>::type \
unrolled_values[(UNROLL_FACTOR)]; \
auto __src = (SRC); \
auto __dst = (DST); \
for (int __i = (LANE_ID); __i < ((N) / kLoopStride) * kLoopStride; __i += kLoopStride) { \
_Pragma("unroll") for (int __j = 0; __j < (UNROLL_FACTOR); ++__j) \
unrolled_values[__j] = LD_FUNC(__src + __i + __j * kWarpSize); \
_Pragma("unroll") for (int __j = 0; __j < (UNROLL_FACTOR); ++__j) \
ST_FUNC(__dst + __i + __j * kWarpSize, unrolled_values[__j]); \
} \
{ \
int __i = ((N) / kLoopStride) * kLoopStride + (LANE_ID); \
_Pragma("unroll") for (int __j = 0; __j < (UNROLL_FACTOR); ++__j) { \
if (__i + __j * kWarpSize < (N)) { \
unrolled_values[__j] = LD_FUNC(__src + __i + __j * kWarpSize); \
} \
} \
_Pragma("unroll") for (int __j = 0; __j < (UNROLL_FACTOR); ++__j) { \
if (__i + __j * kWarpSize < (N)) { \
ST_FUNC(__dst + __i + __j * kWarpSize, unrolled_values[__j]); \
} \
} \
} \
}
#define UNROLLED_WARP_COPY_EMULATED(UNROLL_FACTOR, LANE_ID, N, DST, SRC, LD_FUNC, ST_FUNC) \
{ \
constexpr int kLoopStride = kEmulatedWarpSize * (UNROLL_FACTOR); \
typename std::remove_reference<decltype(LD_FUNC((SRC) + 0))>::type \
unrolled_values[(UNROLL_FACTOR)]; \
auto __src = (SRC); \
auto __dst = (DST); \
for (int __i = (LANE_ID); __i < ((N) / kLoopStride) * kLoopStride; __i += kLoopStride) { \
_Pragma("unroll") for (int __j = 0; __j < (UNROLL_FACTOR); ++__j) \
unrolled_values[__j] = LD_FUNC(__src + __i + __j * kEmulatedWarpSize); \
_Pragma("unroll") for (int __j = 0; __j < (UNROLL_FACTOR); ++__j) \
ST_FUNC(__dst + __i + __j * kEmulatedWarpSize, unrolled_values[__j]); \
} \
for (int __i = ((N) / kLoopStride) * kLoopStride + (LANE_ID); __i < (N); \
__i += kEmulatedWarpSize) \
ST_FUNC(__dst + __i, LD_FUNC(__src + __i)); \
}
// HELPER FUNCTIONS
// #####################################################################################
template <typename T>
__device__ __forceinline__ T shfl_xor(const T val, int laneMask, int width = kWarpSize,
uint64_t shfl_sync_mask = kFullWarpMask) {
return __shfl_xor(val, laneMask, width);
}
__device__ __forceinline__ int
shfl_sync(const int val, int srcLane = 0, int width = kWarpSize,
uint64_t shfl_sync_mask = kFullWarpMask) { // Let compiler deduce type
return __shfl(val, srcLane, width);
}
__device__ __forceinline__ int __any_sync(uint64_t mask, int predicate) {
uint64_t predicate_bit_pattern = __ballot(predicate);
return (predicate_bit_pattern & mask) > 0;
}
__device__ __forceinline__ int __all_sync(uint64_t mask, int predicate) {
uint64_t predicate_bit_pattern = __ballot(predicate);
return (~predicate_bit_pattern & mask) == 0;
}
__device__ __forceinline__ void syncwarp() {
__builtin_amdgcn_fence(__ATOMIC_RELEASE, "wavefront");
__builtin_amdgcn_wave_barrier();
__builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "wavefront");
}
// ######################################################################################################
namespace deep_ep {
template <int kBytes> struct VecInt {};
template <> struct VecInt<1> {
using vec_t = int8_t;
};
template <> struct VecInt<2> {
using vec_t = int16_t;
};
template <> struct VecInt<4> {
using vec_t = int;
};
template <> struct VecInt<8> {
using vec_t = int64_t;
};
template <> struct VecInt<16> {
using native_int4 = int __attribute__((ext_vector_type(4)));
using vec_t = native_int4;
};
__device__ __forceinline__ void trap() {
abort();
}
__device__ __forceinline__ void memory_fence() {
__threadfence_system();
}
__device__ __forceinline__ void memory_fence_gpu() {
__threadfence();
}
__device__ __forceinline__ void memory_fence_cta() {
__threadfence_block();
}
__device__ __forceinline__ void st_relaxed_sys_global(int *ptr, int val) {
__builtin_nontemporal_store(val, ptr);
}
__device__ __forceinline__ void st_release_sys_global(const int *ptr, int val) {
__hip_atomic_store(const_cast<int *>(ptr), val, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_SYSTEM);
}
__device__ __forceinline__ void st_release_cta(const int *ptr, int val) {
__hip_atomic_store(const_cast<int *>(ptr), val, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_WORKGROUP);
}
__device__ __forceinline__ int ld_relaxed_sys_global(const int *ptr) {
int res = __builtin_nontemporal_load(ptr);
return res;
}
__device__ __forceinline__ int ld_relaxed_sys_global(const uint64_t *ptr) {
uint64_t ret;
ret = __hip_atomic_load(ptr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
return ret;
}
__device__ __forceinline__ int ld_acquire_sys_global(const int *ptr) {
int ret;
ret = __hip_atomic_load(ptr, __ATOMIC_ACQUIRE, __HIP_MEMORY_SCOPE_SYSTEM);
return ret;
}
__device__ __forceinline__ uint64_t ld_acquire_sys_global(const uint64_t *ptr) {
uint64_t ret;
ret = __hip_atomic_load(ptr, __ATOMIC_ACQUIRE, __HIP_MEMORY_SCOPE_SYSTEM);
return ret;
}
__device__ __forceinline__ int ld_acquire_global(const int *ptr) {
int ret;
ret = __hip_atomic_load(ptr, __ATOMIC_ACQUIRE, __HIP_MEMORY_SCOPE_AGENT);
return ret;
}
__device__ __forceinline__ int atomic_add_release_global(const int *ptr, int value) {
int ret;
// ret = __hip_atomic_fetch_add(const_cast<int *>(ptr), value, __ATOMIC_RELEASE,
// __HIP_MEMORY_SCOPE_AGENT);
ret = atomicAdd((int*)ptr, value);
return ret;
}
__device__ __forceinline__ int ld_acquire_cta(const int *ptr) {
int ret;
ret = __hip_atomic_load(ptr, __ATOMIC_ACQUIRE, __HIP_MEMORY_SCOPE_WORKGROUP);
return ret;
}
__device__ __forceinline__ int ld_volatile_global(const volatile int *ptr) {
int ret;
ret = __hip_atomic_load(ptr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
return ret;
}
__device__ __forceinline__ float ld_volatile_global(const volatile float *ptr) {
float ret;
ret = __hip_atomic_load(ptr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
return ret;
}
__device__ __forceinline__ int64_t ld_volatile_global(const volatile int64_t *ptr) {
int64_t ret;
ret = __hip_atomic_load(ptr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
return ret;
}
__device__ __forceinline__ int64_t ld_volatile_global(const volatile uint64_t *ptr) {
int64_t ret;
ret = __hip_atomic_load(ptr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
return ret;
}
template <typename dtype_t> __device__ __forceinline__ dtype_t ld_nc_global(const dtype_t *ptr) {
using T = typename VecInt<sizeof(dtype_t)>::vec_t;
auto ret = __builtin_nontemporal_load(reinterpret_cast<const T *>(ptr));
return *reinterpret_cast<dtype_t *>(&ret);
}
////////////////// used in ibgda
__device__ __forceinline__ void st_na_relaxed(const uint8_t *ptr, uint8_t val) {
uint8_t *non_const_ptr = const_cast<uint8_t *>(ptr);
__hip_atomic_store(non_const_ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
}
__device__ __forceinline__ void st_na_relaxed(const uint16_t *ptr, uint16_t val) {
uint16_t *non_const_ptr = const_cast<uint16_t *>(ptr);
__hip_atomic_store(non_const_ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
}
__device__ __forceinline__ void st_na_relaxed(const uint32_t *ptr, uint32_t val) {
uint32_t *non_const_ptr = const_cast<uint32_t *>(ptr);
__hip_atomic_store(non_const_ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
}
__device__ __forceinline__ void st_na_relaxed(const int *ptr, int val) {
int *non_const_ptr = const_cast<int *>(ptr);
__hip_atomic_store(non_const_ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
}
__device__ __forceinline__ void st_na_relaxed(const int4 *ptr, int4 val) {
int4 *non_const_ptr = const_cast<int4 *>(ptr);
non_const_ptr->x = val.x;
non_const_ptr->y = val.y;
non_const_ptr->z = val.z;
non_const_ptr->w = val.w;
}
__device__ __forceinline__ void st_na_release(const int *ptr, int val) {
int *non_const_ptr = const_cast<int *>(ptr);
__hip_atomic_store(non_const_ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
}
__device__ __forceinline__ void st_na_release(const uint32_t *ptr, uint32_t val) {
uint32_t *non_const_ptr = const_cast<uint32_t *>(ptr);
__hip_atomic_store(non_const_ptr, val, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_AGENT);
}
__device__ __forceinline__ void st_na_release(const uint64_t *ptr, uint64_t val) {
uint64_t *non_const_ptr = const_cast<uint64_t *>(ptr);
__hip_atomic_store(non_const_ptr, val, __ATOMIC_RELEASE, __HIP_MEMORY_SCOPE_AGENT);
}
// TODO:: apply "st.global.L1::no_allocate" in ROCM
template <typename dtype_t>
__device__ __forceinline__ void st_na_global(const dtype_t *ptr, const dtype_t &value) {
st_na_global(reinterpret_cast<const typename VecInt<sizeof(dtype_t)>::vec_t *>(ptr),
*reinterpret_cast<const typename VecInt<sizeof(dtype_t)>::vec_t *>(&value));
}
template <> __device__ __forceinline__ void st_na_global(const int *ptr, const int &value) {
int *non_const_ptr = const_cast<int *>(ptr);
*non_const_ptr = value;
}
template <> __device__ __forceinline__ void st_na_global(const int64_t *ptr, const int64_t &value) {
int64_t *non_const_ptr = const_cast<int64_t *>(ptr);
*non_const_ptr = value;
}
template <> __device__ __forceinline__ void st_na_global(const float *ptr, const float &value) {
float *non_const_ptr = const_cast<float *>(ptr);
*non_const_ptr = value;
}
template <> __device__ __forceinline__ void st_na_global(const int4 *ptr, const int4 &value) {
int4 *non_const_ptr = const_cast<int4 *>(ptr);
*non_const_ptr = value;
}
__forceinline__ __device__ void get_channel_task_range(int num_tokens, int num_sms, int sm_id,
int &token_start_idx, int &token_end_idx) {
int num_tokens_per_sm = DIVUP(num_tokens, num_sms);
token_start_idx = min(num_tokens_per_sm * sm_id, num_tokens);
token_end_idx = min(token_start_idx + num_tokens_per_sm, num_tokens);
}
template <typename dtype_t>
__device__ __forceinline__ dtype_t broadcast(dtype_t &ptr, int src_lane_idx) {
EP_STATIC_ASSERT(sizeof(dtype_t) % sizeof(int) == 0, "");
auto send_int_values = reinterpret_cast<int *>(&ptr);
int recv_int_values[sizeof(dtype_t) / sizeof(int)];
#pragma unroll
for (int i = 0; i < sizeof(dtype_t) / sizeof(int); ++i)
recv_int_values[i] = shfl_sync(send_int_values[i], src_lane_idx);
return *reinterpret_cast<dtype_t *>(recv_int_values);
}
__forceinline__ __device__ int warp_reduce_sum(int value) {
if constexpr (kWarpSize == 64)
value += shfl_xor<int>(value, 32);
value += shfl_xor<int>(value, 16);
value += shfl_xor<int>(value, 8);
value += shfl_xor<int>(value, 4);
value += shfl_xor<int>(value, 2);
value += shfl_xor<int>(value, 1);
return value;
}
__forceinline__ __device__ int get_lane_id() {
int lane_id = threadIdx.x % kWarpSize;
return lane_id;
}
template <int kNumRanks, bool kSyncOnly = false>
__forceinline__ __device__ void barrier_block(int **barrier_signal_ptrs, int rank) {
auto thread_id = static_cast<int>(threadIdx.x);
// For non-sync-only cases, the memory operations by other threads in the block must be visible
// to the `sys` scope
if constexpr (not kSyncOnly) {
memory_fence();
__syncthreads();
}
// Add self-ranks, sub other ranks
if (thread_id < kNumRanks) {
atomicAdd_system(barrier_signal_ptrs[rank] + thread_id, FINISHED_SUM_TAG);
atomicSub_system(barrier_signal_ptrs[thread_id] + rank, FINISHED_SUM_TAG);
}
EP_DEVICE_ASSERT(kNumRanks <= blockDim.x);
// Check timeout
auto start_time = clock64();
while (true) {
auto value =
thread_id < kNumRanks ? ld_volatile_global(barrier_signal_ptrs[rank] + thread_id) : 0;
if (__all_sync(kFullWarpMask, value <= 0))
break;
if (clock64() - start_time > NUM_TIMEOUT_CYCLES and thread_id < kNumRanks) {
printf("DeepEP timeout check failed: rank = %d, thread = %d, value = %d)\n", rank,
thread_id, value);
trap();
}
}
__syncthreads();
}
} // namespace deep_ep
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment