Commit 314d9021 authored by lijian6's avatar lijian6
Browse files

Merge branch 'package_name' into 'main'

modify package name and update rocshmem

See merge request dcutoolkit/deeplearing/DeepEP!15
parents 00f02bdd d7f41337
......@@ -48,7 +48,7 @@ for arg in "$@"; do
ROCM_USE_MULTIQP=ON
;;
*)
echo "Usage: ./build.sh rocshmem [ROCM_DISABLE_CTX=ON] [ROCM_USE_MULTIQP=ON] / ./build.sh nvshmem"
echo "Usage: ./build.sh rocshmem [ROCM_DISABLE_CTX=ON] [ROCM_USE_MULTIQP=ON] / ./build.sh dushmem"
exit 1
;;
esac
......@@ -89,7 +89,7 @@ if [ "$USE_ROCSHMEM" == "ON" ]; then
build_rocshmem
SHMEM_INSTALL_PREFIX=$(pwd)/third-party/rocshmem_install
COMPILE_OPTIONS=${COMPILE_OPTIONS:= -fPIC -D__HIP_PLATFORM_AMD__=1 -DUSE_ROCM=1 -DHIPBLAS_V2 -DCUDA_HAS_FP16=1 -O3 -fgpu-rdc -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1014"' -DTORCH_EXTENSION_NAME=deep_ep_cpp -D_GLIBCXX_USE_CXX11_ABI=1 --offload-arch=gfx936 -std=c++17 -Wno-return-type}
COMPILE_OPTIONS=${COMPILE_OPTIONS:= -fPIC -D__HIP_PLATFORM_AMD__=1 -DUSE_ROCM=1 -DHIPBLAS_V2 -DCUDA_HAS_FP16=1 -O3 -fgpu-rdc -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1014"' -DTORCH_EXTENSION_NAME=deep_ep_cpp -D_GLIBCXX_USE_CXX11_ABI=1 --offload-arch=gfx936 --offload-arch=gfx938 -std=c++17 -Wno-return-type}
if [ "$ROCM_DISABLE_CTX" == "ON" ]; then
COMPILE_OPTIONS="-DROCM_DISABLE_CTX $COMPILE_OPTIONS"
fi
......@@ -122,18 +122,19 @@ build_dushmem()
cd "$src_path"
}
if [ "$USE_NVSHMEM" == "ON" ]; then
if [ ! -d "third-party/dushmem-hip/src/" ]; then
echo "download submodule..."
git submodule update --init third-party/dushmem-hip
fi
if [ ! -d "third-party/dushmem_install" ]; then
mkdir -p third-party/dushmem_install
fi
build_dushmem
SHMEM_INSTALL_PREFIX=$(pwd)/third-party/dushmem_install
COMPILE_OPTIONS=${COMPILE_OPTIONS:= -fPIC -DFORCE_NVSHMEM_API -DHIP_ENABLE_WARP_SYNC_BUILTINS -D__HIP_PLATFORM_AMD__=1 -DUSE_ROCM=1 -DHIPBLAS_V2 -DCUDA_HAS_FP16=1 -O3 -fgpu-rdc -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1014"' -DTORCH_EXTENSION_NAME=deep_ep_cpp -D_GLIBCXX_USE_CXX11_ABI=1 --offload-arch=gfx936 -std=c++17 -Wno-return-type}
SHMEM_LINK_OPTIONS="-Wl,-rpath,${SHMEM_INSTALL_PREFIX}/lib/ -l:libnvshmem_device.a -lnvshmem_host"
# if [ ! -d "third-party/dushmem-hip/src/" ]; then
# echo "download submodule..."
# git submodule update --init third-party/dushmem-hip
# fi
# if [ ! -d "third-party/dushmem_install" ]; then
# mkdir -p third-party/dushmem_install
# fi
# build_dushmem
# SHMEM_INSTALL_PREFIX=$(pwd)/third-party/dushmem_install
SHMEM_INSTALL_PREFIX=${ROCM_PATH}/dushmem
COMPILE_OPTIONS=${COMPILE_OPTIONS:= -fPIC -DFORCE_DUSHMEM_API -DHIP_ENABLE_WARP_SYNC_BUILTINS -D__HIP_PLATFORM_AMD__=1 -DUSE_ROCM=1 -DHIPBLAS_V2 -DCUDA_HAS_FP16=1 -O3 -fgpu-rdc -DTORCH_API_INCLUDE_EXTENSION_H '-DPYBIND11_COMPILER_TYPE="_gcc"' '-DPYBIND11_STDLIB="_libstdcpp"' '-DPYBIND11_BUILD_ABI="_cxxabi1014"' -DTORCH_EXTENSION_NAME=deep_ep_cpp -D_GLIBCXX_USE_CXX11_ABI=1 --offload-arch=gfx936 --offload-arch=gfx938 -std=c++17 -Wno-return-type}
SHMEM_LINK_OPTIONS="-Wl,-rpath,${SHMEM_INSTALL_PREFIX}/lib/ -l:libdushmem_device.a -ldushmem_host"
fi
# -------------------------- duSHMEM END -------------------------- #
......@@ -146,7 +147,7 @@ hipcc ${INCLUDE_PATHS} -c $(pwd)/csrc/kernels/internode.cu -o build_/internode.o
hipcc ${INCLUDE_PATHS} -c $(pwd)/csrc/kernels/internode_ll.cu -o build_/internode_ll.o ${COMPILE_OPTIONS}
hipcc ${INCLUDE_PATHS} -c $(pwd)/csrc/deep_ep.cu -o build_/deep_ep.o ${COMPILE_OPTIONS}
hipcc -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -shared -Wl,-O1 -Wl,-Bsymbolic-functions build_/internode.o build_/intranode.o build_/runtime.o build_/deep_ep.o build_/layout.o build_/internode_ll.o -L${SHMEM_INSTALL_PREFIX}/lib/ -L/opt/mpi/lib -L/opt/dtk/hip/lib -L/usr/lib/x86_64-linux-gnu -lhipblaslt -lamdhip64 -o deep_ep/deep_ep_cpp.cpython-310-x86_64-linux-gnu.so -Wl,-rpath,/opt/dtk/lib -fgpu-rdc --hip-link --offload-arch=gfx936 -shared -Wl,-soname,deep_ep/deep_ep_cpp.cpython-310-x86_64-linux-gnu.so -L"${llvm_path}/include/../lib/linux" -lclang_rt.builtins-x86_64 /opt/dtk/hip/lib/libgalaxyhip.so ${llvm_path}/lib/linux/libclang_rt.builtins-x86_64.a /opt/hyhal/lib/libhsa-runtime64.so -L${PYTHON_PLATLIB}/torch/lib -L/opt/dtk/lib -L/opt/dtk/hip/lib -L/usr/local/lib -lc10 -ltorch -ltorch_cpu -ltorch_python -lamdhip64 -lc10_hip -ltorch_hip -lrocm-core -lrocm_smi64 ${SHMEM_LINK_OPTIONS} -fgpu-rdc --hip-link -lamdhip64 -lhsa-runtime64 -l:libmpi.so -Wl,-rpath,/opt/mpi/lib/ -libverbs -lmlx5
hipcc -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -shared -Wl,-O1 -Wl,-Bsymbolic-functions build_/internode.o build_/intranode.o build_/runtime.o build_/deep_ep.o build_/layout.o build_/internode_ll.o -L${SHMEM_INSTALL_PREFIX}/lib/ -L/opt/mpi/lib -L/opt/dtk/hip/lib -L/usr/lib/x86_64-linux-gnu -lhipblaslt -lamdhip64 -o deep_ep/deep_ep_cpp.cpython-310-x86_64-linux-gnu.so -Wl,-rpath,/opt/dtk/lib -fgpu-rdc --hip-link --offload-arch=gfx936 --offload-arch=gfx938 -shared -Wl,-soname,deep_ep/deep_ep_cpp.cpython-310-x86_64-linux-gnu.so -L"${llvm_path}/include/../lib/linux" -lclang_rt.builtins-x86_64 /opt/dtk/hip/lib/libgalaxyhip.so ${llvm_path}/lib/linux/libclang_rt.builtins-x86_64.a /opt/hyhal/lib/libhsa-runtime64.so -L${PYTHON_PLATLIB}/torch/lib -L/opt/dtk/lib -L/opt/dtk/hip/lib -L/usr/local/lib -lc10 -ltorch -ltorch_cpu -ltorch_python -lamdhip64 -lc10_hip -ltorch_hip -lrocm-core -lrocm_smi64 ${SHMEM_LINK_OPTIONS} -fgpu-rdc --hip-link -lamdhip64 -lhsa-runtime64 -l:libmpi.so -Wl,-rpath,/opt/mpi/lib/ -libverbs -lmlx5
# build whl
echo "Using Python: $(which python3)"
......
......@@ -143,7 +143,7 @@ pybind11::bytearray Buffer::get_local_ipc_handle() const {
return {ipc_handles[nvl_rank].reserved, HIP_IPC_HANDLE_SIZE};
}
pybind11::bytearray Buffer::get_local_nvshmem_unique_id() const {
pybind11::bytearray Buffer::get_local_dushmem_unique_id() const {
#ifndef DISABLE_ROCSHMEM
EP_HOST_ASSERT(rdma_rank == 0 and "Only RDMA rank 0 can get ROCSHMEM unique ID");
auto unique_id = internode::get_unique_id();
......@@ -260,9 +260,9 @@ void Buffer::sync(const std::vector<int> &device_
std::vector<uint8_t> root_unique_id(root_unique_id_opt->size());
auto root_unique_id_str = root_unique_id_opt->cast<std::string>();
std::memcpy(root_unique_id.data(), root_unique_id_str.c_str(), root_unique_id_opt->size());
auto nvshmem_rank = low_latency_mode ? rank : rdma_rank;
auto num_nvshmem_ranks = low_latency_mode ? num_ranks : num_rdma_ranks;
EP_HOST_ASSERT(nvshmem_rank == internode::init(root_unique_id, nvshmem_rank, num_nvshmem_ranks, low_latency_mode));
auto dushmem_rank = low_latency_mode ? rank : rdma_rank;
auto num_dushmem_ranks = low_latency_mode ? num_ranks : num_rdma_ranks;
EP_HOST_ASSERT(dushmem_rank == internode::init(root_unique_id, dushmem_rank, num_dushmem_ranks, low_latency_mode));
internode::barrier();
// Allocate
......@@ -1531,7 +1531,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
.def("get_root_rdma_rank", &deep_ep::Buffer::get_root_rdma_rank)
.def("get_local_device_id", &deep_ep::Buffer::get_local_device_id)
.def("get_local_ipc_handle", &deep_ep::Buffer::get_local_ipc_handle)
.def("get_local_nvshmem_unique_id", &deep_ep::Buffer::get_local_nvshmem_unique_id)
.def("get_local_dushmem_unique_id", &deep_ep::Buffer::get_local_dushmem_unique_id)
.def("get_local_buffer_tensor", &deep_ep::Buffer::get_local_buffer_tensor)
.def("get_comm_stream", &deep_ep::Buffer::get_comm_stream)
.def("sync", &deep_ep::Buffer::sync)
......
......@@ -29,7 +29,7 @@ private:
void* nvl_buffer_ptrs[NUM_MAX_NVL_PEERS] = {nullptr};
void** nvl_buffer_ptrs_gpu = nullptr;
// NVSHMEM Buffer
// DUSHMEM Buffer
int64_t num_rdma_bytes;
void *rdma_buffer_ptr = nullptr;
......@@ -48,7 +48,7 @@ private:
// Stream for communication
at::hip::HIPStreamMasqueradingAsCUDA comm_stream;
// After IPC/NVSHMEM synchronization, this flag will be true
// After IPC/DUSHMEM synchronization, this flag will be true
bool available = false;
// Whether explicit `destroy()` is required.
......@@ -95,7 +95,7 @@ public:
pybind11::bytearray get_local_ipc_handle() const;
pybind11::bytearray get_local_nvshmem_unique_id() const;
pybind11::bytearray get_local_dushmem_unique_id() const;
torch::Tensor get_local_buffer_tensor(const pybind11::object &dtype, int64_t offset,
bool use_rdma_buffer) const;
......
......@@ -86,7 +86,7 @@ __forceinline__ __device__ int translate_dst_rdma_rank(const int dst_rdma_rank,
template <bool kLowLatencyMode>
__forceinline__ __device__ void
nvshmem_barrier_with_same_gpu_idx(const shmem_team_t &rdma_team) {
dushmem_barrier_with_same_gpu_idx(const shmem_team_t &rdma_team) {
// NOTE: shmem_device_barrier_all() might be an issue as
// it doesn't follow OpenSHMEM specification on ROCm
kLowLatencyMode ? shmem_barrier(rdma_team) : shmem_device_barrier_all();
......@@ -119,7 +119,7 @@ notify_dispatch(const int *num_tokens_per_rank, int *moe_recv_counter_mapped, in
EP_DEVICE_ASSERT(num_warps > 1);
EP_DEVICE_ASSERT(kNumRDMARanks <= num_threads);
if (thread_id == kWarpSize)
nvshmem_barrier_with_same_gpu_idx<kLowLatencyMode>(rdma_team);
dushmem_barrier_with_same_gpu_idx<kLowLatencyMode>(rdma_team);
barrier_block<NUM_MAX_NVL_PEERS>(barrier_signal_ptrs, nvl_rank);
__syncthreads();
......@@ -161,7 +161,7 @@ notify_dispatch(const int *num_tokens_per_rank, int *moe_recv_counter_mapped, in
}
__syncthreads();
if (thread_id == 0)
nvshmem_barrier_with_same_gpu_idx<kLowLatencyMode>(rdma_team);
dushmem_barrier_with_same_gpu_idx<kLowLatencyMode>(rdma_team);
__syncthreads();
......@@ -189,7 +189,7 @@ notify_dispatch(const int *num_tokens_per_rank, int *moe_recv_counter_mapped, in
nvl_buffer_ptr_int[nvl_clean_offset + i] = 0;
// Reduce number of tokens per expert into the NVL send buffer
// TODO: may use NVSHMEM reduction
// TODO: may use DUSHMEM reduction
EP_DEVICE_ASSERT(num_rdma_experts <= num_threads);
if (thread_id < num_rdma_experts) {
int sum = 0;
......@@ -257,7 +257,7 @@ notify_dispatch(const int *num_tokens_per_rank, int *moe_recv_counter_mapped, in
// Finally barrier
__syncthreads();
if (thread_id == kWarpSize)
nvshmem_barrier_with_same_gpu_idx<kLowLatencyMode>(rdma_team);
dushmem_barrier_with_same_gpu_idx<kLowLatencyMode>(rdma_team);
barrier_block<NUM_MAX_NVL_PEERS>(barrier_signal_ptrs, nvl_rank);
} else {
......@@ -399,7 +399,7 @@ dispatch(int4 *recv_x, float *recv_x_scales, int64_t *recv_topk_idx, float *recv
kForwarderCoordinator, // 向远端RDMA确认接收
kNVLReceivers // 从nvl缓存写入到recv_x
};
#if !defined(FORCE_NVSHMEM_API) && !defined(ROCM_DISABLE_CTX)
#if !defined(FORCE_DUSHMEM_API) && !defined(ROCM_DISABLE_CTX)
__shared__ shmem_ctx_t ctx;
shmem_wg_ctx_create(&ctx);
#endif
......@@ -516,7 +516,7 @@ dispatch(int4 *recv_x, float *recv_x_scales, int64_t *recv_topk_idx, float *recv
syncwarp();
if (dst_rdma_rank != rdma_rank) {
#if !defined(FORCE_NVSHMEM_API) && !defined(ROCM_DISABLE_CTX)
#if !defined(FORCE_DUSHMEM_API) && !defined(ROCM_DISABLE_CTX)
shmem_ctx_int_put_nbi_warp(ctx,
#else
shmemx_int_put_nbi_warp(
......@@ -527,7 +527,7 @@ dispatch(int4 *recv_x, float *recv_x_scales, int64_t *recv_topk_idx, float *recv
}
}
#if !defined(FORCE_NVSHMEM_API) && !defined(ROCM_DISABLE_CTX)
#if !defined(FORCE_DUSHMEM_API) && !defined(ROCM_DISABLE_CTX)
shmem_ctx_quiet(ctx);
#else
shmem_fence();
......@@ -690,7 +690,7 @@ dispatch(int4 *recv_x, float *recv_x_scales, int64_t *recv_topk_idx, float *recv
最后,它更新相关的尾部位置,以便下次循环时可以正确地计算需要发送的令牌数。
kRDMASenderCoordinator使用了同sm内存一致性(ld.acquire.cta.s32),
nvshmem内存一致性(nvshmem_fence)和原子操作(nvshmemx_signal_op),减少硬同步,提升整体效率。
dushmem内存一致性(dushmem_fence)和原子操作(dushmemx_signal_op),减少硬同步,提升整体效率。
*/
if(warp_id > kNumDispatchRDMASenderWarps) {
return;
......@@ -741,7 +741,7 @@ dispatch(int4 *recv_x, float *recv_x_scales, int64_t *recv_topk_idx, float *recv
if(dst_rdma_rank != rdma_rank) {
auto dst_slot_idx = synced_last_issued_tail % num_max_rdma_chunked_recv_tokens;
EP_DEVICE_ASSERT(dst_slot_idx + num_tokens_to_issue <= num_max_rdma_chunked_recv_tokens);
#if !defined(FORCE_NVSHMEM_API) && !defined(ROCM_DISABLE_CTX)
#if !defined(FORCE_DUSHMEM_API) && !defined(ROCM_DISABLE_CTX)
shmem_ctx_schar_put_nbi_warp(ctx,
#else
shmemx_int8_put_nbi_warp(
......@@ -752,7 +752,7 @@ dispatch(int4 *recv_x, float *recv_x_scales, int64_t *recv_topk_idx, float *recv
dst_slot_idx * num_bytes_per_rdma_token,
num_bytes_per_rdma_token * num_tokens_to_issue,
translate_dst_rdma_rank<kLowLatencyMode>(dst_rdma_rank, nvl_rank));
#if !defined(FORCE_NVSHMEM_API) && !defined(ROCM_DISABLE_CTX)
#if !defined(FORCE_DUSHMEM_API) && !defined(ROCM_DISABLE_CTX)
shmem_ctx_quiet(ctx);
#else
shmem_fence();
......@@ -768,7 +768,7 @@ dispatch(int4 *recv_x, float *recv_x_scales, int64_t *recv_topk_idx, float *recv
last_issued_tail += num_tokens_to_issue;
num_tokens_to_send -= num_tokens_to_issue;
// 更新远端rdma 己方已发送的token数,用于做发送信息同步。用于与kRDMAAndNVLForwarder互相通信
#if !defined(FORCE_NVSHMEM_API) && !defined(ROCM_DISABLE_CTX)
#if !defined(FORCE_DUSHMEM_API) && !defined(ROCM_DISABLE_CTX)
shmem_ctx_ulong_atomic_add(ctx,
#else
shmem_signal_op_add(
......@@ -1008,7 +1008,7 @@ dispatch(int4 *recv_x, float *recv_x_scales, int64_t *recv_topk_idx, float *recv
// 更新远程头部
if(min_head != std::numeric_limits<int>::max() && min_head >= last_head + num_max_rdma_chunked_send_tokens && lane_id < kNumRDMARanks){
#if !defined(FORCE_NVSHMEM_API) && !defined(ROCM_DISABLE_CTX)
#if !defined(FORCE_DUSHMEM_API) && !defined(ROCM_DISABLE_CTX)
shmem_ctx_ulong_atomic_add(ctx,
#else
shmem_signal_op_add(
......@@ -1127,7 +1127,7 @@ dispatch(int4 *recv_x, float *recv_x_scales, int64_t *recv_topk_idx, float *recv
}
} // while(num_tokens_to_recv > 0)
}
#if !defined(FORCE_NVSHMEM_API) && !defined(ROCM_DISABLE_CTX)
#if !defined(FORCE_DUSHMEM_API) && !defined(ROCM_DISABLE_CTX)
shmem_wg_ctx_destroy(&ctx);
#endif
}
......@@ -1203,7 +1203,7 @@ cached_notify(const int rdma_clean_offset, const int rdma_num_int_clean, const i
if (sm_id == 0) {
// Barrier for RDMA
if (thread_id == 0)
nvshmem_barrier_with_same_gpu_idx<kLowLatencyMode>(rdma_team);
dushmem_barrier_with_same_gpu_idx<kLowLatencyMode>(rdma_team);
__syncthreads();
......@@ -1216,7 +1216,7 @@ cached_notify(const int rdma_clean_offset, const int rdma_num_int_clean, const i
// Barrier again
if (thread_id == 0)
nvshmem_barrier_with_same_gpu_idx<kLowLatencyMode>(rdma_team);
dushmem_barrier_with_same_gpu_idx<kLowLatencyMode>(rdma_team);
} else if (sm_id == 1) {
// Barrier for NVL
......@@ -1417,7 +1417,7 @@ combine(int4 *combined_x, float *combined_topk_weights, const bool *is_combined_
kRDMACoordinator,
kNVLCoordinator
};
#if !defined(FORCE_NVSHMEM_API) && !defined(ROCM_DISABLE_CTX)
#if !defined(FORCE_DUSHMEM_API) && !defined(ROCM_DISABLE_CTX)
__shared__ shmem_ctx_t ctx;
shmem_wg_ctx_create(&ctx);
#endif
......@@ -1744,7 +1744,7 @@ combine(int4 *combined_x, float *combined_topk_weights, const bool *is_combined_
if(sub_warp_id == kNumWarpsPerForwarder - 1) {
if(dst_rdma_rank != rdma_rank) {
auto rdma_slot_idx = token_start_idx % num_max_rdma_chunked_recv_tokens;
#if !defined(FORCE_NVSHMEM_API) && !defined(ROCM_DISABLE_CTX)
#if !defined(FORCE_DUSHMEM_API) && !defined(ROCM_DISABLE_CTX)
shmem_ctx_schar_put_nbi_warp(ctx,
#else
shmemx_int8_put_nbi_warp(
......@@ -1755,7 +1755,7 @@ combine(int4 *combined_x, float *combined_topk_weights, const bool *is_combined_
rdma_slot_idx * num_bytes_per_rdma_token,
num_chunked_tokens * num_bytes_per_rdma_token,
translate_dst_rdma_rank<kLowLatencyMode>(dst_rdma_rank, nvl_rank));
#if !defined(FORCE_NVSHMEM_API) && !defined(ROCM_DISABLE_CTX)
#if !defined(FORCE_DUSHMEM_API) && !defined(ROCM_DISABLE_CTX)
shmem_ctx_quiet(ctx);
#else
shmem_fence();
......@@ -1767,7 +1767,7 @@ combine(int4 *combined_x, float *combined_topk_weights, const bool *is_combined_
// Write new RDMA tail
syncwarp();
if(lane_id == 0) {
#if !defined(FORCE_NVSHMEM_API) && !defined(ROCM_DISABLE_CTX)
#if !defined(FORCE_DUSHMEM_API) && !defined(ROCM_DISABLE_CTX)
shmem_ctx_ulong_atomic_add(ctx,
#else
shmem_signal_op_add(
......@@ -1900,7 +1900,7 @@ combine(int4 *combined_x, float *combined_topk_weights, const bool *is_combined_
min_head = min(min_head, rdma_receiver_rdma_head[i][dst_rdma_rank]);
if (min_head != std::numeric_limits<int>::max() and min_head >= last_rdma_head + num_max_rdma_chunked_send_tokens and lane_id < kNumRDMARanks) {
#if !defined(FORCE_NVSHMEM_API) && !defined(ROCM_DISABLE_CTX)
#if !defined(FORCE_DUSHMEM_API) && !defined(ROCM_DISABLE_CTX)
shmem_ctx_ulong_atomic_add(ctx,
#else
shmem_signal_op_add(
......@@ -1917,7 +1917,7 @@ combine(int4 *combined_x, float *combined_topk_weights, const bool *is_combined_
}
}
}
#if !defined(FORCE_NVSHMEM_API) && !defined(ROCM_DISABLE_CTX)
#if !defined(FORCE_DUSHMEM_API) && !defined(ROCM_DISABLE_CTX)
shmem_wg_ctx_destroy(&ctx);
#endif
}
......
......@@ -257,10 +257,10 @@ dispatch(void* packed_recv_x, void* packed_recv_x_scales,
rank * num_max_dispatch_tokens_per_rank * num_bytes_per_msg +
slot_idx * num_bytes_per_msg;
if (dst_rank != rank) {
#if defined(FORCE_NVSHMEM_API)
void *peer_base_addr = (void *)__ldg((const long long unsigned *)nvshmemi_device_state_d.peer_heap_base_p2p + dst_rank);
#if defined(FORCE_DUSHMEM_API)
void *peer_base_addr = (void *)__ldg((const long long unsigned *)dushmemi_device_state_d.peer_heap_base_p2p + dst_rank);
if (peer_base_addr) {
char *req_rptr_actual = (char *)(peer_base_addr) + ((char *)dst_ptr - (char *)(nvshmemi_device_state_d.heap_base));
char *req_rptr_actual = (char *)(peer_base_addr) + ((char *)dst_ptr - (char *)(dushmemi_device_state_d.heap_base));
const auto* src_int4_ptr = reinterpret_cast<const int4*>(src_ptr);
const auto* dst_int4_ptr = reinterpret_cast<int4*>(req_rptr_actual);
UNROLLED_WARP_COPY(8, lane_id, num_int4_per_msg, dst_int4_ptr, src_int4_ptr, ld_nc_global, st_na_global);
......@@ -279,7 +279,7 @@ dispatch(void* packed_recv_x, void* packed_recv_x_scales,
reinterpret_cast<signed char*>(dst_ptr), reinterpret_cast<signed char*>(src_ptr),
num_bytes_per_msg, (dst_expert_local_idx + 1) * num_ranks + dst_rank, dst_rank);
#endif
#endif // defined(FORCE_NVSHMEM_API)
#endif // defined(FORCE_DUSHMEM_API)
} else {
// NOTES: only 2 load iterations for 7K hidden with 8 unrolls
const auto* src_int4_ptr = reinterpret_cast<const int4*>(src_ptr);
......@@ -342,11 +342,11 @@ dispatch(void* packed_recv_x, void* packed_recv_x_scales,
// Wait local sends issued and send expert counts
while (ld_acquire_global(atomic_finish_counter_per_expert + responsible_expert_idx) != FINISHED_SUM_TAG * 2);
if (dst_rank != rank) {
#if defined(FORCE_NVSHMEM_API)
void *peer_base_addr = (void *)__ldg((const long long unsigned *)nvshmemi_device_state_d.peer_heap_base_p2p + dst_rank);
#if defined(FORCE_DUSHMEM_API)
void *peer_base_addr = (void *)__ldg((const long long unsigned *)dushmemi_device_state_d.peer_heap_base_p2p + dst_rank);
if (peer_base_addr) { // P2P enabled
int *rptr_actual = (int *)((char *)(peer_base_addr) +
((char *)(rdma_recv_count + dst_expert_local_idx * num_ranks + rank) - (char *)(nvshmemi_device_state_d.heap_base)));
((char *)(rdma_recv_count + dst_expert_local_idx * num_ranks + rank) - (char *)(dushmemi_device_state_d.heap_base)));
st_na_release(rptr_actual, -num_tokens_sent - 1);
} else {
internode::shmem_long_atomic_add(
......@@ -361,7 +361,7 @@ dispatch(void* packed_recv_x, void* packed_recv_x_scales,
rdma_recv_count + dst_expert_local_idx * num_ranks + rank, -num_tokens_sent - 1,
(dst_expert_local_idx + 1) * num_ranks + dst_rank, dst_rank);
#endif
#endif // defined(FORCE_NVSHMEM_API)
#endif // defined(FORCE_DUSHMEM_API)
} else {
st_na_release(reinterpret_cast<int *>(rdma_recv_count + dst_expert_local_idx * num_ranks + rank), -num_tokens_sent - 1);
}
......@@ -640,10 +640,10 @@ combine(void* combined_x,
if (not zero_copy)
UNROLLED_WARP_COPY_LL(7, lane_id, hidden_bf16_int4, buf_int4_ptr, x_int4, ld_nc_global, st_na_global);
#if defined(FORCE_NVSHMEM_API)
void *peer_base_addr = (void *)__ldg((const long long unsigned *)nvshmemi_device_state_d.peer_heap_base_p2p + dst_rank);
#if defined(FORCE_DUSHMEM_API)
void *peer_base_addr = (void *)__ldg((const long long unsigned *)dushmemi_device_state_d.peer_heap_base_p2p + dst_rank);
if (peer_base_addr) {
char *req_rptr_actual = (char *)(peer_base_addr) + ((char *)dst_ptr - (char *)(nvshmemi_device_state_d.heap_base));
char *req_rptr_actual = (char *)(peer_base_addr) + ((char *)dst_ptr - (char *)(dushmemi_device_state_d.heap_base));
const auto dst_int4_ptr = reinterpret_cast<int4*>(req_rptr_actual);
UNROLLED_WARP_COPY(7, lane_id, hidden_bf16_int4, dst_int4_ptr, x_int4, ld_nc_global, st_na_global);
} else {
......@@ -661,7 +661,7 @@ combine(void* combined_x,
reinterpret_cast<signed char*>(dst_ptr), reinterpret_cast<signed char*>(buf_ptr),
hidden * sizeof(hip_bfloat16), (local_expert_idx + 1) * num_ranks + dst_rank, dst_rank);
#endif
#endif // defined(FORCE_NVSHMEM_API)
#endif // defined(FORCE_DUSHMEM_API)
}
}
......@@ -676,11 +676,11 @@ combine(void* combined_x,
if (sub_warp_id == 1 and lane_id == 0) {
while (ld_acquire_global(atomic_clean_flag) == 0);
if (dst_rank != rank) {
#if defined(FORCE_NVSHMEM_API)
void *peer_base_addr = (void *)__ldg((const long long unsigned *)nvshmemi_device_state_d.peer_heap_base_p2p + dst_rank);
#if defined(FORCE_DUSHMEM_API)
void *peer_base_addr = (void *)__ldg((const long long unsigned *)dushmemi_device_state_d.peer_heap_base_p2p + dst_rank);
if (peer_base_addr) {
int *req_rptr_actual = (int *)((char *)(peer_base_addr) +
((char *)(rdma_recv_flag + global_expert_idx) - (char *)(nvshmemi_device_state_d.heap_base)));
((char *)(rdma_recv_flag + global_expert_idx) - (char *)(dushmemi_device_state_d.heap_base)));
st_na_release(req_rptr_actual, 1);
} else {
internode::shmem_long_atomic_add(
......@@ -695,7 +695,7 @@ combine(void* combined_x,
rdma_recv_flag + global_expert_idx, 1,
(local_expert_idx + 1) * num_ranks + dst_rank, dst_rank);
#endif
#endif // defined(FORCE_NVSHMEM_API)
#endif // defined(FORCE_DUSHMEM_API)
} else {
st_na_release(reinterpret_cast<int*>(rdma_recv_flag + global_expert_idx), 1);
}
......
......@@ -61,9 +61,9 @@ int init(const std::vector<uint8_t> &root_unique_id_val, int rank, int num_ranks
&cpu_rdma_team_config, 0, &cpu_rdma_team) == 0);
EP_HOST_ASSERT(cpu_rdma_team != EP_SHMEM_TEAM_INVALID);
#ifdef FORCE_NVSHMEM_API
nvshmemi_device_host_state_t* dev_state_ptr = nullptr;
CUDA_CHECK(hipGetSymbolAddress(reinterpret_cast<void**>(&dev_state_ptr), nvshmemi_device_state_d));
#ifdef FORCE_DUSHMEM_API
dushmemi_device_host_state_t* dev_state_ptr = nullptr;
CUDA_CHECK(hipGetSymbolAddress(reinterpret_cast<void**>(&dev_state_ptr), dushmemi_device_state_d));
bool ibgda_is_initialized = false;
CUDA_CHECK(hipMemcpy(&dev_state_ptr->ibgda_is_initialized, &ibgda_is_initialized, sizeof(bool), hipMemcpyHostToDevice));
#endif
......
#pragma once
/*
* Temporary wrapper for for platform specific NVSHMEM and rocSHMEM functions.
* Temporary wrapper for for platform specific DUSHMEM and rocSHMEM functions.
* Once hipify or hipify-torch fully supports this mapping, this file has to be
* removed and according nvshmem* functions restored.
* removed and according dushmem* functions restored.
*/
#ifndef DISABLE_ROCSHMEM
#include "configs.cuh"
#ifndef FORCE_NVSHMEM_API
#ifndef FORCE_DUSHMEM_API
#include <hip/hip_bfloat16.h>
#include <hip/hip_fp8.h>
#include <hip/hip_runtime.h>
#include <rocshmem/rocshmem.hpp>
#else
#include <device_host_transport/nvshmem_common_ibgda.h>
#include <device_host_transport/dushmem_common_ibgda.h>
#include <infiniband/mlx5dv.h>
#include <nvshmem.h>
#include <nvshmemx.h>
#include <non_abi/device/threadgroup/nvshmemi_common_device_defines.cuh>
#include <dushmem.h>
#include <dushmemx.h>
#include <non_abi/device/threadgroup/dushmemi_common_device_defines.cuh>
#endif
namespace deep_ep::internode {
// rocSHMEM wrapper
#ifndef FORCE_NVSHMEM_API
#ifndef FORCE_DUSHMEM_API
using shmem_team_t = rocshmem::rocshmem_team_t;
using shmem_team_config_t = rocshmem::rocshmem_team_config_t;
const shmem_team_t EP_SHMEM_TEAM_INVALID = rocshmem::ROCSHMEM_TEAM_INVALID;
......@@ -171,106 +171,106 @@ __device__ inline void shmem_ctx_int_put_nbi_warp(
#else
// NVSHMEM wrapper
// DUSHMEM wrapper
#ifndef ROCM_DISABLE_CTX
#define ROCM_DISABLE_CTX
#endif
using shmem_team_t = nvshmem_team_t;
using shmem_team_config_t = nvshmem_team_config_t;
using shmemx_uniqueid_t = nvshmemx_uniqueid_t;
using shmemx_init_attr_t = nvshmemx_init_attr_t;
const shmem_team_t EP_SHMEM_TEAM_INVALID = NVSHMEM_TEAM_INVALID;
const shmem_team_t EP_SHMEM_TEAM_WORLD = NVSHMEM_TEAM_WORLD;
constexpr auto EP_SHMEMX_INIT_WITH_UNIQUEID = NVSHMEMX_INIT_WITH_UNIQUEID;
using shmem_team_t = dushmem_team_t;
using shmem_team_config_t = dushmem_team_config_t;
using shmemx_uniqueid_t = dushmemx_uniqueid_t;
using shmemx_init_attr_t = dushmemx_init_attr_t;
const shmem_team_t EP_SHMEM_TEAM_INVALID = DUSHMEM_TEAM_INVALID;
const shmem_team_t EP_SHMEM_TEAM_WORLD = DUSHMEM_TEAM_WORLD;
constexpr auto EP_SHMEMX_INIT_WITH_UNIQUEID = DUSHMEMX_INIT_WITH_UNIQUEID;
__host__ inline int shmemx_get_uniqueid(shmemx_uniqueid_t *uid) {
return nvshmemx_get_uniqueid(uid);
return dushmemx_get_uniqueid(uid);
}
__host__ inline int shmemx_set_attr_uniqueid_args(int rank, int nranks,
shmemx_uniqueid_t *uid, shmemx_init_attr_t *attr) {
return nvshmemx_set_attr_uniqueid_args(rank, nranks, uid, attr);
return dushmemx_set_attr_uniqueid_args(rank, nranks, uid, attr);
}
__host__ inline int shmemx_init_attr(unsigned int flags, shmemx_init_attr_t *attr) {
return nvshmemx_init_attr(flags, attr);
return dushmemx_init_attr(flags, attr);
}
__host__ inline int shmem_team_split_strided(shmem_team_t parent_team,
int start, int stride, int size,
const shmem_team_config_t *config,
long config_mask, shmem_team_t *new_team) {
return nvshmem_team_split_strided(parent_team, start, stride, size, config, config_mask, new_team);
return dushmem_team_split_strided(parent_team, start, stride, size, config, config_mask, new_team);
}
__host__ inline void shmem_barrier_all() {
nvshmem_barrier_all();
dushmem_barrier_all();
}
__device__ inline void shmem_device_barrier_all() {
nvshmem_barrier_all();
dushmem_barrier_all();
}
__device__ inline void shmem_barrier(shmem_team_t team) {
void(nvshmem_barrier(team));
void(dushmem_barrier(team));
}
__host__ inline int shmem_my_pe(){
return nvshmem_my_pe();
return dushmem_my_pe();
}
__host__ inline void shmem_free(void *ptr){
nvshmem_free(ptr);
dushmem_free(ptr);
}
__host__ inline void* shmem_align(const size_t alignment, const size_t size) {
return nvshmem_align(size, alignment);
return dushmem_align(size, alignment);
}
__host__ inline void shmem_finalize() {
nvshmem_finalize();
dushmem_finalize();
}
__host__ inline void shmem_team_destroy(shmem_team_t team) {
nvshmem_team_destroy(team);
dushmem_team_destroy(team);
}
__device__ inline void shmem_fence() {
nvshmem_fence();
dushmem_fence();
}
__device__ inline void shmem_int_put_nbi(
int *dest, const int *source, size_t nelems, int pe) {
nvshmem_int_put_nbi(dest, source, nelems, pe);
dushmem_int_put_nbi(dest, source, nelems, pe);
}
__device__ inline void shmemx_int_put_nbi_warp(
int *dest, const int *source, size_t nelems, int pe) {
nvshmemx_int_put_nbi_warp(dest, source, nelems, pe);
dushmemx_int_put_nbi_warp(dest, source, nelems, pe);
}
__device__ inline void shmemx_int8_put_nbi_warp(
signed char *dest, const signed char *source, size_t nelems, int pe) {
nvshmemx_int8_put_nbi_warp(dest, source, nelems, pe);
dushmemx_int8_put_nbi_warp(dest, source, nelems, pe);
}
__device__ inline void shmem_signal_op_add(
uint64_t *dest, uint64_t value, int pe) {
nvshmemx_signal_op(dest, value, NVSHMEM_SIGNAL_ADD, pe);
dushmemx_signal_op(dest, value, DUSHMEM_SIGNAL_ADD, pe);
}
__device__ inline void shmem_ulong_atomic_add(
uint64_t *dest, uint64_t value, int pe) {
nvshmem_ulong_atomic_add(dest, value, pe);
dushmem_ulong_atomic_add(dest, value, pe);
}
__device__ inline void shmem_long_atomic_add(
long *dest, long value, int pe) {
// nvshmem_##Name##_atomic_add(dest, value, pe);
nvshmem_long_atomic_add(dest, value, pe);
// dushmem_##Name##_atomic_add(dest, value, pe);
dushmem_long_atomic_add(dest, value, pe);
}
#endif
......
......@@ -96,46 +96,46 @@ class Buffer:
local_ipc_handle = self.runtime.get_local_ipc_handle()
dist.all_gather_object(ipc_handles, local_ipc_handle, group)
# Synchronize NVSHMEM unique IDs
# Synchronize DUSHMEM unique IDs
root_unique_id = None
if self.runtime.get_num_rdma_ranks() > 1 or low_latency_mode:
# Enable IBGDA
self._setup_device_hca_mapping()
assert num_qps_per_rank > 0
os.environ["NVSHMEM_DISABLE_P2P"] = "0" if allow_nvlink_for_low_latency_mode else "1"
# os.environ["NVSHMEM_IB_ENABLE_IBGDA"] = "1"
os.environ["NVSHMEM_IB_ENABLE_IBGDA"] = "0" # force_use_ibrc
os.environ["DUSHMEM_DISABLE_P2P"] = "0" if allow_nvlink_for_low_latency_mode else "1"
# os.environ["DUSHMEM_IB_ENABLE_IBGDA"] = "1"
os.environ["DUSHMEM_IB_ENABLE_IBGDA"] = "0" # force_use_ibrc
os.environ["NVSHMEM_IBGDA_NIC_HANDLER"] = "gpu"
os.environ["NVSHMEM_IB_DISABLE_DMABUF"] = "1"
os.environ["NVSHMEM_ENABLE_NIC_PE_MAPPING"] = "1"
os.environ["DUSHMEM_IBGDA_NIC_HANDLER"] = "gpu"
os.environ["DUSHMEM_IB_DISABLE_DMABUF"] = "1"
os.environ["DUSHMEM_ENABLE_NIC_PE_MAPPING"] = "1"
os.environ["NVSHMEM_IBGDA_NUM_RC_PER_PE"] = f"{num_qps_per_rank}"
os.environ["DUSHMEM_IBGDA_NUM_RC_PER_PE"] = f"{num_qps_per_rank}"
# Make sure QP depth is always larger than the number of on-flight WRs, so that we can skip WQ slot check
os.environ["NVSHMEM_QP_DEPTH"] = os.environ.get("NVSHMEM_QP_DEPTH", "1024")
os.environ["DUSHMEM_QP_DEPTH"] = os.environ.get("DUSHMEM_QP_DEPTH", "1024")
# Reduce gpu memory usage
# 6 default teams + 1 extra team
os.environ["NVSHMEM_MAX_TEAMS"] = "7"
os.environ["DUSHMEM_MAX_TEAMS"] = "7"
# Disable NVLink SHArP
os.environ["NVSHMEM_DISABLE_NVLS"] = "1"
# NOTES: NVSHMEM initialization requires at least 256 MiB
os.environ["NVSHMEM_CUMEM_GRANULARITY"] = f"{2 ** 29}"
os.environ["DUSHMEM_DISABLE_NVLS"] = "1"
# NOTES: DUSHMEM initialization requires at least 256 MiB
os.environ["DUSHMEM_CUMEM_GRANULARITY"] = f"{2 ** 29}"
if not allow_mnnvl:
# Disable multi-node NVLink detection
os.environ["NVSHMEM_DISABLE_MNNVL"] = "1"
os.environ["DUSHMEM_DISABLE_MNNVL"] = "1"
# Synchronize using the root ID
nvshmem_unique_ids = [
dushmem_unique_ids = [
None,
] * self.group_size
if (low_latency_mode and self.rank == 0) or (
not low_latency_mode and self.runtime.get_rdma_rank() == 0
):
root_unique_id = self.runtime.get_local_nvshmem_unique_id()
dist.all_gather_object(nvshmem_unique_ids, root_unique_id, group)
root_unique_id = nvshmem_unique_ids[
root_unique_id = self.runtime.get_local_dushmem_unique_id()
dist.all_gather_object(dushmem_unique_ids, root_unique_id, group)
root_unique_id = dushmem_unique_ids[
0 if low_latency_mode else self.runtime.get_root_rdma_rank(True)
]
......@@ -169,9 +169,9 @@ class Buffer:
# assert visible_devices[current_device].isdigit(), f"DEEP_EP_DEVICE_TO_HCA_MAPPING requires CUDA_VISIBLE_DEVICES to contain integer indices"
# current_device = int(visible_devices[current_device])
assert current_device in device_mapping, f"Current CUDA device {current_device} not found in DEEP_EP_DEVICE_TO_HCA_MAPPING"
os.environ['NVSHMEM_ENABLE_PE_MAPPING'] = '1'
os.environ['NVSHMEM_HCA_LIST'] = device_mapping[current_device]
assert current_device in device_mapping, f"Current HIP device {current_device} not found in DEEP_EP_DEVICE_TO_HCA_MAPPING"
os.environ['DUSHMEM_ENABLE_PE_MAPPING'] = '1'
os.environ['DUSHMEM_HCA_LIST'] = device_mapping[current_device]
def destroy(self):
"""
......
try:
__version__ = "1.0.0"
__version_tuple__ = (1, 0, 0)
__hcu_version__ = f'1.0.0+das.opt1.dtk2504'
from .version import __version__, __version_tuple__, __hcu_version__
except Exception as e:
import warnings
warnings.warn(f"Failed to read commit hash:\n + str(e)",
RuntimeWarning, stacklevel=2)
__version__ = "dev"
__version_tuple__ = (0, 0, __version__)
def _prev_minor_version_was(version_str):
'''Check whether a given version matches the previous minor version.
Return True if version_str matches the previous minor version.
For example - return True if the current version is 0.7.4 and the
supplied version_str is '0.6'.
Used for --show-hidden-metrics-for-version.
'''
# Match anything if this is a dev tree
if __version_tuple__[0:2] == (0, 0):
return True
# Note - this won't do the right thing when we release 1.0!
# assert __version_tuple__[0] == 0
assert isinstance(__version_tuple__[1], int)
return version_str == f"{__version_tuple__[0]}.{__version_tuple__[1] - 1}"
def _prev_minor_version():
'''For the purpose of testing, return a previous minor version number.'''
# In dev tree, this will return "0.-1", but that will work fine"
assert isinstance(__version_tuple__[1], int)
return f"{__version_tuple__[0]}.{__version_tuple__[1] - 1}"
import os, sys
import subprocess
import setuptools
from torch.utils.cpp_extension import BuildExtension, CUDAExtension
from datetime import datetime
import subprocess
from typing import Optional
import subprocess
import torch
date_tag = datetime.now().strftime("%Y%m%d")
pwd = os.path.dirname(os.path.abspath(__file__))
add_git_version = False
if int(os.environ.get('ADD_GIT_VERSION', '0')) == 1:
add_git_version = True
# -----------------------
# 解析自定义参数
# -----------------------
shmem = None
other = []
for arg in sys.argv:
......@@ -20,19 +23,91 @@ for arg in sys.argv:
shmem = "b"
else:
other.append(arg)
sys.argv = other
if __name__ == '__main__':
try:
cmd = ['git', 'rev-parse', '--short', 'HEAD']
revision = '+' + subprocess.check_output(cmd).decode('ascii').rstrip()
except Exception as _:
revision = ''
def get_version_add(sha: Optional[str] = None) -> str:
command = "git config --global --add safe.directory " + pwd
result = subprocess.run(command, shell=True, capture_output=False, text=True)
deepep_root = os.path.dirname(os.path.abspath(__file__))
add_version_path = os.path.join(os.path.join(deepep_root, "deep_ep"), "version.py")
major, minor, _ = torch.__version__.split('.')
if add_git_version:
if sha != 'Unknown':
if sha is None:
sha = subprocess.check_output(['git', 'rev-parse', 'HEAD'], cwd=deepep_root).decode('ascii').strip()
if (major, minor) >= ('2', '5'):
version = 'das.opt1.' + sha[:7] + shmem
else:
if (major, minor) >= ('2', '5'):
version = 'das.opt1'
if os.getenv("ROCM_PATH"):
rocm_path = os.getenv('ROCM_PATH', "")
rocm_version_path = os.path.join(rocm_path, '.info', "rocm_version")
with open(rocm_version_path, 'r',encoding='utf-8') as file:
lines = file.readlines()
rocm_version=lines[0].replace(".", "")
version += ".dtk" + rocm_version
new_version_content = f"""
try:
__version__ = "1.0.0"
__version_tuple__ = (1, 0, 0)
__hcu_version__ = f'1.0.0+{version}'
from deep_ep.version import __version__, __version_tuple__, __hcu_version__
except Exception as e:
import warnings
warnings.warn(f"Failed to read commit hash:\\n + str(e)",
RuntimeWarning, stacklevel=2)
__version__ = "dev"
__version_tuple__ = (0, 0, __version__)
def _prev_minor_version_was(version_str):
'''Check whether a given version matches the previous minor version.
Return True if version_str matches the previous minor version.
For example - return True if the current version is 0.7.4 and the
supplied version_str is '0.6'.
Used for --show-hidden-metrics-for-version.
'''
# Match anything if this is a dev tree
if __version_tuple__[0:2] == (0, 0):
return True
# Note - this won't do the right thing when we release 1.0!
# assert __version_tuple__[0] == 0
assert isinstance(__version_tuple__[1], int)
return version_str == f"{{__version_tuple__[0]}}.{{__version_tuple__[1] - 1}}"
def _prev_minor_version():
'''For the purpose of testing, return a previous minor version number.'''
# In dev tree, this will return "0.-1", but that will work fine"
assert isinstance(__version_tuple__[1], int)
return f"{{__version_tuple__[0]}}.{{__version_tuple__[1] - 1}}"
"""
with open(add_version_path, encoding="utf-8",mode="w") as file:
file.write(new_version_content)
file.close()
def get_version():
get_version_add()
version_file = 'deep_ep/version.py'
with open(version_file, encoding='utf-8') as f:
exec(compile(f.read(), version_file, 'exec'))
return locals()['__hcu_version__']
def get_deepep_version() -> str:
version = get_version()
return version
if __name__ == '__main__':
setuptools.setup(
name='deep_ep',
version='1.0.0' + revision + shmem + '.' + date_tag,
version=get_deepep_version(),
packages=setuptools.find_packages(include=['deep_ep']),
include_package_data=True,
package_data={"deep_ep": ["deep_ep_cpp.cpython-310-x86_64-linux-gnu.so"]},
......
rocshmem @ cb2af8b8
Subproject commit 2c2c458c84962a344760988d4bc079e914c11de3
Subproject commit cb2af8b8be114cd5e640e46446325301df93eedb
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment