Commit 1a35d640 authored by root's avatar root
Browse files

fix dtk26.04 4nodes core dump.


Signed-off-by: default avatarroot <root@host-10-212-17-3.cluster.local>
parent 95e46992
......@@ -47,7 +47,7 @@ struct Config {
EP_HOST_ASSERT(num_ranks <= NUM_MAX_NVL_PEERS or num_sms % (2 * NUM_INTERNODE_DISPATCH_BLOCKS_PER_CHANNEL) == 0);
const auto num_rdma_ranks = std::max(num_ranks / NUM_MAX_NVL_PEERS, 1);
const auto num_nvl_ranks = std::min(num_ranks, NUM_MAX_NVL_PEERS);
const int num_channels = num_ranks <=8 ? num_sms / 2 : num_sms / NUM_INTERNODE_DISPATCH_BLOCKS_PER_CHANNEL;
const int num_channels = num_sms / 2;
// 计算每个nvl通信数据包的数据量
size_t num_single_nvl_bag_bytes =
......@@ -83,7 +83,7 @@ struct Config {
EP_HOST_ASSERT(num_ranks % NUM_MAX_NVL_PEERS == 0);
EP_HOST_ASSERT(num_sms % NUM_INTERNODE_DISPATCH_BLOCKS_PER_CHANNEL == 0);
const int num_rdma_ranks = num_ranks / NUM_MAX_NVL_PEERS;
const int num_channels = num_sms / NUM_INTERNODE_DISPATCH_BLOCKS_PER_CHANNEL;
const int num_channels = num_sms / 2;
// 计算每个rdma通信数据包的数据量
size_t num_single_rdma_bag_bytes =
......
......@@ -809,8 +809,8 @@ Buffer::internode_dispatch(const torch::Tensor &x, const std::optional<torch::Te
// here.
pybind11::gil_scoped_release release;
const int num_channels = config.num_sms / NUM_INTERNODE_DISPATCH_BLOCKS_PER_CHANNEL;
EP_HOST_ASSERT(config.num_sms % NUM_INTERNODE_DISPATCH_BLOCKS_PER_CHANNEL == 0);
const int num_channels = config.num_sms / 2;
// EP_HOST_ASSERT(config.num_sms % NUM_INTERNODE_DISPATCH_BLOCKS_PER_CHANNEL == 0);
EP_HOST_ASSERT(0 < get_num_rdma_ranks() and get_num_rdma_ranks() <= NUM_MAX_RDMA_PEERS);
bool cached_mode = cached_rdma_channel_prefix_matrix.has_value();
......@@ -1130,8 +1130,8 @@ Buffer::internode_combine(
const torch::Tensor &combined_nvl_head, const Config &config,
std::optional<EventHandle> &previous_event, bool async, bool allocate_on_comm_stream) {
#ifndef DISABLE_ROCSHMEM
const int num_channels = config.num_sms / NUM_INTERNODE_DISPATCH_BLOCKS_PER_CHANNEL;
EP_HOST_ASSERT(config.num_sms % NUM_INTERNODE_DISPATCH_BLOCKS_PER_CHANNEL == 0);
const int num_channels = config.num_sms / 2;
// EP_HOST_ASSERT(config.num_sms % NUM_INTERNODE_DISPATCH_BLOCKS_PER_CHANNEL == 0);
// Shape and contiguous checks
EP_HOST_ASSERT(x.dim() == 2 and x.is_contiguous());
......
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment