Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
DeepEP
Commits
3e54b78f
Commit
3e54b78f
authored
Apr 22, 2025
by
Shangyan Zhou
Browse files
Normal kernels always use IBGDA mode.
parent
20b2aaaf
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
16 additions
and
21 deletions
+16
-21
csrc/kernels/runtime.cu
csrc/kernels/runtime.cu
+6
-9
deep_ep/buffer.py
deep_ep/buffer.py
+10
-12
No files found.
csrc/kernels/runtime.cu
View file @
3e54b78f
...
...
@@ -58,15 +58,12 @@ int init(const std::vector<uint8_t> &root_unique_id_val, int rank, int num_ranks
EP_HOST_ASSERT
(
cpu_rdma_team
!=
NVSHMEM_TEAM_INVALID
);
}
// Normal operations use IBRC, while low-latency operations use IBGDA
bool
internode_use_ibgda
=
true
;
if
(
low_latency_mode
or
internode_use_ibgda
)
{
nvshmemi_device_host_state_t
*
dev_state_ptr
=
nullptr
;
CUDA_CHECK
(
cudaGetSymbolAddress
(
reinterpret_cast
<
void
**>
(
&
dev_state_ptr
),
nvshmemi_device_state_d
));
bool
ibgda_is_initialized
=
false
;
CUDA_CHECK
(
cudaMemcpy
(
&
dev_state_ptr
->
ibgda_is_initialized
,
&
ibgda_is_initialized
,
sizeof
(
bool
),
cudaMemcpyHostToDevice
));
}
// TODO: we still use `nvshmem_barrier` under IBRC mode, which should be switch to IBGDA mode later
nvshmemi_device_host_state_t
*
dev_state_ptr
=
nullptr
;
CUDA_CHECK
(
cudaGetSymbolAddress
(
reinterpret_cast
<
void
**>
(
&
dev_state_ptr
),
nvshmemi_device_state_d
));
bool
ibgda_is_initialized
=
false
;
CUDA_CHECK
(
cudaMemcpy
(
&
dev_state_ptr
->
ibgda_is_initialized
,
&
ibgda_is_initialized
,
sizeof
(
bool
),
cudaMemcpyHostToDevice
));
nvshmem_barrier_all
();
return
nvshmem_my_pe
();
}
...
...
deep_ep/buffer.py
View file @
3e54b78f
...
...
@@ -65,19 +65,17 @@ class Buffer:
# Synchronize NVSHMEM unique IDs
root_unique_id
=
None
internode_use_ibgda
=
True
if
self
.
runtime
.
get_num_rdma_ranks
()
>
1
or
low_latency_mode
:
# Enable IBGDA for the low latency mode, which refers to "no package forwarding between NVLink and RDMA"
if
low_latency_mode
or
internode_use_ibgda
:
assert
num_qps_per_rank
>
0
os
.
environ
[
'NVSHMEM_DISABLE_P2P'
]
=
'1'
os
.
environ
[
'NVSHMEM_IB_ENABLE_IBGDA'
]
=
'1'
os
.
environ
[
'NVSHMEM_IBGDA_NIC_HANDLER'
]
=
'gpu'
os
.
environ
[
'NVSHMEM_IBGDA_NUM_RC_PER_PE'
]
=
f
'
{
num_qps_per_rank
}
'
# Make sure QP depth is always larger than the number of on-flight WRs, so that we can skip WQ slot check
os
.
environ
[
'NVSHMEM_QP_DEPTH'
]
=
'1024'
# NOTES: NVSHMEM initialization requires at least 256 MiB
os
.
environ
[
'NVSHMEM_CUMEM_GRANULARITY'
]
=
f
'
{
2
**
29
}
'
# Enable IBGDA
assert
num_qps_per_rank
>
0
os
.
environ
[
'NVSHMEM_DISABLE_P2P'
]
=
'1'
os
.
environ
[
'NVSHMEM_IB_ENABLE_IBGDA'
]
=
'1'
os
.
environ
[
'NVSHMEM_IBGDA_NIC_HANDLER'
]
=
'gpu'
os
.
environ
[
'NVSHMEM_IBGDA_NUM_RC_PER_PE'
]
=
f
'
{
num_qps_per_rank
}
'
# Make sure QP depth is always larger than the number of on-flight WRs, so that we can skip WQ slot check
os
.
environ
[
'NVSHMEM_QP_DEPTH'
]
=
'1024'
# NOTES: NVSHMEM initialization requires at least 256 MiB
os
.
environ
[
'NVSHMEM_CUMEM_GRANULARITY'
]
=
f
'
{
2
**
29
}
'
# Synchronize using the root ID
nvshmem_unique_ids
=
[
None
,
]
*
self
.
group_size
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment