Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
DeepEP
Commits
3872dd54
Commit
3872dd54
authored
Dec 30, 2025
by
lishen
Browse files
支持internode将sm数提高,加快combine的带宽
parent
bc11ea32
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
7 additions
and
9 deletions
+7
-9
1.sh
1.sh
+2
-2
2.sh
2.sh
+1
-1
csrc/config.hpp
csrc/config.hpp
+4
-4
csrc/kernels/configs.cuh
csrc/kernels/configs.cuh
+0
-2
No files found.
1.sh
View file @
3872dd54
pgrep
-f
/usr/bin/python | xargs
kill
-9
pgrep
-f
/usr/bin/python | xargs
kill
-9
export
OMPI_MCA_pml
=
ucx
export
OMPI_MCA_osc
=
ucx
...
...
@@ -6,7 +6,7 @@ export OMPI_MCA_coll_hcoll_enable=0
export
UCX_TLS
=
rc,rocm
# export ROCSHMEM_UNIQUEID_WITH_MPI=1
export
OMPI_MCA_rmaps_base_mapping_policy
=
"slot:numa"
export
ROCSHMEM_MAX_NUM_CONTEXTS
=
32
export
ROCSHMEM_MAX_NUM_CONTEXTS
=
48
export
UCX_ROCM_IPC_SIGPOOL_MAX_ELEMS
=
16384
export
UCX_NET_DEVICES
=
mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export
ROCSHMEM_ALLOWED_IBV_DEVICES
=
mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7,mlx5_8,mlx5_9
...
...
2.sh
View file @
3872dd54
...
...
@@ -6,7 +6,7 @@ export OMPI_MCA_coll_hcoll_enable=0
export
UCX_TLS
=
rc,rocm
# export ROCSHMEM_UNIQUEID_WITH_MPI=1
export
OMPI_MCA_rmaps_base_mapping_policy
=
"slot:numa"
export
ROCSHMEM_MAX_NUM_CONTEXTS
=
32
export
ROCSHMEM_MAX_NUM_CONTEXTS
=
48
export
UCX_ROCM_IPC_SIGPOOL_MAX_ELEMS
=
16384
export
UCX_NET_DEVICES
=
mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export
ROCSHMEM_ALLOWED_IBV_DEVICES
=
mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7,mlx5_8,mlx5_9
...
...
csrc/config.hpp
View file @
3872dd54
...
...
@@ -44,10 +44,10 @@ struct Config {
constexpr
int
kNumMaxTopK
=
128
;
constexpr
int
kNumMaxScales
=
128
;
EP_HOST_ASSERT
(
num_ranks
<
NUM_MAX_NVL_PEERS
or
num_ranks
%
NUM_MAX_NVL_PEERS
==
0
);
EP_HOST_ASSERT
(
num_ranks
<=
NUM_MAX_NVL_PEERS
or
num_sms
%
2
==
0
);
EP_HOST_ASSERT
(
num_ranks
<=
NUM_MAX_NVL_PEERS
or
num_sms
%
(
2
*
NUM_INTERNODE_DISPATCH_BLOCKS_PER_CHANNEL
)
==
0
);
const
auto
num_rdma_ranks
=
std
::
max
(
num_ranks
/
NUM_MAX_NVL_PEERS
,
1
);
const
auto
num_nvl_ranks
=
std
::
min
(
num_ranks
,
NUM_MAX_NVL_PEERS
);
const
int
num_channels
=
num_sms
/
2
;
const
int
num_channels
=
num_sms
;
size_t
num_bytes
=
0
;
num_bytes
+=
num_channels
*
num_nvl_ranks
*
(
2
*
num_rdma_ranks
+
3
)
*
sizeof
(
int
);
...
...
@@ -77,9 +77,9 @@ struct Config {
constexpr
int
kNumMaxTopK
=
128
;
constexpr
int
kNumMaxScales
=
128
;
EP_HOST_ASSERT
(
num_ranks
%
NUM_MAX_NVL_PEERS
==
0
);
EP_HOST_ASSERT
(
num_sms
%
2
==
0
);
EP_HOST_ASSERT
(
num_sms
%
NUM_INTERNODE_DISPATCH_BLOCKS_PER_CHANNEL
==
0
);
const
int
num_rdma_ranks
=
num_ranks
/
NUM_MAX_NVL_PEERS
;
const
int
num_channels
=
num_sms
/
2
;
const
int
num_channels
=
num_sms
;
size_t
num_bytes
=
0
;
num_bytes
+=
num_channels
*
num_rdma_ranks
*
(
NUM_MAX_NVL_PEERS
*
2
+
2
)
*
2
*
sizeof
(
int
);
...
...
csrc/kernels/configs.cuh
View file @
3872dd54
...
...
@@ -25,8 +25,6 @@
#define NUM_INTERNODE_DISPATCH_BLOCKS_PER_CHANNEL 3
#define FP8_QUANTIZATION_NUM_PER_CHANNEL 128
#define NUM_INTERNODE_DISPATCH_BLOCKS_PER_CHANNEL 3
#define DEFAULT_NUM_CU 20
#define DEFAULT_NUM_MAX_XGMI_CHUNKED_SEND_TOKENS 6
#define DEFAULT_NUM_MAX_XGMI_CHUNKED_RECV_TOKENS 256
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment