Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
DeepEP
Commits
e1283972
Commit
e1283972
authored
Oct 20, 2025
by
lijian6
Browse files
Fix sync mode error.
Signed-off-by:
lijian
<
lijian6@sugon.com
>
parent
5563b6d0
Changes
4
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
3 additions
and
809 deletions
+3
-809
1.sh
1.sh
+1
-2
2.sh
2.sh
+1
-2
csrc/kernels/internode.hip
csrc/kernels/internode.hip
+0
-804
tests/test_internode.py
tests/test_internode.py
+1
-1
No files found.
1.sh
View file @
e1283972
...
@@ -9,6 +9,5 @@ export UCX_ROCM_IPC_SIGPOOL_MAX_ELEMS=16384
...
@@ -9,6 +9,5 @@ export UCX_ROCM_IPC_SIGPOOL_MAX_ELEMS=16384
export
UCX_NET_DEVICES
=
mlx5_2:1,mlx5_4:1,mlx5_6:1,mlx5_8:1
export
UCX_NET_DEVICES
=
mlx5_2:1,mlx5_4:1,mlx5_6:1,mlx5_8:1
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
export
ROCSHMEM_HEAP_SIZE
=
10737418240
export
ROCSHMEM_HEAP_SIZE
=
10737418240
export
PYTHONPATH
=
/
work
/Tmp/DeepEP:
$PYTHONPATH
export
PYTHONPATH
=
/
public/home/lishen
/Tmp/DeepEP:
$PYTHONPATH
torchrun
--nproc-per-node
=
1
--nnodes
=
2
--node-rank
=
0
--master-addr
=
"10.16.1.37"
--master-port
=
1234 tests/test_internode.py
torchrun
--nproc-per-node
=
1
--nnodes
=
2
--node-rank
=
0
--master-addr
=
"10.16.1.37"
--master-port
=
1234 tests/test_internode.py
# torchrun --nproc-per-node=1 --nnodes=2 --node-rank=0 --master-addr="10.16.1.37" --master-port=1234 tests/internode_lj.py
2.sh
View file @
e1283972
...
@@ -9,6 +9,5 @@ export UCX_ROCM_IPC_SIGPOOL_MAX_ELEMS=16384
...
@@ -9,6 +9,5 @@ export UCX_ROCM_IPC_SIGPOOL_MAX_ELEMS=16384
export
UCX_NET_DEVICES
=
mlx5_2:1,mlx5_4:1,mlx5_6:1,mlx5_8:1
export
UCX_NET_DEVICES
=
mlx5_2:1,mlx5_4:1,mlx5_6:1,mlx5_8:1
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
export
HIP_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
export
ROCSHMEM_HEAP_SIZE
=
10737418240
export
ROCSHMEM_HEAP_SIZE
=
10737418240
export
PYTHONPATH
=
/
work
/Tmp/DeepEP:
$PYTHONPATH
export
PYTHONPATH
=
/
public/home/lishen
/Tmp/DeepEP:
$PYTHONPATH
torchrun
--nproc-per-node
=
1
--nnodes
=
2
--node-rank
=
1
--master-addr
=
"10.16.1.37"
--master-port
=
1234 tests/test_internode.py
torchrun
--nproc-per-node
=
1
--nnodes
=
2
--node-rank
=
1
--master-addr
=
"10.16.1.37"
--master-port
=
1234 tests/test_internode.py
# torchrun --nproc-per-node=1 --nnodes=2 --node-rank=1 --master-addr="10.16.1.37" --master-port=1234 tests/internode_lj.py
csrc/kernels/internode.hip
View file @
e1283972
This diff is collapsed.
Click to expand it.
tests/test_internode.py
View file @
e1283972
...
@@ -162,7 +162,7 @@ def test_main(args: argparse.Namespace, num_sms: int,
...
@@ -162,7 +162,7 @@ def test_main(args: argparse.Namespace, num_sms: int,
# print("lijian test dipatch end and combine start.")
# print("lijian test dipatch end and combine start.")
bias_0
=
torch
.
ones
((
num_tokens
,
hidden
),
dtype
=
torch
.
bfloat16
,
device
=
'cuda'
)
bias_0
=
torch
.
ones
((
num_tokens
,
hidden
),
dtype
=
torch
.
bfloat16
,
device
=
'cuda'
)
bias_1
=
torch
.
randn
((
num_tokens
,
hidden
),
dtype
=
torch
.
bfloat16
,
device
=
'cuda'
)
bias_1
=
torch
.
randn
((
num_tokens
,
hidden
),
dtype
=
torch
.
bfloat16
,
device
=
'cuda'
)
combine_args
=
{
'x'
:
recv_x
,
'handle'
:
handle
,
'config'
:
config
}
combine_args
=
{
'x'
:
recv_x
,
'handle'
:
handle
,
'config'
:
config
,
'async_finish'
:
async_mode
}
if
with_topk
:
if
with_topk
:
combine_args
.
update
({
'topk_weights'
:
recv_topk_weights
})
combine_args
.
update
({
'topk_weights'
:
recv_topk_weights
})
if
previous_mode
:
if
previous_mode
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment