Commit e1283972 authored by lijian6's avatar lijian6
Browse files

Fix sync mode error.


Signed-off-by: lijian6's avatarlijian <lijian6@sugon.com>
parent 5563b6d0
...@@ -9,6 +9,5 @@ export UCX_ROCM_IPC_SIGPOOL_MAX_ELEMS=16384 ...@@ -9,6 +9,5 @@ export UCX_ROCM_IPC_SIGPOOL_MAX_ELEMS=16384
export UCX_NET_DEVICES=mlx5_2:1,mlx5_4:1,mlx5_6:1,mlx5_8:1 export UCX_NET_DEVICES=mlx5_2:1,mlx5_4:1,mlx5_6:1,mlx5_8:1
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export ROCSHMEM_HEAP_SIZE=10737418240 export ROCSHMEM_HEAP_SIZE=10737418240
export PYTHONPATH=/work/Tmp/DeepEP:$PYTHONPATH export PYTHONPATH=/public/home/lishen/Tmp/DeepEP:$PYTHONPATH
torchrun --nproc-per-node=1 --nnodes=2 --node-rank=0 --master-addr="10.16.1.37" --master-port=1234 tests/test_internode.py torchrun --nproc-per-node=1 --nnodes=2 --node-rank=0 --master-addr="10.16.1.37" --master-port=1234 tests/test_internode.py
# torchrun --nproc-per-node=1 --nnodes=2 --node-rank=0 --master-addr="10.16.1.37" --master-port=1234 tests/internode_lj.py
...@@ -9,6 +9,5 @@ export UCX_ROCM_IPC_SIGPOOL_MAX_ELEMS=16384 ...@@ -9,6 +9,5 @@ export UCX_ROCM_IPC_SIGPOOL_MAX_ELEMS=16384
export UCX_NET_DEVICES=mlx5_2:1,mlx5_4:1,mlx5_6:1,mlx5_8:1 export UCX_NET_DEVICES=mlx5_2:1,mlx5_4:1,mlx5_6:1,mlx5_8:1
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export ROCSHMEM_HEAP_SIZE=10737418240 export ROCSHMEM_HEAP_SIZE=10737418240
export PYTHONPATH=/work/Tmp/DeepEP:$PYTHONPATH export PYTHONPATH=/public/home/lishen/Tmp/DeepEP:$PYTHONPATH
torchrun --nproc-per-node=1 --nnodes=2 --node-rank=1 --master-addr="10.16.1.37" --master-port=1234 tests/test_internode.py torchrun --nproc-per-node=1 --nnodes=2 --node-rank=1 --master-addr="10.16.1.37" --master-port=1234 tests/test_internode.py
# torchrun --nproc-per-node=1 --nnodes=2 --node-rank=1 --master-addr="10.16.1.37" --master-port=1234 tests/internode_lj.py
This diff is collapsed.
...@@ -162,7 +162,7 @@ def test_main(args: argparse.Namespace, num_sms: int, ...@@ -162,7 +162,7 @@ def test_main(args: argparse.Namespace, num_sms: int,
# print("lijian test dipatch end and combine start.") # print("lijian test dipatch end and combine start.")
bias_0 = torch.ones((num_tokens, hidden), dtype=torch.bfloat16, device='cuda') bias_0 = torch.ones((num_tokens, hidden), dtype=torch.bfloat16, device='cuda')
bias_1 = torch.randn((num_tokens, hidden), dtype=torch.bfloat16, device='cuda') bias_1 = torch.randn((num_tokens, hidden), dtype=torch.bfloat16, device='cuda')
combine_args = {'x': recv_x, 'handle': handle, 'config': config} combine_args = {'x': recv_x, 'handle': handle, 'config': config, 'async_finish': async_mode}
if with_topk: if with_topk:
combine_args.update({'topk_weights': recv_topk_weights}) combine_args.update({'topk_weights': recv_topk_weights})
if previous_mode: if previous_mode:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment