Commit 0b14d3b2 authored by lijian6's avatar lijian6
Browse files

1. 修复使用函数获取num_nvl_bytes, num_rdma_bytes变量的bug.


2. 修改测试脚本,降低显存占用。使用量从17G -> 8G.
Signed-off-by: lijian6's avatarlijian <lijian6@sugon.com>
parent 447238de
...@@ -6,8 +6,9 @@ export UCX_TLS=rc,rocm ...@@ -6,8 +6,9 @@ export UCX_TLS=rc,rocm
export OMPI_MCA_rmaps_base_mapping_policy="slot:numa" export OMPI_MCA_rmaps_base_mapping_policy="slot:numa"
export ROCSHMEM_MAX_NUM_CONTEXTS=32 export ROCSHMEM_MAX_NUM_CONTEXTS=32
export UCX_ROCM_IPC_SIGPOOL_MAX_ELEMS=16384 export UCX_ROCM_IPC_SIGPOOL_MAX_ELEMS=16384
export UCX_NET_DEVICES=mlx5_2:1,mlx5_4:1,mlx5_6:1,mlx5_8:1 export UCX_NET_DEVICES=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export ROCSHMEM_HEAP_SIZE=10737418240 # export ROCSHMEM_HEAP_SIZE=536870912 805306368 10737418240
export ROCSHMEM_HEAP_SIZE=536870912
export PYTHONPATH=/public/home/lishen/Tmp/DeepEP:$PYTHONPATH export PYTHONPATH=/public/home/lishen/Tmp/DeepEP:$PYTHONPATH
torchrun --nproc-per-node=1 --nnodes=2 --node-rank=0 --master-addr="10.16.1.37" --master-port=1234 tests/test_internode.py torchrun --nproc-per-node=1 --nnodes=2 --node-rank=0 --master-addr="10.16.1.37" --master-port=1234 tests/test_internode.py
...@@ -6,8 +6,9 @@ export UCX_TLS=rc,rocm ...@@ -6,8 +6,9 @@ export UCX_TLS=rc,rocm
export OMPI_MCA_rmaps_base_mapping_policy="slot:numa" export OMPI_MCA_rmaps_base_mapping_policy="slot:numa"
export ROCSHMEM_MAX_NUM_CONTEXTS=32 export ROCSHMEM_MAX_NUM_CONTEXTS=32
export UCX_ROCM_IPC_SIGPOOL_MAX_ELEMS=16384 export UCX_ROCM_IPC_SIGPOOL_MAX_ELEMS=16384
export UCX_NET_DEVICES=mlx5_2:1,mlx5_4:1,mlx5_6:1,mlx5_8:1 export UCX_NET_DEVICES=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export ROCSHMEM_HEAP_SIZE=10737418240 # export ROCSHMEM_HEAP_SIZE=536870912 805306368 10737418240
export ROCSHMEM_HEAP_SIZE=536870912
export PYTHONPATH=/public/home/lishen/Tmp/DeepEP:$PYTHONPATH export PYTHONPATH=/public/home/lishen/Tmp/DeepEP:$PYTHONPATH
torchrun --nproc-per-node=1 --nnodes=2 --node-rank=1 --master-addr="10.16.1.37" --master-port=1234 tests/test_internode.py torchrun --nproc-per-node=1 --nnodes=2 --node-rank=1 --master-addr="10.16.1.37" --master-port=1234 tests/test_internode.py
...@@ -27,7 +27,7 @@ class Buffer: ...@@ -27,7 +27,7 @@ class Buffer:
runtime: the C++ runtime. runtime: the C++ runtime.
""" """
num_sms: int = 20 num_sms: int = 24
def __init__( def __init__(
self, self,
...@@ -253,7 +253,8 @@ class Buffer: ...@@ -253,7 +253,8 @@ class Buffer:
2: Config(Buffer.num_sms, 24, 256, 6, 128), 2: Config(Buffer.num_sms, 24, 256, 6, 128),
4: Config(Buffer.num_sms, 6, 256, 6, 128), 4: Config(Buffer.num_sms, 6, 256, 6, 128),
8: Config(Buffer.num_sms, 6, 256, 6, 128), 8: Config(Buffer.num_sms, 6, 256, 6, 128),
16: Config(Buffer.num_sms, 36, 288, 20, 128), # 16: Config(Buffer.num_sms, 36, 288, 20, 128),
16: Config(Buffer.num_sms, 8, 512, 16, 128),
24: Config(Buffer.num_sms, 8, 288, 32, 128), 24: Config(Buffer.num_sms, 8, 288, 32, 128),
32: Config(Buffer.num_sms, 32, 288, 32, 128), 32: Config(Buffer.num_sms, 32, 288, 32, 128),
64: Config(Buffer.num_sms, 20, 288, 28, 128), 64: Config(Buffer.num_sms, 20, 288, 28, 128),
...@@ -281,7 +282,8 @@ class Buffer: ...@@ -281,7 +282,8 @@ class Buffer:
2: Config(Buffer.num_sms, 10, 256, 6, 128), 2: Config(Buffer.num_sms, 10, 256, 6, 128),
4: Config(Buffer.num_sms, 9, 256, 6, 128), 4: Config(Buffer.num_sms, 9, 256, 6, 128),
8: Config(Buffer.num_sms, 4, 256, 6, 128), 8: Config(Buffer.num_sms, 4, 256, 6, 128),
16: Config(Buffer.num_sms, 4, 288, 12, 128), # 16: Config(Buffer.num_sms, 4, 288, 12, 128),
16: Config(Buffer.num_sms, 8, 512, 16, 128),
24: Config(Buffer.num_sms, 1, 288, 8, 128), 24: Config(Buffer.num_sms, 1, 288, 8, 128),
32: Config(Buffer.num_sms, 1, 288, 8, 128), 32: Config(Buffer.num_sms, 1, 288, 8, 128),
64: Config(Buffer.num_sms, 1, 288, 20, 128), 64: Config(Buffer.num_sms, 1, 288, 20, 128),
......
...@@ -2,6 +2,9 @@ import os ...@@ -2,6 +2,9 @@ import os
import subprocess import subprocess
import setuptools import setuptools
from torch.utils.cpp_extension import BuildExtension, CUDAExtension from torch.utils.cpp_extension import BuildExtension, CUDAExtension
from datetime import datetime
date_tag = datetime.now().strftime("%Y%m%d")
if __name__ == '__main__': if __name__ == '__main__':
try: try:
...@@ -12,7 +15,7 @@ if __name__ == '__main__': ...@@ -12,7 +15,7 @@ if __name__ == '__main__':
setuptools.setup( setuptools.setup(
name='deep_ep', name='deep_ep',
version='1.0.0' + revision, version='1.0.0' + revision + '.' + date_tag,
packages=setuptools.find_packages(include=['deep_ep']), packages=setuptools.find_packages(include=['deep_ep']),
include_package_data=True, include_package_data=True,
package_data={"deep_ep": ["deep_ep_cpp.cpython-310-x86_64-linux-gnu.so"]}, package_data={"deep_ep": ["deep_ep_cpp.cpython-310-x86_64-linux-gnu.so"]},
......
...@@ -193,7 +193,6 @@ def test_main(args: argparse.Namespace, num_sms: int, ...@@ -193,7 +193,6 @@ def test_main(args: argparse.Namespace, num_sms: int,
if skip_benchmark: if skip_benchmark:
return hash_value return hash_value
# print("benchmark start:")
# Tune dispatch performance # Tune dispatch performance
best_dispatch_results = None best_dispatch_results = None
fp8_factor = (1 + 4 / 128) / 2 fp8_factor = (1 + 4 / 128) / 2
...@@ -253,6 +252,10 @@ def test_main(args: argparse.Namespace, num_sms: int, ...@@ -253,6 +252,10 @@ def test_main(args: argparse.Namespace, num_sms: int,
print('', flush=True) print('', flush=True)
return hash_value return hash_value
def get_hidden_bytes(args: argparse.Namespace) -> int:
x = torch.ones((args.num_tokens, args.hidden), dtype=torch.bfloat16)
t = x[0] if isinstance(x, tuple) else x
return t.size(1) * max(t.element_size(), 2)
# noinspection PyUnboundLocalVariable,PyShadowingNames # noinspection PyUnboundLocalVariable,PyShadowingNames
def test_loop(local_rank: int, num_local_ranks: int, args: argparse.Namespace): def test_loop(local_rank: int, num_local_ranks: int, args: argparse.Namespace):
...@@ -261,10 +264,16 @@ def test_loop(local_rank: int, num_local_ranks: int, args: argparse.Namespace): ...@@ -261,10 +264,16 @@ def test_loop(local_rank: int, num_local_ranks: int, args: argparse.Namespace):
if args.test_ll_compatibility: if args.test_ll_compatibility:
ll_num_tokens, ll_hidden, ll_num_experts, ll_num_topk = 16, 5120, 256, 9 ll_num_tokens, ll_hidden, ll_num_experts, ll_num_topk = 16, 5120, 256, 9
num_sms = 30 num_sms = 24
num_qps_per_rank = max(num_sms, ll_num_experts // num_ranks if args.test_ll_compatibility else 0) num_qps_per_rank = max(num_sms, ll_num_experts // num_ranks if args.test_ll_compatibility else 0)
buffer = deep_ep.Buffer(group, int(2e9), int(1e9), low_latency_mode=args.test_ll_compatibility, hidden_bytes = get_hidden_bytes(args)
num_nvl_bytes, num_rdma_bytes = 0, 0
for config in (deep_ep.Buffer.get_dispatch_config(group.size()), deep_ep.Buffer.get_combine_config(group.size())):
num_nvl_bytes = max(config.get_nvl_buffer_size_hint(hidden_bytes, group.size()), num_nvl_bytes)
num_rdma_bytes = max(config.get_rdma_buffer_size_hint(hidden_bytes, group.size()), num_rdma_bytes)
buffer = deep_ep.Buffer(group, num_nvl_bytes, num_rdma_bytes, low_latency_mode=args.test_ll_compatibility,
num_qps_per_rank=num_qps_per_rank, explicitly_destroy=True, use_default_stream_as_comm_stream=False) num_qps_per_rank=num_qps_per_rank, explicitly_destroy=True, use_default_stream_as_comm_stream=False)
assert num_local_ranks == 8 and num_ranks > 8 assert num_local_ranks == 8 and num_ranks > 8
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment