Unverified Commit 5b549c85 authored by Guangguan Wang's avatar Guangguan Wang Committed by GitHub
Browse files

Minor patches for deepep (#318)



* Add arg --pressure-test for test_low_latency.py

Add arg --pressure-test for test_low_latency.py
Signed-off-by: default avatarGuangguan Wang <guangguan.wang@linux.alibaba.com>

* Export NVSHMEM_QP_DEPTH

Export NVSHMEM_QP_DEPTH
Signed-off-by: default avatarGuangguan Wang <guangguan.wang@linux.alibaba.com>

---------
Signed-off-by: default avatarGuangguan Wang <guangguan.wang@linux.alibaba.com>
parent f9c06bb0
...@@ -85,7 +85,7 @@ class Buffer: ...@@ -85,7 +85,7 @@ class Buffer:
os.environ['NVSHMEM_IB_ENABLE_IBGDA'] = '1' os.environ['NVSHMEM_IB_ENABLE_IBGDA'] = '1'
os.environ['NVSHMEM_IBGDA_NUM_RC_PER_PE'] = f'{num_qps_per_rank}' os.environ['NVSHMEM_IBGDA_NUM_RC_PER_PE'] = f'{num_qps_per_rank}'
# Make sure QP depth is always larger than the number of on-flight WRs, so that we can skip WQ slot check # Make sure QP depth is always larger than the number of on-flight WRs, so that we can skip WQ slot check
os.environ['NVSHMEM_QP_DEPTH'] = '1024' os.environ['NVSHMEM_QP_DEPTH'] = os.environ.get('NVSHMEM_QP_DEPTH', '1024')
# Reduce gpu memory usage # Reduce gpu memory usage
# 6 default teams + 1 extra team # 6 default teams + 1 extra team
......
...@@ -164,7 +164,7 @@ def test_loop(local_rank: int, num_local_ranks: int, args: argparse.Namespace): ...@@ -164,7 +164,7 @@ def test_loop(local_rank: int, num_local_ranks: int, args: argparse.Namespace):
test_main(num_tokens, hidden, num_experts, num_topk, rank, num_ranks, group, buffer, test_main(num_tokens, hidden, num_experts, num_topk, rank, num_ranks, group, buffer,
use_logfmt=args.use_logfmt, seed=1) use_logfmt=args.use_logfmt, seed=1)
do_pressure_test = False do_pressure_test = args.pressure_test
for seed in range(int(1e9) if do_pressure_test else 0): for seed in range(int(1e9) if do_pressure_test else 0):
if local_rank == 0: if local_rank == 0:
print(f'Testing with seed {seed} ...', flush=True) print(f'Testing with seed {seed} ...', flush=True)
...@@ -198,6 +198,8 @@ if __name__ == '__main__': ...@@ -198,6 +198,8 @@ if __name__ == '__main__':
help='Whether to disable NVLink for testing') help='Whether to disable NVLink for testing')
parser.add_argument('--use-logfmt', action='store_true', parser.add_argument('--use-logfmt', action='store_true',
help='Whether to test LogFMT combine') help='Whether to test LogFMT combine')
parser.add_argument("--pressure-test", action='store_true',
help='Whether to do pressure test')
args = parser.parse_args() args = parser.parse_args()
num_processes = args.num_processes num_processes = args.num_processes
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment