Commit b9ec4909 authored by yuguo's avatar yuguo
Browse files

[DCU] tmp fix overlap test

parent 7e1270f7
......@@ -3,8 +3,6 @@
# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# See LICENSE for license information.
# UB_SKIPMC=1 mpirun -np 8 --allow-run-as-root --oversubscribe --quiet python3 /home/TransformerEngine/tests/pytorch/distributed/run_layer_with_overlap.py --seed=42 --seq-length=4096 --batch-size=2 --num-heads=96 --head-dim=128 --layer-type LayerNormLinear --linear-parallel-mode column --num-layers 1 --overlap-rs-dgrad
# NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=1 UB_SKIPMC=1 mpirun -np 8 --allow-run-as-root --oversubscribe --quiet python3 /home/TransformerEngine/tests/pytorch/distributed/run_layer_with_overlap.py --seed=42 --seq-length=4096 --batch-size=2 --num-heads=96 --head-dim=128 --layer-type MultiheadAttention --num-layers 1 --overlap-rs-dgrad
import os
import sys
......@@ -376,8 +374,6 @@ def _train(opts):
ub_cfgs = {
"qkv_dgrad": {"method": "ring_exchange"},
"fc1_dgrad": {"method": "ring_exchange"},
"proj_fprop": {"method": "ring_exchange"},
"fc2_fprop": {"method": "ring_exchange"},
}
te.module.base.initialize_ub(
[opts.seq_length * opts.batch_size, opts.num_heads * opts.head_dim],
......@@ -505,13 +501,13 @@ def _train(opts):
test_graph.replay()
else:
test_out = run_fwd_bwd(ref_model, ref_x)
torch.cuda.cudart().cudaProfilerStart()
torch.cuda.synchronize()
for _ in range(opts.benchmark_iter):
if opts.use_cuda_graphs:
test_graph.replay()
else:
test_out = run_fwd_bwd(test_model, test_x)
torch.cuda.cudart().cudaProfilerStop()
torch.cuda.synchronize()
if opts.use_cuda_graphs:
del test_graph
......
# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# See LICENSE for license information.
# mpirun -np 8 --allow-run-as-root --oversubscribe --quiet python3 /home/TransformerEngine/tests/pytorch/distributed/run_gemm_with_overlap.py --check-numerics --seed=42 --seq-length=2048 --batch-size=2 --num-heads=96 --head-dim=128 --comm-type=AG --p2p
# mpirun -np 8 --allow-run-as-root --oversubscribe --quiet python3 /home/TransformerEngine/tests/pytorch/distributed/run_gemm_with_overlap.py --check-numerics --seed=42 --seq-length=2048 --batch-size=2 --num-heads=96 --head-dim=128 --comm-type=RS --p2p
# mpirun -np 8 --allow-run-as-root --oversubscribe --quiet python3 /home/TransformerEngine/tests/pytorch/distributed/run_gemm_with_overlap.py --check-numerics --seed=42 --seq-length=1024 --batch-size=2 --num-heads=48 --head-dim=64 --comm-type=AG --p2p
# mpirun -np 8 --allow-run-as-root --oversubscribe --quiet python3 /home/TransformerEngine/tests/pytorch/distributed/run_gemm_with_overlap.py --check-numerics --seed=42 --seq-length=1024 --batch-size=2 --num-heads=48 --head-dim=64 --comm-type=RS --p2p
# UB_SKIPMC=1 mpirun -np 8 --allow-run-as-root --oversubscribe --quiet python3 /home/TransformerEngine/tests/pytorch/distributed/run_layer_with_overlap.py --seed=42 --seq-length=1024 --batch-size=2 --num-heads=48 --head-dim=64 --layer-type LayerNormLinear --linear-parallel-mode column --num-layers 1 --overlap-rs-dgrad
# NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=1 UB_SKIPMC=1 mpirun -np 8 --allow-run-as-root --oversubscribe --quiet python3 /home/TransformerEngine/tests/pytorch/distributed/run_layer_with_overlap.py --seed=42 --seq-length=1024 --batch-size=2 --num-heads=48 --head-dim=64 --layer-type MultiheadAttention --num-layers 1 --overlap-rs-dgrad
import os
import subprocess
from pathlib import Path
......@@ -20,10 +23,10 @@ if torch.cuda.device_count() < 2:
fp8_available, reason_for_no_fp8 = FP8GlobalStateManager.is_fp8_available()
RNG_SEED: int = 42
SEQ_LENGTH: int = 2048
SEQ_LENGTH: int = 1024
BATCH_SIZE: int = 2
NUM_HEADS: int = 96
HEAD_DIM: int = 128
NUM_HEADS: int = 48
HEAD_DIM: int = 64
TE_LAYERS = [
te.Linear,
te.LayerNormLinear,
......@@ -40,7 +43,7 @@ TEST_ROOT = Path(__file__).parent.resolve()
NUM_PROCS: int = min(torch.cuda.device_count(), MAX_GPUS_TO_USE)
LAUNCH_CMD = ["torchrun", f"--nproc_per_node={NUM_PROCS}"]
if tex.ubuf_built_with_mpi():
LAUNCH_CMD = ["mpirun", "-np", str(NUM_PROCS), "--allow-run-as-root", "--oversubscribe", "--quiet", "python3"]
LAUNCH_CMD = ["mpirun", "-np", str(NUM_PROCS), "-x", "NVTE_FUSED_ATTN=0", "-x", "NVTE_FLASH_ATTN=1", "-x", "UB_SKIPMC=1", "--allow-run-as-root", "--oversubscribe", "--quiet", "python3"]
# Fall back on CUDA IPC if the platform does not support CUDA multicast
if not tex.device_supports_multicast():
......
......@@ -245,6 +245,8 @@ else()
find_package(hip)
list(APPEND transformer_engine_LINKER_LIBS hip::host hip::device roctx64)
find_package(rccl)
list(APPEND transformer_engine_LINKER_LIBS rccl)
if(USE_HIPBLASLT)
find_package(hipblaslt)
find_package(hipblas REQUIRED PATHS ${ROCM_PATH})
......
......@@ -880,7 +880,6 @@ void CommOverlapP2PBase::split_overlap_ag(const TensorWrapper &A, bool transa,
_ubufs[_tp_id].numel() * _ubufs[_tp_id].element_size(),
cudaMemcpyDeviceToDevice, _stream_send[0]));
}
NVTE_CHECK_CUDA(cudaDeviceSynchronize());
}
}
......
......@@ -9,6 +9,7 @@
#include <cuda_runtime_api.h>
#ifdef __HIP_PLATFORM_AMD__
#include <rccl.h>
#ifdef USE_HIPBLASLT
#include <hipblaslt/hipblaslt.h>
#endif
......@@ -41,6 +42,15 @@
} \
} while (false)
#define NCCLCHECK(cmd) do { \
ncclResult_t r = cmd; \
if (r != ncclSuccess) { \
printf("NCCL error %s:%d: '%s'\n", __FILE__, __LINE__, \
ncclGetErrorString(r)); \
exit(EXIT_FAILURE); \
} \
} while(0)
#define NVTE_CHECK_CUDA(expr) \
do { \
const cudaError_t status_NVTE_CHECK_CUDA = (expr); \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment