Commit b9ec4909 authored by yuguo's avatar yuguo
Browse files

[DCU] tmp fix overlap test

parent 7e1270f7
...@@ -3,8 +3,6 @@ ...@@ -3,8 +3,6 @@
# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# #
# See LICENSE for license information. # See LICENSE for license information.
# UB_SKIPMC=1 mpirun -np 8 --allow-run-as-root --oversubscribe --quiet python3 /home/TransformerEngine/tests/pytorch/distributed/run_layer_with_overlap.py --seed=42 --seq-length=4096 --batch-size=2 --num-heads=96 --head-dim=128 --layer-type LayerNormLinear --linear-parallel-mode column --num-layers 1 --overlap-rs-dgrad
# NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=1 UB_SKIPMC=1 mpirun -np 8 --allow-run-as-root --oversubscribe --quiet python3 /home/TransformerEngine/tests/pytorch/distributed/run_layer_with_overlap.py --seed=42 --seq-length=4096 --batch-size=2 --num-heads=96 --head-dim=128 --layer-type MultiheadAttention --num-layers 1 --overlap-rs-dgrad
import os import os
import sys import sys
...@@ -376,8 +374,6 @@ def _train(opts): ...@@ -376,8 +374,6 @@ def _train(opts):
ub_cfgs = { ub_cfgs = {
"qkv_dgrad": {"method": "ring_exchange"}, "qkv_dgrad": {"method": "ring_exchange"},
"fc1_dgrad": {"method": "ring_exchange"}, "fc1_dgrad": {"method": "ring_exchange"},
"proj_fprop": {"method": "ring_exchange"},
"fc2_fprop": {"method": "ring_exchange"},
} }
te.module.base.initialize_ub( te.module.base.initialize_ub(
[opts.seq_length * opts.batch_size, opts.num_heads * opts.head_dim], [opts.seq_length * opts.batch_size, opts.num_heads * opts.head_dim],
...@@ -505,13 +501,13 @@ def _train(opts): ...@@ -505,13 +501,13 @@ def _train(opts):
test_graph.replay() test_graph.replay()
else: else:
test_out = run_fwd_bwd(ref_model, ref_x) test_out = run_fwd_bwd(ref_model, ref_x)
torch.cuda.cudart().cudaProfilerStart() torch.cuda.synchronize()
for _ in range(opts.benchmark_iter): for _ in range(opts.benchmark_iter):
if opts.use_cuda_graphs: if opts.use_cuda_graphs:
test_graph.replay() test_graph.replay()
else: else:
test_out = run_fwd_bwd(test_model, test_x) test_out = run_fwd_bwd(test_model, test_x)
torch.cuda.cudart().cudaProfilerStop() torch.cuda.synchronize()
if opts.use_cuda_graphs: if opts.use_cuda_graphs:
del test_graph del test_graph
......
# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# #
# See LICENSE for license information. # See LICENSE for license information.
# mpirun -np 8 --allow-run-as-root --oversubscribe --quiet python3 /home/TransformerEngine/tests/pytorch/distributed/run_gemm_with_overlap.py --check-numerics --seed=42 --seq-length=2048 --batch-size=2 --num-heads=96 --head-dim=128 --comm-type=AG --p2p # mpirun -np 8 --allow-run-as-root --oversubscribe --quiet python3 /home/TransformerEngine/tests/pytorch/distributed/run_gemm_with_overlap.py --check-numerics --seed=42 --seq-length=1024 --batch-size=2 --num-heads=48 --head-dim=64 --comm-type=AG --p2p
# mpirun -np 8 --allow-run-as-root --oversubscribe --quiet python3 /home/TransformerEngine/tests/pytorch/distributed/run_gemm_with_overlap.py --check-numerics --seed=42 --seq-length=2048 --batch-size=2 --num-heads=96 --head-dim=128 --comm-type=RS --p2p # mpirun -np 8 --allow-run-as-root --oversubscribe --quiet python3 /home/TransformerEngine/tests/pytorch/distributed/run_gemm_with_overlap.py --check-numerics --seed=42 --seq-length=1024 --batch-size=2 --num-heads=48 --head-dim=64 --comm-type=RS --p2p
# UB_SKIPMC=1 mpirun -np 8 --allow-run-as-root --oversubscribe --quiet python3 /home/TransformerEngine/tests/pytorch/distributed/run_layer_with_overlap.py --seed=42 --seq-length=1024 --batch-size=2 --num-heads=48 --head-dim=64 --layer-type LayerNormLinear --linear-parallel-mode column --num-layers 1 --overlap-rs-dgrad
# NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=1 UB_SKIPMC=1 mpirun -np 8 --allow-run-as-root --oversubscribe --quiet python3 /home/TransformerEngine/tests/pytorch/distributed/run_layer_with_overlap.py --seed=42 --seq-length=1024 --batch-size=2 --num-heads=48 --head-dim=64 --layer-type MultiheadAttention --num-layers 1 --overlap-rs-dgrad
import os import os
import subprocess import subprocess
from pathlib import Path from pathlib import Path
...@@ -20,10 +23,10 @@ if torch.cuda.device_count() < 2: ...@@ -20,10 +23,10 @@ if torch.cuda.device_count() < 2:
fp8_available, reason_for_no_fp8 = FP8GlobalStateManager.is_fp8_available() fp8_available, reason_for_no_fp8 = FP8GlobalStateManager.is_fp8_available()
RNG_SEED: int = 42 RNG_SEED: int = 42
SEQ_LENGTH: int = 2048 SEQ_LENGTH: int = 1024
BATCH_SIZE: int = 2 BATCH_SIZE: int = 2
NUM_HEADS: int = 96 NUM_HEADS: int = 48
HEAD_DIM: int = 128 HEAD_DIM: int = 64
TE_LAYERS = [ TE_LAYERS = [
te.Linear, te.Linear,
te.LayerNormLinear, te.LayerNormLinear,
...@@ -40,7 +43,7 @@ TEST_ROOT = Path(__file__).parent.resolve() ...@@ -40,7 +43,7 @@ TEST_ROOT = Path(__file__).parent.resolve()
NUM_PROCS: int = min(torch.cuda.device_count(), MAX_GPUS_TO_USE) NUM_PROCS: int = min(torch.cuda.device_count(), MAX_GPUS_TO_USE)
LAUNCH_CMD = ["torchrun", f"--nproc_per_node={NUM_PROCS}"] LAUNCH_CMD = ["torchrun", f"--nproc_per_node={NUM_PROCS}"]
if tex.ubuf_built_with_mpi(): if tex.ubuf_built_with_mpi():
LAUNCH_CMD = ["mpirun", "-np", str(NUM_PROCS), "--allow-run-as-root", "--oversubscribe", "--quiet", "python3"] LAUNCH_CMD = ["mpirun", "-np", str(NUM_PROCS), "-x", "NVTE_FUSED_ATTN=0", "-x", "NVTE_FLASH_ATTN=1", "-x", "UB_SKIPMC=1", "--allow-run-as-root", "--oversubscribe", "--quiet", "python3"]
# Fall back on CUDA IPC if the platform does not support CUDA multicast # Fall back on CUDA IPC if the platform does not support CUDA multicast
if not tex.device_supports_multicast(): if not tex.device_supports_multicast():
......
...@@ -245,6 +245,8 @@ else() ...@@ -245,6 +245,8 @@ else()
find_package(hip) find_package(hip)
list(APPEND transformer_engine_LINKER_LIBS hip::host hip::device roctx64) list(APPEND transformer_engine_LINKER_LIBS hip::host hip::device roctx64)
find_package(rccl)
list(APPEND transformer_engine_LINKER_LIBS rccl)
if(USE_HIPBLASLT) if(USE_HIPBLASLT)
find_package(hipblaslt) find_package(hipblaslt)
find_package(hipblas REQUIRED PATHS ${ROCM_PATH}) find_package(hipblas REQUIRED PATHS ${ROCM_PATH})
......
...@@ -880,7 +880,6 @@ void CommOverlapP2PBase::split_overlap_ag(const TensorWrapper &A, bool transa, ...@@ -880,7 +880,6 @@ void CommOverlapP2PBase::split_overlap_ag(const TensorWrapper &A, bool transa,
_ubufs[_tp_id].numel() * _ubufs[_tp_id].element_size(), _ubufs[_tp_id].numel() * _ubufs[_tp_id].element_size(),
cudaMemcpyDeviceToDevice, _stream_send[0])); cudaMemcpyDeviceToDevice, _stream_send[0]));
} }
NVTE_CHECK_CUDA(cudaDeviceSynchronize());
} }
} }
......
...@@ -9,6 +9,7 @@ ...@@ -9,6 +9,7 @@
#include <cuda_runtime_api.h> #include <cuda_runtime_api.h>
#ifdef __HIP_PLATFORM_AMD__ #ifdef __HIP_PLATFORM_AMD__
#include <rccl.h>
#ifdef USE_HIPBLASLT #ifdef USE_HIPBLASLT
#include <hipblaslt/hipblaslt.h> #include <hipblaslt/hipblaslt.h>
#endif #endif
...@@ -41,6 +42,15 @@ ...@@ -41,6 +42,15 @@
} \ } \
} while (false) } while (false)
#define NCCLCHECK(cmd) do { \
ncclResult_t r = cmd; \
if (r != ncclSuccess) { \
printf("NCCL error %s:%d: '%s'\n", __FILE__, __LINE__, \
ncclGetErrorString(r)); \
exit(EXIT_FAILURE); \
} \
} while(0)
#define NVTE_CHECK_CUDA(expr) \ #define NVTE_CHECK_CUDA(expr) \
do { \ do { \
const cudaError_t status_NVTE_CHECK_CUDA = (expr); \ const cudaError_t status_NVTE_CHECK_CUDA = (expr); \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment