Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
TransformerEngine
Commits
b9ec4909
Commit
b9ec4909
authored
Apr 17, 2025
by
yuguo
Browse files
[DCU] tmp fix overlap test
parent
7e1270f7
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
23 additions
and
13 deletions
+23
-13
tests/pytorch/distributed/run_layer_with_overlap.py
tests/pytorch/distributed/run_layer_with_overlap.py
+2
-6
tests/pytorch/distributed/test_comm_gemm_overlap.py
tests/pytorch/distributed/test_comm_gemm_overlap.py
+9
-6
transformer_engine/common/CMakeLists.txt
transformer_engine/common/CMakeLists.txt
+2
-0
transformer_engine/common/comm_gemm_overlap/comm_gemm_overlap.cpp
...mer_engine/common/comm_gemm_overlap/comm_gemm_overlap.cpp
+0
-1
transformer_engine/common/util/logging.h
transformer_engine/common/util/logging.h
+10
-0
No files found.
tests/pytorch/distributed/run_layer_with_overlap.py
View file @
b9ec4909
...
@@ -3,8 +3,6 @@
...
@@ -3,8 +3,6 @@
# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
#
# See LICENSE for license information.
# See LICENSE for license information.
# UB_SKIPMC=1 mpirun -np 8 --allow-run-as-root --oversubscribe --quiet python3 /home/TransformerEngine/tests/pytorch/distributed/run_layer_with_overlap.py --seed=42 --seq-length=4096 --batch-size=2 --num-heads=96 --head-dim=128 --layer-type LayerNormLinear --linear-parallel-mode column --num-layers 1 --overlap-rs-dgrad
# NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=1 UB_SKIPMC=1 mpirun -np 8 --allow-run-as-root --oversubscribe --quiet python3 /home/TransformerEngine/tests/pytorch/distributed/run_layer_with_overlap.py --seed=42 --seq-length=4096 --batch-size=2 --num-heads=96 --head-dim=128 --layer-type MultiheadAttention --num-layers 1 --overlap-rs-dgrad
import
os
import
os
import
sys
import
sys
...
@@ -376,8 +374,6 @@ def _train(opts):
...
@@ -376,8 +374,6 @@ def _train(opts):
ub_cfgs
=
{
ub_cfgs
=
{
"qkv_dgrad"
:
{
"method"
:
"ring_exchange"
},
"qkv_dgrad"
:
{
"method"
:
"ring_exchange"
},
"fc1_dgrad"
:
{
"method"
:
"ring_exchange"
},
"fc1_dgrad"
:
{
"method"
:
"ring_exchange"
},
"proj_fprop"
:
{
"method"
:
"ring_exchange"
},
"fc2_fprop"
:
{
"method"
:
"ring_exchange"
},
}
}
te
.
module
.
base
.
initialize_ub
(
te
.
module
.
base
.
initialize_ub
(
[
opts
.
seq_length
*
opts
.
batch_size
,
opts
.
num_heads
*
opts
.
head_dim
],
[
opts
.
seq_length
*
opts
.
batch_size
,
opts
.
num_heads
*
opts
.
head_dim
],
...
@@ -505,13 +501,13 @@ def _train(opts):
...
@@ -505,13 +501,13 @@ def _train(opts):
test_graph
.
replay
()
test_graph
.
replay
()
else
:
else
:
test_out
=
run_fwd_bwd
(
ref_model
,
ref_x
)
test_out
=
run_fwd_bwd
(
ref_model
,
ref_x
)
torch
.
cuda
.
cudart
().
cudaProfilerStart
()
torch
.
cuda
.
synchronize
()
for
_
in
range
(
opts
.
benchmark_iter
):
for
_
in
range
(
opts
.
benchmark_iter
):
if
opts
.
use_cuda_graphs
:
if
opts
.
use_cuda_graphs
:
test_graph
.
replay
()
test_graph
.
replay
()
else
:
else
:
test_out
=
run_fwd_bwd
(
test_model
,
test_x
)
test_out
=
run_fwd_bwd
(
test_model
,
test_x
)
torch
.
cuda
.
cudart
().
cudaProfilerStop
()
torch
.
cuda
.
synchronize
()
if
opts
.
use_cuda_graphs
:
if
opts
.
use_cuda_graphs
:
del
test_graph
del
test_graph
...
...
tests/pytorch/distributed/test_comm_gemm_overlap.py
View file @
b9ec4909
# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
#
# See LICENSE for license information.
# See LICENSE for license information.
# mpirun -np 8 --allow-run-as-root --oversubscribe --quiet python3 /home/TransformerEngine/tests/pytorch/distributed/run_gemm_with_overlap.py --check-numerics --seed=42 --seq-length=2048 --batch-size=2 --num-heads=96 --head-dim=128 --comm-type=AG --p2p
# mpirun -np 8 --allow-run-as-root --oversubscribe --quiet python3 /home/TransformerEngine/tests/pytorch/distributed/run_gemm_with_overlap.py --check-numerics --seed=42 --seq-length=1024 --batch-size=2 --num-heads=48 --head-dim=64 --comm-type=AG --p2p
# mpirun -np 8 --allow-run-as-root --oversubscribe --quiet python3 /home/TransformerEngine/tests/pytorch/distributed/run_gemm_with_overlap.py --check-numerics --seed=42 --seq-length=2048 --batch-size=2 --num-heads=96 --head-dim=128 --comm-type=RS --p2p
# mpirun -np 8 --allow-run-as-root --oversubscribe --quiet python3 /home/TransformerEngine/tests/pytorch/distributed/run_gemm_with_overlap.py --check-numerics --seed=42 --seq-length=1024 --batch-size=2 --num-heads=48 --head-dim=64 --comm-type=RS --p2p
# UB_SKIPMC=1 mpirun -np 8 --allow-run-as-root --oversubscribe --quiet python3 /home/TransformerEngine/tests/pytorch/distributed/run_layer_with_overlap.py --seed=42 --seq-length=1024 --batch-size=2 --num-heads=48 --head-dim=64 --layer-type LayerNormLinear --linear-parallel-mode column --num-layers 1 --overlap-rs-dgrad
# NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=1 UB_SKIPMC=1 mpirun -np 8 --allow-run-as-root --oversubscribe --quiet python3 /home/TransformerEngine/tests/pytorch/distributed/run_layer_with_overlap.py --seed=42 --seq-length=1024 --batch-size=2 --num-heads=48 --head-dim=64 --layer-type MultiheadAttention --num-layers 1 --overlap-rs-dgrad
import
os
import
os
import
subprocess
import
subprocess
from
pathlib
import
Path
from
pathlib
import
Path
...
@@ -20,10 +23,10 @@ if torch.cuda.device_count() < 2:
...
@@ -20,10 +23,10 @@ if torch.cuda.device_count() < 2:
fp8_available
,
reason_for_no_fp8
=
FP8GlobalStateManager
.
is_fp8_available
()
fp8_available
,
reason_for_no_fp8
=
FP8GlobalStateManager
.
is_fp8_available
()
RNG_SEED
:
int
=
42
RNG_SEED
:
int
=
42
SEQ_LENGTH
:
int
=
2048
SEQ_LENGTH
:
int
=
1024
BATCH_SIZE
:
int
=
2
BATCH_SIZE
:
int
=
2
NUM_HEADS
:
int
=
96
NUM_HEADS
:
int
=
48
HEAD_DIM
:
int
=
128
HEAD_DIM
:
int
=
64
TE_LAYERS
=
[
TE_LAYERS
=
[
te
.
Linear
,
te
.
Linear
,
te
.
LayerNormLinear
,
te
.
LayerNormLinear
,
...
@@ -40,7 +43,7 @@ TEST_ROOT = Path(__file__).parent.resolve()
...
@@ -40,7 +43,7 @@ TEST_ROOT = Path(__file__).parent.resolve()
NUM_PROCS
:
int
=
min
(
torch
.
cuda
.
device_count
(),
MAX_GPUS_TO_USE
)
NUM_PROCS
:
int
=
min
(
torch
.
cuda
.
device_count
(),
MAX_GPUS_TO_USE
)
LAUNCH_CMD
=
[
"torchrun"
,
f
"--nproc_per_node=
{
NUM_PROCS
}
"
]
LAUNCH_CMD
=
[
"torchrun"
,
f
"--nproc_per_node=
{
NUM_PROCS
}
"
]
if
tex
.
ubuf_built_with_mpi
():
if
tex
.
ubuf_built_with_mpi
():
LAUNCH_CMD
=
[
"mpirun"
,
"-np"
,
str
(
NUM_PROCS
),
"--allow-run-as-root"
,
"--oversubscribe"
,
"--quiet"
,
"python3"
]
LAUNCH_CMD
=
[
"mpirun"
,
"-np"
,
str
(
NUM_PROCS
),
"-x"
,
"NVTE_FUSED_ATTN=0"
,
"-x"
,
"NVTE_FLASH_ATTN=1"
,
"-x"
,
"UB_SKIPMC=1"
,
"--allow-run-as-root"
,
"--oversubscribe"
,
"--quiet"
,
"python3"
]
# Fall back on CUDA IPC if the platform does not support CUDA multicast
# Fall back on CUDA IPC if the platform does not support CUDA multicast
if
not
tex
.
device_supports_multicast
():
if
not
tex
.
device_supports_multicast
():
...
...
transformer_engine/common/CMakeLists.txt
View file @
b9ec4909
...
@@ -245,6 +245,8 @@ else()
...
@@ -245,6 +245,8 @@ else()
find_package
(
hip
)
find_package
(
hip
)
list
(
APPEND transformer_engine_LINKER_LIBS hip::host hip::device roctx64
)
list
(
APPEND transformer_engine_LINKER_LIBS hip::host hip::device roctx64
)
find_package
(
rccl
)
list
(
APPEND transformer_engine_LINKER_LIBS rccl
)
if
(
USE_HIPBLASLT
)
if
(
USE_HIPBLASLT
)
find_package
(
hipblaslt
)
find_package
(
hipblaslt
)
find_package
(
hipblas REQUIRED PATHS
${
ROCM_PATH
}
)
find_package
(
hipblas REQUIRED PATHS
${
ROCM_PATH
}
)
...
...
transformer_engine/common/comm_gemm_overlap/comm_gemm_overlap.cpp
View file @
b9ec4909
...
@@ -880,7 +880,6 @@ void CommOverlapP2PBase::split_overlap_ag(const TensorWrapper &A, bool transa,
...
@@ -880,7 +880,6 @@ void CommOverlapP2PBase::split_overlap_ag(const TensorWrapper &A, bool transa,
_ubufs
[
_tp_id
].
numel
()
*
_ubufs
[
_tp_id
].
element_size
(),
_ubufs
[
_tp_id
].
numel
()
*
_ubufs
[
_tp_id
].
element_size
(),
cudaMemcpyDeviceToDevice
,
_stream_send
[
0
]));
cudaMemcpyDeviceToDevice
,
_stream_send
[
0
]));
}
}
NVTE_CHECK_CUDA
(
cudaDeviceSynchronize
());
}
}
}
}
...
...
transformer_engine/common/util/logging.h
View file @
b9ec4909
...
@@ -9,6 +9,7 @@
...
@@ -9,6 +9,7 @@
#include <cuda_runtime_api.h>
#include <cuda_runtime_api.h>
#ifdef __HIP_PLATFORM_AMD__
#ifdef __HIP_PLATFORM_AMD__
#include <rccl.h>
#ifdef USE_HIPBLASLT
#ifdef USE_HIPBLASLT
#include <hipblaslt/hipblaslt.h>
#include <hipblaslt/hipblaslt.h>
#endif
#endif
...
@@ -41,6 +42,15 @@
...
@@ -41,6 +42,15 @@
} \
} \
} while (false)
} while (false)
#define NCCLCHECK(cmd) do { \
ncclResult_t r = cmd; \
if (r != ncclSuccess) { \
printf("NCCL error %s:%d: '%s'\n", __FILE__, __LINE__, \
ncclGetErrorString(r)); \
exit(EXIT_FAILURE); \
} \
} while(0)
#define NVTE_CHECK_CUDA(expr) \
#define NVTE_CHECK_CUDA(expr) \
do { \
do { \
const cudaError_t status_NVTE_CHECK_CUDA = (expr); \
const cudaError_t status_NVTE_CHECK_CUDA = (expr); \
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment