Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
TransformerEngine
Commits
2a64c9a6
Commit
2a64c9a6
authored
Oct 16, 2025
by
tabuchixiangcai3
Browse files
[DCU]Fix memory overflow and test-didistributed in L1_pytorch_istributed_unittest
Signed-off-by:
Tangao
<
2205747538@qq.com
>
parent
a26a0c30
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
108 additions
and
87 deletions
+108
-87
tests/pytorch/debug/run_distributed.py
tests/pytorch/debug/run_distributed.py
+48
-77
tests/pytorch/distributed/run_numerics.py
tests/pytorch/distributed/run_numerics.py
+40
-8
tests/pytorch/distributed/test_numerics.py
tests/pytorch/distributed/test_numerics.py
+13
-1
transformer_engine/common/transpose/transpose.cu
transformer_engine/common/transpose/transpose.cu
+7
-1
No files found.
tests/pytorch/debug/run_distributed.py
View file @
2a64c9a6
...
...
@@ -16,7 +16,7 @@ import transformer_engine
import
transformer_engine_torch
as
tex
import
nvdlfw_inspect.api
as
debug_api
from
transformer_engine.debug
import
set_weight_tensor_tp_group_reduce
from
t
ransformer_engine.pytorch.fp8
import
FP8GlobalStateManager
from
t
orch.utils.cpp_extension
import
IS_HIP_EXTENSION
from
test_numerics
import
(
_emulate_linear
,
...
...
@@ -45,8 +45,6 @@ FEATURE_DIRS = None
all_boolean
=
[
True
,
False
]
TEST_NR
=
0
fp8_available
,
_
=
FP8GlobalStateManager
.
is_fp8_available
()
def
_get_tensors
(
parallel_mode
,
weight_seed
=
SEED
,
data_seed
=
SEED
,
tp_size
=
None
,
tp_rank
=
None
):
if
tp_size
is
None
:
...
...
@@ -72,13 +70,23 @@ def _get_tensors(parallel_mode, weight_seed=SEED, data_seed=SEED, tp_size=None,
def
_init_model
(
weight
,
parallel_mode
=
None
,
tp_group
=
None
,
name
=
"linear"
):
model
=
transformer_engine
.
pytorch
.
Linear
(
if
IS_HIP_EXTENSION
:
model
=
transformer_engine
.
pytorch
.
Linear
(
IN_SIZE
,
OUT_SIZE
,
name
=
name
,
bias
=
False
,
parallel_mode
=
parallel_mode
,
tp_group
=
(
tp_group
or
NCCL_WORLD
if
parallel_mode
else
None
),
)
)
else
:
model
=
transformer_engine
.
pytorch
.
Linear
(
IN_SIZE
,
OUT_SIZE
,
name
=
name
,
parallel_mode
=
parallel_mode
,
tp_group
=
(
tp_group
or
NCCL_WORLD
if
parallel_mode
else
None
),
)
with
torch
.
no_grad
():
model
.
weight
.
copy_
(
weight
)
return
model
...
...
@@ -223,7 +231,7 @@ def run_debug_test(func):
return
wrapper
CONFIG_LOG_TEST_DISTRIBUTED
_FP8
=
"""log_distributed:
CONFIG_LOG_TEST_DISTRIBUTED
=
"""log_distributed:
layers:
layer_types: [linear]
enabled:
...
...
@@ -243,27 +251,11 @@ CONFIG_LOG_TEST_DISTRIBUTED_FP8 = """log_distributed:
end_step: 1
"""
CONFIG_LOG_TEST_DISTRIBUTED_NO_FP8
=
"""log_distributed:
layers:
layer_types: [linear]
enabled:
True
transformer_engine:
LogTensorStats:
enabled: True
tensors: [activation, gradient, weight, output, wgrad, dgrad]
stats: [min, max, mean, std, l1_norm, l2_norm, cur_amax, dynamic_range]
start_step : 0
end_step: 1
"""
def
_prepare_config_test_log_distributed
(
config_file
):
if
WORLD_RANK
!=
0
:
return
config_file
.
write
(
CONFIG_LOG_TEST_DISTRIBUTED_FP8
if
fp8_available
else
CONFIG_LOG_TEST_DISTRIBUTED_NO_FP8
)
config_file
.
write
(
CONFIG_LOG_TEST_DISTRIBUTED
)
config_file
.
flush
()
...
...
@@ -364,40 +356,6 @@ def test_log_distributed(parallel_mode, gather_weight, **kwargs):
set_weight_tensor_tp_group_reduce
(
True
)
# reset
@
run_debug_test
def
sanity_test_log_quantized_stats
(
parallel_mode
,
gather_weight
,
**
kwargs
):
from
test_log
import
LOG_QUANTIZED_CONFIG
kwargs
[
"config_file"
].
write
(
LOG_QUANTIZED_CONFIG
)
kwargs
[
"config_file"
].
flush
()
_init_debug
(
kwargs
[
"config_file"
].
name
,
kwargs
[
"log_dir"
],
FEATURE_DIRS
)
set_weight_tensor_tp_group_reduce
(
gather_weight
)
if
WORLD_SIZE
%
2
!=
0
:
return
# skip
TP_SIZE
=
WORLD_SIZE
//
2
DP_SIZE
=
2
TP_RANK
=
WORLD_RANK
%
TP_SIZE
DP_RANK
=
(
WORLD_RANK
-
TP_RANK
)
//
TP_SIZE
debug_api
.
set_tensor_reduction_group
(
NCCL_WORLD
)
x
,
weight
=
_get_tensors
(
parallel_mode
,
weight_seed
=
TP_RANK
*
1234
,
data_seed
=
DP_RANK
*
1234
,
tp_size
=
TP_SIZE
,
tp_rank
=
TP_RANK
,
)
tp_group_ranks
=
[
i
for
i
in
range
(
DP_RANK
*
TP_SIZE
,
(
DP_RANK
+
1
)
*
TP_SIZE
)]
tp_group
=
dist
.
new_group
(
ranks
=
tp_group_ranks
)
model
=
_init_model
(
weight
,
parallel_mode
=
parallel_mode
,
tp_group
=
tp_group
)
_run_forward_backward
(
x
,
model
,
parallel_mode
=
parallel_mode
,
group
=
tp_group
)
set_weight_tensor_tp_group_reduce
(
True
)
# reset
@
run_debug_test
def
test_log_expert_parallel
(
**
kwargs
):
"""
...
...
@@ -413,13 +371,13 @@ def test_log_expert_parallel(**kwargs):
)
# data parallel
model
=
_init_model
(
weight
,
parallel_mode
=
None
,
name
=
"linear1"
)
model1
=
_init_model
(
weight
,
parallel_mode
=
None
,
name
=
"linear2"
)
with
transformer_engine
.
pytorch
.
fp8_autocast
(
enabled
=
fp8_availabl
e
,
fp8_recipe
=
FP8_RECIPE
):
with
transformer_engine
.
pytorch
.
fp8_autocast
(
enabled
=
Tru
e
,
fp8_recipe
=
FP8_RECIPE
):
y1
=
model
(
x
)
y2
=
model1
(
x
)
y
=
y1
+
y2
y
.
sum
().
backward
()
debug_api
.
step
()
with
transformer_engine
.
pytorch
.
fp8_autocast
(
enabled
=
fp8_availabl
e
,
fp8_recipe
=
FP8_RECIPE
):
with
transformer_engine
.
pytorch
.
fp8_autocast
(
enabled
=
Tru
e
,
fp8_recipe
=
FP8_RECIPE
):
y
=
model
(
x
)
if
WORLD_RANK
!=
0
:
y
=
y
+
model1
(
x
)
...
...
@@ -580,6 +538,9 @@ def test_fake_quant_fp8(
"dgrad_fp8"
:
not
(
dgrad_weight
or
dgrad_grad
),
"wgrad_fp8"
:
not
(
wgrad_grad
or
wgrad_input
),
}
if
IS_HIP_EXTENSION
:
if
fp8_kwargs
[
"fprop_fp8"
]
or
fp8_kwargs
[
"dgrad_fp8"
]
or
fp8_kwargs
[
"wgrad_fp8"
]:
return
# Output type 32 (FP32) does not support int8 simulation.
if
WORLD_RANK
==
0
:
fake_quant_fp8_create_config
(
fprop_inp
,
...
...
@@ -667,30 +628,40 @@ if __name__ == "__main__":
random
.
seed
(
SEED
)
_init_distributed
()
test_log_expert_parallel
()
for
parallel_mode
in
[
"column"
,
"row"
]:
for
gather_weight
in
[
True
,
False
]:
test_log_distributed
(
parallel_mode
,
gather_weight
)
if
IS_HIP_EXTENSION
:
# Output type 32 (FP32) does not support int8 simulation.
pass
else
:
test_log_expert_parallel
()
for
parallel_mode
in
[
"column"
,
"row"
]:
for
gather_weight
in
[
True
,
False
]:
test_log_distributed
(
parallel_mode
,
gather_weight
)
if
fp8_available
:
for
parallel_mode
in
[
"row"
,
"column"
]:
test_disable_fp8_layer
(
parallel_mode
)
for
parallel_mode
in
[
"row"
,
"column"
]:
test_disable_fp8_layer
(
parallel_mode
)
# test_disable_fp8_gemms
if
IS_HIP_EXTENSION
:
# Output type 32 (FP32) does not support int8 simulation.
pass
else
:
# test_disable_fp8_gemms
_run_test_with_combinations
(
test_disable_fp8_gemms
,
all_boolean
,
num_repeat
=
3
,
extra_args
=
[
"column"
,
"row"
]
)
# test_fake_quant_fp8
dtype_options
=
[
tex
.
DType
.
kFloat8E4M3
,
tex
.
DType
.
kFloat8E5M2
,
None
]
_run_test_with_combinations
(
test_fake_quant_fp8
,
dtype_options
,
num_repeat
=
6
,
extra_args
=
[
"column"
,
"row"
],
sample_size
=
20
,
)
# test_fake_quant_fp8
dtype_options
=
[
tex
.
DType
.
kFloat8E4M3
,
tex
.
DType
.
kFloat8E5M2
,
None
]
_run_test_with_combinations
(
test_fake_quant_fp8
,
dtype_options
,
num_repeat
=
6
,
extra_args
=
[
"column"
,
"row"
],
sample_size
=
20
,
)
if
IS_HIP_EXTENSION
:
# Output type 32 (FP32) does not support int8 simulation.
pass
else
:
_run_test_with_combinations
(
test_per_tensor_scaling
,
all_boolean
,
...
...
tests/pytorch/distributed/run_numerics.py
View file @
2a64c9a6
...
...
@@ -509,7 +509,7 @@ def _test_linear(parallel_mode=None, sequence_parallel=False, **kwargs):
def
test_linear
():
"""Run linear layer tests with various configurations."""
kwargs_list
=
[
base_
kwargs_list
=
[
{},
{
"bias"
:
False
},
{
"init_method"
:
_constant
},
...
...
@@ -517,11 +517,17 @@ def test_linear():
{
"return_bias"
:
True
},
{
"params_dtype"
:
torch
.
float16
},
{
"delay_wgrad_compute"
:
True
},
{
"save_original_input"
:
True
},
]
#TODO:The blockwise recipe does not currently support calculations with bias set to true.
"""
For AMD platforms, when the quantization recipe is fp8_block_scaling, iterate through base_kwargs_list,
and if the bias value is not set in kwargs or the bias value is true, set bias to false.
"""
if
IS_HIP_EXTENSION
and
QUANTIZATION
==
"fp8_block_scaling"
:
kwargs_list
=
[
kwargs
for
kwargs
in
base_kwargs_list
if
kwargs
.
get
(
"bias"
,
True
)
is
False
]
else
:
kwargs_list
=
base_kwargs_list
for
kwargs
in
kwargs_list
:
if
kwargs
.
get
(
"save_original_input"
,
False
)
and
QUANTIZATION
==
"fp8"
:
continue
for
parallel_mode
in
[
"column"
,
"row"
]:
for
sequence_parallel
in
[
False
,
True
]:
_test_linear
(
parallel_mode
,
sequence_parallel
,
**
kwargs
)
...
...
@@ -688,7 +694,7 @@ def _test_layernorm_linear(parallel_mode=None, sequence_parallel=False, **kwargs
def
test_layernorm_linear
():
kwargs_list
=
[
base_
kwargs_list
=
[
{},
{
"bias"
:
False
},
{
"init_method"
:
_constant
},
...
...
@@ -699,6 +705,15 @@ def test_layernorm_linear():
{
"return_layernorm_output"
:
True
},
{
"delay_wgrad_compute"
:
True
},
]
#TODO:The blockwise recipe does not currently support calculations with bias set to true.
"""
For AMD platforms, when the quantization recipe is fp8_block_scaling, iterate through base_kwargs_list,
and if the bias value is not set in kwargs or the bias value is true, set bias to false.
"""
if
IS_HIP_EXTENSION
and
QUANTIZATION
==
"fp8_block_scaling"
:
kwargs_list
=
[
kwargs
for
kwargs
in
base_kwargs_list
if
kwargs
.
get
(
"bias"
,
True
)
is
False
]
else
:
kwargs_list
=
base_kwargs_list
for
kwargs
in
kwargs_list
:
for
parallel_mode
in
[
"column"
]:
for
sequence_parallel
in
[
False
,
True
]:
...
...
@@ -793,7 +808,7 @@ def _test_layernorm_mlp(set_parallel_mode=None, sequence_parallel=False, **kwarg
def
test_layernorm_mlp
():
kwargs_list
=
[
base_
kwargs_list
=
[
{},
{
"init_method"
:
_constant
},
{
"output_layer_init_method"
:
_constant
},
...
...
@@ -807,7 +822,15 @@ def test_layernorm_mlp():
{
"return_layernorm_output"
:
True
},
{
"delay_wgrad_compute"
:
True
},
]
#TODO:The blockwise recipe does not currently support calculations with bias set to true.
"""
For AMD platforms, when the quantization recipe is fp8_block_scaling, iterate through base_kwargs_list,
and if the bias value is not set in kwargs or the bias value is true, set bias to false.
"""
if
IS_HIP_EXTENSION
and
QUANTIZATION
==
"fp8_block_scaling"
:
kwargs_list
=
[
kwargs
for
kwargs
in
base_kwargs_list
if
kwargs
.
get
(
"bias"
,
True
)
is
False
]
else
:
kwargs_list
=
base_kwargs_list
for
kwargs
in
kwargs_list
:
for
set_parallel_mode
in
[
True
]:
for
sequence_parallel
in
[
False
,
True
]:
...
...
@@ -882,7 +905,7 @@ def _test_transformer_layer_parallel(sequence_parallel=False, **kwargs):
def
test_transformer_layer
():
kwargs_list
=
[
base_
kwargs_list
=
[
{},
{
"num_gqa_groups"
:
4
},
{
"init_method"
:
_constant
},
...
...
@@ -902,6 +925,15 @@ def test_transformer_layer():
{
"fuse_qkv_params"
:
True
},
{
"activation"
:
"relu"
},
]
#TODO:The blockwise recipe does not currently support calculations with bias set to true.
"""
For AMD platforms, when the quantization recipe is fp8_block_scaling, iterate through base_kwargs_list,
and if the bias value is not set in kwargs or the bias value is true, set bias to false.
"""
if
IS_HIP_EXTENSION
and
QUANTIZATION
==
"fp8_block_scaling"
:
kwargs_list
=
[
kwargs
for
kwargs
in
base_kwargs_list
if
kwargs
.
get
(
"bias"
,
True
)
is
False
]
else
:
kwargs_list
=
base_kwargs_list
for
kwargs
in
kwargs_list
:
for
sequence_parallel
in
[
False
,
True
]:
...
...
tests/pytorch/distributed/test_numerics.py
View file @
2a64c9a6
...
...
@@ -9,7 +9,8 @@ from pathlib import Path
import
pytest
import
torch
from
transformer_engine.pytorch.fp8
import
FP8GlobalStateManager
from
torch.utils.cpp_extension
import
IS_HIP_EXTENSION
import
transformer_engine
as
te
"""
Distributed numerics tests
...
...
@@ -61,4 +62,15 @@ def test_distributed(quantization):
pytest
.
skip
(
reason_for_no_mxfp8
)
if
quantization
==
"fp8_block_scaling"
and
not
fp8_block_scaling_available
:
pytest
.
skip
(
reason_for_no_fp8_block_scaling
)
if
IS_HIP_EXTENSION
and
quantization
==
"fp8_block_scaling"
:
import
importlib
ori_int8_sim_fp8
=
os
.
environ
.
get
(
"NVTE_INT8_SIM_FP8"
,
"None"
)
os
.
environ
[
"NVTE_INT8_SIM_FP8"
]
=
"1"
importlib
.
reload
(
te
.
pytorch
.
fp8
)
_run_test
(
quantization
)
if
IS_HIP_EXTENSION
and
quantization
==
"fp8_block_scaling"
:
if
ori_int8_sim_fp8
is
None
or
ori_int8_sim_fp8
==
"None"
:
os
.
environ
[
"NVTE_INT8_SIM_FP8"
]
=
"0"
else
:
del
os
.
environ
[
"NVTE_INT8_SIM_FP8"
]
importlib
.
reload
(
te
.
pytorch
.
fp8
)
transformer_engine/common/transpose/transpose.cu
View file @
2a64c9a6
...
...
@@ -217,7 +217,13 @@ void transpose(const Tensor &input, const Tensor &noop, Tensor *output_, cudaStr
// Choose between runtime-compiled or statically-compiled kernel
const
bool
aligned
=
(
row_length
%
THREADS_PER_WARP
==
0
&&
num_rows
%
THREADS_PER_WARP
==
0
);
if
(
aligned
&&
rtc
::
is_enabled
())
{
// Runtime-compiled tuned kernel
//TODO:Using RTC may cause kernel crashes. Therefore, set use_rtc to true to avoid using RTC and resolve the kernel crash issue.
#ifdef USE_ROCM
const
bool
use_rtc
=
false
;
#else
const
bool
use_rtc
=
true
;
#endif
if
(
aligned
&&
rtc
::
is_enabled
()
&&
use_rtc
)
{
// Runtime-compiled tuned kernel
// Pick kernel config
std
::
vector
<
KernelConfig
>
kernel_configs
;
kernel_configs
.
reserve
(
16
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment