Unverified Commit 2645eaec authored by Paweł Gadziński's avatar Paweł Gadziński Committed by GitHub
Browse files

[Pytorch] NVIDIA-DL-Framework-Inspect support – part 3 – tests (#1612)



* tests drop
Signed-off-by: default avatarPawel Gadzinski <pgadzinski@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci



* fix
Signed-off-by: default avatarPawel Gadzinski <pgadzinski@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci



* move dir
Signed-off-by: default avatarPawel Gadzinski <pgadzinski@nvidia.com>

* tests fox
Signed-off-by: default avatarPawel Gadzinski <pgadzinski@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci



* fix
Signed-off-by: default avatarPawel Gadzinski <pgadzinski@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci



* fix
Signed-off-by: default avatarPawel Gadzinski <pgadzinski@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci



* fix
Signed-off-by: default avatarPawel Gadzinski <pgadzinski@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci



---------
Signed-off-by: default avatarPawel Gadzinski <pgadzinski@nvidia.com>
Signed-off-by: default avatarPrzemek Tredak <ptredak@nvidia.com>
Co-authored-by: default avatarpre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: default avatarPrzemek Tredak <ptredak@nvidia.com>
Co-authored-by: default avatarKirthi Shankar Sivamani <ksivamani@nvidia.com>
parent 1d903f5e
# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# See LICENSE for license information.
: ${TE_PATH:=/opt/transformerengine}
: ${NVTE_TEST_NVINSPECT_FEATURE_DIRS:=$TE_PATH/transformer_engine/debug/features}
: ${NVTE_TEST_NVINSPECT_CONFIGS_DIR:=$TE_PATH/tests/pytorch/debug/test_configs/}
# Config with the dummy feature which prevents nvinspect from being disabled.
# Nvinspect will be disabled if no feature is active.
: ${NVTE_TEST_NVINSPECT_DUMMY_CONFIG_FILE:=$TE_PATH/tests/pytorch/debug/test_configs/dummy_feature.yaml}
FAIL=0
pip install pytest==8.2.1
pytest -v -s $TE_PATH/tests/pytorch/debug/test_sanity.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS || FAIL=1
pytest -v -s $TE_PATH/tests/pytorch/debug/test_config.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS || FAIL=1
pytest -v -s $TE_PATH/tests/pytorch/debug/test_numerics.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS || FAIL=1
NVTE_TORCH_COMPILE=0 pytest -v -s $TE_PATH/tests/pytorch/debug/test_api_features.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS --configs_dir=$NVTE_TEST_NVINSPECT_CONFIGS_DIR || FAIL=1
# standard numerics tests with initialized debug
NVTE_TEST_NVINSPECT_ENABLED=True NVTE_TEST_NVINSPECT_CONFIG_FILE=$NVTE_TEST_NVINSPECT_DUMMY_CONFIG_FILE NVTE_TEST_NVINSPECT_FEATURE_DIRS=$NVTE_TEST_NVINSPECT_FEATURE_DIRS PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 pytest -v -s $TE_PATH/tests/pytorch/test_numerics.py || FAIL=1
exit $FAIL
......@@ -20,6 +20,7 @@ FAILED_CASES=""
: ${XML_LOG_DIR:=/logs}
mkdir -p "$XML_LOG_DIR"
pip3 install pytest==8.2.1 || error_exit "Failed to install pytest"
python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_numerics.xml $TE_PATH/tests/pytorch/distributed/test_numerics.py || test_fail "test_numerics.py"
......@@ -30,6 +31,19 @@ python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_fusible_ops_with_use
python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_fused_attn_with_cp.xml $TE_PATH/tests/pytorch/fused_attn/test_fused_attn_with_cp.py || test_fail "test_fused_attn_with_cp.py"
python3 -m pytest -v -s --junitxml=$XML_LOG_DIR/pytest_test_cast_master_weights_to_fp8.xml $TE_PATH/tests/pytorch/distributed/test_cast_master_weights_to_fp8.py || test_fail "test_cast_master_weights_to_fp8.py"
# debug tests
# Config with the dummy feature which prevents nvinspect from being disabled.
# Nvinspect will be disabled if no feature is active.
: ${NVTE_TEST_NVINSPECT_DUMMY_CONFIG_FILE:=$TE_PATH/tests/pytorch/debug/test_configs/dummy_feature.yaml}
: ${NVTE_TEST_NVINSPECT_FEATURE_DIRS:=$TE_PATH/transformer_engine/debug/features}
pytest -v -s $TE_PATH/tests/pytorch/debug/test_distributed.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS || test_fail "debug test_distributed.py"
# standard numerics tests with initialized debug
NVTE_TEST_NVINSPECT_ENABLED=True NVTE_TEST_NVINSPECT_CONFIG_FILE=$NVTE_TEST_NVINSPECT_DUMMY_CONFIG_FILE NVTE_TEST_NVINSPECT_FEATURE_DIRS=$NVTE_TEST_NVINSPECT_FEATURE_DIRS pytest -v -s $TE_PATH/tests/pytorch/distributed/test_numerics.py || test_fail "debug test_numerics.py"
if [ "$RET" -ne 0 ]; then
echo "Error in the following test cases:$FAILED_CASES"
exit 1
......
# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# See LICENSE for license information.
import pytest
def pytest_addoption(parser):
parser.addoption(
"--feature_dirs", nargs="+", action="store", default="", help="List of feature directories"
)
parser.addoption(
"--configs_dir",
action="store",
default="",
type=str,
help="Path to the directory with configs.",
)
@pytest.fixture
def feature_dirs(request):
return request.config.getoption("--feature_dirs")
@pytest.fixture
def configs_dir(request):
return request.config.getoption("--configs_dir")
This diff is collapsed.
# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# See LICENSE for license information.
import torch
from transformer_engine.pytorch.tensor.float8_tensor import Float8Tensor, Float8Quantizer
import nvdlfw_inspect.api as debug_api
try:
import transformer_engine
import transformer_engine_torch as tex
except (ImportError, ModuleNotFoundError):
print("Could not find TransformerEngine package.")
exit(1)
def test_transformer_engine_no_config(feature_dirs):
debug_api.initialize("", feature_dirs=feature_dirs)
try:
tensor = torch.rand(24, 2046).cuda()
# FP8 enabled - true by the default
assert debug_api.transformer_engine.fp8_gemm_enabled(
"decoder.1.attn.qkv", gemm="fprop", iteration=0
)
# modify_tensor_enabled - False by default
assert not debug_api.transformer_engine.modify_tensor_enabled(
"decoder.1.attn.qkv", gemm="fprop", tensor_name="activation", iteration=0
)
# inspect_tensor_enabled - False by default
assert not debug_api.transformer_engine.inspect_tensor_enabled(
"decoder.1.attn.qkv", tensor_name="activation", iteration=0
)
# inspect_tensor_postquantize - False by default
assert not debug_api.transformer_engine.inspect_tensor_postquantize_enabled(
"decoder.1.attn.qkv", gemm="fprop", tensor_name="activation", iteration=0
)
finally:
debug_api.end_debug()
def test_disable_fp8_gemm(configs_dir, feature_dirs):
try:
debug_api.initialize(configs_dir + "disable_fp8_gemms.yaml", feature_dirs=feature_dirs)
assert debug_api.transformer_engine.fp8_gemm_enabled(
"decoder.1.attn.qkv", gemm="fprop", iteration=0
)
assert not debug_api.transformer_engine.fp8_gemm_enabled(
"decoder.1.attn.qkv", gemm="dgrad", iteration=0
)
assert not debug_api.transformer_engine.fp8_gemm_enabled(
"decoder.1.attn.qkv", gemm="wgrad", iteration=0
)
# caching
assert debug_api.transformer_engine.fp8_gemm_enabled(
"decoder.1.attn.qkv", gemm="fprop", iteration=0
)
assert not debug_api.transformer_engine.fp8_gemm_enabled(
"decoder.1.attn.qkv", gemm="dgrad", iteration=0
)
assert not debug_api.transformer_engine.fp8_gemm_enabled(
"decoder.1.attn.qkv", gemm="wgrad", iteration=0
)
finally:
debug_api.end_debug()
def test_disable_fp8_layer(configs_dir, feature_dirs):
try:
debug_api.initialize(configs_dir + "disable_fp8_layer.yaml", feature_dirs=feature_dirs)
assert debug_api.transformer_engine.fp8_gemm_enabled(
"decoder.1.mlp.fc1", gemm="fprop", iteration=0
)
assert debug_api.transformer_engine.fp8_gemm_enabled(
"decoder.1.mlp.fc1", gemm="wgrad", iteration=0
)
assert debug_api.transformer_engine.fp8_gemm_enabled(
"decoder.1.mlp.fc1", gemm="dgrad", iteration=0
)
assert not debug_api.transformer_engine.fp8_gemm_enabled(
"decoder.1.attn.qkv", gemm="fprop", iteration=0
)
assert not debug_api.transformer_engine.fp8_gemm_enabled(
"decoder.1.attn.qkv", gemm="wgrad", iteration=0
)
assert not debug_api.transformer_engine.fp8_gemm_enabled(
"decoder.1.attn.qkv", gemm="dgrad", iteration=0
)
finally:
debug_api.end_debug()
def test_per_tensor_scaling(configs_dir, feature_dirs):
try:
debug_api.initialize(configs_dir + "per_tensor_scaling.yaml", feature_dirs=feature_dirs)
tensor = torch.rand(24, 2046).cuda()
# check modify_tensor_enabled
assert debug_api.transformer_engine.modify_tensor_enabled(
"decoder.1.mlp.fc1", gemm="fprop", tensor_name="activation", iteration=0
)
assert debug_api.transformer_engine.modify_tensor_enabled(
"decoder.1.mlp.fc1", gemm="fprop", tensor_name="weight", iteration=0
)
assert debug_api.transformer_engine.modify_tensor_enabled(
"decoder.1.mlp.fc1", gemm="dgrad", tensor_name="gradient", iteration=0
)
assert not debug_api.transformer_engine.modify_tensor_enabled(
"decoder.1.mlp.fc1", gemm="dgrad", tensor_name="weight", iteration=0
)
assert not debug_api.transformer_engine.modify_tensor_enabled(
"decoder.1.mlp.fc1", gemm="wgrad", tensor_name="gradient", iteration=0
)
assert not debug_api.transformer_engine.modify_tensor_enabled(
"decoder.1.mlp.fc1", gemm="wgrad", tensor_name="activation", iteration=0
)
# check modify_tensor
default_quantizer1 = Float8Quantizer(
scale=torch.tensor([1]).cuda(),
amax=torch.tensor([0]).cuda(),
fp8_dtype=tex.DType.kFloat8E4M3,
)
default_quantizer2 = Float8Quantizer(
scale=torch.tensor([1]).cuda(),
amax=torch.tensor([0]).cuda(),
fp8_dtype=tex.DType.kFloat8E5M2,
)
output1 = debug_api.transformer_engine.modify_tensor(
layer_name="decoder.1.mlp.fc1",
gemm="fprop",
tensor_name="activation",
default_quantizer=default_quantizer1,
iteration=0,
tensor=tensor,
)
assert type(output1) == Float8Tensor
assert output1._fp8_dtype == tex.DType.kFloat8E4M3
output2 = debug_api.transformer_engine.modify_tensor(
"decoder.1.mlp.fc1",
gemm="dgrad",
tensor=tensor,
tensor_name="gradient",
default_quantizer=default_quantizer2,
iteration=0,
)
assert type(output2) == Float8Tensor
assert output2._fp8_dtype == tex.DType.kFloat8E5M2
assert not debug_api.transformer_engine.modify_tensor_enabled(
"decoder.1.mlp.fc1",
gemm="wgrad",
tensor_name="gradient",
iteration=0,
)
assert not debug_api.transformer_engine.modify_tensor_enabled(
"decoder.1.mlp.fc4",
gemm="fprop",
tensor_name="activation",
iteration=0,
)
finally:
debug_api.end_debug()
def test_fake_quant(configs_dir, feature_dirs):
try:
debug_api.initialize(
configs_dir + "fake_quantization_config.yaml", feature_dirs=feature_dirs
)
tensor = torch.rand(24, 2046).cuda()
# modify_tensor_enabled
assert debug_api.transformer_engine.modify_tensor_enabled(
"decoder.1.mlp.fc1", gemm="fprop", tensor_name="activation", iteration=0
)
assert debug_api.transformer_engine.modify_tensor_enabled(
"decoder.1.mlp.fc1", gemm="dgrad", tensor_name="gradient", iteration=0
)
# modify_tensor
debug_api.transformer_engine.modify_tensor(
"decoder.1.mlp.fc1",
gemm="fprop",
tensor=tensor,
tensor_name="activation",
iteration=0,
default_quantizer=None,
)
debug_api.transformer_engine.modify_tensor(
"decoder.1.mlp.fc1",
gemm="dgrad",
tensor=tensor,
tensor_name="gradient",
iteration=0,
default_quantizer=None,
)
assert debug_api.transformer_engine.fp8_gemm_enabled(
"decoder.1.fc2", gemm="wgrad", iteration=0
)
# caching
assert debug_api.transformer_engine.fp8_gemm_enabled(
"decoder.1.fc2", gemm="wgrad", iteration=0
)
finally:
debug_api.end_debug()
def test_statistics_collection(configs_dir, feature_dirs):
try:
debug_api.initialize(
config_file=configs_dir + "stats_collection_test_config.yaml",
feature_dirs=feature_dirs,
default_logging_enabled=False,
)
tensor = torch.randn((100, 100, 5)).cuda()
tensor_fp8 = Float8Tensor(
data=tensor.to(torch.uint8).cuda(),
fp8_scale_inv=torch.full([1], 1.0).cuda(),
fp8_dtype=tex.DType.kFloat8E4M3,
shape=tensor.shape,
dtype=torch.float32,
)
def log():
from transformer_engine.debug.features.utils.stats_buffer import STATS_BUFFERS
return STATS_BUFFERS.log_stats()
def assert_empty():
stats = log()
assert len(stats) == 0
# TE tensor stats --
debug_api.transformer_engine.inspect_tensor(
"decoder.1.mlp.fc1",
tensor=tensor,
tensor_name="activation",
iteration=200,
tp_group=None,
)
stats = log()
assert stats[("decoder.1.mlp.fc1", "activation", "cur_amax", 200)] == tensor.abs().max()
assert not debug_api.transformer_engine.inspect_tensor_enabled(
"decoder.1.mlp.fc1", tensor_name="activation", iteration=201
)
assert not debug_api.transformer_engine.inspect_tensor_enabled(
"decoder.2.mlp.fc1", tensor_name="activation", iteration=200
)
assert not debug_api.transformer_engine.inspect_tensor_enabled(
"decoder.1.mlp.fc1", tensor_name="gradient", iteration=200
)
expected_underflows = (tensor_fp8._data == 0).sum() * 100 / (100 * 100 * 5)
expected_overflows = (tensor_fp8._data == 126).sum() * 100 / (100 * 100 * 5)
# TE FP8 tensor stats --
assert debug_api.transformer_engine.inspect_tensor_postquantize_enabled(
"decoder.1.mlp.fc1", tensor_name="gradient", gemm="wgrad", iteration=200
)
debug_api.transformer_engine.inspect_tensor_postquantize(
"decoder.1.mlp.fc1",
tensor=tensor_fp8,
tensor_name="gradient",
iteration=200,
rowwise=True,
tp_group=None,
)
stats = log()
torch.testing.assert_close(
stats[("decoder.1.mlp.fc1", "gradient", "underflows%", 200)], expected_underflows
)
assert not debug_api.transformer_engine.inspect_tensor_postquantize_enabled(
"decoder.1.mlp.fc1", tensor_name="activation", gemm="fprop", iteration=201
)
assert not debug_api.transformer_engine.inspect_tensor_postquantize_enabled(
"decoder.2.mlp.fc1", tensor_name="gradient", gemm="wgrad", iteration=200
)
# Second config in same yaml
tensor = torch.rand((100, 100, 5))
debug_api.transformer_engine.inspect_tensor(
"decoder.6.mlp.fc1",
tensor=tensor,
tensor_name="activation",
iteration=200,
tp_group=None,
)
stats = log()
stats_names = [x[3] for x in stats.keys()]
all(s in stats_names for s in ["cur_amax", "dynamic_range", "mean", "std", "l1_norm"])
assert stats[("decoder.6.mlp.fc1", "activation", "mean", 200)] == tensor.mean()
debug_api.transformer_engine.inspect_tensor(
"decoder.7.mlp.fc1",
tensor=tensor,
tensor_name="weight",
iteration=200,
tp_group=None,
)
stats = log()
stats_names = [x[3] for x in stats.keys()]
all(s in stats_names for s in ["mean", "std", "l1_norm", "min", "max"])
assert stats[("decoder.7.mlp.fc1", "weight", "max", 200)] == tensor.max()
assert not debug_api.transformer_engine.inspect_tensor_enabled(
"decoder.7.mlp.fc1", tensor_name="weight", iteration=201
)
assert_empty()
finally:
debug_api.end_debug()
def test_statistics_multi_run(configs_dir, feature_dirs):
try:
debug_api.initialize(
config_file=configs_dir + "stats_collection_test_config.yaml",
feature_dirs=feature_dirs,
default_logging_enabled=False,
)
def feed(tensor, tensor_fp8):
debug_api.transformer_engine.inspect_tensor(
"decoder.5.mlp.fc1",
tensor=tensor,
tensor_name="activation",
iteration=1,
tp_group=None,
)
debug_api.transformer_engine.inspect_tensor_postquantize(
"decoder.5.mlp.fc1",
tensor=tensor_fp8,
tensor_name="activation",
iteration=1,
rowwise=True,
tp_group=None,
)
def log_stats():
from transformer_engine.debug.features.utils.stats_buffer import STATS_BUFFERS
return STATS_BUFFERS.log_stats()
def fp8_tensor(t):
return Float8Tensor(
data=t.to(torch.uint8).cuda(),
fp8_scale_inv=torch.ones([1]).cuda(),
fp8_dtype=tex.DType.kFloat8E4M3,
shape=t.shape,
dtype=torch.float32,
)
shape = [1024, 1024]
tensors = [torch.randn(shape) for _ in range(2)]
tensors_fp8 = [fp8_tensor(tensors[i]) for i in range(2)]
feed(tensors[0], tensors_fp8[0])
feed(tensors[1], tensors_fp8[1])
stats1 = log_stats()
tensor2 = torch.cat((tensors[0], tensors[1])).cuda()
fp8tensor2 = fp8_tensor(tensor2)
feed(tensor2, fp8tensor2)
stats2 = log_stats()
assert len(stats1.keys()) > 0
for k in stats1.keys():
torch.testing.assert_close(stats1[k], stats2[k])
finally:
debug_api.end_debug()
if __name__ == "__main__":
pass
# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# See LICENSE for license information.
import pathlib, os
from nvdlfw_inspect.config_manager import ConfigManager
import nvdlfw_inspect.api as debug_api
try:
import transformer_engine
from transformer_engine.debug.features.api import TEConfigAPIMapper
except (ImportError, ModuleNotFoundError):
print("Could not find TransformerEngine debug module.")
exit(1)
def test_transformer_engine_config_parsing(feature_dirs):
debug_api.initialize(
config_file=pathlib.Path(__file__).resolve().parent
/ "test_configs/tensor_manipulation_transformer_engine.yaml",
feature_dirs=feature_dirs,
log_dir="./log",
)
cfg_fc1 = ConfigManager.get_config_for_layer("decoder.1.mlp.fc1")["transformer_engine"]
cfg_fc2 = ConfigManager.get_config_for_layer("decoder.1.mlp.fc2")["transformer_engine"]
assert cfg_fc1 and cfg_fc2
gemm_parsing = True
tensor_parsing = True
# Per tensor scaling set for dgrad, filter based on gemm
ret, _ = TEConfigAPIMapper().parse_config_and_api(
cfg_fc1["PerTensorScaling"],
gemm_parsing=gemm_parsing,
tensor_parsing=tensor_parsing,
gemm="wgrad",
tensor_name="activation",
)
assert not ret
# per tensor scaling set for gradient, filter based on tensor name
ret, _ = TEConfigAPIMapper().parse_config_and_api(
cfg_fc1["PerTensorScaling"],
gemm_parsing=gemm_parsing,
tensor_parsing=tensor_parsing,
gemm="dgrad",
tensor_name="activation",
)
assert not ret
ret, parsed_cfg_fc1 = TEConfigAPIMapper().parse_config_and_api(
cfg_fc1["PerTensorScaling"],
gemm_parsing=gemm_parsing,
tensor_parsing=tensor_parsing,
gemm="dgrad",
tensor_name="gradient",
)
assert ret
assert parsed_cfg_fc1 == {"gemm": "dgrad", "tensor": "gradient"}
# Test tensor struct
ret, parsed_cfg_fc1_act = TEConfigAPIMapper().parse_config_and_api(
cfg_fc1["FakeQuant"],
gemm_parsing=gemm_parsing,
tensor_parsing=tensor_parsing,
gemm="fprop",
tensor_name="activation",
)
ret, parsed_cfg_fc1_wei = TEConfigAPIMapper().parse_config_and_api(
cfg_fc1["FakeQuant"],
gemm_parsing=gemm_parsing,
tensor_parsing=tensor_parsing,
gemm="fprop",
tensor_name="weight",
)
assert ret
assert parsed_cfg_fc1_act == {
"gemm": "fprop",
"tensor": "activation",
"quant_format": "FP8E4M3",
}
assert parsed_cfg_fc1_wei == {
"gemm": "fprop",
"tensor": "weight",
"quant_format": "FP8E4M3",
}
# Test gemms struct
ret, parsed_cfg_fc2_grad = TEConfigAPIMapper().parse_config_and_api(
cfg_fc2["FakeQuant"],
gemm_parsing=gemm_parsing,
tensor_parsing=tensor_parsing,
gemm="dgrad",
tensor_name="gradient",
)
assert ret
assert parsed_cfg_fc2_grad == {"gemm": "dgrad", "tensor": "gradient", "quant_format": "FP8E5M2"}
ret, parsed_cfg_fc2_wei = TEConfigAPIMapper().parse_config_and_api(
cfg_fc2["FakeQuant"],
gemm_parsing=gemm_parsing,
tensor_parsing=tensor_parsing,
gemm="dgrad",
tensor_name="weight",
)
assert ret
assert parsed_cfg_fc2_wei == {"gemm": "dgrad", "tensor": "weight", "quant_format": "FP8E5M2"}
# Test gemm + tensor struct
ret, parsed_cfg_fc2_fprop_act = TEConfigAPIMapper().parse_config_and_api(
cfg_fc2["PerTensorScaling"],
gemm_parsing=gemm_parsing,
tensor_parsing=tensor_parsing,
gemm="fprop",
tensor_name="activation",
)
assert ret
assert parsed_cfg_fc2_fprop_act == {"gemm": "fprop", "tensor": "activation"}
ret, parsed_cfg_fc2_fprop_wei = TEConfigAPIMapper().parse_config_and_api(
cfg_fc2["PerTensorScaling"],
gemm_parsing=gemm_parsing,
tensor_parsing=tensor_parsing,
gemm="fprop",
tensor_name="weight",
)
assert ret
assert parsed_cfg_fc2_fprop_wei == {"gemm": "fprop", "tensor": "weight"}
ret, parsed_cfg_fc2_wgrad_act = TEConfigAPIMapper().parse_config_and_api(
cfg_fc2["PerTensorScaling"],
gemm_parsing=gemm_parsing,
tensor_parsing=tensor_parsing,
gemm="wgrad",
tensor_name="activation",
)
assert ret
assert parsed_cfg_fc2_wgrad_act == {"gemm": "wgrad", "tensor": "activation"}
ret, parsed_cfg_fc2_wgrad_grad = TEConfigAPIMapper().parse_config_and_api(
cfg_fc2["PerTensorScaling"],
gemm_parsing=gemm_parsing,
tensor_parsing=tensor_parsing,
gemm="wgrad",
tensor_name="gradient",
)
assert ret
assert parsed_cfg_fc2_wgrad_grad == {"gemm": "wgrad", "tensor": "gradient"}
ConfigManager.reset()
test_disable_fp8_gemm_1:
enabled: True
layers:
layer_types: [qkv, fc2]
transformer_engine:
DisableFP8GEMM:
enabled: True
gemms: [dgrad, wgrad]
\ No newline at end of file
test_disable_fp8_layer:
enabled: True
layers:
layer_types: [qkv]
transformer_engine:
DisableFP8Layer:
enabled: True
\ No newline at end of file
deummy_feature_everywhere:
enabled: True
layers:
layer_name_regex_pattern: .*
transformer_engine:
TestDummyFeature:
enabled: True
tensors: [weight, activation, gradient, output, wgrad, dgrad]
gemms: [wgrad, dgrad, fprop]
\ No newline at end of file
test_fake_quant_fp8:
enabled: True
layers:
layer_numbers: [1]
layer_types: [fc1, fc2]
transformer_engine:
FakeQuant:
enabled: True
gemms: [fprop, dgrad]
tensors_struct:
- tensor: activation
quant_format: FP8E4M3
- tensor: gradient
quant_format: FP8E5M2
\ No newline at end of file
test_per_tensor_scaling:
enabled: True
layers:
layer_numbers: [1]
layer_types: [fc1, fc2]
transformer_engine:
DisableFP8GEMM:
enabled: True
gemms: [wgrad]
PerTensorScaling:
enabled: True
gemms_struct:
- gemm: fprop
tensors_struct:
- tensor: activation
- tensor: weight
- gemm: dgrad
tensors_struct:
- tensor: gradient
\ No newline at end of file
stat_collection_test_1:
enabled: True
layers:
layer_numbers: [1, 3]
LogTensorStats:
enabled: True
stats: [mean, std, l1_norm, l2_norm]
tensors: [activation]
freq: 1
start_step: 100
end_step: 500
transformer_engine:
LogTensorStats:
enabled: True
stats: [cur_amax, dynamic_range]
tensors: [activation]
freq: 2
start_step: 100
end_step: 500
LogFp8TensorStats:
enabled: True
stats: [underflows%]
tensors: [gradient]
freq: 5
start_step: 100
end_step: 500
stat_collection_test_2:
enabled: True
layers:
layer_numbers: [6, 7]
transformer_engine:
LogTensorStats:
enabled: True
tensors_struct:
- tensor: activation
stats: [cur_amax, dynamic_range, mean, std, l1_norm]
freq: 2
start_step: 100
end_step: 500
- tensor: weight
stats: [mean, std, l1_norm, min, max]
freq: 5
start_step: 100
end_step: 500
stat_collection_test_4:
enabled: True
layers:
layer_numbers: [5]
transformer_engine:
LogTensorStats:
enabled: True
tensors: [activation]
stats: [cur_amax, dynamic_range, mean, std, l1_norm]
LogFp8TensorStats:
enabled: True
stats: [underflows%]
tensors: [activation]
\ No newline at end of file
# This config is used when FP8 training is ON
transformer_engine_fc1_manipulation:
enabled: True
layers:
layer_name_regex_pattern: .*(fc1) # Select layers if they end in fc1
transformer_engine: # namespace
DisableFP8GEMM: # Disable FP8 GEMM. FProp run in high precision
enabled: True
gemms: [fprop]
PerTensorScaling: # Scale DGrad gradients using per tensor current scaling and run FP8 GEMM
enabled: True
gemms: [dgrad]
tensors: [gradient]
FakeQuant: # Disable FP8 GEMM for Wgrad. Fake quantize activations to Wgrad and run high precision GEMM
enabled: True
gemms: [fprop]
tensors_struct:
- tensor: activation
quant_format: FP8E4M3
- tensor: weight
quant_format: FP8E4M3
transformer_engine_fc2_manipulation:
enabled: True
layers:
layer_name_regex_pattern: .*(fc2) # Select layers if they end in fc2
transformer_engine: # namespace
PerTensorScaling: # Scale WGrad and Fprop inputs using per tensor current scaling and run FP8 GEMM
enabled: True
gemms_struct:
- gemm: fprop
tensors_struct:
- tensor: activation
- tensor: weight
- gemm: wgrad
tensors_struct:
- tensor: activation
- tensor: gradient
FakeQuant: # Disable FP8 GEMM for DGrad. Fake quantize weights and gradients to DGrad and run high precision GEMM
enabled: True
gemms_struct:
- gemm: dgrad
tensors: [weight, gradient]
quant_format: FP8E5M2
\ No newline at end of file
# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# See LICENSE for license information.
import os
import subprocess
from pathlib import Path
import pytest
import torch
"""
Distributed numerics tests
These tests test the numerical corectness of the TransformerEngine layers.
Tests are parametrized by the layer and fp8 precision.
One test consists of running multiple configurations from file run_numerics.py
Such design is due to the fact the initialization of one test is long
- 2 processes need to start and load torch and TE. Multiple configurations
are run in one test - this reduces the initialization overhead.
"""
if torch.cuda.device_count() < 2:
pytest.skip("Distributed training needs at least 2 GPUs.")
TEST_ROOT = Path(__file__).parent.resolve()
NUM_PROCS: int = min(4, torch.cuda.device_count())
LAUNCH_CMD = ["torchrun", f"--nproc_per_node={NUM_PROCS}"]
def test_debug_distributed(feature_dirs):
test_path = TEST_ROOT / "run_distributed.py"
test_cmd = LAUNCH_CMD + [str(test_path), f"--feature_dirs={feature_dirs[0]}"]
result = subprocess.run(test_cmd, env=os.environ, capture_output=True, check=False)
if result.returncode != 0:
raise AssertionError(result.stderr.decode())
This diff is collapsed.
# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# See LICENSE for license information.
import functools
import itertools
import os
import random
import tempfile
from string import Template
import pytest
import torch
import nvdlfw_inspect.api as debug_api
import transformer_engine.debug
import transformer_engine.pytorch as te
import transformer_engine_torch as tex
from transformer_engine.common.recipe import DelayedScaling, Format
from transformer_engine.pytorch.constants import TE_DType
from transformer_engine.pytorch.fp8 import _default_sf_compute
from transformer_engine.pytorch.tensor.float8_tensor import Float8Quantizer
from test_numerics import create_config_file
B, S, H, D = 64, 64, 64, 64
model_keys = ["linear", "layernorm_linear", "layernorm_mlp", "mha_attention", "transformer_layer"]
configs = {
"": "",
"log": """log:
layers:
layer_types: [linear]
enabled:
True
transformer_engine:
LogTensorStats:
enabled: True
tensors: [activation, gradient, weight, output, wgrad, dgrad]
stats: [min, max, mean, std, l1_norm, l2_norm, cur_amax, dynamic_range]
start_step : 0
end_step: 1
LogFp8TensorStats:
enabled: True
tensors: [activation, gradient, weight]
stats: [underflows, overflows]
start_step : 0
end_step: 1
""",
"fake_quant": """
fake_quant_config:
enabled: True
layers:
layer_types: [linear]
transformer_engine:
FakeQuant:
enabled: True
gemms: [fprop, dgrad, wgrad]
quant_format: FP8E5M2
""",
}
def _get_model(model_key):
if model_key == "linear":
return te.Linear(D, D)
if model_key == "layernorm_linear":
return te.LayerNormLinear(D, D)
if model_key == "layernorm_mlp":
return te.LayerNormMLP(D, D, D)
if model_key == "mha_attention":
return te.MultiheadAttention(D, H)
if model_key == "transformer_layer":
return te.TransformerLayer(D, D, H)
def _run_forward_backward(model, fp8):
for _ in range(3):
inp = torch.randn((S, B, H)).cuda()
with te.fp8_autocast(enabled=fp8):
out = model(inp)
out.sum().backward()
debug_api.step()
@create_config_file
def _run_test(model_key, fp8, config, feature_dirs, config_file, log_dir):
try:
if config != "":
config_file.write(config)
config_file.flush()
config_file_name = config_file.name if config != "" else ""
debug_api.initialize(feature_dirs=feature_dirs, config_file=config_file_name)
model = _get_model(model_key)
_run_forward_backward(model, fp8)
except Exception as error:
raise error
finally:
debug_api.end_debug()
@pytest.mark.parametrize("model_key", model_keys)
@pytest.mark.parametrize("fp8", [False, True])
@pytest.mark.parametrize("config_key", configs.keys())
def test_sanity_debug(model_key, fp8, config_key, feature_dirs):
_run_test(model_key, fp8, configs[config_key], feature_dirs)
# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# See LICENSE for license information.
import os
LOG_FILE = os.path.join("nvdlfw_inspect_logs", "nvdlfw_inspect_globalrank-0.log")
def reset_debug_log():
if os.path.isfile(LOG_FILE):
# delete all content
with open(LOG_FILE, "w") as f:
pass
def check_debug_log(msg):
with open(LOG_FILE, "r") as f:
for line in f.readlines():
if msg in line:
return True
return False
......@@ -34,6 +34,18 @@ NCCL_WORLD = None
LOSS_FN = nn.MSELoss()
QUANTIZATION = None
if os.environ.get("NVTE_TEST_NVINSPECT_ENABLED", False):
# The numerics of all the layers should work the same,
# when debug=True. I fed them with dummy feature
# to prevent switching off debug, which can happen if
# no feature is active.
import nvdlfw_inspect.api as debug_api
debug_api.initialize(
os.environ["NVTE_TEST_NVINSPECT_CONFIG_FILE"],
feature_dirs=os.environ["NVTE_TEST_NVINSPECT_FEATURE_DIRS"],
)
# Disable TF32
torch.backends.cuda.matmul.allow_tf32 = False
......
......@@ -102,6 +102,20 @@ all_normalizations = ["LayerNorm", "RMSNorm"]
mask_types = ["causal", "no_mask"]
NVTE_TEST_NVINSPECT_ENABLED = os.environ.get("NVTE_TEST_NVINSPECT_ENABLED", False)
if NVTE_TEST_NVINSPECT_ENABLED:
# The numerics of all the layers should work the same,
# when debug=True. I fed them with dummy feature
# to prevent switching off debug, which can happen if
# no feature is active.
import nvdlfw_inspect.api as debug_api
debug_api.initialize(
os.environ["NVTE_TEST_NVINSPECT_CONFIG_FILE"],
feature_dirs=os.environ["NVTE_TEST_NVINSPECT_FEATURE_DIRS"],
)
fp8_recipes = [
recipe.MXFP8BlockScaling(),
recipe.DelayedScaling(),
......@@ -568,6 +582,8 @@ def test_gpt_selective_activation_recompute(dtype, bs, model, fp8, recipe, fp8_m
pytest.skip(reason_for_no_fp8)
if recipe.mxfp8() and not mxfp8_available:
pytest.skip(reason_for_no_mxfp8)
if fp8_model_params and NVTE_TEST_NVINSPECT_ENABLED:
pytest.skip("FP8 parameters are not supported in debug mode.")
if recipe.float8_block_scaling() and not fp8_block_scaling_available:
pytest.skip(reason_for_no_fp8_block_scaling)
......@@ -682,6 +698,8 @@ def test_gpt_full_activation_recompute(
pytest.skip(reason_for_no_fp8)
if recipe.mxfp8() and not mxfp8_available:
pytest.skip(reason_for_no_mxfp8)
if fp8_model_params and NVTE_TEST_NVINSPECT_ENABLED:
pytest.skip("FP8 parameters are not supported in debug mode.")
if recipe.float8_block_scaling() and not fp8_block_scaling_available:
pytest.skip(reason_for_no_fp8_block_scaling)
......@@ -1726,6 +1744,8 @@ def test_grouped_linear_accuracy(
pytest.skip(reason_for_no_fp8)
if fp8 and recipe.mxfp8() and not mxfp8_available:
pytest.skip(reason_for_no_mxfp8)
if fp8_model_params and NVTE_TEST_NVINSPECT_ENABLED:
pytest.skip("FP8 parameters are not supported in debug mode.")
if fp8 and recipe.float8_block_scaling() and not fp8_block_scaling_available:
pytest.skip(reason_for_no_fp8_block_scaling)
......@@ -1924,6 +1944,8 @@ def test_padding_grouped_linear_accuracy(
pytest.skip(reason_for_no_fp8)
if recipe.mxfp8() and not mxfp8_available:
pytest.skip(reason_for_no_mxfp8)
if fp8_model_params and NVTE_TEST_NVINSPECT_ENABLED:
pytest.skip("FP8 parameters are not supported in debug mode.")
if recipe.float8_block_scaling() and not fp8_block_scaling_available:
pytest.skip(reason_for_no_fp8_block_scaling)
......@@ -2039,6 +2061,8 @@ def _test_gpt_e2e_cuda_graph(block, bs, dtype, config, graph):
@pytest.mark.parametrize("bs", batch_sizes)
@pytest.mark.parametrize("model", ["126m"])
def test_gpt_cuda_graph(dtype, bs, model):
if NVTE_TEST_NVINSPECT_ENABLED:
pytest.skip("Cuda Graphs are not supported in debug mode.")
config = model_configs[model]
sigma = 0.023
......@@ -2136,6 +2160,8 @@ def test_gpt_fp8_parameters(dtype, bs, model, recipe):
pytest.skip(reason_for_no_fp8)
if recipe.mxfp8() and not mxfp8_available:
pytest.skip(reason_for_no_mxfp8)
if NVTE_TEST_NVINSPECT_ENABLED:
pytest.skip("FP8 parameters are not supported in debug mode.")
if recipe.float8_block_scaling() and not fp8_block_scaling_available:
pytest.skip(reason_for_no_fp8_block_scaling)
......
......@@ -12,7 +12,7 @@ from nvdlfw_inspect.registry import Registry
import torch
from transformer_engine.debug.features.utils.stats_buffer import STATS_BUFFERS
from transformer_engine.pytorch.tensor import all_tensor_types
from transformer_engine.pytorch.tensor import get_all_tensor_types
from transformer_engine.debug.pytorch.debug_state import TEDebugState
from transformer_engine.pytorch.tensor import Quantizer, QuantizedTensor
......@@ -424,7 +424,7 @@ class TransformerEngineAPI(BaseNamespaceAPI):
if api_name in ["inspect_tensor", "inspect_tensor_postquantize"]:
assert ret is None
if api_name == "modify_tensor":
assert type(ret) in all_tensor_types
assert type(ret) in get_all_tensor_types()
if (
type(ret) == torch.Tensor # pylint: disable=unidiomatic-typecheck
and "dtype" in kwargs
......@@ -438,4 +438,4 @@ class TransformerEngineAPI(BaseNamespaceAPI):
def end_debug(self):
"""This function is called by the nvidia-dlframework-inspect after every debug_api.end_debug()"""
TEDebugState.reset()
TEDebugState._reset()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment