Unverified Commit 1ae1d228 authored by Paweł Gadziński's avatar Paweł Gadziński Committed by GitHub
Browse files

[PyTorch Debug] Skip some of debug tests if FP8 is not available. (#1902)



* fix
Signed-off-by: default avatarPawel Gadzinski <pgadzinski@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci



* fix
Signed-off-by: default avatarPawel Gadzinski <pgadzinski@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci



* Update tests/pytorch/debug/test_distributed.py
Co-authored-by: default avatarTim Moon <4406448+timmoon10@users.noreply.github.com>
Signed-off-by: default avatarPaweł Gadziński <62263673+pggPL@users.noreply.github.com>

---------
Signed-off-by: default avatarPawel Gadzinski <pgadzinski@nvidia.com>
Signed-off-by: default avatarPaweł Gadziński <62263673+pggPL@users.noreply.github.com>
Co-authored-by: default avatarpre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: default avatarTim Moon <4406448+timmoon10@users.noreply.github.com>
parent 6f4310d7
...@@ -16,7 +16,7 @@ import transformer_engine ...@@ -16,7 +16,7 @@ import transformer_engine
import transformer_engine_torch as tex import transformer_engine_torch as tex
import nvdlfw_inspect.api as debug_api import nvdlfw_inspect.api as debug_api
from transformer_engine.debug import set_weight_tensor_tp_group_reduce from transformer_engine.debug import set_weight_tensor_tp_group_reduce
from transformer_engine.pytorch.fp8 import FP8GlobalStateManager
from test_numerics import ( from test_numerics import (
_emulate_linear, _emulate_linear,
...@@ -45,6 +45,8 @@ FEATURE_DIRS = None ...@@ -45,6 +45,8 @@ FEATURE_DIRS = None
all_boolean = [True, False] all_boolean = [True, False]
TEST_NR = 0 TEST_NR = 0
fp8_available, _ = FP8GlobalStateManager.is_fp8_available()
def _get_tensors(parallel_mode, weight_seed=SEED, data_seed=SEED, tp_size=None, tp_rank=None): def _get_tensors(parallel_mode, weight_seed=SEED, data_seed=SEED, tp_size=None, tp_rank=None):
if tp_size is None: if tp_size is None:
...@@ -221,7 +223,7 @@ def run_debug_test(func): ...@@ -221,7 +223,7 @@ def run_debug_test(func):
return wrapper return wrapper
CONFIG_LOG_TEST_DISTRIBUTED = """log_distributed: CONFIG_LOG_TEST_DISTRIBUTED_FP8 = """log_distributed:
layers: layers:
layer_types: [linear] layer_types: [linear]
enabled: enabled:
...@@ -241,11 +243,27 @@ CONFIG_LOG_TEST_DISTRIBUTED = """log_distributed: ...@@ -241,11 +243,27 @@ CONFIG_LOG_TEST_DISTRIBUTED = """log_distributed:
end_step: 1 end_step: 1
""" """
CONFIG_LOG_TEST_DISTRIBUTED_NO_FP8 = """log_distributed:
layers:
layer_types: [linear]
enabled:
True
transformer_engine:
LogTensorStats:
enabled: True
tensors: [activation, gradient, weight, output, wgrad, dgrad]
stats: [min, max, mean, std, l1_norm, l2_norm, cur_amax, dynamic_range]
start_step : 0
end_step: 1
"""
def _prepare_config_test_log_distributed(config_file): def _prepare_config_test_log_distributed(config_file):
if WORLD_RANK != 0: if WORLD_RANK != 0:
return return
config_file.write(CONFIG_LOG_TEST_DISTRIBUTED) config_file.write(
CONFIG_LOG_TEST_DISTRIBUTED_FP8 if fp8_available else CONFIG_LOG_TEST_DISTRIBUTED_NO_FP8
)
config_file.flush() config_file.flush()
...@@ -361,13 +379,13 @@ def test_log_expert_parallel(**kwargs): ...@@ -361,13 +379,13 @@ def test_log_expert_parallel(**kwargs):
) # data parallel ) # data parallel
model = _init_model(weight, parallel_mode=None, name="linear1") model = _init_model(weight, parallel_mode=None, name="linear1")
model1 = _init_model(weight, parallel_mode=None, name="linear2") model1 = _init_model(weight, parallel_mode=None, name="linear2")
with transformer_engine.pytorch.fp8_autocast(enabled=True, fp8_recipe=FP8_RECIPE): with transformer_engine.pytorch.fp8_autocast(enabled=fp8_available, fp8_recipe=FP8_RECIPE):
y1 = model(x) y1 = model(x)
y2 = model1(x) y2 = model1(x)
y = y1 + y2 y = y1 + y2
y.sum().backward() y.sum().backward()
debug_api.step() debug_api.step()
with transformer_engine.pytorch.fp8_autocast(enabled=True, fp8_recipe=FP8_RECIPE): with transformer_engine.pytorch.fp8_autocast(enabled=fp8_available, fp8_recipe=FP8_RECIPE):
y = model(x) y = model(x)
if WORLD_RANK != 0: if WORLD_RANK != 0:
y = y + model1(x) y = y + model1(x)
...@@ -620,28 +638,29 @@ if __name__ == "__main__": ...@@ -620,28 +638,29 @@ if __name__ == "__main__":
for gather_weight in [True, False]: for gather_weight in [True, False]:
test_log_distributed(parallel_mode, gather_weight) test_log_distributed(parallel_mode, gather_weight)
for parallel_mode in ["row", "column"]: if fp8_available:
test_disable_fp8_layer(parallel_mode) for parallel_mode in ["row", "column"]:
test_disable_fp8_layer(parallel_mode)
# test_disable_fp8_gemms # test_disable_fp8_gemms
_run_test_with_combinations( _run_test_with_combinations(
test_disable_fp8_gemms, all_boolean, num_repeat=3, extra_args=["column", "row"] test_disable_fp8_gemms, all_boolean, num_repeat=3, extra_args=["column", "row"]
) )
# test_fake_quant_fp8 # test_fake_quant_fp8
dtype_options = [tex.DType.kFloat8E4M3, tex.DType.kFloat8E5M2, None] dtype_options = [tex.DType.kFloat8E4M3, tex.DType.kFloat8E5M2, None]
_run_test_with_combinations( _run_test_with_combinations(
test_fake_quant_fp8, test_fake_quant_fp8,
dtype_options, dtype_options,
num_repeat=6, num_repeat=6,
extra_args=["column", "row"], extra_args=["column", "row"],
sample_size=20, sample_size=20,
) )
_run_test_with_combinations( _run_test_with_combinations(
test_per_tensor_scaling, test_per_tensor_scaling,
all_boolean, all_boolean,
num_repeat=6, num_repeat=6,
extra_args=["column"], extra_args=["column"],
sample_size=20, sample_size=20,
) )
...@@ -5,7 +5,6 @@ ...@@ -5,7 +5,6 @@
import os import os
import subprocess import subprocess
from pathlib import Path from pathlib import Path
import pytest import pytest
import torch import torch
...@@ -21,7 +20,6 @@ import torch ...@@ -21,7 +20,6 @@ import torch
""" """
if torch.cuda.device_count() < 2: if torch.cuda.device_count() < 2:
pytest.skip("Distributed training needs at least 2 GPUs.") pytest.skip("Distributed training needs at least 2 GPUs.")
......
...@@ -27,6 +27,9 @@ from transformer_engine.pytorch.module.base import ( ...@@ -27,6 +27,9 @@ from transformer_engine.pytorch.module.base import (
_2X_ACC_FPROP, _2X_ACC_FPROP,
_2X_ACC_WGRAD, _2X_ACC_WGRAD,
) )
from transformer_engine.pytorch.fp8 import FP8GlobalStateManager
fp8_available, reason_for_no_fp8 = FP8GlobalStateManager.is_fp8_available()
all_boolean = [True, False] all_boolean = [True, False]
FP8_FORMAT = Format.HYBRID FP8_FORMAT = Format.HYBRID
...@@ -246,8 +249,8 @@ def _init_model(weight): ...@@ -246,8 +249,8 @@ def _init_model(weight):
return model return model
def _run_forward_backward(x, model, loss_scale=1.0, is_first_microbatch=None): def _run_forward_backward(x, model, loss_scale=1.0, is_first_microbatch=None, fp8=True):
with tepytorch.fp8_autocast(enabled=True, fp8_recipe=FP8_RECIPE): with tepytorch.fp8_autocast(enabled=fp8, fp8_recipe=FP8_RECIPE):
y = model(x, is_first_microbatch=is_first_microbatch) y = model(x, is_first_microbatch=is_first_microbatch)
(y.sum() * loss_scale).backward() (y.sum() * loss_scale).backward()
debug_api.step() debug_api.step()
...@@ -297,8 +300,8 @@ def run_logging_zero_numel_tensor(feature_dirs, **kwargs): ...@@ -297,8 +300,8 @@ def run_logging_zero_numel_tensor(feature_dirs, **kwargs):
x, weight = _get_tensors() x, weight = _get_tensors()
x1 = x[:0, :] x1 = x[:0, :]
model = _init_model(weight) model = _init_model(weight)
_ = _run_forward_backward(x1, model) _ = _run_forward_backward(x1, model, fp8=False)
_ = _run_forward_backward(x, model) _ = _run_forward_backward(x, model, fp8=False)
def test_logging_zero_numel_tensor(feature_dirs): def test_logging_zero_numel_tensor(feature_dirs):
...@@ -309,6 +312,8 @@ def test_logging_zero_numel_tensor(feature_dirs): ...@@ -309,6 +312,8 @@ def test_logging_zero_numel_tensor(feature_dirs):
@pytest.mark.parametrize("dgrad_fp8", all_boolean) @pytest.mark.parametrize("dgrad_fp8", all_boolean)
@pytest.mark.parametrize("wgrad_fp8", all_boolean) @pytest.mark.parametrize("wgrad_fp8", all_boolean)
def test_disable_fp8_gemms(feature_dirs, fprop_fp8, dgrad_fp8, wgrad_fp8): def test_disable_fp8_gemms(feature_dirs, fprop_fp8, dgrad_fp8, wgrad_fp8):
if not fp8_available:
pytest.skip(reason_for_no_fp8)
run_disable_fp8_gemms(feature_dirs, fprop_fp8, dgrad_fp8, wgrad_fp8) run_disable_fp8_gemms(feature_dirs, fprop_fp8, dgrad_fp8, wgrad_fp8)
...@@ -348,6 +353,8 @@ def run_disable_fp8_gemms(feature_dirs, fprop_fp8, dgrad_fp8, wgrad_fp8, **kwarg ...@@ -348,6 +353,8 @@ def run_disable_fp8_gemms(feature_dirs, fprop_fp8, dgrad_fp8, wgrad_fp8, **kwarg
def test_disable_fp8_layer(feature_dirs): def test_disable_fp8_layer(feature_dirs):
if not fp8_available:
pytest.skip(reason_for_no_fp8)
run_disable_fp8_layer(feature_dirs) run_disable_fp8_layer(feature_dirs)
...@@ -393,6 +400,8 @@ subset_combinations = random.sample(all_combinations, 20) ...@@ -393,6 +400,8 @@ subset_combinations = random.sample(all_combinations, 20)
def test_per_tensor_scaling( def test_per_tensor_scaling(
feature_dirs, fprop_inp, fprop_weight, dgrad_weight, dgrad_grad, wgrad_input, wgrad_grad feature_dirs, fprop_inp, fprop_weight, dgrad_weight, dgrad_grad, wgrad_input, wgrad_grad
): ):
if not fp8_available:
pytest.skip(reason_for_no_fp8)
if not any([fprop_inp, fprop_weight, dgrad_weight, dgrad_grad, wgrad_input, wgrad_grad]): if not any([fprop_inp, fprop_weight, dgrad_weight, dgrad_grad, wgrad_input, wgrad_grad]):
pytest.skip("Skipping test because all parameters are False") pytest.skip("Skipping test because all parameters are False")
run_per_tensor_scaling( run_per_tensor_scaling(
...@@ -565,6 +574,8 @@ def run_per_tensor_scaling( ...@@ -565,6 +574,8 @@ def run_per_tensor_scaling(
def test_microbatching_per_tensor_scaling( def test_microbatching_per_tensor_scaling(
feature_dirs, fprop_inp, fprop_weight, dgrad_weight, dgrad_grad, wgrad_input, wgrad_grad feature_dirs, fprop_inp, fprop_weight, dgrad_weight, dgrad_grad, wgrad_input, wgrad_grad
): ):
if not fp8_available:
pytest.skip(reason_for_no_fp8)
if not any([fprop_inp, fprop_weight, dgrad_weight, dgrad_grad, wgrad_input, wgrad_grad]): if not any([fprop_inp, fprop_weight, dgrad_weight, dgrad_grad, wgrad_input, wgrad_grad]):
pytest.skip("Skipping test because all parameters are False") pytest.skip("Skipping test because all parameters are False")
...@@ -654,6 +665,8 @@ subset_combinations = random.sample(all_combinations, 10) ...@@ -654,6 +665,8 @@ subset_combinations = random.sample(all_combinations, 10)
def test_fake_quant_fp8( def test_fake_quant_fp8(
feature_dirs, fprop_inp, fprop_weight, dgrad_weight, dgrad_grad, wgrad_input, wgrad_grad feature_dirs, fprop_inp, fprop_weight, dgrad_weight, dgrad_grad, wgrad_input, wgrad_grad
): ):
if not fp8_available:
pytest.skip(reason_for_no_fp8)
run_fake_quant_fp8( run_fake_quant_fp8(
feature_dirs, fprop_inp, fprop_weight, dgrad_weight, dgrad_grad, wgrad_input, wgrad_grad feature_dirs, fprop_inp, fprop_weight, dgrad_weight, dgrad_grad, wgrad_input, wgrad_grad
) )
......
...@@ -2,27 +2,17 @@ ...@@ -2,27 +2,17 @@
# #
# See LICENSE for license information. # See LICENSE for license information.
import functools
import itertools
import os
import random
import tempfile
from string import Template
import pytest import pytest
import torch import torch
import nvdlfw_inspect.api as debug_api import nvdlfw_inspect.api as debug_api
import transformer_engine.debug
import transformer_engine.pytorch as te import transformer_engine.pytorch as te
import transformer_engine_torch as tex from transformer_engine.pytorch.fp8 import FP8GlobalStateManager
from transformer_engine.common.recipe import DelayedScaling, Format
from transformer_engine.pytorch.constants import TE_DType
from transformer_engine.pytorch.fp8 import _default_sf_compute
from transformer_engine.pytorch.tensor.float8_tensor import Float8Quantizer
from test_numerics import create_config_file from test_numerics import create_config_file
fp8_available, reason_for_no_fp8 = FP8GlobalStateManager.is_fp8_available()
B, S, H, D = 64, 64, 64, 64 B, S, H, D = 64, 64, 64, 64
model_keys = ["linear", "layernorm_linear", "layernorm_mlp", "mha_attention", "transformer_layer"] model_keys = ["linear", "layernorm_linear", "layernorm_mlp", "mha_attention", "transformer_layer"]
...@@ -104,4 +94,6 @@ def _run_test(model_key, fp8, config, feature_dirs, config_file, log_dir): ...@@ -104,4 +94,6 @@ def _run_test(model_key, fp8, config, feature_dirs, config_file, log_dir):
@pytest.mark.parametrize("fp8", [False, True]) @pytest.mark.parametrize("fp8", [False, True])
@pytest.mark.parametrize("config_key", configs.keys()) @pytest.mark.parametrize("config_key", configs.keys())
def test_sanity_debug(model_key, fp8, config_key, feature_dirs): def test_sanity_debug(model_key, fp8, config_key, feature_dirs):
if fp8 and not fp8_available:
pytest.skip(reason_for_no_fp8)
_run_test(model_key, fp8, configs[config_key], feature_dirs) _run_test(model_key, fp8, configs[config_key], feature_dirs)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment