"vscode:/vscode.git/clone" did not exist on "24cde76a152fbffde30fa2be0d08dcbad490530e"
test_numerics_exact.py 2.21 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# See LICENSE for license information.

import os
import subprocess
from pathlib import Path

import pytest
import torch
11
import transformer_engine.pytorch as te
12
13
14
15
16
17
18
19
20
21
22
23
24
25

"""
    Distributed numerics tests

    This numerical test aims for zero tolerance test for absolute confidence in numerics.
    In the case of NVFP4, with the experimental NVFP4 quantization, we matched bitwise
    result with the native silicon. For distrbuted test cases, we can do the same by thing
    by comparing BF16 AG results with the low precision AG results at layer level.
"""


if torch.cuda.device_count() < 2:
    pytest.skip("Distributed training needs at least 2 GPUs.")

26
27
28
29
fp8_available, reason_for_no_fp8 = te.is_fp8_available(return_reason=True)
mxfp8_available, reason_for_no_mxfp8 = te.is_mxfp8_available(return_reason=True)
fp8_block_scaling_available, reason_for_no_fp8_block_scaling = te.is_fp8_block_scaling_available(
    return_reason=True
30
)
31
nvfp4_available, reason_for_no_nvfp4 = te.is_nvfp4_available(return_reason=True)
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70

TEST_ROOT = Path(__file__).parent.resolve()
NUM_PROCS: int = min(4, torch.cuda.device_count())
LAUNCH_CMD = ["torchrun", f"--nproc_per_node={NUM_PROCS}"]


def _run_test(quantization, batch_size, hidden_size, out_size):
    test_path = TEST_ROOT / "run_numerics_exact.py"
    test_cmd = LAUNCH_CMD + [str(test_path)]

    test_cmd += ["--quantization", quantization]
    test_cmd += ["--batch-size", str(batch_size)]
    test_cmd += ["--hidden-size", str(hidden_size)]
    test_cmd += ["--out-size", str(out_size)]

    result = subprocess.run(test_cmd, env=os.environ, check=False)
    assert result.returncode == 0


all_boolean = [True, False]


@pytest.mark.parametrize("quantization", ["nvfp4"])
@pytest.mark.parametrize(
    "batch_size, hidden_size, out_size",
    [
        (64, 128, 128),
        (128, 128, 128),
        (128, 256, 256),
        (512, 1024, 768),
        (512, 256, 1024),
        (2048, 2048, 2048),
    ],
)
def test_distributed(quantization, batch_size, hidden_size, out_size):
    if quantization == "nvfp4" and not nvfp4_available:
        pytest.skip(reason_for_no_nvfp4)

    _run_test(quantization, batch_size, hidden_size, out_size)