Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
TransformerEngine
Commits
62550505
Commit
62550505
authored
Aug 25, 2025
by
wenjh
Browse files
Fix some test problem in pytorch unittest
parent
11864d3d
Changes
6
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
37 additions
and
180 deletions
+37
-180
qa/L0_pytorch_unittest/test.sh
qa/L0_pytorch_unittest/test.sh
+0
-1
tests/cpp/test_common.h
tests/cpp/test_common.h
+2
-0
tests/pytorch/test_float8_blockwise_scaling_exact.py
tests/pytorch/test_float8_blockwise_scaling_exact.py
+25
-2
tests/pytorch/test_float8blockwisetensor.py
tests/pytorch/test_float8blockwisetensor.py
+5
-1
tests/pytorch/test_int8_blockwise_layers.py
tests/pytorch/test_int8_blockwise_layers.py
+0
-175
tests/pytorch/test_numerics.py
tests/pytorch/test_numerics.py
+5
-1
No files found.
qa/L0_pytorch_unittest/test.sh
View file @
62550505
...
@@ -40,7 +40,6 @@ NVTE_INT8_SIM_FP8=1 NVTE_INT8_SIM_FP8_TENSORWISE=1 python3 -m pytest -v -s test_
...
@@ -40,7 +40,6 @@ NVTE_INT8_SIM_FP8=1 NVTE_INT8_SIM_FP8_TENSORWISE=1 python3 -m pytest -v -s test_
python3
-m
pytest
-v
-s
--junitxml
=
$XML_LOG_DIR
/pytest_test_float8_blockwise_scaling_exact.xml
$TE_PATH
/tests/pytorch/test_float8_blockwise_scaling_exact.py
||
test_fail
"test_float8_blockwise_scaling_exact.py"
python3
-m
pytest
-v
-s
--junitxml
=
$XML_LOG_DIR
/pytest_test_float8_blockwise_scaling_exact.xml
$TE_PATH
/tests/pytorch/test_float8_blockwise_scaling_exact.py
||
test_fail
"test_float8_blockwise_scaling_exact.py"
python3
-m
pytest
-v
-s
--junitxml
=
$XML_LOG_DIR
/pytest_test_float8_blockwise_gemm_exact.xml
$TE_PATH
/tests/pytorch/test_float8_blockwise_gemm_exact.py
||
test_fail
"test_float8_blockwise_gemm_exact.py"
python3
-m
pytest
-v
-s
--junitxml
=
$XML_LOG_DIR
/pytest_test_float8_blockwise_gemm_exact.xml
$TE_PATH
/tests/pytorch/test_float8_blockwise_gemm_exact.py
||
test_fail
"test_float8_blockwise_gemm_exact.py"
python3
$TE_PATH
/tests/pytorch/test_int8_blockwise_gemm_exact.py
python3
$TE_PATH
/tests/pytorch/test_int8_blockwise_gemm_exact.py
NVTE_INT8_SIM_FP8
=
1 python3
-m
pytest
-v
-s
--junitxml
=
$XML_LOG_DIR
/pytest_test_int8_blockwise_layers.xml
$TE_PATH
/tests/pytorch/test_int8_blockwise_layers.py
||
test_fail
"test_int8_blockwise_layers.py"
python3
-m
pytest
-v
-s
--junitxml
=
$XML_LOG_DIR
/pytest_test_gqa.xml
$TE_PATH
/tests/pytorch/test_gqa.py
||
test_fail
"test_gqa.py"
python3
-m
pytest
-v
-s
--junitxml
=
$XML_LOG_DIR
/pytest_test_gqa.xml
$TE_PATH
/tests/pytorch/test_gqa.py
||
test_fail
"test_gqa.py"
python3
-m
pytest
-v
-s
--junitxml
=
$XML_LOG_DIR
/pytest_test_fused_optimizer.xml
$TE_PATH
/tests/pytorch/test_fused_optimizer.py
||
test_fail
"test_fused_optimizer.py"
python3
-m
pytest
-v
-s
--junitxml
=
$XML_LOG_DIR
/pytest_test_fused_optimizer.xml
$TE_PATH
/tests/pytorch/test_fused_optimizer.py
||
test_fail
"test_fused_optimizer.py"
python3
-m
pytest
-v
-s
--junitxml
=
$XML_LOG_DIR
/pytest_test_multi_tensor.xml
$TE_PATH
/tests/pytorch/test_multi_tensor.py
||
test_fail
"test_multi_tensor.py"
python3
-m
pytest
-v
-s
--junitxml
=
$XML_LOG_DIR
/pytest_test_multi_tensor.xml
$TE_PATH
/tests/pytorch/test_multi_tensor.py
||
test_fail
"test_multi_tensor.py"
...
...
tests/cpp/test_common.h
View file @
62550505
...
@@ -10,7 +10,9 @@
...
@@ -10,7 +10,9 @@
#include <vector>
#include <vector>
#include <array>
#include <array>
#include <random>
#include <random>
#ifndef __HIP_PLATFORM_AMD__
#include <cudaTypedefs.h>
#include <cudaTypedefs.h>
#endif
#define FP4_TYPE_SUPPORTED (CUDA_VERSION >= 12080)
#define FP4_TYPE_SUPPORTED (CUDA_VERSION >= 12080)
#include <cuda_runtime_api.h>
#include <cuda_runtime_api.h>
...
...
tests/pytorch/test_float8_blockwise_scaling_exact.py
View file @
62550505
...
@@ -25,6 +25,7 @@ from test_float8_current_scaling_exact import (
...
@@ -25,6 +25,7 @@ from test_float8_current_scaling_exact import (
TestFP8RecipeLinearBase
,
TestFP8RecipeLinearBase
,
TestFP8RecipeLayerNormLinearBase
,
TestFP8RecipeLayerNormLinearBase
,
)
)
from
torch.utils.cpp_extension
import
IS_HIP_EXTENSION
# read env variable NVTE_TEST_FLOAT8_BLOCK_SCALING_EXACT_TENSOR_DUMP_DIR to override the default tensor dump directory
# read env variable NVTE_TEST_FLOAT8_BLOCK_SCALING_EXACT_TENSOR_DUMP_DIR to override the default tensor dump directory
TENSOR_DUMP_DIR
=
pathlib
.
Path
(
__file__
).
resolve
().
parent
.
parent
.
parent
/
"tensor_dumps"
TENSOR_DUMP_DIR
=
pathlib
.
Path
(
__file__
).
resolve
().
parent
.
parent
.
parent
/
"tensor_dumps"
...
@@ -529,6 +530,11 @@ class TestFP8BlockScalingRecipeLinear(TestFP8RecipeLinearBase):
...
@@ -529,6 +530,11 @@ class TestFP8BlockScalingRecipeLinear(TestFP8RecipeLinearBase):
dtype
,
dtype
,
use_bias
=
True
,
use_bias
=
True
,
):
):
if
IS_HIP_EXTENSION
:
import
importlib
ori_int8_sim_fp8
=
os
.
environ
.
get
(
"NVTE_INT8_SIM_FP8"
,
None
)
os
.
environ
[
"NVTE_INT8_SIM_FP8"
]
=
"1"
importlib
.
reload
(
te
.
pytorch
.
fp8
)
fp8_zero_tolerance_tensor_dumps_recipe2
=
None
fp8_zero_tolerance_tensor_dumps_recipe2
=
None
# check tensor dumps dir, if the dir exists, then read files to get y, dgrad, wgrad, bgrad
# check tensor dumps dir, if the dir exists, then read files to get y, dgrad, wgrad, bgrad
# if we cannot get all four tensors, then still set the tensor dump to None
# if we cannot get all four tensors, then still set the tensor dump to None
...
@@ -554,6 +560,12 @@ class TestFP8BlockScalingRecipeLinear(TestFP8RecipeLinearBase):
...
@@ -554,6 +560,12 @@ class TestFP8BlockScalingRecipeLinear(TestFP8RecipeLinearBase):
recipe1_golden_tensors
=
None
,
recipe1_golden_tensors
=
None
,
recipe2_golden_tensors
=
fp8_zero_tolerance_tensor_dumps_recipe2
,
recipe2_golden_tensors
=
fp8_zero_tolerance_tensor_dumps_recipe2
,
)
)
if
IS_HIP_EXTENSION
:
if
ori_int8_sim_fp8
is
not
None
:
os
.
environ
[
"NVTE_INT8_SIM_FP8"
]
=
ori_int8_sim_fp8
else
:
del
os
.
environ
[
"NVTE_INT8_SIM_FP8"
]
importlib
.
reload
(
te
.
pytorch
.
fp8
)
@
pytest
.
mark
.
skipif
(
not
recipe_available
,
reason
=
reason_for_no_recipe
)
@
pytest
.
mark
.
skipif
(
not
recipe_available
,
reason
=
reason_for_no_recipe
)
...
@@ -589,6 +601,11 @@ class TestFP8BlockScalingRecipeLayerNormLinear(TestFP8RecipeLayerNormLinearBase)
...
@@ -589,6 +601,11 @@ class TestFP8BlockScalingRecipeLayerNormLinear(TestFP8RecipeLayerNormLinearBase)
dtype
,
dtype
,
use_bias
=
True
,
use_bias
=
True
,
):
):
if
IS_HIP_EXTENSION
:
import
importlib
ori_int8_sim_fp8
=
os
.
environ
.
get
(
"NVTE_INT8_SIM_FP8"
,
None
)
os
.
environ
[
"NVTE_INT8_SIM_FP8"
]
=
"1"
importlib
.
reload
(
te
.
pytorch
.
fp8
)
fp8_zero_tolerance_tensor_dumps_recipe2
=
None
fp8_zero_tolerance_tensor_dumps_recipe2
=
None
# check tensor dumps dir, if the dir exists, then read files to get y, dgrad, wgrad, bgrad
# check tensor dumps dir, if the dir exists, then read files to get y, dgrad, wgrad, bgrad
# if we cannot get all four tensors, then still set the tensor dump to None
# if we cannot get all four tensors, then still set the tensor dump to None
...
@@ -612,11 +629,17 @@ class TestFP8BlockScalingRecipeLayerNormLinear(TestFP8RecipeLayerNormLinearBase)
...
@@ -612,11 +629,17 @@ class TestFP8BlockScalingRecipeLayerNormLinear(TestFP8RecipeLayerNormLinearBase)
use_bias
,
use_bias
,
seed
=
torch
.
initial_seed
(),
seed
=
torch
.
initial_seed
(),
dtype
=
dtype
,
dtype
=
dtype
,
y_error
=
0.5
,
y_error
=
0.5
if
not
IS_HIP_EXTENSION
else
0.9
,
ln_out_error
=
0.5
,
ln_out_error
=
0.5
,
dgrad_error
=
1.6
,
dgrad_error
=
1.6
if
not
IS_HIP_EXTENSION
else
1.0
,
wgrad_error
=
1
,
wgrad_error
=
1
,
bgrad_error
=
0.5
,
bgrad_error
=
0.5
,
recipe1_golden_tensors
=
None
,
recipe1_golden_tensors
=
None
,
recipe2_golden_tensors
=
fp8_zero_tolerance_tensor_dumps_recipe2
,
recipe2_golden_tensors
=
fp8_zero_tolerance_tensor_dumps_recipe2
,
)
)
if
IS_HIP_EXTENSION
:
if
ori_int8_sim_fp8
is
not
None
:
os
.
environ
[
"NVTE_INT8_SIM_FP8"
]
=
ori_int8_sim_fp8
else
:
del
os
.
environ
[
"NVTE_INT8_SIM_FP8"
]
importlib
.
reload
(
te
.
pytorch
.
fp8
)
tests/pytorch/test_float8blockwisetensor.py
View file @
62550505
...
@@ -17,6 +17,7 @@ from transformer_engine.pytorch.tensor.float8_blockwise_tensor import (
...
@@ -17,6 +17,7 @@ from transformer_engine.pytorch.tensor.float8_blockwise_tensor import (
Float8BlockwiseQTensor
,
Float8BlockwiseQTensor
,
)
)
from
transformer_engine.pytorch.utils
import
get_device_compute_capability
from
transformer_engine.pytorch.utils
import
get_device_compute_capability
from
torch.utils.cpp_extension
import
IS_HIP_EXTENSION
import
transformer_engine_torch
as
tex
import
transformer_engine_torch
as
tex
# PyTorch tensor dtypes
# PyTorch tensor dtypes
...
@@ -43,7 +44,10 @@ def _to_list(x: Union[Iterable, Any]) -> List:
...
@@ -43,7 +44,10 @@ def _to_list(x: Union[Iterable, Any]) -> List:
DimsType
=
Union
[
Iterable
[
int
],
int
]
DimsType
=
Union
[
Iterable
[
int
],
int
]
# TODO replace with call to fp8.py when recipe added.
# TODO replace with call to fp8.py when recipe added.
recipe_available
=
get_device_compute_capability
()
>=
(
9
,
0
)
and
float
(
torch
.
version
.
cuda
)
>=
12.8
if
IS_HIP_EXTENSION
:
recipe_available
=
get_device_compute_capability
()
>=
(
9
,
0
)
else
:
recipe_available
=
get_device_compute_capability
()
>=
(
9
,
0
)
and
float
(
torch
.
version
.
cuda
)
>=
12.8
reason_for_no_recipe
=
"Quantize kernels require TMA and are only relevant with GEMMS."
reason_for_no_recipe
=
"Quantize kernels require TMA and are only relevant with GEMMS."
...
...
tests/pytorch/test_int8_blockwise_layers.py
deleted
100644 → 0
View file @
11864d3d
# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# See LICENSE for license information.
from
typing
import
Tuple
import
math
import
os
import
pathlib
import
pytest
import
torch
import
transformer_engine
as
te
import
transformer_engine_torch
as
tex
from
transformer_engine.pytorch.fp8
import
FP8GlobalStateManager
from
transformer_engine.common.recipe
import
Float8BlockScaling
from
transformer_engine.pytorch.constants
import
TE_DType
from
transformer_engine.pytorch.tensor.float8_blockwise_tensor
import
(
Float8BlockQuantizer
,
Float8BlockwiseQTensor
,
)
from
references.blockwise_quantizer_reference
import
(
BlockwiseQuantizerReference
,
QuantizeResult
,
)
from
test_float8_current_scaling_exact
import
(
TestFP8RecipeLinearBase
,
TestFP8RecipeLayerNormLinearBase
,
)
import
logging
# read env variable NVTE_TEST_FLOAT8_BLOCK_SCALING_EXACT_TENSOR_DUMP_DIR to override the default tensor dump directory
TENSOR_DUMP_DIR
=
pathlib
.
Path
(
__file__
).
resolve
().
parent
.
parent
.
parent
/
"tensor_dumps"
tensor_dump_dir_env
=
os
.
getenv
(
"NVTE_TEST_BLOCK_CURRENT_SCALING_EXACT_TENSOR_DUMP_DIR"
)
if
tensor_dump_dir_env
is
not
None
:
TENSOR_DUMP_DIR
=
pathlib
.
Path
(
tensor_dump_dir_env
)
recipe_available
,
reason_for_no_recipe
=
FP8GlobalStateManager
.
is_fp8_block_scaling_available
()
class
GetRecipes
:
@
staticmethod
def
none
():
return
None
@
staticmethod
def
fp8_blockwise
():
# return default configs
return
Float8BlockScaling
()
# FP8 per tesnor current scaling
@
pytest
.
mark
.
skipif
(
not
recipe_available
,
reason
=
reason_for_no_recipe
)
class
TestFP8BlockScalingRecipeLinear
(
TestFP8RecipeLinearBase
):
@
staticmethod
def
setup_class
(
cls
)
->
None
:
# Configure RNG
seed
=
1234
torch
.
manual_seed
(
seed
)
torch
.
cuda
.
manual_seed
(
seed
)
@
pytest
.
mark
.
parametrize
(
"batch_size, hidden_size, out_size"
,
[
(
16
,
256
,
128
),
],
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
torch
.
bfloat16
],
ids
=
[
"bf16"
])
@
pytest
.
mark
.
parametrize
(
"recipe1, recipe2"
,
[
(
GetRecipes
.
none
,
GetRecipes
.
fp8_blockwise
),
],
)
def
test_fp8_current_scaling_with_linear_module
(
self
,
recipe1
,
recipe2
,
batch_size
,
hidden_size
,
out_size
,
dtype
,
use_bias
=
False
,
):
fp8_zero_tolerance_tensor_dumps_recipe2
=
None
# check tensor dumps dir, if the dir exists, then read files to get y, dgrad, wgrad, bgrad
# if we cannot get all four tensors, then still set the tensor dump to None
tensor_map
=
self
.
_check_golden_tensor_dumps
(
TENSOR_DUMP_DIR
,
recipe2
,
(
batch_size
,
hidden_size
,
out_size
),
dtype
,
use_bias
)
if
tensor_map
is
not
None
:
fp8_zero_tolerance_tensor_dumps_recipe2
=
tensor_map
assert
recipe1
==
GetRecipes
.
none
,
"Only None recipe is supported for recipe1"
self
.
compare_recipe
(
recipe1
,
recipe2
,
batch_size
,
hidden_size
,
out_size
,
use_bias
,
seed
=
torch
.
initial_seed
(),
dtype
=
dtype
,
y_error
=
0.5
,
dgrad_error
=
1
,
wgrad_error
=
1
,
bgrad_error
=
0.5
,
recipe1_golden_tensors
=
None
,
recipe2_golden_tensors
=
fp8_zero_tolerance_tensor_dumps_recipe2
,
)
@
pytest
.
mark
.
skipif
(
not
recipe_available
,
reason
=
reason_for_no_recipe
)
class
TestFP8BlockScalingRecipeLayerNormLinear
(
TestFP8RecipeLayerNormLinearBase
):
@
staticmethod
def
setup_class
(
cls
)
->
None
:
# Configure RNG
seed
=
1234
torch
.
manual_seed
(
seed
)
torch
.
cuda
.
manual_seed
(
seed
)
@
pytest
.
mark
.
parametrize
(
"batch_size, hidden_size, out_size"
,
[
(
16
,
256
,
128
),
],
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
torch
.
bfloat16
],
ids
=
[
"bf16"
])
@
pytest
.
mark
.
parametrize
(
"recipe1, recipe2"
,
[
(
GetRecipes
.
none
,
GetRecipes
.
fp8_blockwise
),
],
)
def
test_fp8_current_scaling_with_layernorm_linear_module
(
self
,
recipe1
,
recipe2
,
batch_size
,
hidden_size
,
out_size
,
dtype
,
use_bias
=
False
,
):
fp8_zero_tolerance_tensor_dumps_recipe2
=
None
# check tensor dumps dir, if the dir exists, then read files to get y, dgrad, wgrad, bgrad
# if we cannot get all four tensors, then still set the tensor dump to None
tensor_map
=
self
.
_check_golden_tensor_dumps
(
TENSOR_DUMP_DIR
,
recipe2
,
(
batch_size
,
hidden_size
,
out_size
),
dtype
,
use_bias
,
"LayerNorm"
,
)
if
tensor_map
is
not
None
:
fp8_zero_tolerance_tensor_dumps_recipe2
=
tensor_map
self
.
compare_recipe
(
recipe1
,
recipe2
,
batch_size
,
hidden_size
,
out_size
,
use_bias
,
seed
=
torch
.
initial_seed
(),
dtype
=
dtype
,
y_error
=
0.9
,
ln_out_error
=
0.5
,
dgrad_error
=
1
,
wgrad_error
=
1
,
bgrad_error
=
0.5
,
recipe1_golden_tensors
=
None
,
recipe2_golden_tensors
=
fp8_zero_tolerance_tensor_dumps_recipe2
,
)
tests/pytorch/test_numerics.py
View file @
62550505
...
@@ -2515,6 +2515,7 @@ def test_grouped_gemm(shape, dtype, layout, accumulate):
...
@@ -2515,6 +2515,7 @@ def test_grouped_gemm(shape, dtype, layout, accumulate):
# Force the sequential_linear and grouped_linear to use hipblaslt rather than hipblas
# Force the sequential_linear and grouped_linear to use hipblaslt rather than hipblas
if
IS_HIP_EXTENSION
:
if
IS_HIP_EXTENSION
:
ori_force_rocm_gemm
=
os
.
environ
.
get
(
"NVTE_FORCE_ROCM_GEMM"
,
None
)
os
.
environ
[
"NVTE_FORCE_ROCM_GEMM"
]
=
"1"
os
.
environ
[
"NVTE_FORCE_ROCM_GEMM"
]
=
"1"
for
i
in
range
(
z
):
for
i
in
range
(
z
):
...
@@ -2544,7 +2545,10 @@ def test_grouped_gemm(shape, dtype, layout, accumulate):
...
@@ -2544,7 +2545,10 @@ def test_grouped_gemm(shape, dtype, layout, accumulate):
single_output
=
single_output
,
single_output
=
single_output
,
)
)
if
IS_HIP_EXTENSION
:
if
IS_HIP_EXTENSION
:
os
.
environ
[
"NVTE_FORCE_ROCM_GEMM"
]
=
"0"
if
ori_force_rocm_gemm
is
not
None
:
os
.
environ
[
"NVTE_FORCE_ROCM_GEMM"
]
=
ori_force_rocm_gemm
else
:
del
os
.
environ
[
"NVTE_FORCE_ROCM_GEMM"
]
# should be bit-wise match
# should be bit-wise match
for
o
,
o_ref
in
zip
(
out
,
out_ref
):
for
o
,
o_ref
in
zip
(
out
,
out_ref
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment