Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
TransformerEngine
Commits
f8c2af4c
Commit
f8c2af4c
authored
May 21, 2025
by
yuguo
Browse files
Merge commit '
1d903f5e
' of...
Merge commit '
1d903f5e
' of
https://github.com/NVIDIA/TransformerEngine
parents
e92773a3
1d903f5e
Changes
211
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
154 additions
and
63 deletions
+154
-63
qa/L1_pytorch_mcore_integration/test.sh
qa/L1_pytorch_mcore_integration/test.sh
+1
-1
qa/L2_jax_unittest/test.sh
qa/L2_jax_unittest/test.sh
+2
-2
qa/L3_pytorch_FA_versions_test/test.sh
qa/L3_pytorch_FA_versions_test/test.sh
+5
-3
setup.py
setup.py
+12
-9
tests/cpp/operator/test_act.cu
tests/cpp/operator/test_act.cu
+8
-8
tests/cpp/operator/test_cast_dbias.cu
tests/cpp/operator/test_cast_dbias.cu
+1
-1
tests/cpp/operator/test_cast_dbias_dgelu.cu
tests/cpp/operator/test_cast_dbias_dgelu.cu
+1
-1
tests/cpp/operator/test_cast_float8blockwise.cu
tests/cpp/operator/test_cast_float8blockwise.cu
+2
-2
tests/cpp/operator/test_cast_mxfp8.cu
tests/cpp/operator/test_cast_mxfp8.cu
+2
-2
tests/cpp/operator/test_cast_mxfp8_gated_swiglu.cu
tests/cpp/operator/test_cast_mxfp8_gated_swiglu.cu
+4
-4
tests/cpp/operator/test_cast_transpose.cu
tests/cpp/operator/test_cast_transpose.cu
+2
-2
tests/cpp/operator/test_cast_transpose_current_scaling.cu
tests/cpp/operator/test_cast_transpose_current_scaling.cu
+2
-2
tests/cpp/operator/test_cast_transpose_dbias.cu
tests/cpp/operator/test_cast_transpose_dbias.cu
+3
-3
tests/cpp/operator/test_cast_transpose_dbias_dgelu.cu
tests/cpp/operator/test_cast_transpose_dbias_dgelu.cu
+4
-4
tests/cpp/operator/test_cast_transpose_dgeglu.cu
tests/cpp/operator/test_cast_transpose_dgeglu.cu
+3
-3
tests/cpp/operator/test_causal_softmax.cu
tests/cpp/operator/test_causal_softmax.cu
+5
-5
tests/cpp/operator/test_dequantize_mxfp8.cu
tests/cpp/operator/test_dequantize_mxfp8.cu
+7
-7
tests/cpp/operator/test_memset.cu
tests/cpp/operator/test_memset.cu
+86
-0
tests/cpp/operator/test_multi_cast_transpose.cu
tests/cpp/operator/test_multi_cast_transpose.cu
+2
-2
tests/cpp/operator/test_multi_padding.cu
tests/cpp/operator/test_multi_padding.cu
+2
-2
No files found.
qa/L1_pytorch_mcore_integration/test.sh
View file @
f8c2af4c
...
...
@@ -17,7 +17,7 @@ fi
# Download Megatron-LM if needed
if
[
!
-d
"
${
MCORE_PATH
}
"
]
;
then
pushd
$(
dirname
${
MCORE_PATH
}
)
git clone
-b
core_r0.
9
.0 https://github.com/NVIDIA/Megatron-LM.git Megatron-LM
git clone
-b
core_r0.
12
.0 https://github.com/NVIDIA/Megatron-LM.git Megatron-LM
popd
fi
...
...
qa/L2_jax_unittest/test.sh
View file @
f8c2af4c
...
...
@@ -20,6 +20,7 @@ FAILED_CASES=""
pip3
install
"nltk>=3.8.2"
||
error_exit
"Failed to install nltk"
pip3
install
pytest
==
8.2.1
||
error_exit
"Failed to install pytest"
:
${
TE_PATH
:
=/opt/transformerengine
}
:
${
XML_LOG_DIR
:
=/logs
}
mkdir
-p
"
$XML_LOG_DIR
"
...
...
@@ -30,10 +31,9 @@ python3 -m pytest -c $TE_PATH/tests/jax/pytest.ini -v --junitxml=$XML_LOG_DIR/py
NVTE_JAX_UNITTEST_LEVEL
=
"L2"
NVTE_CUSTOM_CALLS_RE
=
""
python3
-m
pytest
-c
$TE_PATH
/tests/jax/pytest.ini
-v
--junitxml
=
$XML_LOG_DIR
/pytest_test_custom_call_compute.xml
$TE_PATH
/tests/jax/test_custom_call_compute.py
||
test_fail
"test_custom_call_compute.py"
pip3
install
-r
$TE_PATH
/examples/jax/mnist/requirements.txt
||
error_exit
"Failed to install mnist requirements"
pip3
install
-r
$TE_PATH
/examples/jax/encoder/requirements.txt
||
error_exit
"Failed to install encoder requirements"
python3
-m
pytest
-c
$TE_PATH
/tests/jax/pytest.ini
-v
--junitxml
=
$XML_LOG_DIR
/pytest_mnist.xml
$TE_PATH
/examples/jax/mnist
||
test_fail
"mnist"
pip3
install
-r
$TE_PATH
/examples/jax/encoder/requirements.txt
||
error_exit
"Failed to install encoder requirements"
# Make encoder tests to have run-to-run deterministic to have the stable CI results
export
XLA_FLAGS
=
"
${
XLA_FLAGS
}
--xla_gpu_deterministic_ops"
python3
-m
pytest
-c
$TE_PATH
/tests/jax/pytest.ini
-v
--junitxml
=
$XML_LOG_DIR
/pytest_test_single_gpu_encoder.xml
$TE_PATH
/examples/jax/encoder/test_single_gpu_encoder.py
||
test_fail
"test_single_gpu_encoder.py"
...
...
qa/L3_pytorch_FA_versions_test/test.sh
View file @
f8c2af4c
...
...
@@ -11,15 +11,17 @@ mkdir -p "$XML_LOG_DIR"
pip3
install
pytest
==
8.2.1
# Limit parallel build jobs to avoid overwhelming system resources
export
MAX_JOBS
=
4
export
MAX_JOBS
=
32
# Iterate over Flash Attention versions
sm_arch
=
`
python3
-c
"import torch; sm = torch.cuda.get_device_capability(0); print(sm[0]*10+sm[1])"
`
export
FLASH_ATTN_CUDA_ARCHS
=
$sm_arch
if
[
$sm_arch
-gt
90
]
then
FA_versions
=(
2.7.3
)
else
FA_versions
=(
2.3.0 2.4.1 2.5.7 2.7.3 3.0.0b1
)
elif
[
$sm_arch
-eq
90
]
then
FA_versions
=(
2.5.7 2.7.3 3.0.0b1
)
fi
for
fa_version
in
"
${
FA_versions
[@]
}
"
...
...
setup.py
View file @
f8c2af4c
...
...
@@ -7,7 +7,6 @@
# NVTE_FRAMEWORK=pytorch NVTE_USE_ROCM=1 NVTE_USE_HIPBLASLT=1 NVTE_USE_ROCBLAS=1 CMAKE_PREFIX_PATH=/opt/dtk/lib/cmake/amd_comgr/ MPI_HOME=/opt/mpi/ NVTE_UB_WITH_MPI=1 CXX=hipcc PYTHONPATH=/home/TransformerEngine/3rdparty/hipify_torch:$PYTHONPATH python3 setup.py bdist_wheel
import
os
import
sys
import
time
from
pathlib
import
Path
from
typing
import
List
,
Tuple
...
...
@@ -26,7 +25,6 @@ from build_tools.utils import (
get_frameworks
,
install_and_import
,
remove_dups
,
uninstall_te_wheel_packages
,
)
frameworks
=
get_frameworks
()
...
...
@@ -111,7 +109,15 @@ def setup_requirements() -> Tuple[List[str], List[str], List[str]]:
"""
# Common requirements
setup_reqs
:
List
[
str
]
=
[]
setup_reqs
:
List
[
str
]
=
[
"nvidia-cuda-runtime-cu12"
,
"nvidia-cublas-cu12"
,
"nvidia-cudnn-cu12"
,
"nvidia-cuda-cccl-cu12"
,
"nvidia-cuda-nvcc-cu12"
,
"nvidia-nvtx-cu12"
,
"nvidia-cuda-nvrtc-cu12"
,
]
install_reqs
:
List
[
str
]
=
[
"pydantic"
,
"importlib-metadata>=1.0"
,
...
...
@@ -130,6 +136,7 @@ def setup_requirements() -> Tuple[List[str], List[str], List[str]]:
# Framework-specific requirements
if
not
bool
(
int
(
os
.
getenv
(
"NVTE_RELEASE_BUILD"
,
"0"
))):
if
"pytorch"
in
frameworks
:
setup_reqs
.
extend
([
"torch>=2.1"
])
install_reqs
.
extend
([
"torch>=2.1"
])
# install_reqs.append(
# "nvdlfw-inspect @"
...
...
@@ -137,8 +144,9 @@ def setup_requirements() -> Tuple[List[str], List[str], List[str]]:
# )
# Blackwell is not supported as of Triton 3.2.0, need custom internal build
# install_reqs.append("triton")
test_reqs
.
extend
([
"numpy"
,
"torchvision"
,
"prettytable"
,
"PyYAML"
])
test_reqs
.
extend
([
"numpy"
,
"torchvision"
])
if
"jax"
in
frameworks
:
setup_reqs
.
extend
([
"jax[cuda12]"
,
"flax>=0.7.1"
])
install_reqs
.
extend
([
"jax"
,
"flax>=0.7.1"
])
test_reqs
.
extend
([
"numpy"
])
...
...
@@ -157,7 +165,6 @@ if __name__ == "__main__":
int
(
os
.
getenv
(
"NVTE_RELEASE_BUILD"
,
"0"
))
),
"NVTE_RELEASE_BUILD env must be set for metapackage build."
ext_modules
=
[]
cmdclass
=
{}
package_data
=
{}
include_package_data
=
False
setup_requires
=
[]
...
...
@@ -169,15 +176,11 @@ if __name__ == "__main__":
else
:
setup_requires
,
install_requires
,
test_requires
=
setup_requirements
()
ext_modules
=
[
setup_common_extension
()]
cmdclass
=
{
"build_ext"
:
CMakeBuildExtension
,
"bdist_wheel"
:
TimedBdist
}
package_data
=
{
""
:
[
"VERSION.txt"
]}
include_package_data
=
True
extras_require
=
{
"test"
:
test_requires
}
if
not
bool
(
int
(
os
.
getenv
(
"NVTE_RELEASE_BUILD"
,
"0"
))):
# Remove residual FW packages since compiling from source
# results in a single binary with FW extensions included.
uninstall_te_wheel_packages
()
if
"pytorch"
in
frameworks
:
from
build_tools.pytorch
import
setup_pytorch_extension
...
...
tests/cpp/operator/test_act.cu
View file @
f8c2af4c
...
...
@@ -116,10 +116,10 @@ void performTest(const size_t N, const size_t H) {
DType
itype
=
TypeInfo
<
IType
>::
dtype
;
DType
otype
=
TypeInfo
<
OType
>::
dtype
;
Tensor
input
(
"input"
,
{
N
,
H
},
itype
);
Tensor
output
(
"output"
,
{
N
,
H
},
otype
);
Tensor
igrad
(
"igrad"
,
{
N
,
H
},
itype
);
Tensor
ograd
(
"ograd"
,
{
N
,
H
},
itype
);
Tensor
input
(
"input"
,
std
::
vector
<
size_t
>
{
N
,
H
},
itype
);
Tensor
output
(
"output"
,
std
::
vector
<
size_t
>
{
N
,
H
},
otype
);
Tensor
igrad
(
"igrad"
,
std
::
vector
<
size_t
>
{
N
,
H
},
itype
);
Tensor
ograd
(
"ograd"
,
std
::
vector
<
size_t
>
{
N
,
H
},
itype
);
fillUniform
(
&
input
);
fillUniform
(
&
ograd
);
...
...
@@ -171,10 +171,10 @@ void performTestGLU(const size_t N, const size_t H) {
DType
itype
=
TypeInfo
<
IType
>::
dtype
;
DType
otype
=
TypeInfo
<
OType
>::
dtype
;
Tensor
input
(
"input"
,
{
N
,
H
*
2
},
itype
);
Tensor
output
(
"output"
,
{
N
,
H
},
otype
);
Tensor
igrad
(
"igrad"
,
{
N
,
H
*
2
},
itype
);
Tensor
ograd
(
"ograd"
,
{
N
,
H
},
itype
);
Tensor
input
(
"input"
,
std
::
vector
<
size_t
>
{
N
,
H
*
2
},
itype
);
Tensor
output
(
"output"
,
std
::
vector
<
size_t
>
{
N
,
H
},
otype
);
Tensor
igrad
(
"igrad"
,
std
::
vector
<
size_t
>
{
N
,
H
*
2
},
itype
);
Tensor
ograd
(
"ograd"
,
std
::
vector
<
size_t
>
{
N
,
H
},
itype
);
fillUniform
(
&
input
);
fillUniform
(
&
ograd
);
...
...
tests/cpp/operator/test_cast_dbias.cu
View file @
f8c2af4c
...
...
@@ -70,7 +70,7 @@ void performTest(const std::vector<size_t>& shape) {
Tensor
output_c
(
"output_c"
,
shape
,
otype
);
// dbias has the same data type with "output grad"
Tensor
dbias
(
"dbias"
,
{
H
},
itype
);
Tensor
dbias
(
"dbias"
,
std
::
vector
<
size_t
>
{
H
},
itype
);
fillUniform
(
&
input
);
setRandomScale
(
&
output_c
);
...
...
tests/cpp/operator/test_cast_dbias_dgelu.cu
View file @
f8c2af4c
...
...
@@ -79,7 +79,7 @@ void performTest(const std::vector<size_t>& shape) {
Tensor
output_c
(
"output_c"
,
shape
,
otype
);
// dbias has the same data type with "output grad"
Tensor
dbias
(
"dbias"
,
{
H
},
itype
);
Tensor
dbias
(
"dbias"
,
std
::
vector
<
size_t
>
{
H
},
itype
);
fillUniform
(
&
input
);
fillUniform
(
&
grad
);
...
...
tests/cpp/operator/test_cast_float8blockwise.cu
View file @
f8c2af4c
...
...
@@ -280,7 +280,7 @@ void runTestCase(const ProcessingMethod processing_method, const std::vector<siz
Tensor
grad
(
"grad"
,
shape
,
itype
);
Tensor
output_c
(
"output_c"
,
shape
,
otype
,
rowwise
,
colwise
,
opts
.
block_scaling_dim
==
2
?
NVTE_BLOCK_SCALING_2D
:
NVTE_BLOCK_SCALING_1D
);
Tensor
output_dbias
(
"output_dbias"
,
{
cols
},
itype
);
Tensor
output_dbias
(
"output_dbias"
,
std
::
vector
<
size_t
>
{
cols
},
itype
);
std
::
unique_ptr
<
OutputType
[]
>
ref_output
=
std
::
make_unique
<
OutputType
[]
>
(
rows
*
cols
);
std
::
unique_ptr
<
OutputType
[]
>
ref_output_t
=
std
::
make_unique
<
OutputType
[]
>
(
rows
*
cols
);
...
...
@@ -355,7 +355,7 @@ void runTestCaseOneDimensionalBlocks(const ProcessingMethod processing_method,
Tensor
grad
(
"grad"
,
shape
,
itype
);
Tensor
output_c
(
"output_c"
,
shape
,
otype
,
rowwise
,
colwise
,
opts
.
block_scaling_dim
==
2
?
NVTE_BLOCK_SCALING_2D
:
NVTE_BLOCK_SCALING_1D
);
Tensor
output_dbias
(
"output_dbias"
,
{
cols
},
itype
);
Tensor
output_dbias
(
"output_dbias"
,
std
::
vector
<
size_t
>
{
cols
},
itype
);
std
::
unique_ptr
<
OutputType
[]
>
ref_output
=
std
::
make_unique
<
OutputType
[]
>
(
rows
*
cols
);
std
::
unique_ptr
<
OutputType
[]
>
ref_output_t
=
std
::
make_unique
<
OutputType
[]
>
(
rows
*
cols
);
...
...
tests/cpp/operator/test_cast_mxfp8.cu
View file @
f8c2af4c
...
...
@@ -230,7 +230,7 @@ void performTest_x1(const ProcessingMethod processing_method,
Tensor
input
(
"input"
,
shape
,
itype
);
Tensor
grad
(
"grad"
,
shape
,
itype
);
Tensor
output_c
(
"output_c"
,
shape
,
otype
,
rowwise
,
colwise
,
NVTE_MXFP8_1D_SCALING
);
Tensor
output_dbias
(
"output_dbias"
,
{
cols
},
itype
);
Tensor
output_dbias
(
"output_dbias"
,
std
::
vector
<
size_t
>
{
cols
},
itype
);
std
::
unique_ptr
<
OutputType
[]
>
ref_output_c
=
std
::
make_unique
<
OutputType
[]
>
(
rows
*
cols
);
std
::
unique_ptr
<
InputType
[]
>
ref_output_dbias
=
std
::
make_unique
<
InputType
[]
>
(
cols
);
...
...
@@ -368,7 +368,7 @@ void performTest_x2(const ProcessingMethod processing_method,
Tensor
input
(
"input"
,
shape
,
itype
);
Tensor
grad
(
"grad"
,
shape
,
itype
);
Tensor
output
(
"output"
,
shape
,
otype
,
true
,
true
,
NVTE_MXFP8_1D_SCALING
);
Tensor
output_dbias
(
"output_dbias"
,
{
cols
},
itype
);
Tensor
output_dbias
(
"output_dbias"
,
std
::
vector
<
size_t
>
{
cols
},
itype
);
std
::
unique_ptr
<
OutputType
[]
>
ref_output_c_rowwise
=
std
::
make_unique
<
OutputType
[]
>
(
rows
*
cols
);
std
::
unique_ptr
<
OutputType
[]
>
ref_output_c_colwise
=
std
::
make_unique
<
OutputType
[]
>
(
rows
*
cols
);
...
...
tests/cpp/operator/test_cast_mxfp8_gated_swiglu.cu
View file @
f8c2af4c
...
...
@@ -204,8 +204,8 @@ void performTest_x1(const size_t rows,
// std::cout << "blocks_X: " << blocks_X << std::endl;
// std::cout << "scales_stride: " << scales_stride << std::endl;
Tensor
grad
(
"grad"
,
{
rows
,
cols
},
itype
);
Tensor
input
(
"input"
,
{
rows
,
cols
*
2
},
itype
);
Tensor
grad
(
"grad"
,
std
::
vector
<
size_t
>
{
rows
,
cols
},
itype
);
Tensor
input
(
"input"
,
std
::
vector
<
size_t
>
{
rows
,
cols
*
2
},
itype
);
const
size_t
output_cols
=
(
IS_DGATED
?
2
:
1
)
*
cols
;
...
...
@@ -289,8 +289,8 @@ void performTest_x2(const size_t rows,
DType
itype
=
TypeInfo
<
IType
>::
dtype
;
DType
otype
=
TypeInfo
<
OType
>::
dtype
;
Tensor
grad
(
"grad"
,
{
rows
,
cols
},
itype
);
Tensor
input
(
"input"
,
{
rows
,
cols
*
2
},
itype
);
Tensor
grad
(
"grad"
,
std
::
vector
<
size_t
>
{
rows
,
cols
},
itype
);
Tensor
input
(
"input"
,
std
::
vector
<
size_t
>
{
rows
,
cols
*
2
},
itype
);
const
size_t
output_cols
=
(
IS_DGATED
?
2
:
1
)
*
cols
;
...
...
tests/cpp/operator/test_cast_transpose.cu
View file @
f8c2af4c
...
...
@@ -47,8 +47,8 @@ void performTest(const size_t N, const size_t H) {
DType
itype
=
TypeInfo
<
InputType
>::
dtype
;
DType
otype
=
TypeInfo
<
OutputType
>::
dtype
;
Tensor
input
(
"input"
,
{
N
,
H
},
itype
);
Tensor
output
(
"output"
,
{
N
,
H
},
otype
,
true
,
true
);
Tensor
input
(
"input"
,
std
::
vector
<
size_t
>
{
N
,
H
},
itype
);
Tensor
output
(
"output"
,
std
::
vector
<
size_t
>
{
N
,
H
},
otype
,
true
,
true
);
std
::
unique_ptr
<
OutputType
[]
>
ref_output_c
=
std
::
make_unique
<
OutputType
[]
>
(
N
*
H
);
std
::
unique_ptr
<
OutputType
[]
>
ref_output_t
=
std
::
make_unique
<
OutputType
[]
>
(
N
*
H
);
...
...
tests/cpp/operator/test_cast_transpose_current_scaling.cu
View file @
f8c2af4c
...
...
@@ -112,8 +112,8 @@ void performTest(const size_t N, const size_t H) {
}
}
Tensor
input
(
"input"
,
{
N
,
H
},
itype
);
Tensor
output
(
"output"
,
{
N
,
H
},
otype
,
true
,
true
);
Tensor
input
(
"input"
,
std
::
vector
<
size_t
>
{
N
,
H
},
itype
);
Tensor
output
(
"output"
,
std
::
vector
<
size_t
>
{
N
,
H
},
otype
,
true
,
true
);
std
::
unique_ptr
<
OutputType
[]
>
ref_output_c
=
std
::
make_unique
<
OutputType
[]
>
(
N
*
H
);
std
::
unique_ptr
<
OutputType
[]
>
ref_output_t
=
std
::
make_unique
<
OutputType
[]
>
(
N
*
H
);
...
...
tests/cpp/operator/test_cast_transpose_dbias.cu
View file @
f8c2af4c
...
...
@@ -65,11 +65,11 @@ void performTest(const size_t N, const size_t H) {
DType
itype
=
TypeInfo
<
IType
>::
dtype
;
DType
otype
=
TypeInfo
<
OType
>::
dtype
;
Tensor
input
(
"input"
,
{
N
,
H
},
itype
);
Tensor
input
(
"input"
,
std
::
vector
<
size_t
>
{
N
,
H
},
itype
);
Tensor
output
(
"output"
,
{
N
,
H
},
otype
,
true
,
true
);
Tensor
output
(
"output"
,
std
::
vector
<
size_t
>
{
N
,
H
},
otype
,
true
,
true
);
// dbias has the same data type with "output grad"
Tensor
dbias
(
"dbias"
,
{
H
},
itype
);
Tensor
dbias
(
"dbias"
,
std
::
vector
<
size_t
>
{
H
},
itype
);
fillUniform
(
&
input
);
setRandomScale
(
&
output
);
...
...
tests/cpp/operator/test_cast_transpose_dbias_dgelu.cu
View file @
f8c2af4c
...
...
@@ -76,12 +76,12 @@ void performTest(const size_t N, const size_t H) {
DType
itype
=
TypeInfo
<
IType
>::
dtype
;
DType
otype
=
TypeInfo
<
OType
>::
dtype
;
Tensor
input
(
"input"
,
{
N
,
H
},
itype
);
Tensor
gelu_input
(
"gelu_input"
,
{
N
,
H
},
itype
);
Tensor
input
(
"input"
,
std
::
vector
<
size_t
>
{
N
,
H
},
itype
);
Tensor
gelu_input
(
"gelu_input"
,
std
::
vector
<
size_t
>
{
N
,
H
},
itype
);
Tensor
output
(
"output"
,
{
N
,
H
},
otype
,
true
,
true
);
Tensor
output
(
"output"
,
std
::
vector
<
size_t
>
{
N
,
H
},
otype
,
true
,
true
);
// dbias has the same data type with "output grad"
Tensor
dbias
(
"dbias"
,
{
H
},
itype
);
Tensor
dbias
(
"dbias"
,
std
::
vector
<
size_t
>
{
H
},
itype
);
fillUniform
(
&
input
);
fillUniform
(
&
gelu_input
);
...
...
tests/cpp/operator/test_cast_transpose_dgeglu.cu
View file @
f8c2af4c
...
...
@@ -74,9 +74,9 @@ void performTest(const size_t N, const size_t H) {
DType
itype
=
TypeInfo
<
IType
>::
dtype
;
DType
otype
=
TypeInfo
<
OType
>::
dtype
;
Tensor
grad
(
"grad"
,
{
N
,
H
},
itype
);
Tensor
input
(
"input"
,
{
N
,
H
*
2
},
itype
);
Tensor
output
(
"output"
,
{
N
,
H
*
2
},
otype
,
true
,
true
);
Tensor
grad
(
"grad"
,
std
::
vector
<
size_t
>
{
N
,
H
},
itype
);
Tensor
input
(
"input"
,
std
::
vector
<
size_t
>
{
N
,
H
*
2
},
itype
);
Tensor
output
(
"output"
,
std
::
vector
<
size_t
>
{
N
,
H
*
2
},
otype
,
true
,
true
);
fillUniform
(
&
grad
);
fillUniform
(
&
input
);
...
...
tests/cpp/operator/test_causal_softmax.cu
View file @
f8c2af4c
...
...
@@ -153,11 +153,11 @@ void performTest(
DType
itype
=
TypeInfo
<
Type
>::
dtype
;
Tensor
data_in
(
"data_in"
,
{
batches
,
heads
,
rows
,
cols
},
itype
);
Tensor
softmax_out
(
"softmax_out"
,
{
batches
,
heads
,
rows
,
cols
},
itype
);
Tensor
softmax_in
(
"softmax_in"
,
{
batches
,
heads
,
rows
,
cols
},
itype
);
Tensor
grads_in
(
"grads_in"
,
{
batches
,
heads
,
rows
,
cols
},
itype
);
Tensor
grads_out
(
"grads_out"
,
{
batches
,
heads
,
rows
,
cols
},
itype
);
Tensor
data_in
(
"data_in"
,
std
::
vector
<
size_t
>
{
batches
,
heads
,
rows
,
cols
},
itype
);
Tensor
softmax_out
(
"softmax_out"
,
std
::
vector
<
size_t
>
{
batches
,
heads
,
rows
,
cols
},
itype
);
Tensor
softmax_in
(
"softmax_in"
,
std
::
vector
<
size_t
>
{
batches
,
heads
,
rows
,
cols
},
itype
);
Tensor
grads_in
(
"grads_in"
,
std
::
vector
<
size_t
>
{
batches
,
heads
,
rows
,
cols
},
itype
);
Tensor
grads_out
(
"grads_out"
,
std
::
vector
<
size_t
>
{
batches
,
heads
,
rows
,
cols
},
itype
);
const
size_t
elements_total
=
batches
*
heads
*
rows
*
cols
;
std
::
unique_ptr
<
Type
[]
>
softmax_out_ref
=
std
::
make_unique
<
Type
[]
>
(
elements_total
);
...
...
tests/cpp/operator/test_dequantize_mxfp8.cu
View file @
f8c2af4c
...
...
@@ -214,10 +214,10 @@ void performTest_x1(const size_t rows,
const
size_t
blocks_num
=
rowwise
?
blocks_num_rowwise
:
blocks_num_colwise
;
const
size_t
scales_stride
=
rowwise
?
blocks_X_rowwise
:
blocks_X_colwise
;
Tensor
input
(
"input"
,
{
rows
,
cols
},
itype
,
rowwise
,
colwise
,
NVTE_MXFP8_1D_SCALING
);
Tensor
input
(
"input"
,
std
::
vector
<
size_t
>
{
rows
,
cols
},
itype
,
rowwise
,
colwise
,
NVTE_MXFP8_1D_SCALING
);
// Output data are written to the rowwise ptr regardless of the scaling direction
Tensor
output
(
"output"
,
{
rows
,
cols
},
otype
,
true
,
false
);
Tensor
output
(
"output"
,
std
::
vector
<
size_t
>
{
rows
,
cols
},
otype
,
true
,
false
);
std
::
unique_ptr
<
OutputType
[]
>
ref_output
=
std
::
make_unique
<
OutputType
[]
>
(
rows
*
cols
);
std
::
unique_ptr
<
fp8e8m0
[]
>
scales
=
std
::
make_unique
<
fp8e8m0
[]
>
(
blocks_num
);
...
...
@@ -267,11 +267,11 @@ void performTest_quantize_then_dequantize(const size_t rows,
// input --> quantized --> output (dequantized)
// input == output
Tensor
input
(
"input"
,
{
rows
,
cols
},
in_type
);
Tensor
quantized
(
"quantized"
,
{
rows
,
cols
},
intermed_type
,
rowwise
,
colwise
,
NVTE_MXFP8_1D_SCALING
);
Tensor
input
(
"input"
,
std
::
vector
<
size_t
>
{
rows
,
cols
},
in_type
);
Tensor
quantized
(
"quantized"
,
std
::
vector
<
size_t
>
{
rows
,
cols
},
intermed_type
,
rowwise
,
colwise
,
NVTE_MXFP8_1D_SCALING
);
// Output data are written to the rowwise ptr regardless of the scaling direction
Tensor
output
(
"output"
,
{
rows
,
cols
},
out_type
,
true
,
false
);
Tensor
output
(
"output"
,
std
::
vector
<
size_t
>
{
rows
,
cols
},
out_type
,
true
,
false
);
// fillCase<EncodingType>(&input, InputsFillCase::minNorm_to_maxNorm);
fillCase
<
EncodingType
>
(
&
input
,
InputsFillCase
::
uniform
);
...
...
@@ -333,8 +333,8 @@ void performTest_x2(const size_t rows,
const
size_t
blocks_num_rowwise
=
blocks_Y_rowwise
*
blocks_X_rowwise
;
const
size_t
blocks_num_colwise
=
blocks_Y_colwise
*
blocks_X_colwise
;
Tensor
input
(
"input"
,
{
rows
,
cols
},
itype
,
true
,
true
,
NVTE_MXFP8_1D_SCALING
);
Tensor
output
(
"output"
,
{
rows
,
cols
},
otype
);
Tensor
input
(
"input"
,
std
::
vector
<
size_t
>
{
rows
,
cols
},
itype
,
true
,
true
,
NVTE_MXFP8_1D_SCALING
);
Tensor
output
(
"output"
,
std
::
vector
<
size_t
>
{
rows
,
cols
},
otype
);
std
::
unique_ptr
<
OutputType
[]
>
ref_output_rowwise
=
std
::
make_unique
<
OutputType
[]
>
(
rows
*
cols
);
std
::
unique_ptr
<
OutputType
[]
>
ref_output_colwise
=
std
::
make_unique
<
OutputType
[]
>
(
rows
*
cols
);
...
...
tests/cpp/operator/test_memset.cu
0 → 100644
View file @
f8c2af4c
/*************************************************************************
* Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* See LICENSE for license information.
************************************************************************/
#include <cmath>
#include <cstring>
#include <memory>
#include <iomanip>
#include <iostream>
#include <random>
#include <type_traits>
#include <cuda_bf16.h>
#include <cuda_runtime.h>
#include <gtest/gtest.h>
#include <transformer_engine/transformer_engine.h>
#include "../test_common.h"
using
namespace
transformer_engine
;
class
MemsetTestSuite
:
public
::
testing
::
TestWithParam
<
std
::
tuple
<
int
,
size_t
>>
{};
TEST_P
(
MemsetTestSuite
,
TestMemset
)
{
using
namespace
transformer_engine
;
using
namespace
test
;
int
value
=
std
::
get
<
0
>
(
GetParam
());
size_t
size_in_bytes
=
std
::
get
<
1
>
(
GetParam
());
std
::
vector
<
uint8_t
>
h_buffer
{};
h_buffer
.
resize
(
size_in_bytes
);
for
(
size_t
i
=
0
;
i
<
size_in_bytes
;
++
i
)
{
h_buffer
[
i
]
=
value
+
1
;
// Initialize host buffer to a different value than memset value to verify memset is working correctly
}
char
*
d_ptr
;
NVTE_CHECK_CUDA
(
cudaMalloc
(
&
d_ptr
,
size_in_bytes
));
NVTE_CHECK_CUDA
(
cudaMemcpy
(
d_ptr
,
h_buffer
.
data
(),
size_in_bytes
,
cudaMemcpyHostToDevice
));
nvte_memset
(
d_ptr
,
value
,
size_in_bytes
,
0
/* stream */
);
NVTE_CHECK_CUDA
(
cudaMemcpy
(
h_buffer
.
data
(),
d_ptr
,
size_in_bytes
,
cudaMemcpyDeviceToHost
));
NVTE_CHECK_CUDA
(
cudaFree
(
d_ptr
));
NVTE_CHECK_CUDA
(
cudaDeviceSynchronize
());
for
(
size_t
i
=
0
;
i
<
size_in_bytes
;
++
i
)
{
EXPECT_EQ
(
h_buffer
[
i
],
static_cast
<
uint8_t
>
(
value
))
<<
"Mismatch at index "
<<
i
<<
": expected "
<<
static_cast
<
int
>
(
value
)
<<
", got "
<<
static_cast
<
int
>
(
h_buffer
[
i
]);
}
}
namespace
{
std
::
vector
<
size_t
>
memset_test_sizes
=
{
1
,
4
,
9
,
16
,
128
,
4096
,
4097
,
8192
,
};
}
// namespace
INSTANTIATE_TEST_SUITE_P
(
OperatorTest
,
MemsetTestSuite
,
::
testing
::
Combine
(
::
testing
::
Values
(
0
,
6
),
::
testing
::
ValuesIn
(
memset_test_sizes
)),
[](
const
testing
::
TestParamInfo
<
MemsetTestSuite
::
ParamType
>&
info
)
{
std
::
string
name
=
std
::
to_string
(
std
::
get
<
0
>
(
info
.
param
))
+
"X"
+
std
::
to_string
(
std
::
get
<
1
>
(
info
.
param
));
return
name
;
});
tests/cpp/operator/test_multi_cast_transpose.cu
View file @
f8c2af4c
...
...
@@ -81,9 +81,9 @@ void performTest() {
for
(
size_t
tensor_id
=
0
;
tensor_id
<
num_tensors
;
++
tensor_id
)
{
const
size_t
height
=
tensor_dims
[
tensor_id
].
first
;
const
size_t
width
=
tensor_dims
[
tensor_id
].
second
;
input_list
.
emplace_back
(
Tensor
(
"input_"
+
std
::
to_string
(
tensor_id
),
{
height
,
width
},
itype
));
input_list
.
emplace_back
(
Tensor
(
"input_"
+
std
::
to_string
(
tensor_id
),
std
::
vector
<
size_t
>
{
height
,
width
},
itype
));
output_list
.
emplace_back
(
Tensor
(
"output_"
+
std
::
to_string
(
tensor_id
),
{
height
,
width
},
otype
,
true
,
true
));
std
::
vector
<
size_t
>
{
height
,
width
},
otype
,
true
,
true
));
auto
&
input
=
input_list
.
back
();
auto
&
output
=
output_list
.
back
();
...
...
tests/cpp/operator/test_multi_padding.cu
View file @
f8c2af4c
...
...
@@ -85,8 +85,8 @@ void performTest() {
const
size_t
height
=
tensor_dims
[
tensor_id
].
first
;
const
size_t
width
=
tensor_dims
[
tensor_id
].
second
;
const
size_t
padded_height
=
(
height
+
align
-
1
)
/
align
*
align
;
input_list
.
emplace_back
(
Tensor
(
"input_"
+
std
::
to_string
(
tensor_id
),
{
height
,
width
},
itype
));
output_list
.
emplace_back
(
Tensor
(
"output_"
+
std
::
to_string
(
tensor_id
),
{
padded_height
,
width
},
otype
));
input_list
.
emplace_back
(
Tensor
(
"input_"
+
std
::
to_string
(
tensor_id
),
std
::
vector
<
size_t
>
{
height
,
width
},
itype
));
output_list
.
emplace_back
(
Tensor
(
"output_"
+
std
::
to_string
(
tensor_id
),
std
::
vector
<
size_t
>
{
padded_height
,
width
},
otype
));
auto
&
input
=
input_list
.
back
();
auto
&
output
=
output_list
.
back
();
...
...
Prev
1
2
3
4
5
6
…
11
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment