Merge commit '1d903f5e' of...

Merge commit '1d903f5e' of https://github.com/NVIDIA/TransformerEngine

Merge commit '1d903f5e' of...
Merge commit '1d903f5e' of https://github.com/NVIDIA/TransformerEngine
f8c2af4c · yuguo · e92773a3 · 1d903f5e · f8c2af4c · f8c2af4c
Commit f8c2af4c authored May 21, 2025 by yuguo
20 changed files
--- a/qa/L1_pytorch_mcore_integration/test.sh
+++ b/qa/L1_pytorch_mcore_integration/test.sh
@@ -17,7 +17,7 @@ fi
 # Download Megatron-LM if needed
 if [ ! -d "${MCORE_PATH}" ]; then
    pushd $(dirname ${MCORE_PATH})
-    git clone -b core_r0.9.0 https://github.com/NVIDIA/Megatron-LM.git Megatron-LM
+    git clone -b core_r0.12.0 https://github.com/NVIDIA/Megatron-LM.git Megatron-LM
    popd
 fi


--- a/qa/L2_jax_unittest/test.sh
+++ b/qa/L2_jax_unittest/test.sh
@@ -20,6 +20,7 @@ FAILED_CASES=""

 pip3 install "nltk>=3.8.2" || error_exit "Failed to install nltk"
 pip3 install pytest==8.2.1 || error_exit "Failed to install pytest"
+
 : ${TE_PATH:=/opt/transformerengine}
 : ${XML_LOG_DIR:=/logs}
 mkdir -p "$XML_LOG_DIR"
@@ -30,10 +31,9 @@ python3 -m pytest -c $TE_PATH/tests/jax/pytest.ini -v --junitxml=$XML_LOG_DIR/py
 NVTE_JAX_UNITTEST_LEVEL="L2" NVTE_CUSTOM_CALLS_RE="" python3 -m pytest -c $TE_PATH/tests/jax/pytest.ini -v --junitxml=$XML_LOG_DIR/pytest_test_custom_call_compute.xml $TE_PATH/tests/jax/test_custom_call_compute.py || test_fail "test_custom_call_compute.py"

 pip3 install -r $TE_PATH/examples/jax/mnist/requirements.txt || error_exit "Failed to install mnist requirements"
-pip3 install -r $TE_PATH/examples/jax/encoder/requirements.txt || error_exit "Failed to install encoder requirements"
-
 python3 -m pytest -c $TE_PATH/tests/jax/pytest.ini -v --junitxml=$XML_LOG_DIR/pytest_mnist.xml $TE_PATH/examples/jax/mnist || test_fail "mnist"

+pip3 install -r $TE_PATH/examples/jax/encoder/requirements.txt || error_exit "Failed to install encoder requirements"
 # Make encoder tests to have run-to-run deterministic to have the stable CI results
 export XLA_FLAGS="${XLA_FLAGS} --xla_gpu_deterministic_ops"
 python3 -m pytest -c $TE_PATH/tests/jax/pytest.ini -v --junitxml=$XML_LOG_DIR/pytest_test_single_gpu_encoder.xml $TE_PATH/examples/jax/encoder/test_single_gpu_encoder.py || test_fail "test_single_gpu_encoder.py"

--- a/qa/L3_pytorch_FA_versions_test/test.sh
+++ b/qa/L3_pytorch_FA_versions_test/test.sh
@@ -11,15 +11,17 @@ mkdir -p "$XML_LOG_DIR"
 pip3 install pytest==8.2.1

 # Limit parallel build jobs to avoid overwhelming system resources
-export MAX_JOBS=4
+export MAX_JOBS=32

 # Iterate over Flash Attention versions
 sm_arch=`python3 -c "import torch; sm = torch.cuda.get_device_capability(0); print(sm[0]*10+sm[1])"`
+export FLASH_ATTN_CUDA_ARCHS=$sm_arch
 if [ $sm_arch -gt 90 ]
 then
  FA_versions=(2.7.3)
-else
-  FA_versions=(2.3.0 2.4.1 2.5.7 2.7.3 3.0.0b1)
+elif [ $sm_arch -eq 90 ]
+then
+  FA_versions=(2.5.7 2.7.3 3.0.0b1)
 fi

 for fa_version in "${FA_versions[@]}"

--- a/setup.py
+++ b/setup.py
@@ -7,7 +7,6 @@
 # NVTE_FRAMEWORK=pytorch NVTE_USE_ROCM=1 NVTE_USE_HIPBLASLT=1 NVTE_USE_ROCBLAS=1 CMAKE_PREFIX_PATH=/opt/dtk/lib/cmake/amd_comgr/ MPI_HOME=/opt/mpi/ NVTE_UB_WITH_MPI=1 CXX=hipcc PYTHONPATH=/home/TransformerEngine/3rdparty/hipify_torch:$PYTHONPATH python3 setup.py bdist_wheel

 import os
-import sys
 import time
 from pathlib import Path
 from typing import List, Tuple
@@ -26,7 +25,6 @@ from build_tools.utils import (
    get_frameworks,
    install_and_import,
    remove_dups,
-    uninstall_te_wheel_packages,
 )

 frameworks = get_frameworks()
@@ -111,7 +109,15 @@ def setup_requirements() -> Tuple[List[str], List[str], List[str]]:
    """

    # Common requirements
-    setup_reqs: List[str] = []
+    setup_reqs: List[str] = [
+        "nvidia-cuda-runtime-cu12",
+        "nvidia-cublas-cu12",
+        "nvidia-cudnn-cu12",
+        "nvidia-cuda-cccl-cu12",
+        "nvidia-cuda-nvcc-cu12",
+        "nvidia-nvtx-cu12",
+        "nvidia-cuda-nvrtc-cu12",
+    ]
    install_reqs: List[str] = [
        "pydantic",
        "importlib-metadata>=1.0",
@@ -130,6 +136,7 @@ def setup_requirements() -> Tuple[List[str], List[str], List[str]]:
    # Framework-specific requirements
    if not bool(int(os.getenv("NVTE_RELEASE_BUILD", "0"))):
        if "pytorch" in frameworks:
+            setup_reqs.extend(["torch>=2.1"])
            install_reqs.extend(["torch>=2.1"])
            # install_reqs.append(
            #     "nvdlfw-inspect @"
@@ -137,8 +144,9 @@ def setup_requirements() -> Tuple[List[str], List[str], List[str]]:
            # )
            # Blackwell is not supported as of Triton 3.2.0, need custom internal build
            # install_reqs.append("triton")
-            test_reqs.extend(["numpy", "torchvision", "prettytable", "PyYAML"])
+            test_reqs.extend(["numpy", "torchvision"])
        if "jax" in frameworks:
+            setup_reqs.extend(["jax[cuda12]", "flax>=0.7.1"])
            install_reqs.extend(["jax", "flax>=0.7.1"])
            test_reqs.extend(["numpy"])

@@ -157,7 +165,6 @@ if __name__ == "__main__":
            int(os.getenv("NVTE_RELEASE_BUILD", "0"))
        ), "NVTE_RELEASE_BUILD env must be set for metapackage build."
        ext_modules = []
-        cmdclass = {}
        package_data = {}
        include_package_data = False
        setup_requires = []
@@ -169,15 +176,11 @@ if __name__ == "__main__":
    else:
        setup_requires, install_requires, test_requires = setup_requirements()
        ext_modules = [setup_common_extension()]
-        cmdclass = {"build_ext": CMakeBuildExtension, "bdist_wheel": TimedBdist}
        package_data = {"": ["VERSION.txt"]}
        include_package_data = True
        extras_require = {"test": test_requires}

        if not bool(int(os.getenv("NVTE_RELEASE_BUILD", "0"))):
-            # Remove residual FW packages since compiling from source
-            # results in a single binary with FW extensions included.
-            uninstall_te_wheel_packages()
            if "pytorch" in frameworks:
                from build_tools.pytorch import setup_pytorch_extension


--- a/tests/cpp/operator/test_act.cu
+++ b/tests/cpp/operator/test_act.cu
@@ -116,10 +116,10 @@ void performTest(const size_t N, const size_t H) {
  DType itype = TypeInfo<IType>::dtype;
  DType otype = TypeInfo<OType>::dtype;

-  Tensor input("input", { N, H }, itype);
-  Tensor output("output", { N, H }, otype);
-  Tensor igrad("igrad", { N, H }, itype);
-  Tensor ograd("ograd", { N, H }, itype);
+  Tensor input("input", std::vector<size_t>{ N, H }, itype);
+  Tensor output("output", std::vector<size_t>{ N, H }, otype);
+  Tensor igrad("igrad", std::vector<size_t>{ N, H }, itype);
+  Tensor ograd("ograd", std::vector<size_t>{ N, H }, itype);

  fillUniform(&input);
  fillUniform(&ograd);
@@ -171,10 +171,10 @@ void performTestGLU(const size_t N, const size_t H) {
  DType itype = TypeInfo<IType>::dtype;
  DType otype = TypeInfo<OType>::dtype;

-  Tensor input("input", {N, H * 2}, itype);
-  Tensor output("output", {N, H}, otype);
-  Tensor igrad("igrad", { N, H * 2 }, itype);
-  Tensor ograd("ograd", { N, H }, itype);
+  Tensor input("input", std::vector<size_t>{N, H * 2}, itype);
+  Tensor output("output", std::vector<size_t>{N, H}, otype);
+  Tensor igrad("igrad", std::vector<size_t>{ N, H * 2 }, itype);
+  Tensor ograd("ograd", std::vector<size_t>{ N, H }, itype);

  fillUniform(&input);
  fillUniform(&ograd);

--- a/tests/cpp/operator/test_cast_dbias.cu
+++ b/tests/cpp/operator/test_cast_dbias.cu
@@ -70,7 +70,7 @@ void performTest(const std::vector<size_t>& shape) {

  Tensor output_c("output_c", shape, otype);
  // dbias has the same data type with "output grad"
-  Tensor dbias("dbias", {H}, itype);
+  Tensor dbias("dbias", std::vector<size_t>{H}, itype);

  fillUniform(&input);
  setRandomScale(&output_c);

--- a/tests/cpp/operator/test_cast_dbias_dgelu.cu
+++ b/tests/cpp/operator/test_cast_dbias_dgelu.cu
@@ -79,7 +79,7 @@ void performTest(const std::vector<size_t>& shape) {

  Tensor output_c("output_c", shape, otype);
  // dbias has the same data type with "output grad"
-  Tensor dbias("dbias", {H}, itype);
+  Tensor dbias("dbias", std::vector<size_t>{H}, itype);

  fillUniform(&input);
  fillUniform(&grad);

--- a/tests/cpp/operator/test_cast_float8blockwise.cu
+++ b/tests/cpp/operator/test_cast_float8blockwise.cu
@@ -280,7 +280,7 @@ void runTestCase(const ProcessingMethod processing_method, const std::vector<siz
  Tensor grad("grad", shape, itype);
  Tensor output_c("output_c", shape, otype, rowwise, colwise,
                  opts.block_scaling_dim == 2 ? NVTE_BLOCK_SCALING_2D : NVTE_BLOCK_SCALING_1D);
-  Tensor output_dbias("output_dbias", {cols}, itype);
+  Tensor output_dbias("output_dbias", std::vector<size_t>{cols}, itype);

  std::unique_ptr<OutputType[]> ref_output = std::make_unique<OutputType[]>(rows * cols);
  std::unique_ptr<OutputType[]> ref_output_t = std::make_unique<OutputType[]>(rows * cols);
@@ -355,7 +355,7 @@ void runTestCaseOneDimensionalBlocks(const ProcessingMethod processing_method,
  Tensor grad("grad", shape, itype);
  Tensor output_c("output_c", shape, otype, rowwise, colwise,
                  opts.block_scaling_dim == 2 ? NVTE_BLOCK_SCALING_2D : NVTE_BLOCK_SCALING_1D);
-  Tensor output_dbias("output_dbias", {cols}, itype);
+  Tensor output_dbias("output_dbias", std::vector<size_t>{cols}, itype);

  std::unique_ptr<OutputType[]> ref_output = std::make_unique<OutputType[]>(rows * cols);
  std::unique_ptr<OutputType[]> ref_output_t = std::make_unique<OutputType[]>(rows * cols);

--- a/tests/cpp/operator/test_cast_mxfp8.cu
+++ b/tests/cpp/operator/test_cast_mxfp8.cu
@@ -230,7 +230,7 @@ void performTest_x1(const ProcessingMethod processing_method,
    Tensor input("input", shape, itype);
    Tensor grad("grad", shape, itype);
    Tensor output_c("output_c", shape, otype, rowwise, colwise, NVTE_MXFP8_1D_SCALING);
-    Tensor output_dbias("output_dbias", { cols }, itype);
+    Tensor output_dbias("output_dbias", std::vector<size_t>{ cols }, itype);

    std::unique_ptr<OutputType[]> ref_output_c = std::make_unique<OutputType[]>(rows * cols);
    std::unique_ptr<InputType[]> ref_output_dbias = std::make_unique<InputType[]>(cols);
@@ -368,7 +368,7 @@ void performTest_x2(const ProcessingMethod processing_method,
    Tensor input("input", shape, itype);
    Tensor grad("grad", shape, itype);
    Tensor output("output", shape, otype, true, true, NVTE_MXFP8_1D_SCALING);
-    Tensor output_dbias("output_dbias", { cols }, itype);
+    Tensor output_dbias("output_dbias", std::vector<size_t>{ cols }, itype);

    std::unique_ptr<OutputType[]> ref_output_c_rowwise = std::make_unique<OutputType[]>(rows * cols);
    std::unique_ptr<OutputType[]> ref_output_c_colwise = std::make_unique<OutputType[]>(rows * cols);

--- a/tests/cpp/operator/test_cast_mxfp8_gated_swiglu.cu
+++ b/tests/cpp/operator/test_cast_mxfp8_gated_swiglu.cu
@@ -204,8 +204,8 @@ void performTest_x1(const size_t rows,
    // std::cout << "blocks_X: " << blocks_X << std::endl;
    // std::cout << "scales_stride: " << scales_stride << std::endl;

-    Tensor grad("grad", { rows, cols }, itype);
-    Tensor input("input", { rows, cols * 2 }, itype);
+    Tensor grad("grad", std::vector<size_t>{ rows, cols }, itype);
+    Tensor input("input", std::vector<size_t>{ rows, cols * 2 }, itype);

    const size_t output_cols = (IS_DGATED ? 2 : 1) * cols;

@@ -289,8 +289,8 @@ void performTest_x2(const size_t rows,
    DType itype = TypeInfo<IType>::dtype;
    DType otype = TypeInfo<OType>::dtype;

-    Tensor grad("grad", { rows, cols }, itype);
-    Tensor input("input", { rows, cols * 2 }, itype);
+    Tensor grad("grad", std::vector<size_t>{ rows, cols }, itype);
+    Tensor input("input", std::vector<size_t>{ rows, cols * 2 }, itype);

    const size_t output_cols = (IS_DGATED ? 2 : 1) * cols;


--- a/tests/cpp/operator/test_cast_transpose.cu
+++ b/tests/cpp/operator/test_cast_transpose.cu
@@ -47,8 +47,8 @@ void performTest(const size_t N, const size_t H) {
  DType itype = TypeInfo<InputType>::dtype;
  DType otype = TypeInfo<OutputType>::dtype;

-  Tensor input("input", { N, H }, itype);
-  Tensor output("output", { N, H }, otype, true, true);
+  Tensor input("input", std::vector<size_t>{ N, H }, itype);
+  Tensor output("output", std::vector<size_t>{ N, H }, otype, true, true);

  std::unique_ptr<OutputType[]> ref_output_c = std::make_unique<OutputType[]>(N * H);
  std::unique_ptr<OutputType[]> ref_output_t = std::make_unique<OutputType[]>(N * H);

--- a/tests/cpp/operator/test_cast_transpose_current_scaling.cu
+++ b/tests/cpp/operator/test_cast_transpose_current_scaling.cu
@@ -112,8 +112,8 @@ void performTest(const size_t N, const size_t H) {
    }
  }

-  Tensor input("input", { N, H }, itype);
-  Tensor output("output", { N, H }, otype, true, true);
+  Tensor input("input", std::vector<size_t>{ N, H }, itype);
+  Tensor output("output", std::vector<size_t>{ N, H }, otype, true, true);

  std::unique_ptr<OutputType[]> ref_output_c = std::make_unique<OutputType[]>(N * H);
  std::unique_ptr<OutputType[]> ref_output_t = std::make_unique<OutputType[]>(N * H);

--- a/tests/cpp/operator/test_cast_transpose_dbias.cu
+++ b/tests/cpp/operator/test_cast_transpose_dbias.cu
@@ -65,11 +65,11 @@ void performTest(const size_t N, const size_t H) {
  DType itype = TypeInfo<IType>::dtype;
  DType otype = TypeInfo<OType>::dtype;

-  Tensor input("input", {N, H}, itype);
+  Tensor input("input", std::vector<size_t>{N, H}, itype);

-  Tensor output("output", {N, H}, otype, true, true);
+  Tensor output("output", std::vector<size_t>{N, H}, otype, true, true);
  // dbias has the same data type with "output grad"
-  Tensor dbias("dbias", {H}, itype);
+  Tensor dbias("dbias", std::vector<size_t>{H}, itype);

  fillUniform(&input);
  setRandomScale(&output);

--- a/tests/cpp/operator/test_cast_transpose_dbias_dgelu.cu
+++ b/tests/cpp/operator/test_cast_transpose_dbias_dgelu.cu
@@ -76,12 +76,12 @@ void performTest(const size_t N, const size_t H) {
  DType itype = TypeInfo<IType>::dtype;
  DType otype = TypeInfo<OType>::dtype;

-  Tensor input("input", {N, H}, itype);
-  Tensor gelu_input("gelu_input", {N, H}, itype);
+  Tensor input("input", std::vector<size_t>{N, H}, itype);
+  Tensor gelu_input("gelu_input", std::vector<size_t>{N, H}, itype);

-  Tensor output("output", {N, H}, otype, true, true);
+  Tensor output("output", std::vector<size_t>{N, H}, otype, true, true);
  // dbias has the same data type with "output grad"
-  Tensor dbias("dbias", {H}, itype);
+  Tensor dbias("dbias", std::vector<size_t>{H}, itype);

  fillUniform(&input);
  fillUniform(&gelu_input);

--- a/tests/cpp/operator/test_cast_transpose_dgeglu.cu
+++ b/tests/cpp/operator/test_cast_transpose_dgeglu.cu
@@ -74,9 +74,9 @@ void performTest(const size_t N, const size_t H) {
  DType itype = TypeInfo<IType>::dtype;
  DType otype = TypeInfo<OType>::dtype;

-  Tensor grad("grad", {N, H}, itype);
-  Tensor input("input", {N, H * 2}, itype);
-  Tensor output("output", {N, H * 2}, otype, true, true);
+  Tensor grad("grad", std::vector<size_t>{N, H}, itype);
+  Tensor input("input", std::vector<size_t>{N, H * 2}, itype);
+  Tensor output("output", std::vector<size_t>{N, H * 2}, otype, true, true);

  fillUniform(&grad);
  fillUniform(&input);

--- a/tests/cpp/operator/test_causal_softmax.cu
+++ b/tests/cpp/operator/test_causal_softmax.cu
@@ -153,11 +153,11 @@ void performTest(

  DType itype = TypeInfo<Type>::dtype;

-  Tensor data_in("data_in", { batches, heads, rows, cols }, itype);
-  Tensor softmax_out("softmax_out", { batches, heads, rows, cols }, itype);
-  Tensor softmax_in("softmax_in", { batches, heads, rows, cols }, itype);
-  Tensor grads_in("grads_in", { batches, heads, rows, cols }, itype);
-  Tensor grads_out("grads_out", { batches, heads, rows, cols }, itype);
+  Tensor data_in("data_in", std::vector<size_t>{ batches, heads, rows, cols }, itype);
+  Tensor softmax_out("softmax_out", std::vector<size_t>{ batches, heads, rows, cols }, itype);
+  Tensor softmax_in("softmax_in", std::vector<size_t>{ batches, heads, rows, cols }, itype);
+  Tensor grads_in("grads_in", std::vector<size_t>{ batches, heads, rows, cols }, itype);
+  Tensor grads_out("grads_out", std::vector<size_t>{ batches, heads, rows, cols }, itype);

  const size_t elements_total = batches * heads * rows * cols;
  std::unique_ptr<Type[]> softmax_out_ref = std::make_unique<Type[]>(elements_total);

--- a/tests/cpp/operator/test_dequantize_mxfp8.cu
+++ b/tests/cpp/operator/test_dequantize_mxfp8.cu
@@ -214,10 +214,10 @@ void performTest_x1(const size_t rows,
    const size_t blocks_num = rowwise ? blocks_num_rowwise : blocks_num_colwise;
    const size_t scales_stride = rowwise ? blocks_X_rowwise : blocks_X_colwise;

-    Tensor input("input", { rows, cols }, itype, rowwise, colwise, NVTE_MXFP8_1D_SCALING);
+    Tensor input("input", std::vector<size_t>{ rows, cols }, itype, rowwise, colwise, NVTE_MXFP8_1D_SCALING);

    // Output data are written to the rowwise ptr regardless of the scaling direction
-    Tensor output("output", { rows, cols }, otype, true, false);
+    Tensor output("output", std::vector<size_t>{ rows, cols }, otype, true, false);

    std::unique_ptr<OutputType[]> ref_output = std::make_unique<OutputType[]>(rows * cols);
    std::unique_ptr<fp8e8m0[]> scales = std::make_unique<fp8e8m0[]>(blocks_num);
@@ -267,11 +267,11 @@ void performTest_quantize_then_dequantize(const size_t rows,

    // input --> quantized --> output (dequantized)
    // input == output
-    Tensor input("input", { rows, cols }, in_type);
-    Tensor quantized("quantized", { rows, cols }, intermed_type, rowwise, colwise, NVTE_MXFP8_1D_SCALING);
+    Tensor input("input", std::vector<size_t>{ rows, cols }, in_type);
+    Tensor quantized("quantized", std::vector<size_t>{ rows, cols }, intermed_type, rowwise, colwise, NVTE_MXFP8_1D_SCALING);

    // Output data are written to the rowwise ptr regardless of the scaling direction
-    Tensor output("output", { rows, cols }, out_type, true, false);
+    Tensor output("output", std::vector<size_t>{ rows, cols }, out_type, true, false);

    // fillCase<EncodingType>(&input, InputsFillCase::minNorm_to_maxNorm);
    fillCase<EncodingType>(&input, InputsFillCase::uniform);
@@ -333,8 +333,8 @@ void performTest_x2(const size_t rows,
    const size_t blocks_num_rowwise = blocks_Y_rowwise * blocks_X_rowwise;
    const size_t blocks_num_colwise = blocks_Y_colwise * blocks_X_colwise;

-    Tensor input("input", { rows, cols }, itype, true, true, NVTE_MXFP8_1D_SCALING);
-    Tensor output("output", { rows, cols }, otype);
+    Tensor input("input", std::vector<size_t>{ rows, cols }, itype, true, true, NVTE_MXFP8_1D_SCALING);
+    Tensor output("output", std::vector<size_t>{ rows, cols }, otype);

    std::unique_ptr<OutputType[]> ref_output_rowwise = std::make_unique<OutputType[]>(rows * cols);
    std::unique_ptr<OutputType[]> ref_output_colwise = std::make_unique<OutputType[]>(rows * cols);

--- a/tests/cpp/operator/test_memset.cu
+++ b/tests/cpp/operator/test_memset.cu
+/*************************************************************************
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * See LICENSE for license information.
+ ************************************************************************/
+
+#include <cmath>
+#include <cstring>
+#include <memory>
+#include <iomanip>
+#include <iostream>
+#include <random>
+#include <type_traits>
+
+#include <cuda_bf16.h>
+#include <cuda_runtime.h>
+#include <gtest/gtest.h>
+
+#include <transformer_engine/transformer_engine.h>
+#include "../test_common.h"
+
+using namespace transformer_engine;
+
+
+class MemsetTestSuite : public ::testing::TestWithParam<std::tuple<int,
+                                                                size_t>> {};
+
+TEST_P(MemsetTestSuite, TestMemset) {
+    using namespace transformer_engine;
+    using namespace test;
+
+    int value = std::get<0>(GetParam());
+    size_t size_in_bytes = std::get<1>(GetParam());
+
+    std::vector<uint8_t> h_buffer{};
+    h_buffer.resize(size_in_bytes);
+    for (size_t i = 0; i < size_in_bytes; ++i) {
+        h_buffer[i] = value + 1;  // Initialize host buffer to a different value than memset value to verify memset is working correctly
+    }
+
+    char* d_ptr;
+    NVTE_CHECK_CUDA(cudaMalloc(&d_ptr, size_in_bytes));
+
+    NVTE_CHECK_CUDA(cudaMemcpy(d_ptr, h_buffer.data(), size_in_bytes, cudaMemcpyHostToDevice));
+
+    nvte_memset(d_ptr, value, size_in_bytes, 0 /* stream */);
+
+    NVTE_CHECK_CUDA(cudaMemcpy(
+        h_buffer.data(), d_ptr, size_in_bytes, cudaMemcpyDeviceToHost));
+    NVTE_CHECK_CUDA(cudaFree(d_ptr));
+
+    NVTE_CHECK_CUDA(cudaDeviceSynchronize());
+
+    for (size_t i = 0; i < size_in_bytes; ++i) {
+        EXPECT_EQ(h_buffer[i], static_cast<uint8_t>(value))
+            << "Mismatch at index " << i << ": expected " << static_cast<int>(value)
+            << ", got " << static_cast<int>(h_buffer[i]);
+    }
+}
+
+namespace {
+
+std::vector<size_t> memset_test_sizes = {
+  1,
+  4,
+  9,
+  16,
+  128,
+  4096,
+  4097,
+  8192,
+};
+
+}  // namespace
+
+INSTANTIATE_TEST_SUITE_P(
+    OperatorTest,
+    MemsetTestSuite,
+    ::testing::Combine(
+        ::testing::Values(0, 6),
+        ::testing::ValuesIn(memset_test_sizes)),
+    [](const testing::TestParamInfo<MemsetTestSuite::ParamType>& info) {
+      std::string name = std::to_string(std::get<0>(info.param)) + "X" +
+                         std::to_string(std::get<1>(info.param));
+      return name;
+    });
--- a/tests/cpp/operator/test_multi_cast_transpose.cu
+++ b/tests/cpp/operator/test_multi_cast_transpose.cu
@@ -81,9 +81,9 @@ void performTest() {
  for (size_t tensor_id = 0; tensor_id < num_tensors; ++tensor_id) {
    const size_t height = tensor_dims[tensor_id].first;
    const size_t width = tensor_dims[tensor_id].second;
-    input_list.emplace_back(Tensor("input_" + std::to_string(tensor_id), { height, width }, itype));
+    input_list.emplace_back(Tensor("input_" + std::to_string(tensor_id), std::vector<size_t>{ height, width }, itype));
    output_list.emplace_back(Tensor("output_" + std::to_string(tensor_id),
-                                    { height, width }, otype, true, true));
+                                    std::vector<size_t>{ height, width }, otype, true, true));

    auto& input = input_list.back();
    auto& output = output_list.back();

--- a/tests/cpp/operator/test_multi_padding.cu
+++ b/tests/cpp/operator/test_multi_padding.cu
@@ -85,8 +85,8 @@ void performTest() {
    const size_t height = tensor_dims[tensor_id].first;
    const size_t width = tensor_dims[tensor_id].second;
    const size_t padded_height = (height + align - 1) / align * align;
-    input_list.emplace_back(Tensor("input_" + std::to_string(tensor_id), { height, width }, itype));
-    output_list.emplace_back(Tensor("output_" + std::to_string(tensor_id), { padded_height, width }, otype));
+    input_list.emplace_back(Tensor("input_" + std::to_string(tensor_id), std::vector<size_t>{ height, width }, itype));
+    output_list.emplace_back(Tensor("output_" + std::to_string(tensor_id), std::vector<size_t>{ padded_height, width }, otype));

    auto& input = input_list.back();
    auto& output = output_list.back();