Merge branch 'develop_v2.10' into release_v2.10

bd05b0dc · wenjh · 7aeb5a72 · 8fc9d8f1 · bd05b0dc · bd05b0dc
Commit bd05b0dc authored Jan 23, 2026 by wenjh
3 changed files
--- a/tests/cpp/test_common.cu
+++ b/tests/cpp/test_common.cu
@@ -21,6 +21,8 @@
 #include <transformer_engine/transformer_engine.h>
 #include "util/logging.h"
+#define FP4_TYPE_SUPPORTED (CUDA_VERSION >= 12080)
 namespace test {
 size_t create_seed_from_tensor_name(const std::string& tensor_name) {
@@ -343,10 +345,15 @@ Tensor::Tensor(const std::string& name,
    }
  }
+#if FP4_TYPE_SUPPORTED
  const DType rowwise_type = (scaling_mode == NVTE_NVFP4_1D_SCALING) ? DType::kFloat4E2M1 : type;
  const DType colwise_type = (scaling_mode == NVTE_NVFP4_1D_SCALING) ? DType::kFloat4E2M1 : type;
  tensor_.set_rowwise_data(dptr_rowwise, rowwise_type, shape);
  tensor_.set_columnwise_data(dptr_columnwise, colwise_type, columnwise_shape);
+#else
+  tensor_.set_rowwise_data(dptr_rowwise, type, shape);
+  tensor_.set_columnwise_data(dptr_columnwise, type, columnwise_shape);
+#endif
  if (isFp8Type(type) || isFp4Type(type)) {
    if (scaling_mode == NVTE_DELAYED_TENSOR_SCALING) {
@@ -1041,7 +1048,11 @@ bool isFp8Type(DType type) {
 }
 bool isFp4Type(DType type) {
+#if FP4_TYPE_SUPPORTED
  return type == DType::kFloat4E2M1;
+#else
+  return false;
+#endif
 }
 int32_t getDeviceComputeCapability() {

--- a/tests/cpp/test_common.h
+++ b/tests/cpp/test_common.h
@@ -112,7 +112,9 @@ struct TypeInfo {
    struct Helper {
        constexpr static DType getType() {
            constexpr int i = static_cast<int>(current);
-            if (std::is_same<U, typename std::tuple_element<i, types>::type>::value) {
+	    if constexpr (i >= std::tuple_size_v<types>) {
+                return DType::kNumTypes;
+	    } else if (std::is_same<U, typename std::tuple_element<i, types>::type>::value) {
                return current;
            } else {
                return Helper<U, static_cast<DType>(i + 1)>::getType();

--- a/transformer_engine/common/transpose/quantize_transpose_vector_blockwise.cu
+++ b/transformer_engine/common/transpose/quantize_transpose_vector_blockwise.cu
@@ -1519,6 +1519,7 @@ void quantize_transpose_vector_blockwise(const SimpleTensor& input, SimpleTensor
                    size_t smem_bytes = kSMemSize_Colwise * sizeof(InputType);
                    const size_t num_blocks_x = DIVUP(row_length, (size_t)(block_len / 2));
                    const size_t num_blocks_y = DIVUP(num_rows, (size_t)(block_len));
+                    dim3 grid(num_blocks_x, num_blocks_y, 1);
                    if (smem_bytes >= 48 * 1024) {
                      cudaError_t err = cudaFuncSetAttribute(
                          (const void*)&block_scaled_1d_cast_transpose_kernel_colwise<