Overhaul the compilation for the arch-specific features (#2279)

* Added sm_120f to the build Signed-off-by: Przemek Tredak <ptredak@nvidia.com> * Change the arch specific handling Signed-off-by: Przemek Tredak <ptredak@nvidia.com> * Fix Signed-off-by: Przemek Tredak <ptredak@nvidia.com> * Support for CUDA<12.9 Signed-off-by: Przemek Tredak <ptredak@nvidia.com> * Moved through the rest of the files Signed-off-by: Przemek Tredak <ptredak@nvidia.com> * Fix Signed-off-by: Przemek Tredak <ptredak@nvidia.com> * Common cases Signed-off-by: Przemek Tredak <ptredak@nvidia.com> * Remove pure 100 from the list Signed-off-by: Przemek Tredak <ptredak@nvidia.com> * Fix Signed-off-by: Przemek Tredak <ptredak@nvidia.com> * CMake changes, (not yet working) Signed-off-by: Przemek Tredak <ptredak@nvidia.com> * Fix Signed-off-by: Przemek Tredak <ptredak@nvidia.com> * Do not pass the arch-specific thing from build_tools Signed-off-by: Przemek Tredak <ptredak@nvidia.com> * Fix Signed-off-by: Przemek Tredak <ptredak@nvidia.com> * Moved some of the files to arch-specific compilation Signed-off-by: Przemek Tredak <ptredak@nvidia.com> * Fix and also changing the order of compilation to hopefully get the compilation time lower Signed-off-by: Przemek Tredak <ptredak@nvidia.com> * Fix for the files overwriting custom compile properties Signed-off-by: Przemek Tredak <ptredak@nvidia.com> * Actually make this whole thing work Signed-off-by: Przemek Tredak <ptredak@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Add space to the error message Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Signed-off-by: Przemyslaw Tredak <ptrendx@gmail.com> * Apply suggestions from code review Co-authored-by: Oleg Goncharov <64355998+Oleg-Goncharov@users.noreply.github.com> Signed-off-by: Przemyslaw Tredak <ptrendx@gmail.com> * Fixes from review Signed-off-by: Przemek Tredak <ptredak@nvidia.com> * Changing the naming to be more intuitive Signed-off-by: Przemek Tredak <ptredak@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Add missing cassert include for device-side asserts Signed-off-by: Przemek Tredak <ptredak@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Przemek Tredak <ptredak@nvidia.com> Signed-off-by: Przemyslaw Tredak <ptrendx@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Oleg Goncharov <64355998+Oleg-Goncharov@users.noreply.github.com>

Overhaul the compilation for the arch-specific features (#2279)
* Added sm_120f to the build Signed-off-by: Przemek Tredak <ptredak@nvidia.com> * Change the arch specific handling Signed-off-by: Przemek Tredak <ptredak@nvidia.com> * Fix Signed-off-by: Przemek Tredak <ptredak@nvidia.com> * Support for CUDA<12.9 Signed-off-by: Przemek Tredak <ptredak@nvidia.com> * Moved through the rest of the files Signed-off-by: Przemek Tredak <ptredak@nvidia.com> * Fix Signed-off-by: Przemek Tredak <ptredak@nvidia.com> * Common cases Signed-off-by: Przemek Tredak <ptredak@nvidia.com> * Remove pure 100 from the list Signed-off-by: Przemek Tredak <ptredak@nvidia.com> * Fix Signed-off-by: Przemek Tredak <ptredak@nvidia.com> * CMake changes, (not yet working) Signed-off-by: Przemek Tredak <ptredak@nvidia.com> * Fix Signed-off-by: Przemek Tredak <ptredak@nvidia.com> * Do not pass the arch-specific thing from build_tools Signed-off-by: Przemek Tredak <ptredak@nvidia.com> * Fix Signed-off-by: Przemek Tredak <ptredak@nvidia.com> * Moved some of the files to arch-specific compilation Signed-off-by: Przemek Tredak <ptredak@nvidia.com> * Fix and also changing the order of compilation to hopefully get the compilation time lower Signed-off-by: Przemek Tredak <ptredak@nvidia.com> * Fix for the files overwriting custom compile properties Signed-off-by: Przemek Tredak <ptredak@nvidia.com> * Actually make this whole thing work Signed-off-by: Przemek Tredak <ptredak@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Add space to the error message Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Signed-off-by: Przemyslaw Tredak <ptrendx@gmail.com> * Apply suggestions from code review Co-authored-by: Oleg Goncharov <64355998+Oleg-Goncharov@users.noreply.github.com> Signed-off-by: Przemyslaw Tredak <ptrendx@gmail.com> * Fixes from review Signed-off-by: Przemek Tredak <ptredak@nvidia.com> * Changing the naming to be more intuitive Signed-off-by: Przemek Tredak <ptredak@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Add missing cassert include for device-side asserts Signed-off-by: Przemek Tredak <ptredak@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Przemek Tredak <ptredak@nvidia.com> Signed-off-by: Przemyslaw Tredak <ptrendx@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Oleg Goncharov <64355998+Oleg-Goncharov@users.noreply.github.com>
eb34783c · Przemyslaw Tredak · GitHub · 66acb8e9 · eb34783c · eb34783c
Unverified Commit eb34783c authored Oct 22, 2025 by Przemyslaw Tredak Committed by GitHub Oct 22, 2025
7 changed files
--- a/build_tools/utils.py
+++ b/build_tools/utils.py
@@ -257,11 +257,9 @@ def cuda_archs() -> str:
    if archs is None:
        version = cuda_version()
        if version >= (13, 0):
-            archs = "75;80;89;90;100;100a;103a;120"
-        elif version >= (12, 9):
-            archs = "70;80;89;90;100;100a;103a;120"
+            archs = "75;80;89;90;100;120"
        elif version >= (12, 8):
-            archs = "70;80;89;90;100;100a;120"
+            archs = "70;80;89;90;100;120"
        else:
            archs = "70;80;89;90"
    return archs

--- a/transformer_engine/common/CMakeLists.txt
+++ b/transformer_engine/common/CMakeLists.txt
@@ -5,15 +5,6 @@
 cmake_minimum_required(VERSION 3.21)

 # Language options
-if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
-  if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL 13.0)
-    set(CMAKE_CUDA_ARCHITECTURES 75 80 89 90 100 120)
-  elseif (CUDAToolkit_VERSION VERSION_GREATER_EQUAL 12.8)
-    set(CMAKE_CUDA_ARCHITECTURES 70 80 89 90 100 120)
-  else ()
-    set(CMAKE_CUDA_ARCHITECTURES 70 80 89 90)
-  endif()
-endif()
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CUDA_STANDARD 17)
 set(CMAKE_CUDA_STANDARD_REQUIRED ON)
@@ -30,8 +21,62 @@ project(transformer_engine LANGUAGES CUDA CXX)

 # CUDA Toolkit
 find_package(CUDAToolkit REQUIRED)
-if (CUDAToolkit_VERSION VERSION_LESS 12.0)
-  message(FATAL_ERROR "CUDA 12.0+ is required, but found CUDA ${CUDAToolkit_VERSION}")
+if (CUDAToolkit_VERSION VERSION_LESS 12.1)
+  message(FATAL_ERROR "CUDA 12.1+ is required, but found CUDA ${CUDAToolkit_VERSION}")
+endif()
+
+# Process GPU architectures
+if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+  if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL 13.0)
+    set(CMAKE_CUDA_ARCHITECTURES 75 80 89 90 100 120)
+  elseif (CUDAToolkit_VERSION VERSION_GREATER_EQUAL 12.8)
+    set(CMAKE_CUDA_ARCHITECTURES 70 80 89 90 100 120)
+  else ()
+    set(CMAKE_CUDA_ARCHITECTURES 70 80 89 90)
+  endif()
+endif()
+
+# Process CMAKE_CUDA_ARCHITECTURES to separate generic and specific architectures
+set(NVTE_GENERIC_ARCHS)
+set(NVTE_SPECIFIC_ARCHS)
+
+# Check for architecture 100
+list(FIND CMAKE_CUDA_ARCHITECTURES "100" arch_100_index)
+if(NOT arch_100_index EQUAL -1)
+  list(REMOVE_ITEM CMAKE_CUDA_ARCHITECTURES "100")
+  list(APPEND NVTE_GENERIC_ARCHS "100")
+  list(APPEND NVTE_SPECIFIC_ARCHS "100a")
+  if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 12.9)
+    list(APPEND NVTE_SPECIFIC_ARCHS "103a")
+  endif()
+endif()
+
+# Check for architecture 101 (if we see this we are in toolkit <= 12.9)
+list(FIND CMAKE_CUDA_ARCHITECTURES "101" arch_101_index)
+if(NOT arch_101_index EQUAL -1)
+  list(REMOVE_ITEM CMAKE_CUDA_ARCHITECTURES "101")
+  list(APPEND NVTE_GENERIC_ARCHS "101")
+  list(APPEND NVTE_SPECIFIC_ARCHS "101a")
+endif()
+
+# Check for architecture 110 (if we see this we are in toolkit >= 13.0)
+list(FIND CMAKE_CUDA_ARCHITECTURES "110" arch_110_index)
+if(NOT arch_110_index EQUAL -1)
+  list(REMOVE_ITEM CMAKE_CUDA_ARCHITECTURES "110")
+  list(APPEND NVTE_GENERIC_ARCHS "110")
+  list(APPEND NVTE_SPECIFIC_ARCHS "110f")
+endif()
+
+# Check for architecture 120
+list(FIND CMAKE_CUDA_ARCHITECTURES "120" arch_120_index)
+if(NOT arch_120_index EQUAL -1)
+  list(REMOVE_ITEM CMAKE_CUDA_ARCHITECTURES "120")
+  list(APPEND NVTE_GENERIC_ARCHS "120")
+  if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 12.9)
+    list(APPEND NVTE_SPECIFIC_ARCHS "120f")
+  else()
+    list(APPEND NVTE_SPECIFIC_ARCHS "120a")
+  endif()
 endif()

 # cuDNN frontend API
@@ -78,9 +123,28 @@ endif()
 # Configure Transformer Engine library
 include_directories(${PROJECT_SOURCE_DIR}/..)
 set(transformer_engine_SOURCES)
-list(APPEND transformer_engine_SOURCES
+set(transformer_engine_cpp_sources)
+set(transformer_engine_cuda_sources)
+set(transformer_engine_cuda_arch_specific_sources)
+
+list(APPEND transformer_engine_cpp_sources
     cudnn_utils.cpp
     transformer_engine.cpp
+     fused_attn/fused_attn.cpp
+     gemm/config.cpp
+     normalization/common.cpp
+     normalization/layernorm/ln_api.cpp
+     normalization/rmsnorm/rmsnorm_api.cpp
+     util/cuda_driver.cpp
+     util/cuda_nvml.cpp
+     util/cuda_runtime.cpp
+     util/multi_stream.cpp
+     util/rtc.cpp
+     comm_gemm_overlap/userbuffers/ipcsocket.cc
+     comm_gemm_overlap/userbuffers/userbuffers-host.cpp
+     comm_gemm_overlap/comm_gemm_overlap.cpp)
+
+list(APPEND transformer_engine_cuda_sources
     common.cu
     multi_tensor/adam.cu
     multi_tensor/compute_scale.cu
@@ -92,40 +156,23 @@ list(APPEND transformer_engine_SOURCES
     transpose/cast_transpose_fusion.cu
     transpose/transpose_fusion.cu
     transpose/multi_cast_transpose.cu
-     transpose/quantize_transpose_square_blockwise.cu
     transpose/quantize_transpose_vector_blockwise.cu
     transpose/swap_first_dims.cu
-     transpose/quantize_transpose_vector_blockwise_fp4.cu
-     activation/gelu.cu
     dropout/dropout.cu
     fused_attn/flash_attn.cu
     fused_attn/context_parallel.cu
     fused_attn/kv_cache.cu
     fused_attn/fused_attn_f16_max512_seqlen.cu
     fused_attn/fused_attn_f16_arbitrary_seqlen.cu
-     activation/relu.cu
-     activation/swiglu.cu
     fused_attn/fused_attn_fp8.cu
-     fused_attn/fused_attn.cpp
     fused_attn/utils.cu
-     gemm/config.cpp
     gemm/cublaslt_gemm.cu
-     gemm/cutlass_grouped_gemm.cu
-     normalization/common.cpp
-     normalization/layernorm/ln_api.cpp
     normalization/layernorm/ln_bwd_semi_cuda_kernel.cu
     normalization/layernorm/ln_fwd_cuda_kernel.cu
-     normalization/rmsnorm/rmsnorm_api.cpp
     normalization/rmsnorm/rmsnorm_bwd_semi_cuda_kernel.cu
     normalization/rmsnorm/rmsnorm_fwd_cuda_kernel.cu
     permutation/permutation.cu
-     util/cast.cu
     util/padding.cu
-     util/cuda_driver.cpp
-     util/cuda_nvml.cpp
-     util/cuda_runtime.cpp
-     util/multi_stream.cpp
-     util/rtc.cpp
     swizzle/swizzle.cu
     swizzle/swizzle_block_scaling.cu
     fused_softmax/scaled_masked_softmax.cu
@@ -139,12 +186,58 @@ list(APPEND transformer_engine_SOURCES
     recipe/delayed_scaling.cu
     recipe/fp8_block_scaling.cu
     recipe/nvfp4.cu
+     comm_gemm_overlap/userbuffers/userbuffers.cu)
+
+list(APPEND transformer_engine_cuda_arch_specific_sources
+     gemm/cutlass_grouped_gemm.cu
+     util/cast.cu
+     activation/gelu.cu
+     activation/relu.cu
+     activation/swiglu.cu
+     transpose/quantize_transpose_square_blockwise.cu
+     transpose/quantize_transpose_vector_blockwise_fp4.cu
     hadamard_transform/hadamard_transform.cu
-     hadamard_transform/hadamard_transform_cast_fusion.cu
-     comm_gemm_overlap/userbuffers/ipcsocket.cc
-     comm_gemm_overlap/userbuffers/userbuffers-host.cpp
-     comm_gemm_overlap/userbuffers/userbuffers.cu
-     comm_gemm_overlap/comm_gemm_overlap.cpp)
+     hadamard_transform/hadamard_transform_cast_fusion.cu)
+
+# Compiling the files with the worst compilation time first to hopefully overlap
+# better with the faster-compiling cpp files
+list(APPEND transformer_engine_SOURCES ${transformer_engine_cuda_arch_specific_sources}
+                                       ${transformer_engine_cuda_sources}
+                                       ${transformer_engine_cpp_sources})
+
+# Set compile options for CUDA sources with generic architectures
+foreach(cuda_source IN LISTS transformer_engine_cuda_sources)
+  set(arch_compile_options)
+  foreach(arch IN LISTS NVTE_GENERIC_ARCHS)
+    list(APPEND arch_compile_options "--generate-code=arch=compute_${arch},code=sm_${arch}")
+  endforeach()
+
+  if(arch_compile_options)
+    set_property(
+      SOURCE ${cuda_source}
+      APPEND
+      PROPERTY
+      COMPILE_OPTIONS ${arch_compile_options}
+    )
+  endif()
+endforeach()
+
+# Set compile options for CUDA sources with specific architectures
+foreach(cuda_source IN LISTS transformer_engine_cuda_arch_specific_sources)
+  set(arch_compile_options)
+  foreach(arch IN LISTS NVTE_SPECIFIC_ARCHS)
+    list(APPEND arch_compile_options "--generate-code=arch=compute_${arch},code=sm_${arch}")
+  endforeach()
+
+  if(arch_compile_options)
+    set_property(
+      SOURCE ${cuda_source}
+      APPEND
+      PROPERTY
+      COMPILE_OPTIONS ${arch_compile_options}
+    )
+  endif()
+endforeach()

 if (NVTE_WITH_CUBLASMP)
 list(APPEND transformer_engine_SOURCES
@@ -249,28 +342,35 @@ target_include_directories(transformer_engine PRIVATE
                           "${CMAKE_CURRENT_BINARY_DIR}/string_headers")

 # Compiler options
-set_source_files_properties(fused_softmax/scaled_masked_softmax.cu
-                            fused_softmax/scaled_upper_triang_masked_softmax.cu
-                            fused_softmax/scaled_aligned_causal_masked_softmax.cu
-                            multi_tensor/adam.cu
-                            multi_tensor/compute_scale.cu
-                            multi_tensor/l2norm.cu
-                            multi_tensor/scale.cu
-                            multi_tensor/sgd.cu
-                            fused_attn/flash_attn.cu
-                            fused_attn/context_parallel.cu
-                            fused_attn/kv_cache.cu
-                            PROPERTIES
-                            COMPILE_OPTIONS "--use_fast_math")
+set(nvte_sources_with_fast_math)
+list(APPEND nvte_sources_with_fast_math fused_softmax/scaled_masked_softmax.cu
+                                        fused_softmax/scaled_upper_triang_masked_softmax.cu
+                                        fused_softmax/scaled_aligned_causal_masked_softmax.cu
+                                        multi_tensor/adam.cu
+                                        multi_tensor/compute_scale.cu
+                                        multi_tensor/l2norm.cu
+                                        multi_tensor/scale.cu
+                                        multi_tensor/sgd.cu
+                                        fused_attn/flash_attn.cu
+                                        fused_attn/context_parallel.cu
+                                        fused_attn/kv_cache.cu)
+
 option(NVTE_BUILD_ACTIVATION_WITH_FAST_MATH "Compile activation kernels with --use_fast_math option" OFF)
 if (NVTE_BUILD_ACTIVATION_WITH_FAST_MATH)
-  set_source_files_properties(activation/gelu.cu
-                              activation/relu.cu
-                              activation/swiglu.cu
-                              util/cast.cu
-                              PROPERTIES
-                              COMPILE_OPTIONS "--use_fast_math")
+  list(APPEND nvte_sources_with_fast_math activation/gelu.cu
+                                          activation/relu.cu
+                                          activation/swiglu.cu
+                                          util/cast.cu)
 endif()
+
+foreach(cuda_source IN LISTS nvte_sources_with_fast_math)
+  set_property(
+    SOURCE ${cuda_source}
+    APPEND
+    PROPERTY
+    COMPILE_OPTIONS "--use_fast_math")
+endforeach()
+
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -O3")


--- a/transformer_engine/common/hadamard_transform/hadamard_transform_cast_fusion.cu
+++ b/transformer_engine/common/hadamard_transform/hadamard_transform_cast_fusion.cu
@@ -97,22 +97,23 @@ cutlass::Array<cutlass::float_e2m1_t, 8>
 StochasticNumericConverterBase(cutlass::Array<float, 8> const &input, cutlass::Array<uint32_t, 2> const &rbits) {
  using result_type = cutlass::Array<cutlass::float_e2m1_t, 8>;
  result_type output;
-#if CUDA_ARCH_HAS_FEATURE_SM10X_ALL
-  auto output_ptr = reinterpret_cast<uint16_t *>(&output);
-  asm volatile( \
-      "{\n" \
-      "cvt.rs.satfinite.e2m1x4.f32   %0, {%5, %4, %3, %2}, %10;\n" \
-      "cvt.rs.satfinite.e2m1x4.f32   %1, {%9, %8, %7, %6}, %11;\n" \
-      "}" \
-      : "=h"(output_ptr[0]),
+  constexpr bool has_rs = ARCH_HAS_STOCHASTIC_ROUNDING;
+  if constexpr (has_rs) {
+    auto output_ptr = reinterpret_cast<uint16_t *>(&output);
+    asm volatile( \
+        "{\n" \
+        "cvt.rs.satfinite.e2m1x4.f32   %0, {%5, %4, %3, %2}, %10;\n" \
+        "cvt.rs.satfinite.e2m1x4.f32   %1, {%9, %8, %7, %6}, %11;\n" \
+        "}" \
+        : "=h"(output_ptr[0]),
        "=h"(output_ptr[1])
-      : "f"(input[0]), "f"(input[1]), "f"(input[2]), "f"(input[3]),
+        : "f"(input[0]), "f"(input[1]), "f"(input[2]), "f"(input[3]),
        "f"(input[4]), "f"(input[5]), "f"(input[6]), "f"(input[7]),
        "r"(rbits[0]), "r"(rbits[1]));
-#else
-  NVTE_DEVICE_ERROR("FP4 cvt PTX instructions are architecture-specific. "
-                    "Try recompiling with sm_XXXa instead of sm_XXX.");
-#endif  // CUDA_ARCH_HAS_FEATURE_SM10X_ALL
+  } else {
+    NVTE_DEVICE_ERROR("FP4 cvt PTX instructions are architecture-specific. "
+        "Try recompiling with sm_XXXa instead of sm_XXX.");
+  }
  return output;
 }


--- a/transformer_engine/common/transpose/quantize_transpose_vector_blockwise_fp4.cu
+++ b/transformer_engine/common/transpose/quantize_transpose_vector_blockwise_fp4.cu
@@ -264,48 +264,50 @@ __device__ __forceinline__ size_t scale_factor_swizzled_offset(size_t row_idx, s

 __device__ __forceinline__ __nv_fp4x4_e2m1 cvt_fp32_to_fp4_4x_with_stochastic_rounding(
    const float2 in01, const float2 in23, const uint32_t rbits) {
-#if CUDA_ARCH_HAS_FEATURE_SM10X_ALL
-  uint16_t out_4x;
-  asm volatile(
-      "{\n"
-      "cvt.rs.satfinite.e2m1x4.f32 %0, {%3, %4, %1, %2}, %5; \n\t"
-      "}"
-      : "=h"(out_4x)
-      : "f"(in01.y), "f"(in01.x), "f"(in23.y), "f"(in23.x), "r"(rbits));
-  return *reinterpret_cast<__nv_fp4x4_e2m1*>(&out_4x);
-#else
-  NVTE_DEVICE_ERROR(
-      "FP4 cvt PTX instructions are architecture-specific. "
-      "Try recompiling with sm_XXXa instead of sm_XXX.");
-  uint16_t dummy = 0;
-  return *reinterpret_cast<__nv_fp4x4_e2m1*>(&dummy);
-#endif  // CUDA_ARCH_HAS_FEATURE_SM10X_ALL
+  constexpr bool has_rs = ARCH_HAS_STOCHASTIC_ROUNDING;
+  if constexpr (has_rs) {
+    uint16_t out_4x;
+    asm volatile(
+        "{\n"
+        "cvt.rs.satfinite.e2m1x4.f32 %0, {%3, %4, %1, %2}, %5; \n\t"
+        "}"
+        : "=h"(out_4x)
+        : "f"(in01.y), "f"(in01.x), "f"(in23.y), "f"(in23.x), "r"(rbits));
+    return *reinterpret_cast<__nv_fp4x4_e2m1*>(&out_4x);
+  } else {
+    NVTE_DEVICE_ERROR(
+        "FP4 cvt.rs PTX instructions are architecture-specific. "
+        "Try recompiling with sm_XXXa instead of sm_XXX.");
+    uint16_t dummy = 0;
+    return *reinterpret_cast<__nv_fp4x4_e2m1*>(&dummy);
+  }
 }

 __device__ __forceinline__ __nv_fp4x4_e2m1 cvt_fp32_to_fp4_4x_with_rn(const float2 in01,
                                                                      const float2 in23,
                                                                      const uint32_t rbits) {
-#if CUDA_ARCH_HAS_FEATURE_SM10X_ALL
-  // NOTE: rbits unused for rn.
-  uint32_t out_4x;  // Only need 16 bit. Using 32 bit container for packing.
-  asm volatile(
-      "{\n"
-      ".reg.b8 f0; \n\t"
-      ".reg.b8 f1; \n\t"
-      "cvt.rn.satfinite.e2m1x2.f32 f0, %1, %2;\n\t"
-      "cvt.rn.satfinite.e2m1x2.f32 f1, %3, %4;\n\t"
-      "mov.b32 %0, {f0, f1, f0, f1};\n\t"
-      "}"
-      : "=r"(out_4x)
-      : "f"(in01.y), "f"(in01.x), "f"(in23.y), "f"(in23.x));
-  return reinterpret_cast<__nv_fp4x4_e2m1*>(&out_4x)[0];
-#else
-  NVTE_DEVICE_ERROR(
-      "FP4 cvt PTX instructions are architecture-specific. "
-      "Try recompiling with sm_XXXa instead of sm_XXX.");
-  uint16_t dummy = 0;
-  return *reinterpret_cast<__nv_fp4x4_e2m1*>(&dummy);
-#endif  // CUDA_ARCH_HAS_FEATURE_SM10X_ALL
+  constexpr bool has_fp4 = ARCH_BLACKWELL_FAMILY;
+  if constexpr (has_fp4) {
+    // NOTE: rbits unused for rn.
+    uint32_t out_4x;  // Only need 16 bit. Using 32 bit container for packing.
+    asm volatile(
+        "{\n"
+        ".reg.b8 f0; \n\t"
+        ".reg.b8 f1; \n\t"
+        "cvt.rn.satfinite.e2m1x2.f32 f0, %1, %2;\n\t"
+        "cvt.rn.satfinite.e2m1x2.f32 f1, %3, %4;\n\t"
+        "mov.b32 %0, {f0, f1, f0, f1};\n\t"
+        "}"
+        : "=r"(out_4x)
+        : "f"(in01.y), "f"(in01.x), "f"(in23.y), "f"(in23.x));
+    return reinterpret_cast<__nv_fp4x4_e2m1*>(&out_4x)[0];
+  } else {
+    NVTE_DEVICE_ERROR(
+        "FP4 cvt PTX instructions are architecture-specific. "
+        "Try recompiling with sm_XXXa instead of sm_XXX.");
+    uint16_t dummy = 0;
+    return *reinterpret_cast<__nv_fp4x4_e2m1*>(&dummy);
+  }
 }

 template <bool kApplyStochasticRounding>

--- a/transformer_engine/common/util/nvfp4_transpose.cuh
+++ b/transformer_engine/common/util/nvfp4_transpose.cuh
@@ -15,10 +15,9 @@
 #include <cudaTypedefs.h>
 #include <cuda_runtime.h>

-#if CUDA_VERSION > 12080
+#if FP4_TYPE_SUPPORTED
 #include <cuda_fp4.h>
-#endif  // CUDA_VERSION > 12080
-
+#endif  // FP4_TYPE_SUPPORTED
 #include <cfloat>

 #include "../common.h"
@@ -30,7 +29,7 @@

 namespace transformer_engine {

-#if CUDA_VERSION > 12080
+#if FP4_TYPE_SUPPORTED
 namespace nvfp4_transpose {

 using RNG = decltype(curanddx::Generator<curanddx::philox4_32>() + curanddx::PhiloxRounds<10>() +
@@ -152,89 +151,89 @@ __device__ __forceinline__ uint32_t get_rbits(RNG &rng, uint4 &random_uint4, int
  return rbits;
 }

-#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-
 __device__ __forceinline__ fp4e2m1x4 mul_cvt_bf16_to_fp4_4x_with_stochastic_rounding(
    const uint64_t in_4x, const float2 scale, const uint32_t rbits) {
  uint16_t out_4x = 0;
-#if CUDA_ARCH_HAS_FEATURE_SM10X_ALL
-  asm volatile(
-      "{\n"
-      ".reg.b64 v01; \n\t"
-      ".reg.b64 v23; \n\t"
-      ".reg.b16 v0_bf16; \n\t"
-      ".reg.b16 v1_bf16; \n\t"
-      ".reg.b16 v2_bf16; \n\t"
-      ".reg.b16 v3_bf16; \n\t"
-      ".reg.b32 v0; \n\t"
-      ".reg.b32 v1; \n\t"
-      ".reg.b32 v2; \n\t"
-      ".reg.b32 v3; \n\t"
-      "mov.b64 {v0_bf16, v1_bf16, v2_bf16, v3_bf16} , %1; \n\t"
-      "cvt.f32.bf16 v0, v0_bf16; \n\t"
-      "cvt.f32.bf16 v1, v1_bf16; \n\t"
-      "cvt.f32.bf16 v2, v2_bf16; \n\t"
-      "cvt.f32.bf16 v3, v3_bf16; \n\t"
-      "mov.b64 v01, {v0, v1}; \n\t"
-      "mov.b64 v23, {v2, v3}; \n\t"
-      "mul.f32x2 v01, v01, %2; \n\t"  // mind the shuffled elements order
-      "mul.f32x2 v23, v23, %2; \n\t"  // mind the shuffled elements order
-      "mov.b64 {v1, v0}, v01; \n\t"
-      "mov.b64 {v3, v2}, v23; \n\t"
-      "cvt.rs.satfinite.e2m1x4.f32 %0, {v2, v3, v0, v1}, %3; \n\t"  // mind the shuffled elements order
-      "}"
-      : "=h"(out_4x)
-      : "l"(in_4x), "l"(reinterpret_cast<const uint64_t &>(scale)), "r"(rbits));
-#else
-  NVTE_DEVICE_ERROR(
-      "FP4 cvt PTX instructions are architecture-specific. "
-      "Try recompiling with sm_XXXa instead of sm_XXX.");
-#endif  // CUDA_ARCH_HAS_FEATURE_SM10X_ALL
+  constexpr bool has_rs = ARCH_HAS_STOCHASTIC_ROUNDING;
+  if constexpr (has_rs) {
+    asm volatile(
+        "{\n"
+        ".reg.b64 v01; \n\t"
+        ".reg.b64 v23; \n\t"
+        ".reg.b16 v0_bf16; \n\t"
+        ".reg.b16 v1_bf16; \n\t"
+        ".reg.b16 v2_bf16; \n\t"
+        ".reg.b16 v3_bf16; \n\t"
+        ".reg.b32 v0; \n\t"
+        ".reg.b32 v1; \n\t"
+        ".reg.b32 v2; \n\t"
+        ".reg.b32 v3; \n\t"
+        "mov.b64 {v0_bf16, v1_bf16, v2_bf16, v3_bf16} , %1; \n\t"
+        "cvt.f32.bf16 v0, v0_bf16; \n\t"
+        "cvt.f32.bf16 v1, v1_bf16; \n\t"
+        "cvt.f32.bf16 v2, v2_bf16; \n\t"
+        "cvt.f32.bf16 v3, v3_bf16; \n\t"
+        "mov.b64 v01, {v0, v1}; \n\t"
+        "mov.b64 v23, {v2, v3}; \n\t"
+        "mul.f32x2 v01, v01, %2; \n\t"  // mind the shuffled elements order
+        "mul.f32x2 v23, v23, %2; \n\t"  // mind the shuffled elements order
+        "mov.b64 {v1, v0}, v01; \n\t"
+        "mov.b64 {v3, v2}, v23; \n\t"
+        "cvt.rs.satfinite.e2m1x4.f32 %0, {v2, v3, v0, v1}, %3; \n\t"  // mind the shuffled elements order
+        "}"
+        : "=h"(out_4x)
+        : "l"(in_4x), "l"(reinterpret_cast<const uint64_t &>(scale)), "r"(rbits));
+  } else {
+    NVTE_DEVICE_ERROR(
+        "FP4 cvt PTX instructions are architecture-specific. "
+        "Try recompiling with sm_XXXa instead of sm_XXX.");
+  }
  return *reinterpret_cast<fp4e2m1x4 *>(&out_4x);
 }

 __device__ __forceinline__ fp4e2m1x4 mul_cvt_bf16_to_fp4_4x_with_rn(const uint64_t in_4x,
                                                                    const float2 scale,
                                                                    const uint32_t rbits) {
-  // NOTE: rbits unused for rn.
+  constexpr bool is_blackwell = ARCH_BLACKWELL_FAMILY;
  uint32_t out_4x = 0;  // Only need 16 bit. Using 32 bit container for packing.
-#if CUDA_ARCH_HAS_FEATURE_SM10X_ALL
-  asm volatile(
-      "{\n"
-      ".reg.b64 v01; \n\t"
-      ".reg.b64 v23; \n\t"
-      ".reg.b16 v0_bf16; \n\t"
-      ".reg.b16 v1_bf16; \n\t"
-      ".reg.b16 v2_bf16; \n\t"
-      ".reg.b16 v3_bf16; \n\t"
-      ".reg.b32 v0; \n\t"
-      ".reg.b32 v1; \n\t"
-      ".reg.b32 v2; \n\t"
-      ".reg.b32 v3; \n\t"
-      ".reg.b8 f0; \n\t"
-      ".reg.b8 f1; \n\t"
-      "mov.b64 {v0_bf16, v1_bf16, v2_bf16, v3_bf16} , %1; \n\t"
-      "cvt.f32.bf16 v0, v0_bf16; \n\t"
-      "cvt.f32.bf16 v1, v1_bf16; \n\t"
-      "cvt.f32.bf16 v2, v2_bf16; \n\t"
-      "cvt.f32.bf16 v3, v3_bf16; \n\t"
-      "mov.b64 v01, {v0, v1}; \n\t"
-      "mov.b64 v23, {v2, v3}; \n\t"
-      "mul.f32x2 v01, v01, %2; \n\t"  // mind the shuffled elements order
-      "mul.f32x2 v23, v23, %2; \n\t"  // mind the shuffled elements order
-      "mov.b64 {v1, v0}, v01; \n\t"
-      "mov.b64 {v3, v2}, v23; \n\t"
-      "cvt.rn.satfinite.e2m1x2.f32 f0, v0, v1;\n\t"
-      "cvt.rn.satfinite.e2m1x2.f32 f1, v2, v3;\n\t"
-      "mov.b32 %0, {f0, f1, f0, f1};\n\t"
-      "}"
-      : "=r"(out_4x)
-      : "l"(in_4x), "l"(reinterpret_cast<const uint64_t &>(scale)));
-#else
-  NVTE_DEVICE_ERROR(
-      "FP4 cvt PTX instructions are architecture-specific. "
-      "Try recompiling with sm_XXXa instead of sm_XXX.");
-#endif  // CUDA_ARCH_HAS_FEATURE_SM10X_ALL
+  if constexpr (is_blackwell) {
+    // NOTE: rbits unused for rn.
+    asm volatile(
+        "{\n"
+        ".reg.b64 v01; \n\t"
+        ".reg.b64 v23; \n\t"
+        ".reg.b16 v0_bf16; \n\t"
+        ".reg.b16 v1_bf16; \n\t"
+        ".reg.b16 v2_bf16; \n\t"
+        ".reg.b16 v3_bf16; \n\t"
+        ".reg.b32 v0; \n\t"
+        ".reg.b32 v1; \n\t"
+        ".reg.b32 v2; \n\t"
+        ".reg.b32 v3; \n\t"
+        ".reg.b8 f0; \n\t"
+        ".reg.b8 f1; \n\t"
+        "mov.b64 {v0_bf16, v1_bf16, v2_bf16, v3_bf16} , %1; \n\t"
+        "cvt.f32.bf16 v0, v0_bf16; \n\t"
+        "cvt.f32.bf16 v1, v1_bf16; \n\t"
+        "cvt.f32.bf16 v2, v2_bf16; \n\t"
+        "cvt.f32.bf16 v3, v3_bf16; \n\t"
+        "mov.b64 v01, {v0, v1}; \n\t"
+        "mov.b64 v23, {v2, v3}; \n\t"
+        "mul.f32x2 v01, v01, %2; \n\t"  // mind the shuffled elements order
+        "mul.f32x2 v23, v23, %2; \n\t"  // mind the shuffled elements order
+        "mov.b64 {v1, v0}, v01; \n\t"
+        "mov.b64 {v3, v2}, v23; \n\t"
+        "cvt.rn.satfinite.e2m1x2.f32 f0, v0, v1;\n\t"
+        "cvt.rn.satfinite.e2m1x2.f32 f1, v2, v3;\n\t"
+        "mov.b32 %0, {f0, f1, f0, f1};\n\t"
+        "}"
+        : "=r"(out_4x)
+        : "l"(in_4x), "l"(reinterpret_cast<const uint64_t &>(scale)));
+  } else {
+    NVTE_DEVICE_ERROR(
+        "FP4 cvt PTX instructions are architecture-specific. "
+        "Try recompiling with sm_XXXa instead of sm_XXX.");
+  }
  return reinterpret_cast<fp4e2m1x4 *>(&out_4x)[0];
 }

@@ -252,34 +251,35 @@ __device__ __forceinline__ fp4e2m1x4 mul_cvt_bf16_to_fp4_4x(const uint64_t in_4x
 __device__ __forceinline__ fp4e2m1x4 mul_cvt_fp32_to_fp4_4x_with_stochastic_rounding(
    const float2 in01, const float2 in23, const float2 scale, const uint32_t rbits) {
  uint16_t out_4x = 0;
-#if CUDA_ARCH_HAS_FEATURE_SM10X_ALL
-  asm volatile(
-      "{\n"
-      ".reg.b64 v01; \n\t"
-      ".reg.b64 v23; \n\t"
-      ".reg.b32 v0; \n\t"
-      ".reg.b32 v1; \n\t"
-      ".reg.b32 v2; \n\t"
-      ".reg.b32 v3; \n\t"
-      "mov.b64 {v0, v1} , %1; \n\t"
-      "mov.b64 {v2, v3} , %2; \n\t"
-      "mov.b64 v01, {v0, v1}; \n\t"
-      "mov.b64 v23, {v2, v3}; \n\t"
-      "mul.f32x2 v01, v01, %3; \n\t"  // mind the shuffled elements order
-      "mul.f32x2 v23, v23, %3; \n\t"  // mind the shuffled elements order
-      "mov.b64 {v1, v0}, v01; \n\t"
-      "mov.b64 {v3, v2}, v23; \n\t"
-      "cvt.rs.satfinite.e2m1x4.f32 %0, {v2, v3, v0, v1}, %4; \n\t"  // mind the shuffled elements order
-      "}"
-      : "=h"(out_4x)
-      : "l"(reinterpret_cast<const uint64_t &>(in01)),
-        "l"(reinterpret_cast<const uint64_t &>(in23)),
-        "l"(reinterpret_cast<const uint64_t &>(scale)), "r"(rbits));
-#else
-  NVTE_DEVICE_ERROR(
-      "FP4 cvt PTX instructions are architecture-specific. "
-      "Try recompiling with sm_XXXa instead of sm_XXX.");
-#endif  // CUDA_ARCH_HAS_FEATURE_SM10X_ALL
+  constexpr bool has_rs = ARCH_HAS_STOCHASTIC_ROUNDING;
+  if constexpr (has_rs) {
+    asm volatile(
+        "{\n"
+        ".reg.b64 v01; \n\t"
+        ".reg.b64 v23; \n\t"
+        ".reg.b32 v0; \n\t"
+        ".reg.b32 v1; \n\t"
+        ".reg.b32 v2; \n\t"
+        ".reg.b32 v3; \n\t"
+        "mov.b64 {v0, v1} , %1; \n\t"
+        "mov.b64 {v2, v3} , %2; \n\t"
+        "mov.b64 v01, {v0, v1}; \n\t"
+        "mov.b64 v23, {v2, v3}; \n\t"
+        "mul.f32x2 v01, v01, %3; \n\t"  // mind the shuffled elements order
+        "mul.f32x2 v23, v23, %3; \n\t"  // mind the shuffled elements order
+        "mov.b64 {v1, v0}, v01; \n\t"
+        "mov.b64 {v3, v2}, v23; \n\t"
+        "cvt.rs.satfinite.e2m1x4.f32 %0, {v2, v3, v0, v1}, %4; \n\t"  // mind the shuffled elements order
+        "}"
+        : "=h"(out_4x)
+        : "l"(reinterpret_cast<const uint64_t &>(in01)),
+          "l"(reinterpret_cast<const uint64_t &>(in23)),
+          "l"(reinterpret_cast<const uint64_t &>(scale)), "r"(rbits));
+  } else {
+    NVTE_DEVICE_ERROR(
+        "FP4 cvt PTX instructions are architecture-specific. "
+        "Try recompiling with sm_XXXa instead of sm_XXX.");
+  }
  return *reinterpret_cast<fp4e2m1x4 *>(&out_4x);
 }

@@ -287,40 +287,41 @@ __device__ __forceinline__ fp4e2m1x4 mul_cvt_fp32_to_fp4_4x_with_rn(const float2
                                                                    const float2 in23,
                                                                    const float2 scale,
                                                                    const uint32_t rbits) {
-  // NOTE: rbits unused for rn.
+  constexpr bool is_blackwell = ARCH_BLACKWELL_FAMILY;
  uint32_t out_4x = 0;  // Only need 16 bit. Using 32 bit container for packing.
-#if CUDA_ARCH_HAS_FEATURE_SM10X_ALL
-  asm volatile(
-      "{\n"
-      ".reg.b64 v01; \n\t"
-      ".reg.b64 v23; \n\t"
-      ".reg.b32 v0; \n\t"
-      ".reg.b32 v1; \n\t"
-      ".reg.b32 v2; \n\t"
-      ".reg.b32 v3; \n\t"
-      ".reg.b8 f0; \n\t"
-      ".reg.b8 f1; \n\t"
-      "mov.b64 {v0, v1} , %1; \n\t"
-      "mov.b64 {v2, v3} , %2; \n\t"
-      "mov.b64 v01, {v0, v1}; \n\t"
-      "mov.b64 v23, {v2, v3}; \n\t"
-      "mul.f32x2 v01, v01, %3; \n\t"  // mind the shuffled elements order
-      "mul.f32x2 v23, v23, %3; \n\t"  // mind the shuffled elements order
-      "mov.b64 {v1, v0}, v01; \n\t"
-      "mov.b64 {v3, v2}, v23; \n\t"
-      "cvt.rn.satfinite.e2m1x2.f32 f0, v0, v1;\n\t"
-      "cvt.rn.satfinite.e2m1x2.f32 f1, v2, v3;\n\t"
-      "mov.b32 %0, {f0, f1, f0, f1};\n\t"
-      "}"
-      : "=r"(out_4x)
-      : "l"(reinterpret_cast<const uint64_t &>(in01)),
-        "l"(reinterpret_cast<const uint64_t &>(in23)),
-        "l"(reinterpret_cast<const uint64_t &>(scale)));
-#else
-  NVTE_DEVICE_ERROR(
-      "FP4 cvt PTX instructions are architecture-specific. "
-      "Try recompiling with sm_XXXa instead of sm_XXX.");
-#endif  // CUDA_ARCH_HAS_FEATURE_SM10X_ALL
+  if constexpr (is_blackwell) {
+    // NOTE: rbits unused for rn.
+    asm volatile(
+        "{\n"
+        ".reg.b64 v01; \n\t"
+        ".reg.b64 v23; \n\t"
+        ".reg.b32 v0; \n\t"
+        ".reg.b32 v1; \n\t"
+        ".reg.b32 v2; \n\t"
+        ".reg.b32 v3; \n\t"
+        ".reg.b8 f0; \n\t"
+        ".reg.b8 f1; \n\t"
+        "mov.b64 {v0, v1} , %1; \n\t"
+        "mov.b64 {v2, v3} , %2; \n\t"
+        "mov.b64 v01, {v0, v1}; \n\t"
+        "mov.b64 v23, {v2, v3}; \n\t"
+        "mul.f32x2 v01, v01, %3; \n\t"  // mind the shuffled elements order
+        "mul.f32x2 v23, v23, %3; \n\t"  // mind the shuffled elements order
+        "mov.b64 {v1, v0}, v01; \n\t"
+        "mov.b64 {v3, v2}, v23; \n\t"
+        "cvt.rn.satfinite.e2m1x2.f32 f0, v0, v1;\n\t"
+        "cvt.rn.satfinite.e2m1x2.f32 f1, v2, v3;\n\t"
+        "mov.b32 %0, {f0, f1, f0, f1};\n\t"
+        "}"
+        : "=r"(out_4x)
+        : "l"(reinterpret_cast<const uint64_t &>(in01)),
+          "l"(reinterpret_cast<const uint64_t &>(in23)),
+          "l"(reinterpret_cast<const uint64_t &>(scale)));
+  } else {
+    NVTE_DEVICE_ERROR(
+        "FP4 cvt PTX instructions are architecture-specific. "
+        "Try recompiling with sm_XXXa instead of sm_XXX.");
+  }
  return reinterpret_cast<fp4e2m1x4 *>(&out_4x)[0];
 }

@@ -335,8 +336,6 @@ __device__ __forceinline__ fp4e2m1x4 mul_cvt_fp32_to_fp4_4x(const float2 in01, c
  }
 }

-#endif  // (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
-
 template <bool COMPUTE_ACTIVATIONS, typename ParamOP, float (*OP)(float, const ParamOP &),
          typename IType, bool USE_STOCHASTIC_ROUNDING, bool RETURN_TRANSPOSE>
 __global__ void __launch_bounds__(THREADS_NUM)
@@ -1380,18 +1379,13 @@ __global__ void __launch_bounds__(THREADS_NUM)
 #endif  // #if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000)
 }
 }  // namespace nvfp4_transpose
-#endif  // CUDA_VERSION > 12080
-
-// Compile-time flag to choose kernel variant
-#ifndef USE_2D_NVFP4_KERNEL
-#define USE_2D_NVFP4_KERNEL 0
-#endif
+#endif  // FP4_TYPE_SUPPORTED

 template <bool COMPUTE_ACTIVATIONS, typename ParamOP, float (*OP)(float, const ParamOP &),
          bool use_2d_quantization>
 void nvfp4_quantize_transpose(const Tensor &input, const Tensor *noop, Tensor *output,
                              const QuantizationConfig *quant_config, cudaStream_t stream) {
-#if CUDA_VERSION > 12080
+#if FP4_TYPE_SUPPORTED
  bool use_stochastic_rounding = quant_config ? quant_config->stochastic_rounding : false;

  // If transposed output is allocated, return the transposed data. Otherwise, it's not necesary to
@@ -1509,7 +1503,7 @@ void nvfp4_quantize_transpose(const Tensor &input, const Tensor *noop, Tensor *o
      }););
 #else
  NVTE_ERROR("FP4 support requires CUDA 12.8+, but compile-time CUDA version is ", CUDA_VERSION);
-#endif  // CUDA_VERSION > 12080
+#endif  // FP4_TYPE_SUPPORTED
 }
 }  // namespace transformer_engine


--- a/transformer_engine/common/util/ptx.cuh
+++ b/transformer_engine/common/util/ptx.cuh
--- a/transformer_engine/common/utils.cuh
+++ b/transformer_engine/common/utils.cuh
@@ -16,6 +16,7 @@
 #endif

 #if !defined(__CUDACC_RTC__)
+#include <cassert>
 #include <cstdint>
 #else
 // Importing C++ standard headers is a pain with NVRTC