[DCU] compile pass

ab122dac · yuguo · 4c6a5a27 · ab122dac · ab122dac · ab122dac
Commit ab122dac authored Mar 27, 2025 by yuguo
20 changed files
--- a/build_tools/pytorch.py
+++ b/build_tools/pytorch.py
@@ -58,6 +58,7 @@ def setup_pytorch_extension(
            "-U__HIP_NO_BFLOAT16_CONVERSIONS__",
            "-U__HIP_NO_BFLOAT162_OPERATORS__",
            "-U__HIP_NO_BFLOAT162_CONVERSIONS__",
+            "-w",
        ]
    else:
        nvcc_flags = [

--- a/hipify_custom_map.json
+++ b/hipify_custom_map.json
 {
    "custom_map" : {
+           "common/util/vectorized_pointwise.h" : "common/util/vectorized_pointwise_hip.h",
+           "common/common.h" : "common/common_hip.h",
+           "/userbuffers.h" : "/userbuffers_hip.h",
+           "/logging.h" : "/logging_hip.h",
+           "/system.h" : "/system_hip.h",
           "<cuda_bf16.h>" : "<hip/hip_bf16.h>",
           "<cuda_fp8.h>" : "\"amd_detail/hip_float8.h\"",
           "CUfunc_cache" : "hipFuncCache_t", 

--- a/setup.py
+++ b/setup.py
@@ -3,6 +3,7 @@
 # See LICENSE for license information.

 """Installation script."""
+# NVTE_FRAMEWORK=pytorch NVTE_USE_ROCM=1 NVTE_USE_HIPBLASLT=1 NVTE_USE_ROCBLAS=0 CMAKE_PREFIX_PATH=/opt/dtk/lib/cmake/amd_comgr/ MPI_HOME=/opt/mpi/ NVTE_UB_WITH_MPI=1 CXX=hipcc pip3 install . -v

 import os
 import sys
@@ -43,7 +44,10 @@ elif "jax" in frameworks:


 CMakeBuildExtension = get_build_ext(BuildExtension)
-archs = cuda_archs()
+if rocm_build():
+    archs = None
+else:
+    archs = cuda_archs()


 class TimedBdist(bdist_wheel):

--- a/transformer_engine/common/CMakeLists.txt
+++ b/transformer_engine/common/CMakeLists.txt
@@ -226,11 +226,9 @@ else()
  add_library(transformer_engine SHARED ${te_hip_sources})
 endif()

+target_include_directories(transformer_engine PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include")
 # Configure dependencies
 if (USE_CUDA)
-  target_include_directories(transformer_engine PUBLIC
-                             "${CMAKE_CURRENT_SOURCE_DIR}/include")
-  
  # Configure dependencies
  target_link_libraries(transformer_engine PUBLIC
                        CUDA::cublas
@@ -239,6 +237,7 @@ if (USE_CUDA)
                             ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
  target_include_directories(transformer_engine PRIVATE "${CUDNN_FRONTEND_INCLUDE_DIR}")
 else()
+  target_include_directories(transformer_engine PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}")
  # Aotriton is currently unsupported 
  set(AotritonAndCk_fused_attn "unsupported")

@@ -343,7 +342,7 @@ else()
  set(HIP_HCC_FLAGS "${CMAKE_HIP_FLAGS} -mavx2 -mf16c -mfma -std=c++17")
  # Ask hcc to generate device code during compilation so we can use
  # host linker to link.
-  set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -fno-gpu-rdc -Wno-defaulted-function-deleted")
+  set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -fno-gpu-rdc -w")
  foreach(rocm_arch ${CMAKE_HIP_ARCHITECTURES})
    # if CMAKE_CXX_FLAGS has --offload-arch set already, better to rm first
    set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} --offload-arch=${rocm_arch}")

--- a/transformer_engine/common/amd_detail/hip_f8_impl.h
+++ b/transformer_engine/common/amd_detail/hip_f8_impl.h
@@ -4,6 +4,9 @@
 * License for AMD contributions = MIT. See LICENSE for more information
 ************************************************************************/
 #include <hip/hip_runtime.h>
+#include <hip/hip_fp16.h>
+#include <hip/hip_bf16.h>
+
 namespace hip_f8_impl {

 HIP_HOST_DEVICE inline int clz(uint32_t x) {
@@ -190,7 +193,7 @@ HIP_HOST_DEVICE
 T cast_from_f8(uint8_t x) {
  constexpr bool is_half = std::is_same<T,__half>::value;
  constexpr bool is_float = std::is_same<T,float>::value;
-  constexpr bool is_bf16 = std::is_same<T,hip_bfloat16>::value;
+  constexpr bool is_bf16 = std::is_same<T,__hip_bfloat16>::value;
  static_assert(is_half || is_float, "only half and float are supported");

  constexpr int weo = is_half ? 5 : 8;

--- a/transformer_engine/common/amd_detail/hip_float8.h
+++ b/transformer_engine/common/amd_detail/hip_float8.h
@@ -326,7 +326,7 @@ struct hip_f8 {
 #endif // #ifdef __gfx942__

  // constructor from hip_bfloat16
-  explicit HIP_HOST_DEVICE hip_f8(hip_bfloat16 v, hip_f8_rounding_mode r=hip_f8_rounding_mode::standard, uint32_t rng=0);
+  explicit HIP_HOST_DEVICE hip_f8(__hip_bfloat16 v, hip_f8_rounding_mode r=hip_f8_rounding_mode::standard, uint32_t rng=0);

  // convert to float
 #ifdef __gfx942__
@@ -430,7 +430,7 @@ struct hip_f8 {
 #endif // #ifdef __gfx942__

  // convert to hip_bfloat16
-  explicit inline HIP_HOST_DEVICE operator hip_bfloat16() const;
+  explicit inline HIP_HOST_DEVICE operator __hip_bfloat16() const;

  // check for zero
  inline HIP_HOST_DEVICE bool is_zero() const {
@@ -504,7 +504,7 @@ struct hip_f8x4 {
  HIP_HOST_DEVICE hip_f8x4(halfx4 v, hip_f8_rounding_mode rm=hip_f8_rounding_mode::standard, uint32_t rng=0);

  // constructor from hip_bfloat16
-  HIP_HOST_DEVICE hip_f8x4(hip_bfloat16 v0, hip_bfloat16 v1=hip_bfloat16(0.0f), hip_bfloat16 v2=hip_bfloat16(0.0f), hip_bfloat16 v3=hip_bfloat16(0.0f), hip_f8_rounding_mode rm=hip_f8_rounding_mode::standard, uint32_t rng=0);
+  HIP_HOST_DEVICE hip_f8x4(__hip_bfloat16 v0, __hip_bfloat16 v1=__hip_bfloat16(0.0f), __hip_bfloat16 v2=__hip_bfloat16(0.0f), __hip_bfloat16 v3=__hip_bfloat16(0.0f), hip_f8_rounding_mode rm=hip_f8_rounding_mode::standard, uint32_t rng=0);
  HIP_HOST_DEVICE hip_f8x4(hip_bfloat16x2 v, hip_f8_rounding_mode rm=hip_f8_rounding_mode::standard, uint32_t rng=0);
  HIP_HOST_DEVICE hip_f8x4(hip_bfloat16x4 v, hip_f8_rounding_mode rm=hip_f8_rounding_mode::standard, uint32_t rng=0);


--- a/transformer_engine/common/comm_gemm_overlap/comm_gemm_overlap.cpp
+++ b/transformer_engine/common/comm_gemm_overlap/comm_gemm_overlap.cpp
@@ -12,8 +12,13 @@
 #include <numeric>

 #include "common/common.h"
+#ifdef USE_ROCM
+#include "common/util/hip_driver.h"
+#include "common/util/hip_runtime.h"
+#else
 #include "common/util/cuda_driver.h"
 #include "common/util/cuda_runtime.h"
+#endif
 #include "common/util/logging.h"
 #include "common/util/system.h"
 #include "userbuffers/userbuffers.h"

--- a/transformer_engine/common/comm_gemm_overlap/userbuffers/userbuffers-host.cpp
+++ b/transformer_engine/common/comm_gemm_overlap/userbuffers/userbuffers-host.cpp
@@ -19,9 +19,15 @@
 #include <map>
 #include <utility>

+#ifdef USE_ROCM
+#include "common/util/hip_driver.h"
+#include "common/util/hip_nvml.h"
+#include "common/util/hip_runtime.h"
+#else
 #include "common/util/cuda_driver.h"
 #include "common/util/cuda_nvml.h"
 #include "common/util/cuda_runtime.h"
+#endif
 #include "common/util/logging.h"
 #include "common/util/system.h"
 #include "ipcsocket.h"
@@ -362,7 +368,7 @@ int create_communicator_grouped2(communicator **comm, int myrank, int numranks,
  NVTE_CHECK_CUDA(cudaMemset((*comm)->flags, 0, 2 * GPU_PAGE_SIZE));
  (*comm)->flags =
 #ifdef USE_ROCM
-      reinterpret_cast<int *>((reinterpret_cast<uintptr_t>((*comm)->flags) + GPU_PAGE_SIZE - 1) & GPU_PAGE_MASK
+      reinterpret_cast<int *>((reinterpret_cast<uintptr_t>((*comm)->flags) + GPU_PAGE_SIZE - 1) & GPU_PAGE_MASK);
 #else
      reinterpret_cast<int *>(((CUdeviceptr)(*comm)->flags + GPU_PAGE_SIZE - 1) & GPU_PAGE_MASK);
 #endif

--- a/transformer_engine/common/common.cu
+++ b/transformer_engine/common/common.cu
@@ -10,7 +10,11 @@

 #include "./common.h"
 #include "./utils.cuh"
+#ifdef __HIP_PLATFORM_AMD__
+#include "common/util/hip_runtime.h"
+#else
 #include "common/util/cuda_runtime.h"
+#endif
 #include "common/util/logging.h"

 namespace transformer_engine {

--- a/transformer_engine/common/common.h
+++ b/transformer_engine/common/common.h
@@ -25,7 +25,11 @@
 #include <vector>

 #include "./nvtx.h"
+#ifdef __HIP_PLATFORM_AMD__
+#include "./util/hip_driver.h"
+#else
 #include "./util/cuda_driver.h"
+#endif
 #include "./util/logging.h"

 namespace transformer_engine {
@@ -223,7 +227,7 @@ using bf16 = nv_bfloat16;
 using fp8e4m3 = __nv_fp8_e4m3;
 using fp8e5m2 = __nv_fp8_e5m2;
 #else
-using bf16 = hip_bfloat16;
+using bf16 = __hip_bfloat16;
 using fp8e4m3 = te_hip_fp8_e4m3;
 using fp8e5m2 = te_hip_fp8_e5m2;
 #endif
@@ -247,7 +251,7 @@ TRANSFORMER_ENGINE_TYPE_NAME(int64_t)
 TRANSFORMER_ENGINE_TYPE_NAME(float)
 TRANSFORMER_ENGINE_TYPE_NAME(half)
 #ifdef __HIP_PLATFORM_AMD__
-TRANSFORMER_ENGINE_TYPE_NAME(hip_bfloat16)
+TRANSFORMER_ENGINE_TYPE_NAME(__hip_bfloat16)
 TRANSFORMER_ENGINE_TYPE_NAME(te_hip_fp8_e4m3)
 TRANSFORMER_ENGINE_TYPE_NAME(te_hip_fp8_e5m2)
 #else

--- a/transformer_engine/common/gemm/cublaslt_gemm.cu
+++ b/transformer_engine/common/gemm/cublaslt_gemm.cu
@@ -22,7 +22,11 @@
 #include "../common.h"
 #include "../util/handle_manager.h"
 #include "../util/logging.h"
+#ifdef __HIP_PLATFORM_AMD__
+#include "common/util/hip_runtime.h"
+#else
 #include "common/util/cuda_runtime.h"
+#endif

 #ifndef __HIP_PLATFORM_AMD__
 namespace {
@@ -738,7 +742,7 @@ void nvte_multi_stream_cublas_gemm(const NVTETensor *A, const NVTETensor *B, NVT

  if(NVTE_BLAS_MULSTREAM==nullptr){
    NVTE_FORCE_BLASLT_MULSTREAM = true;
-  } elif((NVTE_BLASLT_BLAS != nullptr && NVTE_BLASLT_BLAS[0] == '1') && (NVTE_BLAS_MULSTREAM != nullptr && NVTE_BLAS_MULSTREAM[0] == '1')){
+  } else if((NVTE_BLASLT_BLAS != nullptr && NVTE_BLASLT_BLAS[0] == '1') && (NVTE_BLAS_MULSTREAM != nullptr && NVTE_BLAS_MULSTREAM[0] == '1')){
    NVTE_ERROR("NVTE_FORCE_BLAS_MULSTREAM and NVTE_FORCE_BLASLT can't be set at the same time.");
  } else{
    NVTE_FORCE_BLASLT_MULSTREAM = false;
@@ -776,8 +780,7 @@ void nvte_multi_stream_cublas_batchgemm(const NVTETensor *A, const NVTETensor *B
                                   cudaStream_t stream) {
  NVTE_API_CALL(nvte_multi_stream_cublas_batchgemm);
  using namespace transformer_engine;
-  static_assert(num_gemms % num_batchgemm_streams == 0,
-                "Need num_gemms mod num_batchgemm_streams == 0.");
+  assert(num_gemms % num_batchgemm_streams == 0);
  static int batch_count = num_gemms / num_batchgemm_streams;
  // Inits streams and events (once, globally)
  std::call_once(init_flag_batchgemm, init_streams_and_events_batchgemm);

--- a/transformer_engine/common/normalization/common.cpp
+++ b/transformer_engine/common/normalization/common.cpp
@@ -192,15 +192,15 @@ CudnnNormalizationPlan::CudnnNormalizationPlan(NVTE_Norm_Type NormType, NVTE_Nor
                                               const size_t sm_count,
                                               const bool zero_centered_gamma,
                                               const NVTEScalingMode mode, bool training)
+#ifdef USE_ROCM
+    { assert(false);
+#else
    : _fp8_out(is_fp8_dtype(otype)),
      _zero_centered(zero_centered_gamma),
      _training(training),
      _norm_stage(NormStage),
      _norm_type(NormType) {
-#ifdef USE_ROCM
-  static_assert(false,
-                "Cudnn backend is not surpported in rocm for normalization yet.");
-#else
+
  static_assert(CUDNN_FRONTEND_VERSION >= 10601,
                "CUDNN_FRONTEND_VERSION should be at least 1.6.1!");

@@ -389,8 +389,7 @@ CudnnNormalizationPlan::CudnnNormalizationPlan(NVTE_Norm_Type NormType, NVTE_Nor

 void CudnnNormalizationPlan::_build() {
 #ifdef USE_ROCM
-  static_assert(false,
-                "Cudnn backend is not surpported in rocm for normalization yet.");
+  assert(false);
 #else
  NVTE_CHECK(_graph.validate().is_good());
  NVTE_CHECK(_graph.build_operation_graph(_handle).is_good());
@@ -406,8 +405,8 @@ void CudnnNormalizationPlan::_build() {

 std::vector<size_t> CudnnNormalizationPlan::getWorkspaceShape() const {
 #ifdef USE_ROCM
-  static_assert(false,
-                "Cudnn backend is not surpported in rocm for normalization yet.");
+  assert(false);
+  return {0};
 #else
  return {static_cast<size_t>(_graph.get_workspace_size())};
 #endif
@@ -417,8 +416,7 @@ void CudnnNormalizationPlan::execute(Tensor* z, void* x_dptr, void* gamma_dptr,
                                     void* mean_dptr, void* eps_dptr, void* rsigma_dptr,
                                     void* workspace_dptr, cudaStream_t stream) {
 #ifdef USE_ROCM
-  static_assert(false,
-                "Cudnn backend is not surpported in rocm for normalization yet.");
+  assert(false);
 #else
  // Binding data pointers to graph tensors
  _variant_pack = {{_x, x_dptr}, {_eps, eps_dptr}};
@@ -462,8 +460,7 @@ void CudnnNormalizationPlan::execute(void* x_dptr, void* gamma_dptr, void* mean_
                                     void* dbeta_dptr, void* dgamma_dptr, void* workspace_dptr,
                                     cudaStream_t stream) {
 #ifdef USE_ROCM
-  static_assert(false,
-                "Cudnn backend is not surpported in rocm for normalization yet.");
+  assert(false);
 #else
  // Binding data pointers to graph tensors
  _variant_pack = {
@@ -519,7 +516,8 @@ NormalizationPlanBase* NormalizationPlanRegistry::getNormalizationPlan(

 bool& _cudnn_norm_fwd_flag() {
 #ifdef USE_ROCM
-  return false;
+  static bool flag = false;
+  return flag;
 #else
  static bool flag = transformer_engine::getenv<bool>("NVTE_NORM_FWD_USE_CUDNN");
  return flag;
@@ -528,7 +526,8 @@ bool& _cudnn_norm_fwd_flag() {

 bool& _cudnn_norm_bwd_flag() {
 #ifdef USE_ROCM
-  return false;
+  static bool flag = false;
+  return flag;
 #else
  static bool flag = transformer_engine::getenv<bool>("NVTE_NORM_BWD_USE_CUDNN");
  return flag;
@@ -544,7 +543,8 @@ bool use_cudnn_norm_bwd() { return _cudnn_norm_bwd_flag(); }
 void nvte_enable_cudnn_norm_fwd(bool enable) {
  NVTE_API_CALL(nvte_enable_cudnn_norm_fwd);
 #ifdef USE_ROCM
-  transformer_engine::normalization::_cudnn_norm_bwd_flag() = false;
+  bool flag = false;
+  transformer_engine::normalization::_cudnn_norm_bwd_flag() = flag;
 #else
  transformer_engine::normalization::_cudnn_norm_fwd_flag() = enable;
 #endif
@@ -553,7 +553,8 @@ void nvte_enable_cudnn_norm_fwd(bool enable) {
 void nvte_enable_cudnn_norm_bwd(bool enable) {
  NVTE_API_CALL(nvte_enable_cudnn_norm_bwd);
 #ifdef USE_ROCM
-  transformer_engine::normalization::_cudnn_norm_bwd_flag() = false;
+  bool flag = false;
+  transformer_engine::normalization::_cudnn_norm_bwd_flag() = flag;
 #else
  transformer_engine::normalization::_cudnn_norm_bwd_flag() = enable;
 #endif

--- a/transformer_engine/common/normalization/common.h
+++ b/transformer_engine/common/normalization/common.h
@@ -30,7 +30,9 @@ namespace transformer_engine {

 namespace normalization {

+#ifndef __HIP_PLATFORM_AMD__
 namespace fe = cudnn_frontend;
+#endif

 template <typename KernelParamsType>
 struct LaunchParams {
@@ -277,14 +279,14 @@ class CudnnNormalizationPlan : public NormalizationPlanBase {

 private:
  void _build() override;
-
+  
+#ifndef __HIP_PLATFORM_AMD__
  const bool _zero_centered, _fp8_out;
  int _ndim_scale_block;
  const NVTE_Norm_Stage _norm_stage;
  const NVTE_Norm_Type _norm_type;
  std::unique_ptr<char[]> _scalar_dptr;
  std::unique_ptr<float> _one_dptr = std::make_unique<float>(1.0f);
-#ifndef __HIP_PLATFORM_AMD__
  // FWD
  std::shared_ptr<fe::graph::Tensor_attributes> _x, _gamma_zero, _scalar_offset, _gamma, _beta,
      _eps, _mean, _rsigma, _z, _z_scale, _one_for_div, _z_scale_inv, _amax, _z_fp8;

--- a/transformer_engine/common/normalization/layernorm/ln_bwd_semi_cuda_kernel.cu
+++ b/transformer_engine/common/normalization/layernorm/ln_bwd_semi_cuda_kernel.cu
@@ -43,6 +43,11 @@ void launch_tuned_(LaunchParams<BackwardKernelParams> &launch_params,
    NVTE_CHECK_CUDA(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize,
                                         Kernel_traits::SMEM_BYTES));
  }
+#else
+  if (Kernel_traits::SMEM_BYTES >= 48 * 1024) {
+    NVTE_CHECK_CUDA(cudaFuncSetAttribute((const void *)kernel, cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                         Kernel_traits::SMEM_BYTES));
+  }
 #endif

  auto stream = launch_params.stream;

--- a/transformer_engine/common/normalization/layernorm/ln_fwd_cuda_kernel.cu
+++ b/transformer_engine/common/normalization/layernorm/ln_fwd_cuda_kernel.cu
@@ -39,6 +39,11 @@ void launch_tuned_(LaunchParams<ForwardKernelParams> &launch_params,
    NVTE_CHECK_CUDA(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize,
                                         Kernel_traits::SMEM_BYTES_FWD));
  }
+#else
+  if (Kernel_traits::SMEM_BYTES_FWD >= 48 * 1024) {
+    NVTE_CHECK_CUDA(cudaFuncSetAttribute((const void *)kernel, cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                         Kernel_traits::SMEM_BYTES_FWD));
+  }
 #endif

  auto stream = launch_params.stream;

--- a/transformer_engine/common/normalization/rmsnorm/rmsnorm_bwd_semi_cuda_kernel.cu
+++ b/transformer_engine/common/normalization/rmsnorm/rmsnorm_bwd_semi_cuda_kernel.cu
@@ -42,6 +42,11 @@ void launch_tuned_(LaunchParams<BackwardKernelParams> &launch_params,
    NVTE_CHECK_CUDA(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize,
                                         Kernel_traits::SMEM_BYTES));
  }
+#else
+  if (Kernel_traits::SMEM_BYTES >= 48 * 1024) {
+    NVTE_CHECK_CUDA(cudaFuncSetAttribute((const void *)kernel, cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                         Kernel_traits::SMEM_BYTES));
+  }
 #endif

  auto stream = launch_params.stream;

--- a/transformer_engine/common/normalization/rmsnorm/rmsnorm_fwd_cuda_kernel.cu
+++ b/transformer_engine/common/normalization/rmsnorm/rmsnorm_fwd_cuda_kernel.cu
@@ -40,6 +40,11 @@ void launch_tuned_(LaunchParams<ForwardKernelParams> &launch_params,
    NVTE_CHECK_CUDA(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize,
                                         Kernel_traits::SMEM_BYTES_FWD));
  }
+#else
+  if (Kernel_traits::SMEM_BYTES_FWD >= 48 * 1024) {
+    NVTE_CHECK_CUDA(cudaFuncSetAttribute((const void *)kernel, cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                         Kernel_traits::SMEM_BYTES_FWD));
+  }
 #endif

  auto stream = launch_params.stream;

--- a/transformer_engine/common/permutation/permutation.cu
+++ b/transformer_engine/common/permutation/permutation.cu
@@ -11,6 +11,7 @@
 #ifdef __HIP_PLATFORM_AMD__
 using __nv_fp8_e4m3 = hip_f8<hip_f8_type::fp8>;
 using __nv_fp8_e5m2 = hip_f8<hip_f8_type::bf8>;
+#define __ldlu(x) __ldg(x)
 #endif

 static __global__ void moe_permute_row_map(const int *sorted_row_id, int *row_id_map,
@@ -214,7 +215,11 @@ __global__ void moe_permute_kernel(const T *input_bwd, const T *input_fwd, T *ac
      if (k == topK) break;
      // Warp-level reduction
      for (int mask = 16; mask > 0; mask /= 2) {
+#ifdef __HIP_PLATFORM_AMD__
+        accum[k] = accum[k] + __shfl_xor(accum[k], mask, 32);
+#else
        accum[k] = accum[k] + __shfl_xor_sync(0xffffffff, accum[k], mask, 32);
+#endif
      }
    }


--- a/transformer_engine/common/recipe/current_scaling.cu
+++ b/transformer_engine/common/recipe/current_scaling.cu
@@ -15,6 +15,10 @@
 #include "../util/vectorized_pointwise.h"
 #include "recipe_common.cuh"

+#ifdef __HIP_PLATFORM_AMD__
+using __nv_bfloat16 = __hip_bfloat16;
+#endif
+
 namespace transformer_engine {
 namespace {


--- a/transformer_engine/common/recipe/delayed_scaling.cu
+++ b/transformer_engine/common/recipe/delayed_scaling.cu
@@ -11,7 +11,11 @@
 #include <string>

 #include "../common.h"
+#ifdef __HIP_PLATFORM_AMD__
+#include "../util/hip_runtime.h"
+#else
 #include "../util/cuda_runtime.h"
+#endif
 #include "../util/logging.h"

 namespace transformer_engine {