Merge branch 'main' of http://10.6.10.68/dcutoolkit/deeplearing/TransformerEngine

9da3621b · yuguo · 16de530e · 86f2e9a9 · 9da3621b · 9da3621b
Commit 9da3621b authored Apr 29, 2025 by yuguo
4 changed files
--- a/tests/pytorch/test_numerics.py
+++ b/tests/pytorch/test_numerics.py
@@ -39,6 +39,7 @@ from transformer_engine.pytorch import (
    Fp8Padding,
    Fp8Unpadding,
 )
+from transformer_engine.pytorch import torch_version
 from transformer_engine.pytorch.dot_product_attention.inference import InferenceParams
 from transformer_engine.pytorch.distributed import checkpoint as te_checkpoint
 from transformer_engine.pytorch.cpp_extensions import general_gemm, general_grouped_gemm
@@ -61,8 +62,10 @@ torch.cuda.manual_seed(seed)
 _cpu_rng_state = torch.get_rng_state()
 _cuda_rng_state = torch.cuda.get_rng_state()
-torch._dynamo.config.recompile_limit = 16
+if torch_version() >= (2, 7, 0):
+    torch._dynamo.config.recompile_limit = 16
+else:
+    torch._dynamo.config.cache_size_limit = 16
 class ModelConfig:
    def __init__(self, hidden_size, eps, num_attention_heads, embed, num_layers, seq_len):

--- a/transformer_engine/common/permutation/permutation.cu
+++ b/transformer_engine/common/permutation/permutation.cu
@@ -253,7 +253,11 @@ void nvte_permute_launcher(const T *input, T *output, const int *sorted_row_id,
                                                        num_out_tokens);
    blocks = num_rows;
+    #ifdef __HIP_PLATFORM_AMD__
+    threads = std::min(num_cols / kElementsPerAccess, 256);
+    #else
    threads = std::min(num_cols / kElementsPerAccess, 1024);
+    #endif
    moe_permute_kernel<T, TCompute, 128, false><<<blocks, threads, 0, stream>>>(
        input, nullptr, output, nullptr, nullptr, row_id_map, num_rows, topK, num_cols);
  } else {
@@ -305,7 +309,11 @@ void nvte_unpermute_launcher(const T *input, T *output, int *row_id_map, const f
  static constexpr int kElementsPerAccess = 16 / sizeof(T);
  int blocks = num_rows;
+  #ifdef __HIP_PLATFORM_AMD__
+  int threads = std::min(num_cols / kElementsPerAccess, 256);
+  #else
  int threads = std::min(num_cols / kElementsPerAccess, 1024);
+  #endif
  size_t smem_bytes = topK * sizeof(TCompute);
  if (prob == nullptr) {

--- a/transformer_engine/pytorch/csrc/extensions/attention.cu
+++ b/transformer_engine/pytorch/csrc/extensions/attention.cu
@@ -18,7 +18,8 @@ NVTE_Fused_Attn_Backend get_fused_attn_backend(
    size_t max_seqlen_kv, size_t head_dim_qk, size_t head_dim_v, int64_t window_size_left,
    int64_t window_size_right) {
 #ifdef __HIP_PLATFORM_AMD__
-  assert(false);
+  // assert(false);
+  return NVTE_Fused_Attn_Backend::NVTE_No_Backend;
 #else
  NVTE_Fused_Attn_Backend fused_attention_backend = nvte_get_fused_attn_backend(
      static_cast<NVTEDType>(q_dtype), static_cast<NVTEDType>(kv_dtype), qkv_layout, bias_type,

--- a/transformer_engine/pytorch/triton/cross_entropy.py
+++ b/transformer_engine/pytorch/triton/cross_entropy.py
@@ -281,7 +281,7 @@ def cross_entropy_forward(
        rank=rank,
        n_cols=V,
        BLOCK_SIZE=BLOCK_SIZE,
-        num_warps=32,
+        num_warps=16 if IS_HIP_EXTENSION else 32,
    )
    world_size = 1 if dist_process_group is None else dist.get_world_size(dist_process_group)
@@ -309,7 +309,7 @@ def cross_entropy_forward(
        n_non_ignore=n_rows,
        label_smoothing=label_smoothing,
        BLOCK_SIZE=BLOCK_SIZE,
-        num_warps=32,
+        num_warps=16 if IS_HIP_EXTENSION else 32,
    )
    loss = torch.reshape(loss_1d, (B, SQ)) if not reduce_loss else (torch.sum(loss_1d) / n_rows)
@@ -335,7 +335,7 @@ def cross_entropy_backward(_input: torch.Tensor, grad_output: torch.Tensor):
            grad_output,
            V,
            BLOCK_SIZE=BLOCK_SIZE,
-            num_warps=32,
+            num_warps=16 if IS_HIP_EXTENSION else 32,
        )
    return _input