Merge tag 'v0.10.2rc1' into v0.10.2rc1-ori

d2b52805 · zhuwenwen · 9a521c23 · 5438967f · d2b52805 · d2b52805
Commit d2b52805 authored Sep 07, 2025 by zhuwenwen
20 changed files
--- a/csrc/moe/moe_permute_unpermute_op.cu
+++ b/csrc/moe/moe_permute_unpermute_op.cu
@@ -45,8 +45,6 @@ void moe_permute(
  auto copy_topk_ids = topk_ids.clone();  // copy topk_ids for preprocess
  auto permuted_experts_id = torch::empty_like(topk_ids);
  auto sorted_row_idx = torch::empty_like(inv_permuted_idx);
-  auto align_expert_first_token_offset =
-      torch::zeros_like(expert_first_token_offset);
  CubKeyValueSorter sorter{};
  int64_t* valid_num_ptr = nullptr;
@@ -85,12 +83,14 @@ void moe_permute(
  });
  // get m_indices and update expert_first_token_offset with align block
-  getMIndices(get_ptr<int64_t>(expert_first_token_offset),
+  // this is only required for DeepGemm and not required for CUTLASS group gemm
-              get_ptr<int64_t>(align_expert_first_token_offset),
-              get_ptr<int>(m_indices), n_local_expert, align_block_size_value,
-              stream);
  if (align_block_size.has_value()) {
-    // update align_expert_first_token_offset
+    auto align_expert_first_token_offset =
+        torch::zeros_like(expert_first_token_offset);
+    getMIndices(get_ptr<int64_t>(expert_first_token_offset),
+                get_ptr<int64_t>(align_expert_first_token_offset),
+                get_ptr<int>(m_indices), n_local_expert, align_block_size_value,
+                stream);
    expert_first_token_offset.copy_(align_expert_first_token_offset);
  }
 }
@@ -195,19 +195,14 @@ void moe_permute(const torch::Tensor& input, const torch::Tensor& topk_weights,
                 torch::Tensor& expert_first_token_offset,
                 torch::Tensor& src_row_id2dst_row_id_map,
                 torch::Tensor& m_indices) {
-  TORCH_CHECK(false, "moe_unpermute is not supported on CUDA < 12.0");
+  TORCH_CHECK(false, "moe_permute is not supported on CUDA < 12.0");
 }
-void moe_unpermute(const torch::Tensor& input,
+void moe_unpermute(
-                   const torch::Tensor& topk_weights, torch::Tensor& topk_ids,
+    const torch::Tensor& permuted_hidden_states,
-                   const torch::Tensor& token_expert_indices,
+    const torch::Tensor& topk_weights, const torch::Tensor& inv_permuted_idx,
-                   const std::optional<torch::Tensor>& expert_map,
+    const std::optional<torch::Tensor>& expert_first_token_offset, int64_t topk,
-                   int64_t n_expert, int64_t n_local_expert, int64_t topk,
+    torch::Tensor& hidden_states) {
-                   const std::optional<int64_t>& align_block_size,
-                   torch::Tensor& permuted_input,
-                   torch::Tensor& expert_first_token_offset,
-                   torch::Tensor& src_row_id2dst_row_id_map,
-                   torch::Tensor& m_indices) {
  TORCH_CHECK(false, "moe_unpermute is not supported on CUDA < 12.0");
 }
@@ -224,4 +219,4 @@ bool moe_permute_unpermute_supported() {
 TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
  m.impl("moe_permute", &moe_permute);
  m.impl("moe_unpermute", &moe_unpermute);
 }
\ No newline at end of file
--- a/csrc/moe/topk_softmax_kernels.cu
+++ b/csrc/moe/topk_softmax_kernels.cu
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
--- a/csrc/ops.h
+++ b/csrc/ops.h
--- a/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu
+++ b/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu
--- a/csrc/quantization/cutlass_w8a8/moe/get_group_starts.cuh
+++ b/csrc/quantization/cutlass_w8a8/moe/get_group_starts.cuh
--- a/csrc/quantization/cutlass_w8a8/moe/moe_data.cu
+++ b/csrc/quantization/cutlass_w8a8/moe/moe_data.cu
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
--- a/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu
+++ b/csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu
--- a/csrc/quantization/machete/generate.py
+++ b/csrc/quantization/machete/generate.py
--- a/csrc/quantization/marlin/dense/LICENSE
+++ b/csrc/quantization/marlin/dense/LICENSE
--- a/csrc/quantization/marlin/dense/common/base.h
+++ b/csrc/quantization/marlin/dense/common/base.h
--- a/csrc/quantization/marlin/dense/common/mem.h
+++ b/csrc/quantization/marlin/dense/common/mem.h
--- a/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
+++ b/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
--- a/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu
+++ b/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu
--- a/csrc/quantization/vectorization_utils.cuh
+++ b/csrc/quantization/vectorization_utils.cuh
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
--- a/docker/Dockerfile.s390x
+++ b/docker/Dockerfile.s390x