Removed unused marlin cuda code (#17684)

Signed-off-by: mgoin <mgoin64@gmail.com>

Removed unused marlin cuda code (#17684)
Signed-off-by: mgoin <mgoin64@gmail.com>
a17cef70 · Michael Goin · GitHub · 18dd5e01 · 18dd5e01 · 18dd5e01
Unverified Commit a17cef70 authored May 06, 2025 by Michael Goin Committed by GitHub May 06, 2025
9 changed files
--- a/csrc/moe/marlin_kernels/marlin_moe_kernel.h
+++ b/csrc/moe/marlin_kernels/marlin_moe_kernel.h
--- a/csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.cu
+++ b/csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.cu
-#include "marlin_moe_kernel_ku4.h"
-
-namespace marlin_moe {
-
-// We return bool so we can create these different kernel calls as a sequence
-// of if-elseif's.
-bool call_marlin_moe_kernel_ku4(
-    vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks,
-    bool has_act_order, int group_blocks, int num_threads, int blocks,
-    int max_shared_mem, cudaStream_t stream, const int4* A_ptr,
-    const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr,
-    const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr,
-    const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups,
-    int expert_idx, int num_experts, int topk, int prob_m, int prob_n,
-    int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights,
-    int m_block, int max_par, int cfg_max_m_blocks) {
-  bool has_zp = true;
-
-  if (false) {
-  }
-  AWQ_CALL_IF_MOE(vllm::kU4, 16, 4, 256)
-  AWQ_CALL_IF_MOE(vllm::kU4, 8, 8, 256)
-  AWQ_CALL_IF_MOE(vllm::kU4, 8, 4, 128)
-  AWQ_CALL_IF_MOE(vllm::kU4, 4, 8, 128)
-  else {
-    return false;
-  }
-  return true;
-}
-
-}  // namespace marlin_moe
--- a/csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.h
+++ b/csrc/moe/marlin_kernels/marlin_moe_kernel_ku4.h
-#pragma once
-
-#include "marlin_moe_kernel.h"
-
-namespace marlin_moe {
-
-// We return bool so we can create these different kernel calls as a sequence
-// of if-elseif's.
-bool call_marlin_moe_kernel_ku4(
-    vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks,
-    bool has_act_order, int group_blocks, int num_threads, int blocks,
-    int max_shared_mem, cudaStream_t stream, const int4* A_ptr,
-    const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr,
-    const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr,
-    const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups,
-    int expert_idx, int num_experts, int topk, int prob_m, int prob_n,
-    int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights,
-    int m_block, int max_par, int cfg_max_m_blocks);
-
-}  // namespace marlin_moe
--- a/csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu
+++ b/csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.cu
-#include "marlin_moe_kernel_ku4b8.h"
-
-namespace marlin_moe {
-
-// We return bool so we can create these different kernel calls as a sequence
-// of if-elseif's.
-bool call_marlin_moe_kernel_ku4b8(
-    vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks,
-    bool has_act_order, int group_blocks, int num_threads, int blocks,
-    int max_shared_mem, cudaStream_t stream, const int4* A_ptr,
-    const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr,
-    const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr,
-    const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups,
-    int expert_idx, int num_experts, int topk, int prob_m, int prob_n,
-    int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights,
-    int m_block, int max_par, int cfg_max_m_blocks) {
-  bool has_zp = false;
-
-  if (false) {
-  }
-  GPTQ_CALL_IF_MOE(vllm::kU4B8, 16, 4, 256)
-  GPTQ_CALL_IF_MOE(vllm::kU4B8, 8, 8, 256)
-  GPTQ_CALL_IF_MOE(vllm::kU4B8, 8, 4, 128)
-  GPTQ_CALL_IF_MOE(vllm::kU4B8, 4, 8, 128)
-  else {
-    return false;
-  }
-  return true;
-}
-
-}  // namespace marlin_moe
--- a/csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h
+++ b/csrc/moe/marlin_kernels/marlin_moe_kernel_ku4b8.h
-#pragma once
-
-#include "marlin_moe_kernel.h"
-
-namespace marlin_moe {
-
-// We return bool so we can create these different kernel calls as a sequence
-// of if-elseif's.
-bool call_marlin_moe_kernel_ku4b8(
-    vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks,
-    bool has_act_order, int group_blocks, int num_threads, int blocks,
-    int max_shared_mem, cudaStream_t stream, const int4* A_ptr,
-    const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr,
-    const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr,
-    const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups,
-    int expert_idx, int num_experts, int topk, int prob_m, int prob_n,
-    int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights,
-    int m_block, int max_par, int cfg_max_m_blocks);
-
-}  // namespace marlin_moe
--- a/csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu
+++ b/csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.cu
-#include "marlin_moe_kernel_ku8b128.h"
-
-namespace marlin_moe {
-
-// We return bool so we can create these different kernel calls as a sequence
-// of if-elseif's.
-bool call_marlin_moe_kernel_ku8b128(
-    vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks,
-    bool has_act_order, int group_blocks, int num_threads, int blocks,
-    int max_shared_mem, cudaStream_t stream, const int4* A_ptr,
-    const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr,
-    const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr,
-    const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups,
-    int expert_idx, int num_experts, int topk, int prob_m, int prob_n,
-    int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights,
-    int m_block, int max_par, int cfg_max_m_blocks) {
-  bool has_zp = false;
-
-  if (false) {
-  }
-  GPTQ_CALL_IF_MOE(vllm::kU8B128, 16, 4, 256)
-  GPTQ_CALL_IF_MOE(vllm::kU8B128, 8, 8, 256)
-  GPTQ_CALL_IF_MOE(vllm::kU8B128, 8, 4, 128)
-  GPTQ_CALL_IF_MOE(vllm::kU8B128, 4, 8, 128)
-  else {
-    return false;
-  }
-  return true;
-}
-
-}  // namespace marlin_moe
--- a/csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h
+++ b/csrc/moe/marlin_kernels/marlin_moe_kernel_ku8b128.h
-#pragma once
-
-#include "marlin_moe_kernel.h"
-
-namespace marlin_moe {
-
-bool call_marlin_moe_kernel_ku8b128(
-    vllm::ScalarType const& q_type, int thread_n_blocks, int thread_k_blocks,
-    bool has_act_order, int group_blocks, int num_threads, int blocks,
-    int max_shared_mem, cudaStream_t stream, const int4* A_ptr,
-    const int4* B_ptr, int4* C_ptr, const int* sorted_ids_ptr,
-    const float* topk_weights_ptr, const int4* s_ptr, const int4* zp_ptr,
-    const int* g_idx_ptr, int* expert_offsets_ptr, int num_groups,
-    int expert_idx, int num_experts, int topk, int prob_m, int prob_n,
-    int prob_k, int tot_m, int* locks, bool replicate_input, bool apply_weights,
-    int m_block, int max_par, int cfg_max_m_blocks);
-
-}
--- a/csrc/moe/marlin_moe_ops.cu
+++ b/csrc/moe/marlin_moe_ops.cu
--- a/csrc/quantization/fp8/fp8_marlin.cu
+++ b/csrc/quantization/fp8/fp8_marlin.cu