Revert feat:optimize act_and_mul_kernel

880b2e41 · zhuwenwen · efb2f75f · 880b2e41 · 880b2e41
Commit 880b2e41 authored Aug 10, 2024 by zhuwenwen
Show whitespace changes
Inline Side-by-side

Showing with 13 additions and 89 deletions

csrc/activation_kernels.cu csrc/activation_kernels.cu +13 -89

vllm/benchmark_throughput.py vllm/benchmark_throughput.py +0 -0

No files found.
--- a/csrc/activation_kernels.cu
+++ b/csrc/activation_kernels.cu
 #include <ATen/cuda/CUDAContext.h>
 #include <torch/all.h>
 #include <c10/cuda/CUDAGuard.h>
-#include <ATen/native/cuda/MemoryAccess.cuh>
 #include <cmath>
@@ -24,64 +23,6 @@ __global__ void act_and_mul_kernel(
  }
 }
-template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&), int VEC>
-__global__ void act_and_mul_kernel_vectorize1(
-    scalar_t* __restrict__ out,          // [..., d]
-    const scalar_t* __restrict__ input,  // [..., 2, d]
-    const int d) {
-  using VecType = at::native::memory::aligned_vector<scalar_t, VEC>;
-  const int token_idx = blockIdx.x;
-  int idx = threadIdx.x * VEC;
-  if (idx < d) {
-    const int x_index = token_idx * 2 * d + idx;
-    const int y_index = token_idx * d + idx;
-    VecType* x1 = (VecType*)(input + x_index);
-    VecType* x2 = (VecType*)(input + x_index + d);
-    VecType* y = (VecType*)(out + y_index);
-    scalar_t r_x1[VEC];
-    scalar_t r_x2[VEC];
-    scalar_t r_y[VEC];
-    *(VecType*)r_x1 = *x1;
-    *(VecType*)r_x2 = *x2;
-#pragma unroll
-    for (int i = 0; i < VEC; i++) {
-      const scalar_t t_x1 = VLLM_LDG(&r_x1[i]);
-      const scalar_t t_x2 = VLLM_LDG(&r_x2[i]);
-      r_y[i] = ACT_FN(t_x1) * t_x2;
-    }
-    *y = *(VecType*)r_y;
-  }
-}
-template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&), int VEC>
-__global__ void act_and_mul_kernel_vectorize2(
-    scalar_t* __restrict__ out,          // [..., d]
-    const scalar_t* __restrict__ input,  // [..., 2, d]
-    const int d) {
-  using VecType = at::native::memory::aligned_vector<scalar_t, VEC>;
-  const int token_idx = blockIdx.x;
-  int idx = threadIdx.x * VEC;
-  for (; idx < d; idx += blockDim.x * VEC) {
-    const int x_index = token_idx * 2 * d + idx;
-    const int y_index = token_idx * d + idx;
-    VecType* x1 = (VecType*)(input + x_index);
-    VecType* x2 = (VecType*)(input + x_index + d);
-    VecType* y = (VecType*)(out + y_index);
-    scalar_t r_x1[VEC];
-    scalar_t r_x2[VEC];
-    scalar_t r_y[VEC];
-    *(VecType*)r_x1 = *x1;
-    *(VecType*)r_x2 = *x2;
-#pragma unroll
-    for (int i = 0; i < VEC; i++) {
-      const scalar_t t_x1 = VLLM_LDG(&r_x1[i]);
-      const scalar_t t_x2 = VLLM_LDG(&r_x2[i]);
-      r_y[i] = ACT_FN(t_x1) * t_x2;
-    }
-    *y = *(VecType*)r_y;
-  }
-}
 template <typename T>
 __device__ __forceinline__ T silu_kernel(const T& x) {
  // x * sigmoid(x)
@@ -113,6 +54,7 @@ __device__ __forceinline__ T gelu_tanh_kernel(const T& x) {
 }  // namespace vllm
+// Launch activation and gating kernel.
 #define LAUNCH_ACTIVATION_GATE_KERNEL(KERNEL)                            \
  int d = input.size(-1) / 2;                                            \
  int64_t num_tokens = input.numel() / input.size(-1);                   \
@@ -122,27 +64,9 @@ __device__ __forceinline__ T gelu_tanh_kernel(const T& x) {
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();          \
  VLLM_DISPATCH_FLOATING_TYPES(                                          \
      input.scalar_type(), "act_and_mul_kernel", [&] {                   \
-        if (d <= 512) {                                                     \
+        vllm::act_and_mul_kernel<scalar_t, KERNEL<scalar_t>>             \
-          vllm::act_and_mul_kernel_vectorize1<scalar_t, KERNEL<scalar_t>, 2> \
+            <<<grid, block, 0, stream>>>(out.data_ptr<scalar_t>(),       \
-              <<<grid, 256, 0, stream>>>(out.data_ptr<scalar_t>(),          \
-                                         input.data_ptr<scalar_t>(), d);    \
-        } else if (d <= 1024) {                                             \
-          vllm::act_and_mul_kernel_vectorize1<scalar_t, KERNEL<scalar_t>, 8> \
-              <<<grid, 128, 0, stream>>>(out.data_ptr<scalar_t>(),          \
-                                         input.data_ptr<scalar_t>(), d);    \
-        } else if (d <= 2048) {                                             \
-          vllm::act_and_mul_kernel_vectorize1<scalar_t, KERNEL<scalar_t>, 8> \
-              <<<grid, 256, 0, stream>>>(out.data_ptr<scalar_t>(),          \
-                                         input.data_ptr<scalar_t>(), d);    \
-        } else if (d <= 4096) {                                             \
-          vllm::act_and_mul_kernel_vectorize1<scalar_t, KERNEL<scalar_t>, 8> \
-              <<<grid, 512, 0, stream>>>(out.data_ptr<scalar_t>(),          \
-                                         input.data_ptr<scalar_t>(), d);    \
-        } else {                                                            \
-          vllm::act_and_mul_kernel_vectorize2<scalar_t, KERNEL<scalar_t>, 8> \
-              <<<grid, 1024, 0, stream>>>(out.data_ptr<scalar_t>(),         \
                                         input.data_ptr<scalar_t>(), d); \
-        }                                                                   \
      });
 void silu_and_mul(torch::Tensor& out,    // [..., d]

--- a/vllm/benchmarks/benchmark_throughput.py
+++ b/vllm/benchmarks/benchmark_throughput.py