Merge branch 'v0.5.0-dtk24.04.1'

7462218e · zhuwenwen · 6ccd3f47 · 1cec5e62 · 7462218e · 7462218e
Commit 7462218e authored Sep 05, 2024 by zhuwenwen
20 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -4,11 +4,13 @@ project(vllm_extensions LANGUAGES CXX)

 option(VLLM_TARGET_DEVICE "Target device backend for vLLM" "cuda")

+set(CMAKE_BUILD_TYPE "Release")
+
 message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
 message(STATUS "Target device: ${VLLM_TARGET_DEVICE}")

 include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
-
+add_compile_options(-w)
 #
 # Supported python versions.  These versions will be searched in order, the
 # first match will be selected.  These should be kept in sync with setup.py.
@@ -123,7 +125,7 @@ endif()
 override_gpu_arches(VLLM_GPU_ARCHES
  ${VLLM_GPU_LANG}
  "${${VLLM_GPU_LANG}_SUPPORTED_ARCHS}")
-
+  
 #
 # Query torch for additional GPU compilation flags for the given
 # `VLLM_GPU_LANG`.
@@ -150,8 +152,13 @@ set(VLLM_EXT_SRC
  "csrc/cache_kernels.cu"
  "csrc/attention/attention_kernels.cu"
  "csrc/pos_encoding_kernels.cu"
+  "csrc/pos_encoding_tgi_kernels.cu"
  "csrc/activation_kernels.cu"
  "csrc/layernorm_kernels.cu"
+  "csrc/opt/transpose_kernels.cu"
+  "csrc/opt/activation_kernels_opt.cu"
+  "csrc/attention/attention_kernels_opt.cu"
+  "csrc/opt/layernorm_kernels_opt.cu"
  "csrc/quantization/squeezellm/quant_cuda_kernel.cu"
  "csrc/quantization/gptq/q_gemm.cu"
  "csrc/quantization/compressed_tensors/int8_quant_kernels.cu"

--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@
 vLLM是一个快速且易于使用的LLM推理和服务库，使用PageAttention高效管理kv内存，Continuous batching传入请求，支持很多Hugging Face模型，如LLaMA & LLaMA-2、Qwen、Chatglm2 & Chatglm3等。

 ## 暂不支持的官方功能
- **量化推理**：目前支持fp16的推理和gptq推理，awq-int4和mralin的权重量化、kv-cache fp8推理方案暂不支持
+- **量化推理**：目前支持fp16的推理和gptq,awq-int4推理，mralin的权重量化、kv-cache fp8推理方案暂不支持
 - **模块支持**：目前不支持Sliding window attention、 moe kernel和lora模块


@@ -15,12 +15,17 @@ vLLM是一个快速且易于使用的LLM推理和服务库，使用PageAttention
 |    LlamaForCausalLM       |    LLaMA-3        |   Yes    | Yes  |
 |    LlamaForCausalLM       |    Codellama      |   Yes    | Yes  |
 |    QWenLMHeadModel        |    QWen           |   Yes    | Yes  |
+|    Qwen2ForCausalLM       |    QWen1.5        |   Yes    | Yes  |
+|    Qwen2ForCausalLM       |    CodeQwen1.5    |   Yes    | Yes  |
+|    Qwen2ForCausalLM       |    QWen2          |   Yes    | Yes  |
+|    ChatGLMModel           |    chatglm2       |   Yes    | Yes  |
+|    ChatGLMModel           |    chatglm3       |   Yes    | Yes  |
 |    BaiChuanForCausalLM    |    Baichuan-7B    |   Yes    | Yes  |
 |    BaiChuanForCausalLM    |    Baichuan2-7B   |   Yes    | Yes  |
-|    ChatGLMModel           |    chatglm2-6b    |   Yes    | Yes  |
-|    ChatGLMModel           |    chatglm3-6b    |   Yes    | Yes  |
 |    InternLMForCausalLM    |    InternLM       |   Yes    | Yes  |
 |    InternLM2ForCausalLM   |    InternLM2      |   Yes    | Yes  |
+|    LlamaForCausalLM       |    deepseek       |   Yes    | Yes  |
+|    DeepseekV2ForCausalLM  |    DeepSeek-V2    |   Yes    | Yes  |
 |    LlamaForCausalLM       |    Yi             |   Yes    | Yes  |
 |    MixtralForCausalLM     |    Mixtral-8x7B   |   Yes    | Yes  |

@@ -49,16 +54,19 @@ pip install setuptools wheel
 ```shell
 git clone http://developer.hpccube.com/codes/OpenDAS/vllm.git # 根据需要的分支进行切换
 ```
-
+安装依赖：
+```shell
+pip install -r requirements-rocm.txt
+```
 - 提供2种源码编译方式（进入vllm目录）：
 ```
 1. 编译whl包并安装
-python setup.py bdist_wheel 
+VLLM_INSTALL_PUNICA_KERNELS=1 python setup.py bdist_wheel 
 cd dist
 pip install vllm*

 2. 源码编译安装
-python3 setup.py install 
+VLLM_INSTALL_PUNICA_KERNELS=1 python3 setup.py install 
 ```

 #### 运行基础环境准备
@@ -68,13 +76,13 @@ python3 setup.py install
 - triton:[https://cancon.hpccube.com:65024/4/main/triton](https://cancon.hpccube.com:65024/4/main/triton/)
 - xformers:[https://cancon.hpccube.com:65024/4/main/xformers](https://cancon.hpccube.com:65024/4/main/xformers)
 - flash_attn: [https://cancon.hpccube.com:65024/4/main/flash_attn](https://cancon.hpccube.com:65024/4/main/flash_attn)
-
+- lmslim: [https://cancon.hpccube.com:65024/4/main/lmslim](https://cancon.hpccube.com:65024/4/main/lmslim)

 #### 注意事项
 + 若使用 pip install 下载安装过慢，可添加源：-i https://pypi.tuna.tsinghua.edu.cn/simple/

 ## 验证
- python -c "import vllm; print(vllm.\_\_version__)"，版本号与官方版本同步，查询该软件的版本号，例如0.5.0.post1；
+- python -c "import vllm; print(vllm.\_\_version__)"，版本号与官方版本同步，查询该软件的版本号，例如0.5.0；

 ## Known Issue
 - 无

--- a/README_ORIGIN.md
+++ b/README_ORIGIN.md
@@ -23,10 +23,16 @@ If you have cool projects related to vLLM or LLM inference, we would love to see
 This will be a great chance for everyone in the community to get together and learn.
 Please submit your proposal [here](https://raysummit.anyscale.com/flow/anyscale/raysummit2024/landing/page/eventsite)

+**The Fourth vLLM Bay Area Meetup (June 11th 5:30pm-8pm PT)**
+
+We are thrilled to announce our fourth vLLM Meetup!
+The vLLM team will share recent updates and roadmap.
+We will also have vLLM collaborators from BentoML and Cloudflare coming up to the stage to discuss their experience in deploying LLMs with vLLM.
+Please register [here](https://lu.ma/agivllm) and join us!
+
 ---

 *Latest News* 🔥
- [2024/06] We hosted [the fourth vLLM meetup](https://lu.ma/agivllm) with Cloudflare and BentoML! Please find the meetup slides [here](https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing).
 - [2024/04] We hosted [the third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/) with Roblox! Please find the meetup slides [here](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing).
 - [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing).
 - [2024/01] Added ROCm 6.0 support to vLLM.
@@ -59,7 +65,7 @@ vLLM is flexible and easy to use with:
 - Tensor parallelism support for distributed inference
 - Streaming outputs
 - OpenAI-compatible API server
- Support NVIDIA GPUs, AMD GPUs, and Intel CPUs
+- Support NVIDIA GPUs and AMD GPUs
 - (Experimental) Prefix caching support
 - (Experimental) Multi-lora support


--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -5,11 +5,13 @@ import random
 import time
 from typing import List, Optional, Tuple

+import numpy as np
 import torch
 from tqdm import tqdm
 from transformers import (AutoModelForCausalLM, AutoTokenizer,
                          PreTrainedTokenizerBase)

+from vllm.inputs import PromptStrictInputs
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS


@@ -60,6 +62,7 @@ def sample_requests(


 def run_vllm(
+    warmup_requests: List[Tuple[str, int, int]],
    requests: List[Tuple[str, int, int]],
    model: str,
    tokenizer: str,
@@ -119,6 +122,41 @@ def run_vllm(
                max_tokens=output_len,
            ))

+    # warmup
+    warmup_prompts = []
+    warmup_sampling_params = []
+    for prompt, _, output_len in warmup_requests:
+        warmup_prompts.append(prompt)
+        warmup_sampling_params.append(
+            SamplingParams(
+                n=n,
+                temperature=0.0 if use_beam_search else 1.0,
+                top_p=1.0,
+                use_beam_search=use_beam_search,
+                ignore_eos=True,
+                max_tokens=output_len,
+            ))
+        
+    print("Warming up...")
+    for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
+        llm.generate(warmup_prompts, warmup_sampling_params, use_tqdm=True)
+    
+    # dummy_prompt_token_ids = np.random.randint(10000,
+    #                                            size=(args.num_prompts,
+    #                                                  args.input_len))
+    # dummy_inputs: List[PromptStrictInputs] = [{
+    #     "prompt_token_ids": batch
+    # } for batch in dummy_prompt_token_ids.tolist()]
+
+    # def run_to_completion():
+    #     llm.generate(dummy_inputs,
+    #                     sampling_params=sampling_params,
+    #                     use_tqdm=False)
+
+    # print("Warming up...")
+    # for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
+    #     run_to_completion()
+    
    start = time.perf_counter()
    llm.generate(prompts, sampling_params, use_tqdm=True)
    end = time.perf_counter()
@@ -212,6 +250,10 @@ def main(args: argparse.Namespace):
        args.tokenizer, trust_remote_code=args.trust_remote_code)
    if args.dataset is None:
        # Synthesize a prompt with the given input length.
+        warmup_prompt = "hi" * 10
+        warmup_requests = [(warmup_prompt, 10, 10)
+                    for _ in range(1)]
+        
        prompt = "hi" * (args.input_len - 1)
        requests = [(prompt, args.input_len, args.output_len)
                    for _ in range(args.num_prompts)]
@@ -221,7 +263,7 @@ def main(args: argparse.Namespace):

    if args.backend == "vllm":
        elapsed_time = run_vllm(
-            requests, args.model, args.tokenizer, args.quantization,
+            warmup_requests, requests, args.model, args.tokenizer, args.quantization,
            args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
            args.trust_remote_code, args.dtype, args.max_model_len,
            args.enforce_eager, args.kv_cache_dtype,
@@ -295,6 +337,10 @@ if __name__ == "__main__":
                        default=1,
                        help="Number of generated sequences per prompt.")
    parser.add_argument("--use-beam-search", action="store_true")
+    parser.add_argument('--num-iters-warmup',
+                        type=int,
+                        default=1,
+                        help='Number of iterations to run for warmup.')
    parser.add_argument("--num-prompts",
                        type=int,
                        default=1000,

--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -117,6 +117,10 @@ function (get_torch_gpu_compiler_flags OUT_GPU_FLAGS GPU_LANG)
      "import torch.utils.cpp_extension as t; print(';'.join(t.COMMON_HIP_FLAGS + t.COMMON_HIPCC_FLAGS))"
      "Failed to determine torch nvcc compiler flags")

+    list(REMOVE_ITEM GPU_FLAGS
+      "-DUSE_ROCM=1"
+    )
+ 
    list(APPEND GPU_FLAGS
      "-DUSE_ROCM"
      # "-DENABLE_FP8"

--- a/csrc/activation_kernels.cu
+++ b/csrc/activation_kernels.cu
@@ -147,4 +147,4 @@ void gelu_fast(torch::Tensor& out,    // [..., d]
               torch::Tensor& input)  // [..., d]
 {
  LAUNCH_ACTIVATION_KERNEL(vllm::gelu_fast_kernel);
-}
+}
\ No newline at end of file
--- a/csrc/attention/attention_kernels.cu
+++ b/csrc/attention/attention_kernels.cu
@@ -992,4 +992,4 @@ void paged_attention_v2(
 #undef WARP_SIZE
 #undef MAX
 #undef MIN
-#undef DIVIDE_ROUND_UP
+#undef DIVIDE_ROUND_UP
\ No newline at end of file
--- a/csrc/attention/attention_kernels_opt.cu
+++ b/csrc/attention/attention_kernels_opt.cu
--- a/csrc/attention/attention_utils.cuh
+++ b/csrc/attention/attention_utils.cuh
@@ -26,17 +26,104 @@

 namespace vllm {

-// Q*K^T operation.
-template <int THREAD_GROUP_SIZE, typename Vec, int N>
+inline __device__ void v_dot2_f32_f16(float& a, const uint32_t &  b,const uint32_t &  c) {
+  asm volatile("v_dot2_f32_f16 %0, %1, %2, %0;": "=v"(a): "v"(b), "v"(c), "0"(a));
+}
+
+inline __device__ void v_pk_fma_f16(uint32_t& a, const uint32_t &  b,const uint32_t &  c){
+   asm volatile("v_pk_fma_f16 %0, %1, %2, %3;": "=v"(a) : "v"(b), "v"(c), "v"(a));
+}
+
+inline __device__ void ds_read_b128(uint4& a, uint32_t offset){
+    asm volatile("ds_read_b128 %0 %1;": "=v" (a): "v" (offset));
+}
+
+inline __device__ void ds_read_b128_sync(uint4& a, uint32_t offset){
+    asm volatile("ds_read_b128 %0 %1\ns_waitcnt lgkmcnt(1);": "=v" (a): "v" (offset));
+}
+
+inline __device__ void lgkmcnt0(){
+    asm volatile("s_waitcnt lgkmcnt(0);");
+}
+
+__device__ inline size_t  __nv_cvta_generic_to_shared_impl(const void *__ptr) {
+        return (size_t)(void __attribute__((address_space(3))) *)__ptr;
+}
+
+inline __device__ void v_dot2_f32_f16(float& a,const uint2 &  b,const uint2 &  c) {
+  v_dot2_f32_f16(a, b.x, c.x);
+  v_dot2_f32_f16(a, b.y, c.y);
+}
+
+inline __device__ void v_dot2_f32_f16(float& a,const uint4 &  b,const uint4 &  c) {
+  v_dot2_f32_f16(a, b.x, c.x);
+  v_dot2_f32_f16(a, b.y, c.y);
+  v_dot2_f32_f16(a, b.z, c.z);
+  v_dot2_f32_f16(a, b.w, c.w);
+}
+
+inline __device__ float add_half2(uint32_t a){
+ union {
+    uint32_t u32;
+    half u16[2];
+  } tmp;
+  tmp.u32=a;
+  return static_cast<float>(tmp.u16[0]+tmp.u16[1]);
+}
+
+inline __device__ void v_pk_fma_f16x8(float& a,const uint4 &  b,const uint4 &  c) {
+  uint32_t tmp = mul<uint32_t, uint32_t, uint32_t>(b.x,c.x);
+  v_pk_fma_f16(tmp,b.y,c.y);
+  v_pk_fma_f16(tmp,b.z,c.z);
+  v_pk_fma_f16(tmp,b.w,c.w);
+  a+=add_half2(tmp);
+}
+
+// Q*K^T operation. fp16
+template <int THREAD_GROUP_SIZE, typename Vec, int N, typename scalar_t, std::enable_if_t<std::is_same<scalar_t, uint16_t>::value, int> = 0>
+// template <int THREAD_GROUP_SIZE, typename Vec, int N>
 inline __device__ float qk_dot_(const Vec (&q)[N], const Vec (&k)[N]) {
+  
+  float qk =0;
+  // uint32_t offset = __nv_cvta_generic_to_shared_impl(q);
+  // const uint4 *k_ptr= reinterpret_cast<const uint4 *>(k);
+  // // Compute the parallel products for Q*K^T (treat vector lanes separately).
+
+  // constexpr int loop=N*sizeof(Vec)/16/2;
+  // uint4 qt[2];
+  // #pragma unroll
+  // for (int ii = 0; ii < loop; ++ii) {
+  //   ds_read_b128(qt[0],offset+16*ii*2);
+  //   ds_read_b128_sync(qt[1],offset+16*(ii*2+1));
+  //   v_dot2_f32_f16(qk,qt[0],k_ptr[ii*2]);
+  //   // v_pk_fma_f16x8(qk,qt[0],k_ptr[ii*2]);
+  //   lgkmcnt0();
+  //   v_dot2_f32_f16(qk,qt[1],k_ptr[ii*2+1]);
+  //   // v_pk_fma_f16x8(qk,qt[1],k_ptr[ii*2+1]);
+  // }
+  #pragma unroll
+  for (int ii = 0; ii < N; ++ii) {
+    v_dot2_f32_f16(qk,q[ii],k[ii]);
+  }
+  // Finalize the reduction across lanes.
+#pragma unroll
+  for (int mask = THREAD_GROUP_SIZE / 2; mask >= 1; mask /= 2) {
+    qk += VLLM_SHFL_XOR_SYNC(qk, mask);
+  }
+  return qk;
+}
+
+// Q*K^T operation. //bf16
+template <int THREAD_GROUP_SIZE, typename Vec, int N, typename scalar_t, std::enable_if_t<!std::is_same<scalar_t, uint16_t>::value, int> = 0>
+// template <int THREAD_GROUP_SIZE, typename Vec, int N>
+inline __device__ float qk_dot_(const Vec (&q)[N], const Vec (&k)[N]) {
+
  using A_vec = typename FloatVec<Vec>::Type;
-  // Compute the parallel products for Q*K^T (treat vector lanes separately).
  A_vec qk_vec = mul<A_vec, Vec, Vec>(q[0], k[0]);
-#pragma unroll
+  #pragma unroll
  for (int ii = 1; ii < N; ++ii) {
    qk_vec = fma(q[ii], k[ii], qk_vec);
  }
-
  // Finalize the reduction across lanes.
  float qk = sum(qk_vec);
 #pragma unroll
@@ -46,12 +133,17 @@ inline __device__ float qk_dot_(const Vec (&q)[N], const Vec (&k)[N]) {
  return qk;
 }

+
 template <typename T, int THREAD_GROUP_SIZE>
 struct Qk_dot {
  template <typename Vec, int N>
  static inline __device__ float dot(const Vec (&q)[N], const Vec (&k)[N]) {
-    return qk_dot_<THREAD_GROUP_SIZE>(q, k);
+    return qk_dot_<THREAD_GROUP_SIZE,Vec,N,T>(q, k);
  }
+  // template <typename Vec, int N>
+  // static inline __device__ float qk_dot_vpack(const Vec (&q)[N], const Vec (&k)[N]) {
+  //   return qk_dot_vpack_<THREAD_GROUP_SIZE>(q, k);
+  // }
 };

 }  // namespace vllm
--- a/csrc/attention/static_switch.h
+++ b/csrc/attention/static_switch.h
+#define BOOL_SWITCH(COND, CONST_NAME, ...)      \
+  [&] {                                         \
+    if (COND) {                                 \
+      constexpr static bool CONST_NAME = true;  \
+      return __VA_ARGS__();                     \
+    } else {                                    \
+      constexpr static bool CONST_NAME = false; \
+      return __VA_ARGS__();                     \
+    }                                           \
+  }()
+
+#define OPT_SWITCH(COND, ...)      \
+  [&] {                                         \
+    if (COND) {                                 \
+      constexpr static int opt = 1;  \
+      return __VA_ARGS__();                     \
+    } else {                                    \
+      constexpr static int opt = 2; \
+      return __VA_ARGS__();                     \
+    }                                           \
+  }()
+
+#define NUM_THREADS_SWITCH(NUM_THREAD, ...)    \
+  [&] {                                         \
+    if (NUM_THREAD == 256) {                   \
+      constexpr static int NUM_THREADS = 256;  \
+      return __VA_ARGS__();                     \
+    } else {                                    \
+      constexpr static int NUM_THREADS = 128;  \
+      return __VA_ARGS__();                     \
+    }                                           \
+  }()
+
+  #define HEADSIZE_SWITCH(HEADDIM, ...)   \
+  [&] {                                    \
+    if (HEADDIM == 64) {                   \
+      constexpr static int HEAD_SIZE = 64;  \
+      return __VA_ARGS__();                \
+    } else if (HEADDIM == 80) {            \
+      constexpr static int HEAD_SIZE = 80;  \
+      return __VA_ARGS__();                \
+    } else if (HEADDIM == 96) {            \
+      constexpr static int HEAD_SIZE = 96;  \
+      return __VA_ARGS__();                \
+    } else if (HEADDIM == 112) {           \
+      constexpr static int HEAD_SIZE = 112; \
+      return __VA_ARGS__();                \
+    } else if (HEADDIM == 128) {           \
+      constexpr static int HEAD_SIZE = 128; \
+      return __VA_ARGS__();                \
+    } else if (HEADDIM == 256) {           \
+      constexpr static int HEAD_SIZE = 256; \
+      return __VA_ARGS__();                \
+    }                                      \
+    else {                                 \
+      TORCH_CHECK(false, "Unsupported head size: ", HEADDIM);\
+    }                                      \
+  }()
+
+#define REUSEKV_SWITCH(num_blocks , ...)      \
+[&] {                                                   \
+    if (num_heads % 2 == 0 && num_heads / num_kv_heads >= 4 && num_blocks >= 1200){      \
+        constexpr static int REUSE_KV_TIMES = 4;        \
+        return __VA_ARGS__();                           \
+    } else if (num_heads / num_kv_heads >= 2 && num_blocks >= 1200){\
+        constexpr static int REUSE_KV_TIMES = 2;        \
+        return __VA_ARGS__();                           \
+    } else {                                            \
+        constexpr static int REUSE_KV_TIMES = 1;        \
+        return __VA_ARGS__();                           \
+    }                                                   \
+}()
+
+#define REUSEKV_SWITCH_V1(num_blocks , ...)      \
+[&] {                                                   \
+    if (num_heads > num_kv_heads && num_blocks >= 1200){      \
+        constexpr static int REUSE_KV_TIMES = 2;        \
+        return __VA_ARGS__();                           \
+    }  else {                                           \
+        constexpr static int REUSE_KV_TIMES = 1;        \
+        return __VA_ARGS__();                           \
+    }                                                   \
+}()
+
--- a/csrc/layernorm_kernels.cu
+++ b/csrc/layernorm_kernels.cu
@@ -349,4 +349,4 @@ void fused_add_rms_norm(torch::Tensor& input,     // [..., hidden_size]
  } else {
    LAUNCH_FUSED_ADD_RMS_NORM(0);
  }
-}
+}
\ No newline at end of file
--- a/csrc/moe_align_block_size_kernels.cu
+++ b/csrc/moe_align_block_size_kernels.cu
@@ -9,6 +9,8 @@

 #define CEILDIV(x, y) (((x) + (y) - 1) / (y))

+#define MAX_SHARED_MEM_SIZE 64 * 1024
+
 namespace vllm {

 namespace {
@@ -19,11 +21,12 @@ __device__ __forceinline__ int32_t index(int32_t total_col, int32_t row,
 }
 }  // namespace

-template <typename scalar_t>
+template <typename scalar_t, bool experts_num_exceed_limit>
 __global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids,
                                            int32_t* sorted_token_ids,
                                            int32_t* expert_ids,
                                            int32_t* total_tokens_post_pad,
+                                            int32_t* global_tokens_cnts_ptr,
                                            int32_t num_experts,
                                            int32_t block_size, size_t numel) {
  const size_t tokens_per_thread = CEILDIV(numel, blockDim.x);
@@ -31,11 +34,18 @@ __global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids,

  extern __shared__ int32_t shared_mem[];

-  int32_t* tokens_cnts =
-      shared_mem;  // 2d tensor with shape (num_experts + 1, num_experts)
-  int32_t* cumsum =
-      shared_mem + (num_experts + 1) *
-                       num_experts;  // 1d tensor with shape (num_experts + 1)
+  int32_t* tokens_cnts = nullptr;
+  int32_t* cumsum = nullptr;
+  if (experts_num_exceed_limit) {
+    // 2d tensor with shape (num_experts + 1, num_experts)
+    tokens_cnts = global_tokens_cnts_ptr;
+
+    // 1d tensor with shape (num_experts + 1)
+    cumsum = shared_mem;
+  } else {
+    tokens_cnts = shared_mem;  // 2d tensor with shape (num_experts + 1, num_experts)
+    cumsum = shared_mem + (num_experts + 1) * num_experts;  // 1d tensor with shape (num_experts + 1)
+  }

  for (int i = 0; i < num_experts; ++i) {
    tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0;
@@ -115,20 +125,40 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  VLLM_DISPATCH_INTEGRAL_TYPES(
      topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] {
-        // calc needed amount of shared mem for `tokens_cnts` and `cumsum`
-        // tensors
-        const int32_t shared_mem =
-            ((num_experts + 1) * num_experts + (num_experts + 1)) *
-            sizeof(int32_t);
-
-        // set dynamic shared mem
-        auto kernel = vllm::moe_align_block_size_kernel<scalar_t>;
-        AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(
+        int32_t shared_mem_normal = ((num_experts + 1) * num_experts + (num_experts + 1)) *
+              sizeof(int32_t);
+
+        const bool experts_num_exceed_limit = shared_mem_normal > MAX_SHARED_MEM_SIZE;
+
+        // calc needed amount of shared mem for `cumsum`
+        const int32_t shared_mem = experts_num_exceed_limit ? (num_experts + 1) * sizeof(int32_t) : shared_mem_normal;
+
+        if (experts_num_exceed_limit) {
+          // set dynamic shared mem
+          auto kernel = vllm::moe_align_block_size_kernel<scalar_t, true>;
+          AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(
+            (void*)kernel, shared_mem));
+
+          int32_t tokens_cnts[(num_experts + 1) * num_experts];
+          torch::Tensor key_cache_ptrs_tensor = torch::from_blob(tokens_cnts, {(num_experts + 1) * num_experts}, torch::kInt32)
+              .to(topk_ids.device());
+
+          kernel<<<1, num_experts, shared_mem, stream>>>(
+              topk_ids.data_ptr<scalar_t>(), sorted_token_ids.data_ptr<int32_t>(),
+              experts_ids.data_ptr<int32_t>(),
+              num_tokens_post_pad.data_ptr<int32_t>(), key_cache_ptrs_tensor.data_ptr<int32_t>(), num_experts,
+              block_size, topk_ids.numel());
+        } else {
+          // set dynamic shared mem
+          auto kernel = vllm::moe_align_block_size_kernel<scalar_t, false>;
+          AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(
            (void*)kernel, shared_mem));
-        kernel<<<1, num_experts, shared_mem, stream>>>(
-            topk_ids.data_ptr<scalar_t>(), sorted_token_ids.data_ptr<int32_t>(),
-            experts_ids.data_ptr<int32_t>(),
-            num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size,
-            topk_ids.numel());
+          kernel<<<1, num_experts, shared_mem, stream>>>(
+              topk_ids.data_ptr<scalar_t>(), sorted_token_ids.data_ptr<int32_t>(),
+              experts_ids.data_ptr<int32_t>(),
+              num_tokens_post_pad.data_ptr<int32_t>(), nullptr, num_experts, block_size,
+              topk_ids.numel());
+        }
+
      });
 }
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -23,12 +23,39 @@ void paged_attention_v2(
    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
    const int64_t blocksparse_head_sliding_step);

+void paged_attention_v1_opt(
+    torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
+    torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
+    torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
+    int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
+    const std::string& kv_cache_dtype, double kv_scale, const int64_t tp_rank,
+    const int64_t blocksparse_local_blocks,
+    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
+    const int64_t blocksparse_head_sliding_step);
+
+void paged_attention_v2_opt(
+    torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
+    torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
+    torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
+    torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
+    int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
+    const std::string& kv_cache_dtype, double kv_scale, const int64_t tp_rank,
+    const int64_t blocksparse_local_blocks,
+    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
+    const int64_t blocksparse_head_sliding_step);
+
 void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
              double epsilon);

 void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual,
                        torch::Tensor& weight, double epsilon);

+void rms_norm_opt(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
+              double epsilon);
+
+void fused_add_rms_norm_opt(torch::Tensor& input, torch::Tensor& residual,
+                        torch::Tensor& weight, double epsilon);
+
 void rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
                      torch::Tensor& key, int64_t head_size,
                      torch::Tensor& cos_sin_cache, bool is_neox);
@@ -38,6 +65,13 @@ void batched_rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
                              torch::Tensor& cos_sin_cache, bool is_neox,
                              int64_t rot_dim,
                              torch::Tensor& cos_sin_cache_offsets);
+void rotary_embedding_tgi(
+  torch::Tensor& query,
+  torch::Tensor& key,
+  int64_t head_size,
+  torch::Tensor& cos_cache,
+  torch::Tensor& sin_cache,
+  bool is_neox);

 void silu_and_mul(torch::Tensor& out, torch::Tensor& input);

@@ -49,6 +83,14 @@ void gelu_new(torch::Tensor& out, torch::Tensor& input);

 void gelu_fast(torch::Tensor& out, torch::Tensor& input);

+void silu_and_mul_opt(torch::Tensor& out, torch::Tensor& input);
+
+void gelu_and_mul_opt(torch::Tensor& out, torch::Tensor& input);
+
+void gelu_tanh_and_mul_opt(torch::Tensor& out, torch::Tensor& input);
+
+void trans_w16_gemm(torch::Tensor dst, torch::Tensor src, int64_t row, int64_t col);
+
 #ifndef USE_ROCM
 torch::Tensor aqlm_gemm(const torch::Tensor& input, const torch::Tensor& codes,
                        const torch::Tensor& codebooks,

--- a/csrc/opt/activation_kernels_opt.cu
+++ b/csrc/opt/activation_kernels_opt.cu
+#include <ATen/cuda/CUDAContext.h>
+#include <torch/all.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <ATen/native/cuda/MemoryAccess.cuh>
+
+#include <cmath>
+
+#include "cuda_compat.h"
+#include "../dispatch_utils.h"
+
+namespace vllm {
+
+// Activation and gating kernel template.
+template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&)>
+__global__ void act_and_mul_kernel(
+    scalar_t* __restrict__ out,          // [..., d]
+    const scalar_t* __restrict__ input,  // [..., 2, d]
+    const int d) {
+  const int64_t token_idx = blockIdx.x;
+  for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
+    const scalar_t x = VLLM_LDG(&input[token_idx * 2 * d + idx]);
+    const scalar_t y = VLLM_LDG(&input[token_idx * 2 * d + d + idx]);
+    out[token_idx * d + idx] = ACT_FN(x) * y;
+  }
+}
+
+template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&), int VEC>
+__global__ void act_and_mul_kernel_opt1(
+    scalar_t* __restrict__ out,          // [..., d]
+    const scalar_t* __restrict__ input,  // [..., 2, d]
+    const int d) {
+  using VecType = at::native::memory::aligned_vector<scalar_t, VEC>;
+  const int64_t token_idx= blockIdx.x;
+  int idx = threadIdx.x * VEC;
+  if (idx < d) {
+    const int64_t x_index = token_idx * 2 * d + idx;
+    const int64_t y_index = token_idx * d + idx;
+    VecType* x1 = (VecType*)(input + x_index);
+    VecType* x2 = (VecType*)(input + x_index + d);
+    VecType* y = (VecType*)(out + y_index);
+    scalar_t r_x1[VEC];
+    scalar_t r_x2[VEC];
+    scalar_t r_y[VEC];
+    *(VecType*)r_x1 = *x1;
+    *(VecType*)r_x2 = *x2;
+#pragma unroll
+    for (int i = 0; i < VEC; i++) {
+      r_y[i] = ACT_FN(r_x1[i]) * r_x2[i];
+    }
+    *y = *(VecType*)r_y;
+  }
+}
+
+template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&), int VEC>
+__global__ void act_and_mul_kernel_opt2(
+    scalar_t* __restrict__ out,          // [..., d]
+    const scalar_t* __restrict__ input,  // [..., 2, d]
+    const int d) {
+  using VecType = at::native::memory::aligned_vector<scalar_t, VEC>;
+  const int64_t token_idx = blockIdx.x;
+  int idx = threadIdx.x * VEC;
+  for (; idx < d; idx += blockDim.x * VEC) {
+    const int64_t x_index = token_idx * 2 * d + idx;
+    const int64_t y_index = token_idx * d + idx;
+    VecType* x1 = (VecType*)(input + x_index);
+    VecType* x2 = (VecType*)(input + x_index + d);
+    VecType* y = (VecType*)(out + y_index);
+    scalar_t r_x1[VEC];
+    scalar_t r_x2[VEC];
+    scalar_t r_y[VEC];
+    *(VecType*)r_x1 = *x1;
+    *(VecType*)r_x2 = *x2;
+#pragma unroll
+    for (int i = 0; i < VEC; i++) {
+      r_y[i] = ACT_FN(r_x1[i]) * r_x2[i];
+    }
+    *y = *(VecType*)r_y;
+  }
+}
+
+template <typename T>
+__device__ __forceinline__ T silu_kernel(const T& x) {
+  // x * sigmoid(x)
+  return (T)(((float)x) / (1.0f + expf((float)-x)));
+}
+
+template <typename T>
+__device__ __forceinline__ T gelu_kernel(const T& x) {
+  // Equivalent to PyTorch GELU with 'none' approximation.
+  // Refer to:
+  // https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L36-L38
+  const float f = (float)x;
+  constexpr float ALPHA = M_SQRT1_2;
+  return (T)(f * 0.5f * (1.0f + ::erf(f * ALPHA)));
+}
+
+template <typename T>
+__device__ __forceinline__ T gelu_tanh_kernel(const T& x) {
+  // Equivalent to PyTorch GELU with 'tanh' approximation.
+  // Refer to:
+  // https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L25-L30
+  const float f = (float)x;
+  constexpr float BETA = M_SQRT2 * M_2_SQRTPI * 0.5f;
+  constexpr float KAPPA = 0.044715;
+  float x_cube = f * f * f;
+  float inner = BETA * (f + KAPPA * x_cube);
+  return (T)(0.5f * f * (1.0f + ::tanhf(inner)));
+}
+
+}  // namespace vllm
+
+#define LAUNCH_ACTIVATION_GATE_KERNEL(KERNEL)                                  \
+  int d = input.size(-1) / 2;                                                  \
+  int64_t num_tokens = input.numel() / input.size(-1);                         \
+  dim3 grid(num_tokens);                                                       \
+  dim3 block(std::min(d, 1024));                                               \
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));            \
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();                \
+  VLLM_DISPATCH_FLOATING_TYPES(                                                \
+      input.scalar_type(), "act_and_mul_kernel", [&] {                         \
+        if (0 == d % 8 && d <= 16384) {                                        \
+          if (d <= 512) {                                                      \
+            vllm::act_and_mul_kernel_opt1<scalar_t, KERNEL<scalar_t>, 2> \
+                <<<grid, 256, 0, stream>>>(out.data_ptr<scalar_t>(),           \
+                                           input.data_ptr<scalar_t>(), d);     \
+          } else if (d <= 1024) {                                              \
+            vllm::act_and_mul_kernel_opt1<scalar_t, KERNEL<scalar_t>, 8> \
+                <<<grid, 128, 0, stream>>>(out.data_ptr<scalar_t>(),           \
+                                           input.data_ptr<scalar_t>(), d);     \
+          } else if (d <= 2048) {                                              \
+            vllm::act_and_mul_kernel_opt1<scalar_t, KERNEL<scalar_t>, 8> \
+                <<<grid, 256, 0, stream>>>(out.data_ptr<scalar_t>(),           \
+                                           input.data_ptr<scalar_t>(), d);     \
+          } else if (d <= 4096) {                                              \
+            vllm::act_and_mul_kernel_opt1<scalar_t, KERNEL<scalar_t>, 8> \
+                <<<grid, 512, 0, stream>>>(out.data_ptr<scalar_t>(),           \
+                                           input.data_ptr<scalar_t>(), d);     \
+          } else {                                                             \
+            vllm::act_and_mul_kernel_opt2<scalar_t, KERNEL<scalar_t>, 8> \
+                <<<grid, 1024, 0, stream>>>(out.data_ptr<scalar_t>(),          \
+                                            input.data_ptr<scalar_t>(), d);    \
+          }                                                                    \
+        } else {                                                               \
+              vllm::act_and_mul_kernel<scalar_t, KERNEL<scalar_t>>             \
+                  <<<grid, block, 0, stream>>>(out.data_ptr<scalar_t>(),       \
+                                              input.data_ptr<scalar_t>(), d);  \
+        }                                                                      \
+      });
+
+void silu_and_mul_opt(torch::Tensor& out,    // [..., d]
+                  torch::Tensor& input)  // [..., 2 * d]
+{
+  LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel);
+}
+
+void gelu_and_mul_opt(torch::Tensor& out,    // [..., d]
+                  torch::Tensor& input)  // [..., 2 * d]
+{
+  LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_kernel);
+}
+
+void gelu_tanh_and_mul_opt(torch::Tensor& out,    // [..., d]
+                       torch::Tensor& input)  // [..., 2 * d]
+{
+  LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_tanh_kernel);
+}
+
--- a/csrc/opt/layernorm_kernels_opt.cu
+++ b/csrc/opt/layernorm_kernels_opt.cu
--- a/csrc/opt/transpose_kernels.cu
+++ b/csrc/opt/transpose_kernels.cu
+#include <torch/all.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+
+namespace vllm {
+template <typename T>
+__global__ void trans_w16_gemm_cudakernel(int64_t num_kernels,T* dst,const T* src,int64_t row,int64_t col)
+{
+    int64_t id = blockIdx.x * blockDim.x + threadIdx.x;
+    if(id >= num_kernels) return;
+
+    int64_t j=id%row; 
+    int64_t i=id/row;
+
+    dst[i*row+j]=src[j*col+i];
+}
+
+
+void trans_w16_gemm_cuda(half* dst,const half* src,int64_t row,int64_t col){
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  int64_t num_kernels=row*col;
+  int block_size=256;
+  trans_w16_gemm_cudakernel<<<(num_kernels+block_size-1)/block_size,block_size, 0, stream>>>(num_kernels,dst,src,row,col);
+}
+}   // namespace vllm
+
+void trans_w16_gemm(torch::Tensor dst,torch::Tensor src,int64_t row,int64_t col){
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(src));
+  vllm::trans_w16_gemm_cuda(
+              (half*)dst.data_ptr(),
+              (const half*)src.data_ptr(),
+              row,
+              col
+            );
+}
\ No newline at end of file
--- a/csrc/pos_encoding_tgi_kernels.cu
+++ b/csrc/pos_encoding_tgi_kernels.cu
+#include <torch/all.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include "cuda_compat.h"
+#include "dispatch_utils.h"
+
+namespace vllm {
+
+template<typename scalar_t, bool IS_NEOX>
+inline __device__ void apply_token_rotary_embedding_tgi(
+  scalar_t* __restrict__ arr,
+  const float* __restrict__ cos_ptr,
+  const float* __restrict__ sin_ptr,
+  int rot_offset,
+  int embed_dim)
+{
+  int x_index, y_index;
+  float cos, sin;
+  if (IS_NEOX) {
+    // GPT-NeoX style rotary embedding.
+    x_index = rot_offset;
+    y_index = embed_dim + rot_offset;
+    cos = VLLM_LDG(cos_ptr + x_index);
+    sin = VLLM_LDG(sin_ptr + x_index);
+  } else {
+    // GPT-J style rotary embedding.
+    x_index = 2 * rot_offset;
+    y_index = 2 * rot_offset + 1;
+    cos = VLLM_LDG(cos_ptr + x_index / 2);
+    sin = VLLM_LDG(sin_ptr + x_index / 2);
+  }
+
+  const scalar_t x = arr[x_index];
+  const scalar_t y = arr[y_index];
+  arr[x_index] = x * cos - y * sin;
+  arr[y_index] = y * cos + x * sin;
+}
+
+template<typename scalar_t, bool IS_NEOX>
+inline __device__ void apply_rotary_embedding_tgi(
+  scalar_t* __restrict__ query,                 // [batch_size, seq_len, num_heads, head_size] or [num_tokens, num_heads, head_size]
+  scalar_t* __restrict__ key,                   // [batch_size, seq_len, num_kv_heads, head_size] or [num_tokens, num_kv_heads, head_size]
+  const float* __restrict__ cos_ptr,   // [max_position, 1, rot_dim]
+  const float* __restrict__ sin_ptr,   // [max_position, 1, rot_dim]
+  const int head_size,
+  const int num_heads,
+  const int num_kv_heads,
+  const int rot_dim,
+  const int token_idx,
+  const int64_t query_stride,
+  const int64_t key_stride)
+{
+  const int nq = num_heads * rot_dim;
+  for (int i = threadIdx.x; i < nq; i += blockDim.x) {
+    const int head_idx = i / rot_dim;
+    const int64_t token_head = token_idx * query_stride + head_idx * head_size;
+    const int rot_offset = i % rot_dim;
+    apply_token_rotary_embedding_tgi<scalar_t, IS_NEOX>(query + token_head, cos_ptr,
+                                              sin_ptr, rot_offset, rot_dim);
+  }
+
+  const int nk = num_kv_heads * rot_dim;
+  for (int i = threadIdx.x; i < nk; i += blockDim.x) {
+    const int head_idx = i / rot_dim;
+    const int64_t token_head = token_idx * key_stride + head_idx * head_size;
+    const int rot_offset = i % rot_dim;
+    apply_token_rotary_embedding_tgi<scalar_t, IS_NEOX>(key + token_head, cos_ptr,
+                                              sin_ptr, rot_offset, rot_dim);
+  }
+}
+
+template<typename scalar_t, bool IS_NEOX>
+__global__ void rotary_embedding_tgi_kernel(
+  scalar_t* __restrict__ query,                 // [batch_size, seq_len, num_heads, head_size] or [num_tokens, num_heads, head_size]
+  scalar_t* __restrict__ key,                   // [batch_size, seq_len, num_kv_heads, head_size] or [num_tokens, num_kv_heads, head_size]
+  const float* __restrict__ cos_cache,   // [max_position, 1, rot_dim]
+  const float* __restrict__ sin_cache,   // [max_position, 1, rot_dim]
+  const int rot_dim,
+  const int64_t query_stride,
+  const int64_t key_stride,
+  const int num_heads,
+  const int num_kv_heads,
+  const int head_size) {
+  // Each thread block is responsible for one token.
+  const int token_idx = blockIdx.x;
+
+  const float* cos_ptr = cos_cache + token_idx * rot_dim;
+  const float* sin_ptr = sin_cache + token_idx * rot_dim;
+
+  apply_rotary_embedding_tgi<scalar_t, IS_NEOX>(query, key, cos_ptr, sin_ptr, head_size, num_heads, num_kv_heads, rot_dim, token_idx, query_stride, key_stride);
+}
+
+} // namespace vllm
+
+void rotary_embedding_tgi(
+  torch::Tensor& query,             // [batch_size, seq_len, num_heads * head_size] or [num_tokens, num_heads * head_size]
+  torch::Tensor& key,               // [batch_size, seq_len, num_kv_heads * head_size] or [num_tokens, num_kv_heads * head_size]
+  int64_t head_size,
+  torch::Tensor& cos_cache,
+  torch::Tensor& sin_cache,
+  bool is_neox) {
+  int num_tokens = query.size(0);
+  int rot_dim = cos_cache.size(2);
+  int num_heads = query.size(1);
+  int num_kv_heads = key.size(1);
+  int query_stride = query.stride(0);
+  int key_stride = key.stride(0);
+
+  dim3 grid(num_tokens);
+  dim3 block(std::min(num_heads * rot_dim / 2, 512));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  VLLM_DISPATCH_FLOATING_TYPES(
+    query.scalar_type(),
+    "rotary_embedding_tgi",
+    [&] {
+      if (is_neox) {
+        vllm::rotary_embedding_tgi_kernel<scalar_t, true><<<grid, block, 0, stream>>>(
+          query.data_ptr<scalar_t>(),
+          key.data_ptr<scalar_t>(),
+          cos_cache.data_ptr<float>(),
+          sin_cache.data_ptr<float>(),
+          rot_dim,
+          query_stride,
+          key_stride,
+          num_heads,
+          num_kv_heads,
+          head_size);
+      } else {
+        vllm::rotary_embedding_tgi_kernel<scalar_t, false><<<grid, block, 0, stream>>>(
+          query.data_ptr<scalar_t>(),
+          key.data_ptr<scalar_t>(),
+          cos_cache.data_ptr<float>(),
+          sin_cache.data_ptr<float>(),
+          rot_dim,
+          query_stride,
+          key_stride,
+          num_heads,
+          num_kv_heads,
+          head_size);
+      }
+    });
+}
--- a/csrc/quantization/gptq/q_gemm.cu
+++ b/csrc/quantization/gptq/q_gemm.cu
@@ -1542,6 +1542,7 @@ void gemm_half_q_half_cuda(cublasHandle_t cublas_handle, const half* a,
  }
 }

+
 __global__ void shuffle_4bit_kernel(uint32_t* __restrict__ b_q_weight,
                                    const int size_k, const int size_n) {
  int n = blockIdx.x * THREADS_X + threadIdx.x;
@@ -1847,6 +1848,7 @@ torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight,
  return c;
 }

+
 void gptq_shuffle(torch::Tensor q_weight, torch::Tensor q_perm, int64_t bit) {
  const at::cuda::OptionalCUDAGuard device_guard(device_of(q_weight));
  vllm::gptq::shuffle_exllama_weight(

--- a/csrc/quantization/gptq/setup.py
+++ b/csrc/quantization/gptq/setup.py
+from setuptools import setup
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+import torch
+
+# Compiler flags.
+CXX_FLAGS = ["-g", "-O3", "-std=c++17"]
+NVCC_FLAGS = ["-O3", "-std=c++17","-DUSE_ROCM","-U__HIP_NO_HALF_CONVERSIONS__","-U__HIP_NO_HALF_OPERATORS__"]
+#--gpu-max-threads-per-block=1024编译会导致GPTQ多batch性能下降。
+# NVCC_FLAGS = ["-O3", "-std=c++17","-DUSE_ROCM","--gpu-max-threads-per-block=1024","-U__HIP_NO_HALF_CONVERSIONS__","-U__HIP_NO_HALF_OPERATORS__"]
+
+
+ABI = 1 if torch._C._GLIBCXX_USE_CXX11_ABI else 0
+CXX_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"]
+NVCC_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"]
+
+extra_compile_args={
+    "cxx": CXX_FLAGS,
+    "nvcc": NVCC_FLAGS,
+}
+
+setup(
+    name="gptq_kernels",
+    ext_modules=[
+        CUDAExtension(
+            name="gptq_kernels",
+            sources=[
+                "csrc/quantization/gptq/torch_bindings.cpp",
+                "csrc/quantization/gptq/q_gemm.cu",
+            ],
+            extra_compile_args=extra_compile_args,
+        )
+    ],
+    cmdclass={"build_ext": BuildExtension},
+)
--- a/csrc/quantization/gptq/torch_bindings.cpp
+++ b/csrc/quantization/gptq/torch_bindings.cpp
+#include <torch/extension.h>
+
+torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight,
+                        torch::Tensor b_gptq_qzeros,
+                        torch::Tensor b_gptq_scales, torch::Tensor b_g_idx,
+                        bool use_exllama, int64_t bit);
+
+void gptq_shuffle(torch::Tensor q_weight, torch::Tensor q_perm, int64_t bit);
+
+// Bindings
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    m.def("gptq_gemm", &gptq_gemm, "make_q_matrix");
+    m.def("gptq_shuffle", &gptq_shuffle, "gemm_half_q_half");
+}