Merge branch 'v0.5.0-dtk24.04.1'

7462218e · zhuwenwen · 6ccd3f47 · 1cec5e62 · 7462218e · 7462218e
Commit 7462218e authored Sep 05, 2024 by zhuwenwen
20 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -4,11 +4,13 @@ project(vllm_extensions LANGUAGES CXX)

 option(VLLM_TARGET_DEVICE "Target device backend for vLLM" "cuda")

+set(CMAKE_BUILD_TYPE "Release")
+
 message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
 message(STATUS "Target device: ${VLLM_TARGET_DEVICE}")

 include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
-
+add_compile_options(-w)
 #
 # Supported python versions.  These versions will be searched in order, the
 # first match will be selected.  These should be kept in sync with setup.py.
@@ -123,7 +125,7 @@ endif()
 override_gpu_arches(VLLM_GPU_ARCHES
  ${VLLM_GPU_LANG}
  "${${VLLM_GPU_LANG}_SUPPORTED_ARCHS}")
-
+  
 #
 # Query torch for additional GPU compilation flags for the given
 # `VLLM_GPU_LANG`.
@@ -150,8 +152,13 @@ set(VLLM_EXT_SRC
  "csrc/cache_kernels.cu"
  "csrc/attention/attention_kernels.cu"
  "csrc/pos_encoding_kernels.cu"
+  "csrc/pos_encoding_tgi_kernels.cu"
  "csrc/activation_kernels.cu"
  "csrc/layernorm_kernels.cu"
+  "csrc/opt/transpose_kernels.cu"
+  "csrc/opt/activation_kernels_opt.cu"
+  "csrc/attention/attention_kernels_opt.cu"
+  "csrc/opt/layernorm_kernels_opt.cu"
  "csrc/quantization/squeezellm/quant_cuda_kernel.cu"
  "csrc/quantization/gptq/q_gemm.cu"
  "csrc/quantization/compressed_tensors/int8_quant_kernels.cu"

--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@
 vLLM是一个快速且易于使用的LLM推理和服务库，使用PageAttention高效管理kv内存，Continuous batching传入请求，支持很多Hugging Face模型，如LLaMA & LLaMA-2、Qwen、Chatglm2 & Chatglm3等。

 ## 暂不支持的官方功能
- **量化推理**：目前支持fp16的推理和gptq推理，awq-int4和mralin的权重量化、kv-cache fp8推理方案暂不支持
+- **量化推理**：目前支持fp16的推理和gptq,awq-int4推理，mralin的权重量化、kv-cache fp8推理方案暂不支持
 - **模块支持**：目前不支持Sliding window attention、 moe kernel和lora模块


@@ -15,12 +15,17 @@ vLLM是一个快速且易于使用的LLM推理和服务库，使用PageAttention
 |    LlamaForCausalLM       |    LLaMA-3        |   Yes    | Yes  |
 |    LlamaForCausalLM       |    Codellama      |   Yes    | Yes  |
 |    QWenLMHeadModel        |    QWen           |   Yes    | Yes  |
+|    Qwen2ForCausalLM       |    QWen1.5        |   Yes    | Yes  |
+|    Qwen2ForCausalLM       |    CodeQwen1.5    |   Yes    | Yes  |
+|    Qwen2ForCausalLM       |    QWen2          |   Yes    | Yes  |
+|    ChatGLMModel           |    chatglm2       |   Yes    | Yes  |
+|    ChatGLMModel           |    chatglm3       |   Yes    | Yes  |
 |    BaiChuanForCausalLM    |    Baichuan-7B    |   Yes    | Yes  |
 |    BaiChuanForCausalLM    |    Baichuan2-7B   |   Yes    | Yes  |
-|    ChatGLMModel           |    chatglm2-6b    |   Yes    | Yes  |
-|    ChatGLMModel           |    chatglm3-6b    |   Yes    | Yes  |
 |    InternLMForCausalLM    |    InternLM       |   Yes    | Yes  |
 |    InternLM2ForCausalLM   |    InternLM2      |   Yes    | Yes  |
+|    LlamaForCausalLM       |    deepseek       |   Yes    | Yes  |
+|    DeepseekV2ForCausalLM  |    DeepSeek-V2    |   Yes    | Yes  |
 |    LlamaForCausalLM       |    Yi             |   Yes    | Yes  |
 |    MixtralForCausalLM     |    Mixtral-8x7B   |   Yes    | Yes  |

@@ -49,16 +54,19 @@ pip install setuptools wheel
 ```shell
 git clone http://developer.hpccube.com/codes/OpenDAS/vllm.git # 根据需要的分支进行切换
 ```
-
+安装依赖：
+```shell
+pip install -r requirements-rocm.txt
+```
 - 提供2种源码编译方式（进入vllm目录）：
 ```
 1. 编译whl包并安装
-python setup.py bdist_wheel 
+VLLM_INSTALL_PUNICA_KERNELS=1 python setup.py bdist_wheel 
 cd dist
 pip install vllm*

 2. 源码编译安装
-python3 setup.py install 
+VLLM_INSTALL_PUNICA_KERNELS=1 python3 setup.py install 
 ```

 #### 运行基础环境准备
@@ -68,13 +76,13 @@ python3 setup.py install
 - triton:[https://cancon.hpccube.com:65024/4/main/triton](https://cancon.hpccube.com:65024/4/main/triton/)
 - xformers:[https://cancon.hpccube.com:65024/4/main/xformers](https://cancon.hpccube.com:65024/4/main/xformers)
 - flash_attn: [https://cancon.hpccube.com:65024/4/main/flash_attn](https://cancon.hpccube.com:65024/4/main/flash_attn)
-
+- lmslim: [https://cancon.hpccube.com:65024/4/main/lmslim](https://cancon.hpccube.com:65024/4/main/lmslim)

 #### 注意事项
 + 若使用 pip install 下载安装过慢，可添加源：-i https://pypi.tuna.tsinghua.edu.cn/simple/

 ## 验证
- python -c "import vllm; print(vllm.\_\_version__)"，版本号与官方版本同步，查询该软件的版本号，例如0.5.0.post1；
+- python -c "import vllm; print(vllm.\_\_version__)"，版本号与官方版本同步，查询该软件的版本号，例如0.5.0；

 ## Known Issue
 - 无

--- a/README_ORIGIN.md
+++ b/README_ORIGIN.md
@@ -23,10 +23,16 @@ If you have cool projects related to vLLM or LLM inference, we would love to see
 This will be a great chance for everyone in the community to get together and learn.
 Please submit your proposal [here](https://raysummit.anyscale.com/flow/anyscale/raysummit2024/landing/page/eventsite)

+**The Fourth vLLM Bay Area Meetup (June 11th 5:30pm-8pm PT)**
+
+We are thrilled to announce our fourth vLLM Meetup!
+The vLLM team will share recent updates and roadmap.
+We will also have vLLM collaborators from BentoML and Cloudflare coming up to the stage to discuss their experience in deploying LLMs with vLLM.
+Please register [here](https://lu.ma/agivllm) and join us!
+
 ---

 *Latest News* 🔥
- [2024/06] We hosted [the fourth vLLM meetup](https://lu.ma/agivllm) with Cloudflare and BentoML! Please find the meetup slides [here](https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing).
 - [2024/04] We hosted [the third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/) with Roblox! Please find the meetup slides [here](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing).
 - [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing).
 - [2024/01] Added ROCm 6.0 support to vLLM.
@@ -59,7 +65,7 @@ vLLM is flexible and easy to use with:
 - Tensor parallelism support for distributed inference
 - Streaming outputs
 - OpenAI-compatible API server
- Support NVIDIA GPUs, AMD GPUs, and Intel CPUs
+- Support NVIDIA GPUs and AMD GPUs
 - (Experimental) Prefix caching support
 - (Experimental) Multi-lora support


--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -5,11 +5,13 @@ import random
 import time
 from typing import List, Optional, Tuple

+import numpy as np
 import torch
 from tqdm import tqdm
 from transformers import (AutoModelForCausalLM, AutoTokenizer,
                          PreTrainedTokenizerBase)

+from vllm.inputs import PromptStrictInputs
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS


@@ -60,6 +62,7 @@ def sample_requests(


 def run_vllm(
+    warmup_requests: List[Tuple[str, int, int]],
    requests: List[Tuple[str, int, int]],
    model: str,
    tokenizer: str,
@@ -119,6 +122,41 @@ def run_vllm(
                max_tokens=output_len,
            ))

+    # warmup
+    warmup_prompts = []
+    warmup_sampling_params = []
+    for prompt, _, output_len in warmup_requests:
+        warmup_prompts.append(prompt)
+        warmup_sampling_params.append(
+            SamplingParams(
+                n=n,
+                temperature=0.0 if use_beam_search else 1.0,
+                top_p=1.0,
+                use_beam_search=use_beam_search,
+                ignore_eos=True,
+                max_tokens=output_len,
+            ))
+        
+    print("Warming up...")
+    for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
+        llm.generate(warmup_prompts, warmup_sampling_params, use_tqdm=True)
+    
+    # dummy_prompt_token_ids = np.random.randint(10000,
+    #                                            size=(args.num_prompts,
+    #                                                  args.input_len))
+    # dummy_inputs: List[PromptStrictInputs] = [{
+    #     "prompt_token_ids": batch
+    # } for batch in dummy_prompt_token_ids.tolist()]
+
+    # def run_to_completion():
+    #     llm.generate(dummy_inputs,
+    #                     sampling_params=sampling_params,
+    #                     use_tqdm=False)
+
+    # print("Warming up...")
+    # for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
+    #     run_to_completion()
+    
    start = time.perf_counter()
    llm.generate(prompts, sampling_params, use_tqdm=True)
    end = time.perf_counter()
@@ -212,6 +250,10 @@ def main(args: argparse.Namespace):
        args.tokenizer, trust_remote_code=args.trust_remote_code)
    if args.dataset is None:
        # Synthesize a prompt with the given input length.
+        warmup_prompt = "hi" * 10
+        warmup_requests = [(warmup_prompt, 10, 10)
+                    for _ in range(1)]
+        
        prompt = "hi" * (args.input_len - 1)
        requests = [(prompt, args.input_len, args.output_len)
                    for _ in range(args.num_prompts)]
@@ -221,7 +263,7 @@ def main(args: argparse.Namespace):

    if args.backend == "vllm":
        elapsed_time = run_vllm(
-            requests, args.model, args.tokenizer, args.quantization,
+            warmup_requests, requests, args.model, args.tokenizer, args.quantization,
            args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
            args.trust_remote_code, args.dtype, args.max_model_len,
            args.enforce_eager, args.kv_cache_dtype,
@@ -295,6 +337,10 @@ if __name__ == "__main__":
                        default=1,
                        help="Number of generated sequences per prompt.")
    parser.add_argument("--use-beam-search", action="store_true")
+    parser.add_argument('--num-iters-warmup',
+                        type=int,
+                        default=1,
+                        help='Number of iterations to run for warmup.')
    parser.add_argument("--num-prompts",
                        type=int,
                        default=1000,

--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -117,6 +117,10 @@ function (get_torch_gpu_compiler_flags OUT_GPU_FLAGS GPU_LANG)
      "import torch.utils.cpp_extension as t; print(';'.join(t.COMMON_HIP_FLAGS + t.COMMON_HIPCC_FLAGS))"
      "Failed to determine torch nvcc compiler flags")

+    list(REMOVE_ITEM GPU_FLAGS
+      "-DUSE_ROCM=1"
+    )
+ 
    list(APPEND GPU_FLAGS
      "-DUSE_ROCM"
      # "-DENABLE_FP8"

--- a/csrc/activation_kernels.cu
+++ b/csrc/activation_kernels.cu
@@ -147,4 +147,4 @@ void gelu_fast(torch::Tensor& out,    // [..., d]
               torch::Tensor& input)  // [..., d]
 {
  LAUNCH_ACTIVATION_KERNEL(vllm::gelu_fast_kernel);
-}
+}
\ No newline at end of file
--- a/csrc/attention/attention_kernels.cu
+++ b/csrc/attention/attention_kernels.cu
@@ -992,4 +992,4 @@ void paged_attention_v2(
 #undef WARP_SIZE
 #undef MAX
 #undef MIN
-#undef DIVIDE_ROUND_UP
+#undef DIVIDE_ROUND_UP
\ No newline at end of file
--- a/csrc/attention/attention_kernels_opt.cu
+++ b/csrc/attention/attention_kernels_opt.cu
+#include <torch/all.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <algorithm>
+
+#include "attention/attention_dtypes.h"
+#include "attention/attention_utils.cuh"
+
+#ifdef USE_ROCM
+  #include <hip/hip_bf16.h>
+  #include "quantization/fp8/amd/quant_utils.cuh"
+typedef __hip_bfloat16 __nv_bfloat16;
+#else
+  #include "quantization/fp8/nvidia/quant_utils.cuh"
+#endif
+
+#ifndef USE_ROCM
+  #define WARP_SIZE 32
+#else
+  #define WARP_SIZE warpSize
+#endif
+
+
+#include "static_switch.h"
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#define DIVIDE_ROUND_UP(a, b) (((a) + (b) - 1) / (b))
+
+namespace vllm {
+
+// Utility function for attention softmax.
+template <int NUM_WARPS>
+inline __device__ float block_sum(float* red_smem, float sum) {
+  // Decompose the thread index into warp / lane.
+  int warp = threadIdx.x / WARP_SIZE;
+  int lane = threadIdx.x % WARP_SIZE;
+
+  // Compute the sum per warp.
+#pragma unroll
+  for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) {
+    sum += VLLM_SHFL_XOR_SYNC(sum, mask);
+  }
+
+  // Warp leaders store the data to shared memory.
+  if (lane == 0) {
+    red_smem[warp] = sum;
+  }
+
+  // Make sure the data is in shared memory.
+  __syncthreads();
+
+  // The warps compute the final sums.
+  if (lane < NUM_WARPS) {
+    sum = red_smem[lane];
+  }
+
+  // Parallel reduction inside the warp.
+#pragma unroll
+  for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) {
+    sum += VLLM_SHFL_XOR_SYNC(sum, mask);
+  }
+
+  // Broadcast to other threads.
+  return VLLM_SHFL_SYNC(sum, 0);
+}
+
+// TODO(woosuk): Merge the last two dimensions of the grid.
+// Grid: (num_heads, num_seqs, max_num_partitions).
+template <typename scalar_t, typename cache_t, int HEAD_SIZE, int BLOCK_SIZE,
+          int NUM_THREADS, vllm::Fp8KVCacheDataType KV_DTYPE,
+          bool IS_BLOCK_SPARSE, int REUSE_KV_TIMES = 1, bool odd_nheads = false,
+          int PARTITION_SIZE = 0>  // Zero means no partitioning.
+__device__ void paged_attention_kernel_opt(
+    float* __restrict__ exp_sums,  // [num_seqs, num_heads, max_num_partitions]
+    float* __restrict__ max_logits,  // [num_seqs, num_heads,
+                                     // max_num_partitions]
+    scalar_t* __restrict__ out,  // [num_seqs, num_heads, max_num_partitions,
+                                 // head_size]
+    const scalar_t* __restrict__ q,       // [num_seqs, num_heads, head_size]
+    const cache_t* __restrict__ k_cache,  // [num_blocks, num_kv_heads,
+                                          // head_size/x, block_size, x]
+    const cache_t* __restrict__ v_cache,  // [num_blocks, num_kv_heads,
+                                          // head_size, block_size]
+    const int num_heads,                   // [num_heads]
+    const int num_kv_heads,               // [num_kv_heads]
+    const float scale,
+    const int* __restrict__ block_tables,  // [num_seqs, max_num_blocks_per_seq]
+    const int* __restrict__ seq_lens,      // [num_seqs]
+    const int max_num_blocks_per_seq,
+    const float* __restrict__ alibi_slopes,  // [num_heads]
+    const int q_stride, const int kv_block_stride, const int kv_head_stride,
+    const float kv_scale, const int tp_rank, const int blocksparse_local_blocks,
+    const int blocksparse_vert_stride, const int blocksparse_block_size,
+    const int blocksparse_head_sliding_step) {
+  const int seq_idx = blockIdx.z;
+  const int partition_idx = blockIdx.y;
+  const int max_num_partitions = gridDim.y;
+  constexpr bool USE_PARTITIONING = PARTITION_SIZE > 0;
+  const int seq_len = seq_lens[seq_idx];
+  if (USE_PARTITIONING && partition_idx * PARTITION_SIZE >= seq_len) {
+    // No work to do. Terminate the thread block.
+    return;
+  }
+  if constexpr (sizeof(scalar_t)==2){
+
+  const int num_seq_blocks = DIVIDE_ROUND_UP(seq_len, BLOCK_SIZE);
+  const int num_blocks_per_partition =
+      USE_PARTITIONING ? PARTITION_SIZE / BLOCK_SIZE : num_seq_blocks;
+  const int partition_size = USE_PARTITIONING ? PARTITION_SIZE : num_seq_blocks * BLOCK_SIZE;
+  // [start_block_idx, end_block_idx) is the range of blocks to process.
+  const int start_block_idx =
+      USE_PARTITIONING ? partition_idx * num_blocks_per_partition : 0;
+  const int end_block_idx =
+      MIN(start_block_idx + num_blocks_per_partition, num_seq_blocks);
+  const int num_blocks = end_block_idx - start_block_idx;
+
+  // [start_token_idx, end_token_idx) is the range of tokens to process.
+  const int start_token_idx = start_block_idx * BLOCK_SIZE;
+  const int end_token_idx =
+      MIN(start_token_idx + num_blocks * BLOCK_SIZE, seq_len);
+  const int num_tokens = end_token_idx - start_token_idx;
+
+  constexpr int THREAD_GROUP_SIZE = MAX(WARP_SIZE / BLOCK_SIZE, 1);
+  constexpr int NUM_THREAD_GROUPS =
+      NUM_THREADS / THREAD_GROUP_SIZE;  // Note: This assumes THREAD_GROUP_SIZE
+                                        // divides NUM_THREADS
+  assert(NUM_THREADS % THREAD_GROUP_SIZE == 0);
+  constexpr int NUM_TOKENS_PER_THREAD_GROUP =
+      DIVIDE_ROUND_UP(BLOCK_SIZE, WARP_SIZE);
+  constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
+  const int thread_idx = threadIdx.x;
+  // const int warp_idx_vec = thread_idx / WARP_SIZE;
+  // int warp_idx =0;
+  // asm volatile("v_readfirstlane_b32 %0,%1"
+  //               : "=s"(warp_idx)
+  //               : "v"(warp_idx_vec)
+  //               :);
+  // // const int warp_idx = thread_idx / WARP_SIZE;
+
+  // const int lane = thread_idx % WARP_SIZE;
+
+    //const int warp_idx = thread_idx / WARP_SIZE;
+  const int lane = thread_idx % WARP_SIZE;
+
+  int warp_id_vec = threadIdx.x / WARP_SIZE; //warp id in a block
+  int warp_idx =0;
+  asm volatile("v_readfirstlane_b32 %0,%1"
+                : "=s"(warp_idx)
+                : "v"(warp_id_vec)
+                :);
+
+  // const int head_idx = blockIdx.x;
+  // const int num_heads = gridDim.x;
+  const int num_queries_per_kv = num_heads / num_kv_heads;
+  // const float alibi_slope =
+  //     alibi_slopes == nullptr ? 0.f : alibi_slopes[head_idx];
+
+  // A vector type to store a part of a key or a query.
+  // The vector size is configured in such a way that the threads in a thread
+  // group fetch or compute 16 bytes at a time. For example, if the size of a
+  // thread group is 4 and the data type is half, then the vector size is 16 /
+  // (4 * sizeof(half)) == 2.
+  constexpr int VEC_SIZE = MAX(32 / (THREAD_GROUP_SIZE * sizeof(scalar_t)), 1);
+  using K_vec = typename Vec<scalar_t, VEC_SIZE>::Type;
+  using Q_vec = typename Vec<scalar_t, VEC_SIZE>::Type;
+  using Quant_vec = typename Vec<cache_t, VEC_SIZE>::Type;
+
+  constexpr int NUM_ELEMS_PER_THREAD = HEAD_SIZE / THREAD_GROUP_SIZE;
+  constexpr int NUM_VECS_PER_THREAD = NUM_ELEMS_PER_THREAD / VEC_SIZE;
+
+  const int thread_group_idx = thread_idx / THREAD_GROUP_SIZE;
+  const int thread_group_offset = thread_idx % THREAD_GROUP_SIZE;
+
+  // Load the query to registers.
+  // Each thread in a thread group has a different part of the query.
+  // For example, if the the thread group size is 4, then the first thread in
+  // the group has 0, 4, 8, ... th vectors of the query, and the second thread
+  // has 1, 5, 9, ... th vectors of the query, and so on. NOTE(woosuk): Because
+  // q is split from a qkv tensor, it may not be contiguous.
+  // const scalar_t* q_ptr = q + seq_idx * q_stride;
+  const scalar_t* q_ptr_offset = q + seq_idx * q_stride;
+
+  __shared__ Q_vec q_vecs[REUSE_KV_TIMES * THREAD_GROUP_SIZE][NUM_VECS_PER_THREAD];
+// #pragma unroll
+//   for (int i = thread_group_idx; i < NUM_VECS_PER_THREAD;
+//        i += NUM_THREAD_GROUPS) {
+//     const int vec_idx = thread_group_offset + i * THREAD_GROUP_SIZE;
+//     q_vecs[thread_group_offset][i] =
+//         *reinterpret_cast<const Q_vec*>(q_ptr + vec_idx * VEC_SIZE);
+//   }
+//   __syncthreads();  // TODO(naed90): possible speedup if this is replaced with a
+//                     // memory wall right before we use q_vecs
+
+  // Memory planning.
+  extern __shared__ char shared_mem[];
+  // NOTE(woosuk): We use FP32 for the softmax logits for better accuracy.
+  float* logits = reinterpret_cast<float*>(shared_mem);
+  // Workspace for reduction.
+  __shared__ float red_smem[REUSE_KV_TIMES][2 * NUM_WARPS];
+  // float (*red_smem)[2 * NUM_WARPS] = reinterpret_cast<float(*)[2 * NUM_WARPS]>(&shared_mem[10*1024]);
+
+  // __shared__ char shared_mem[12 * 1024];
+  // float* logits = reinterpret_cast<float*>(shared_mem);
+  // __shared__ float red_smem[REUSE_KV_TIMES][2 * NUM_WARPS];
+
+  // x == THREAD_GROUP_SIZE * VEC_SIZE
+  // Each thread group fetches x elements from the key at a time.
+  constexpr int x = 16 / sizeof(cache_t);
+  float qk_max[REUSE_KV_TIMES];
+
+  for(int reuse_kv_idx=0; reuse_kv_idx<REUSE_KV_TIMES; reuse_kv_idx++) {
+      qk_max[reuse_kv_idx] = -FLT_MAX;
+  }
+   
+  const int num_blocks_per_kv = ((num_queries_per_kv + REUSE_KV_TIMES -1) / REUSE_KV_TIMES);
+  const int head_idx_soffset = (blockIdx.x / num_blocks_per_kv) * num_queries_per_kv + (blockIdx.x % num_blocks_per_kv) * REUSE_KV_TIMES;
+  const int kv_head_idx = head_idx_soffset / num_queries_per_kv;
+  const int q_boundary = (kv_head_idx + 1)* num_queries_per_kv;
+
+  #pragma unroll
+  for(int reuse_kv_idx=0; reuse_kv_idx<REUSE_KV_TIMES; reuse_kv_idx++) {
+    const int head_idx = head_idx_soffset + reuse_kv_idx;//blockIdx.x * REUSE_KV_TIMES + reuse_kv_idx;
+    const scalar_t* q_ptr = q_ptr_offset + head_idx * HEAD_SIZE;
+    #pragma unroll
+    for (int i = thread_group_idx; i < NUM_VECS_PER_THREAD; i += NUM_THREAD_GROUPS) {
+      const int vec_idx = thread_group_offset + i * THREAD_GROUP_SIZE;
+      q_vecs[reuse_kv_idx*THREAD_GROUP_SIZE + thread_group_offset][i] = *reinterpret_cast<const Q_vec*>(q_ptr + vec_idx * VEC_SIZE);
+    }
+  }
+  __syncthreads(); // TODO(naed90): possible speedup if this is replaced with a memory wall right before we use q_vecs
+
+  // Iterate over the key blocks.
+  // Each warp fetches a block of keys for each iteration.
+  // Each thread group in a warp fetches a key from the block, and computes
+  // dot product with the query.
+  const int* block_table = block_tables + seq_idx * max_num_blocks_per_seq;
+  for (int block_idx = start_block_idx + warp_idx; block_idx < end_block_idx; block_idx += NUM_WARPS) {
+    // NOTE(woosuk): The block number is stored in int32. However, we cast it to
+    // int64 because int32 can lead to overflow when this variable is multiplied
+    // by large numbers (e.g., kv_block_stride).
+    // For blocksparse attention: skip computation on blocks that are not
+    // attended
+    for(int reuse_kv_idx=0; reuse_kv_idx<REUSE_KV_TIMES; reuse_kv_idx++) {
+    const int head_idx = head_idx_soffset + reuse_kv_idx;//blockIdx.x * REUSE_KV_TIMES + reuse_kv_idx;
+    if(!odd_nheads || head_idx < q_boundary) {
+        // blocksparse specific vars
+    int bs_block_offset;
+    int q_bs_block_id;
+    if constexpr (IS_BLOCK_SPARSE) { 
+      // const int num_blocksparse_blocks = DIVIDE_ROUND_UP(seq_len,
+      // blocksparse_block_size);
+      q_bs_block_id = (seq_len - 1) / blocksparse_block_size;
+      if (blocksparse_head_sliding_step >= 0)
+        // sliding on q heads
+        bs_block_offset =
+            (tp_rank * num_heads + head_idx) * blocksparse_head_sliding_step + 1;
+      else
+        // sliding on kv heads
+        bs_block_offset = (tp_rank * num_kv_heads + kv_head_idx) *
+                              (-blocksparse_head_sliding_step) +
+                          1;
+    }
+    if constexpr (IS_BLOCK_SPARSE) {
+      const int k_bs_block_id = block_idx * BLOCK_SIZE / blocksparse_block_size;
+      const bool is_remote =
+          ((k_bs_block_id + bs_block_offset) % blocksparse_vert_stride == 0);
+      const bool is_local =
+          (k_bs_block_id > q_bs_block_id - blocksparse_local_blocks);
+      if (!is_remote && !is_local) {
+        for (int i = 0; i < NUM_TOKENS_PER_THREAD_GROUP; i++) {
+          const int physical_block_offset =
+              (thread_group_idx + i * WARP_SIZE) % BLOCK_SIZE;
+          const int token_idx = block_idx * BLOCK_SIZE + physical_block_offset;
+
+          if (thread_group_offset == 0) {
+            // NOTE(linxihui): assign very large number to skipped tokens to
+            // avoid contribution to the sumexp softmax normalizer. This will
+            // not be used at computing sum(softmax*v) as the blocks will be
+            // skipped.
+            logits[token_idx - start_token_idx] = -FLT_MAX;
+          }
+        }
+        continue;
+      }
+    }
+    const float alibi_slope = alibi_slopes == nullptr ? 0.f : alibi_slopes[head_idx];
+    const int64_t physical_block_number = static_cast<int64_t>(block_table[block_idx]);
+
+    // Load a key to registers.
+    // Each thread in a thread group has a different part of the key.
+    // For example, if the the thread group size is 4, then the first thread in
+    // the group has 0, 4, 8, ... th vectors of the key, and the second thread
+    // has 1, 5, 9, ... th vectors of the key, and so on.
+    for (int i = 0; i < NUM_TOKENS_PER_THREAD_GROUP; i++) {
+      const int physical_block_offset = (thread_group_idx + i * WARP_SIZE) % BLOCK_SIZE;
+      const int token_idx = block_idx * BLOCK_SIZE + physical_block_offset;
+      K_vec k_vecs[NUM_VECS_PER_THREAD];
+      if(reuse_kv_idx == 0) {
+        #pragma unroll
+        for (int j = 0; j < NUM_VECS_PER_THREAD; j++) {
+          const cache_t* k_ptr =
+              k_cache + physical_block_number * kv_block_stride +
+              kv_head_idx * kv_head_stride + physical_block_offset * x;
+          const int vec_idx = thread_group_offset + j * THREAD_GROUP_SIZE;
+          const int offset1 = (vec_idx * VEC_SIZE) / x;
+          const int offset2 = (vec_idx * VEC_SIZE) % x;
+
+          if constexpr (KV_DTYPE == Fp8KVCacheDataType::kAuto) {
+            k_vecs[j] = *reinterpret_cast<const K_vec*>(
+                k_ptr + offset1 * BLOCK_SIZE * x + offset2);
+          } else {
+            // Vector conversion from Quant_vec to K_vec.
+            Quant_vec k_vec_quant = *reinterpret_cast<const Quant_vec*>(
+                k_ptr + offset1 * BLOCK_SIZE * x + offset2);
+            k_vecs[j] = fp8::scaled_convert<K_vec, Quant_vec, KV_DTYPE>(
+                k_vec_quant, kv_scale);
+          }
+        }
+      }
+      __builtin_amdgcn_sched_barrier(0);
+      // Compute dot product.
+      // This includes a reduction across the threads in the same thread group.
+      float qk = scale * Qk_dot<scalar_t, THREAD_GROUP_SIZE>::dot(q_vecs[reuse_kv_idx*THREAD_GROUP_SIZE + thread_group_offset], k_vecs);
+      // Add the ALiBi bias if slopes are given.
+      qk += (alibi_slope != 0) ? alibi_slope * (token_idx - seq_len + 1) : 0;
+      __builtin_amdgcn_sched_barrier(0);
+      if (thread_group_offset == 0) {
+        // Store the partial reductions to shared memory.
+        // NOTE(woosuk): It is required to zero out the masked logits.
+        const bool mask = token_idx >= seq_len;
+        logits[(reuse_kv_idx * partition_size) + (token_idx - start_token_idx)] = mask ? 0.f : qk;
+        // Update the max value.
+        qk_max[reuse_kv_idx] = mask ? qk_max[reuse_kv_idx] : fmaxf(qk_max[reuse_kv_idx], qk);
+      }
+    }
+  }
+  }
+  }
+  // Get the sum of the exp values.
+  float exp_sum[REUSE_KV_TIMES] = {0.f};
+
+  // Perform reduction across the threads in the same warp to get the
+  // max qk value for each "warp" (not across the thread block yet).
+  // The 0-th thread of each thread group already has its max qk value.
+  for(int reuse_kv_idx=0; reuse_kv_idx<REUSE_KV_TIMES; reuse_kv_idx++) {
+    const int head_idx = head_idx_soffset + reuse_kv_idx;
+    if(!odd_nheads || head_idx < q_boundary) {
+      #pragma unroll
+      for (int mask = WARP_SIZE / 2; mask >= THREAD_GROUP_SIZE; mask /= 2) {
+        qk_max[reuse_kv_idx] = fmaxf(qk_max[reuse_kv_idx], VLLM_SHFL_XOR_SYNC(qk_max[reuse_kv_idx], mask));
+      }
+      if (lane == 0) {
+        red_smem[reuse_kv_idx][warp_idx] = qk_max[reuse_kv_idx];
+      }
+      __syncthreads();
+
+      // TODO(woosuk): Refactor this part.
+      // Get the max qk value for the sequence.
+      qk_max[reuse_kv_idx] = lane < NUM_WARPS ? red_smem[reuse_kv_idx][lane] : -FLT_MAX;
+    #pragma unroll
+      for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) {
+        qk_max[reuse_kv_idx] = fmaxf(qk_max[reuse_kv_idx], VLLM_SHFL_XOR_SYNC(qk_max[reuse_kv_idx], mask));
+      }
+      // Broadcast the max qk value to all threads.
+      qk_max[reuse_kv_idx] = VLLM_SHFL_SYNC(qk_max[reuse_kv_idx], 0);
+
+      for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) {
+        float val = __expf(logits[(reuse_kv_idx * partition_size) + i] - qk_max[reuse_kv_idx]);
+        logits[(reuse_kv_idx * partition_size) + i] = val;
+        exp_sum[reuse_kv_idx] += val;
+      }
+      exp_sum[reuse_kv_idx] = block_sum<NUM_WARPS>(&red_smem[reuse_kv_idx][NUM_WARPS], exp_sum[reuse_kv_idx]);
+
+      // Compute softmax.
+      const float inv_sum = __fdividef(1.f, exp_sum[reuse_kv_idx] + 1e-6f);
+      for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) {
+        logits[(reuse_kv_idx * partition_size) + i] *= inv_sum;
+      }
+      __syncthreads();
+
+      // If partitioning is enabled, store the max logit and exp_sum.
+      if (USE_PARTITIONING && thread_idx == 0) {
+        float* max_logits_ptr = max_logits +
+                                seq_idx * num_heads * max_num_partitions +
+                                head_idx * max_num_partitions + partition_idx;
+        *max_logits_ptr = qk_max[reuse_kv_idx];
+        float* exp_sums_ptr = exp_sums + seq_idx * num_heads * max_num_partitions +
+                              head_idx * max_num_partitions + partition_idx;
+        *exp_sums_ptr = exp_sum[reuse_kv_idx];
+      }
+    }
+  }
+  // Each thread will fetch 16 bytes from the value cache at a time.
+  constexpr int V_VEC_SIZE = MIN(16 / sizeof(scalar_t), BLOCK_SIZE);
+  using V_vec = typename Vec<scalar_t, V_VEC_SIZE>::Type;
+  using L_vec = typename Vec<scalar_t, V_VEC_SIZE>::Type;
+  using V_quant_vec = typename Vec<cache_t, V_VEC_SIZE>::Type;
+  using Float_L_vec = typename FloatVec<L_vec>::Type;
+
+  constexpr int NUM_V_VECS_PER_ROW = BLOCK_SIZE / V_VEC_SIZE;
+  constexpr int NUM_ROWS_PER_ITER = WARP_SIZE / NUM_V_VECS_PER_ROW;
+  constexpr int NUM_ROWS_PER_THREAD =
+      DIVIDE_ROUND_UP(HEAD_SIZE, NUM_ROWS_PER_ITER);
+
+  // NOTE(woosuk): We use FP32 for the accumulator for better accuracy.
+  float accs[REUSE_KV_TIMES][NUM_ROWS_PER_THREAD];
+
+  #pragma unroll
+  for(int reuse_kv_idx=0; reuse_kv_idx<REUSE_KV_TIMES; reuse_kv_idx++) {
+    #pragma unroll
+    for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
+        accs[reuse_kv_idx][i] = 0.f;
+    }
+  }
+  scalar_t zero_value;
+  zero(zero_value);
+  for (int block_idx = start_block_idx + warp_idx; block_idx < end_block_idx;
+       block_idx += NUM_WARPS) {
+    const int64_t physical_block_number =
+        static_cast<int64_t>(block_table[block_idx]);
+    const int physical_block_offset = (lane % NUM_V_VECS_PER_ROW) * V_VEC_SIZE;
+    const int token_idx = block_idx * BLOCK_SIZE + physical_block_offset;
+    L_vec logits_vec;
+#pragma unroll
+    for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
+    V_vec v_vec;
+    for(int reuse_kv_idx=0; reuse_kv_idx<REUSE_KV_TIMES; reuse_kv_idx++) {
+      // NOTE(woosuk): The block number is stored in int32. However, we cast it to
+      // int64 because int32 can lead to overflow when this variable is multiplied
+      // by large numbers (e.g., kv_block_stride).
+      // For blocksparse attention: skip computation on blocks that are not
+      // attended
+      // blocksparse specific vars
+      const int head_idx = head_idx_soffset + reuse_kv_idx;
+      int bs_block_offset;
+      int q_bs_block_id;
+      if constexpr (IS_BLOCK_SPARSE) {
+        // const int num_blocksparse_blocks = DIVIDE_ROUND_UP(seq_len,
+        // blocksparse_block_size);
+        q_bs_block_id = (seq_len - 1) / blocksparse_block_size;
+        if (blocksparse_head_sliding_step >= 0)
+          // sliding on q heads
+          bs_block_offset =
+              (tp_rank * num_heads + head_idx) * blocksparse_head_sliding_step + 1;
+        else
+          // sliding on kv heads
+          bs_block_offset = (tp_rank * num_kv_heads + kv_head_idx) *
+                                (-blocksparse_head_sliding_step) +
+                            1;
+      }
+      if constexpr (IS_BLOCK_SPARSE) {
+        int v_bs_block_id = block_idx * BLOCK_SIZE / blocksparse_block_size;
+        if (!((v_bs_block_id + bs_block_offset) % blocksparse_vert_stride == 0) &&
+            !((v_bs_block_id > q_bs_block_id - blocksparse_local_blocks))) {
+          continue;
+        }
+      }
+      if(!odd_nheads || head_idx < q_boundary) {
+
+
+      const cache_t* v_ptr = v_cache + physical_block_number * kv_block_stride
+                                   + kv_head_idx * kv_head_stride;
+
+     from_float(logits_vec, *reinterpret_cast<Float_L_vec*>(logits + (reuse_kv_idx * partition_size) +  token_idx - start_token_idx));
+      // scalar_t* logits_vec_ptr = reinterpret_cast<scalar_t*>(&logits_vec);
+      // for(int i=0;i<8;++i){
+      //   from_float(*(logits_vec_ptr+i), 1000);
+      // }
+
+      if(reuse_kv_idx==0) {
+      const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
+      if (row_idx < HEAD_SIZE) {
+        const int offset = row_idx * BLOCK_SIZE + physical_block_offset;
+
+        if constexpr (KV_DTYPE == Fp8KVCacheDataType::kAuto) {
+          v_vec = *reinterpret_cast<const V_vec*>(v_ptr + offset);
+        } else {
+          V_quant_vec v_quant_vec =
+              *reinterpret_cast<const V_quant_vec*>(v_ptr + offset);
+          // Vector conversion from V_quant_vec to V_vec.
+          v_vec = fp8::scaled_convert<V_vec, V_quant_vec, KV_DTYPE>(v_quant_vec,
+                                                                    kv_scale);
+        }
+        if (block_idx == num_seq_blocks - 1) {
+          // NOTE(woosuk): When v_vec contains the tokens that are out of the
+          // context, we should explicitly zero out the values since they may
+          // contain NaNs. See
+          // https://github.com/vllm-project/vllm/issues/641#issuecomment-1682544472
+          scalar_t* v_vec_ptr = reinterpret_cast<scalar_t*>(&v_vec);
+#pragma unroll
+          for (int j = 0; j < V_VEC_SIZE; j++) {
+            v_vec_ptr[j] = token_idx + j < seq_len ? v_vec_ptr[j] : zero_value;
+          }
+        }
+        // if(threadIdx.x==0){
+        //   scalar_t* v_vec_ptr = reinterpret_cast<scalar_t*>(&v_vec);
+        //   scalar_t* logits_vec_ptr = reinterpret_cast<scalar_t*>(&logits_vec);
+        //   for(int i=0;i<8;++i){
+        //     printf("v_vec[%d] = %f\n",i, half_to_float(v_vec_ptr[i]));
+        //     // from_float(*(v_vec_ptr + i), 1000);
+        //   }
+        //   for(int i=0;i<8;++i){
+        //     printf("logits_vec[%d] = %f\n",i,half_to_float(logits_vec_ptr[i]));
+        //     // from_float(*(logits_vec_ptr + i), 1000);
+        //   }
+        // }
+        // accs[reuse_kv_idx][i] += dot(logits_vec, v_vec);
+      }
+      } 
+        accs[reuse_kv_idx][i] += dot(logits_vec, v_vec);
+      }
+      }
+    }
+  }
+
+  // Perform reduction within each warp.
+  #pragma unroll
+  for(int reuse_kv_idx=0; reuse_kv_idx<REUSE_KV_TIMES; reuse_kv_idx++) {
+    int head_idx = head_idx_soffset + reuse_kv_idx;
+
+    if(!odd_nheads || head_idx < q_boundary) {
+#pragma unroll
+  for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
+    float acc = accs[reuse_kv_idx][i];
+#pragma unroll
+    for (int mask = NUM_V_VECS_PER_ROW / 2; mask >= 1; mask /= 2) {
+      acc += VLLM_SHFL_XOR_SYNC(acc, mask);
+    }
+    accs[reuse_kv_idx][i] = acc;
+  }
+
+  // NOTE(woosuk): A barrier is required because the shared memory space for
+  // logits is reused for the output.
+  __syncthreads();
+
+  // Perform reduction across warps.
+  float* out_smem = reinterpret_cast<float*>(shared_mem);
+#pragma unroll
+  for (int i = NUM_WARPS; i > 1; i /= 2) {
+    int mid = i / 2;
+    // Upper warps write to shared memory.
+    if (warp_idx >= mid && warp_idx < i) {
+       float* dst = &out_smem[(reuse_kv_idx * (NUM_WARPS / 2) * HEAD_SIZE) + (warp_idx - mid) * HEAD_SIZE];
+#pragma unroll
+      for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
+        const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
+        if (row_idx < HEAD_SIZE && lane % NUM_V_VECS_PER_ROW == 0) {
+          dst[row_idx] = accs[reuse_kv_idx][i];
+        }
+      }
+    }
+    __syncthreads();
+
+    // Lower warps update the output.
+    if (warp_idx < mid) {
+      const float* src = &out_smem[(reuse_kv_idx * (NUM_WARPS / 2) * HEAD_SIZE) + warp_idx * HEAD_SIZE];
+#pragma unroll
+      for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
+        const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
+        if (row_idx < HEAD_SIZE && lane % NUM_V_VECS_PER_ROW == 0) {
+          accs[reuse_kv_idx][i] += src[row_idx];
+        }
+      }
+    }
+    __syncthreads();
+  }
+
+  // Write the final output.
+  if (warp_idx == 0) {
+    scalar_t* out_ptr =
+        out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE +
+        head_idx * max_num_partitions * HEAD_SIZE + partition_idx * HEAD_SIZE;
+#pragma unroll
+        for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
+          const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
+          if (row_idx < HEAD_SIZE && lane % NUM_V_VECS_PER_ROW == 0) {
+            from_float(*(out_ptr + row_idx), accs[reuse_kv_idx][i]);
+          }
+        }
+      }
+    }
+  }
+  }
+}
+
+
+// Grid: (num_heads, num_seqs, 1).
+template <typename scalar_t, typename cache_t, int HEAD_SIZE, int BLOCK_SIZE,
+          int NUM_THREADS, vllm::Fp8KVCacheDataType KV_DTYPE,
+          int REUSE_KV_TIMES = 1,
+          bool IS_BLOCK_SPARSE,
+          bool odd_nheads = false>
+__global__ __launch_bounds__(256,1) void paged_attention_v1_kernel_opt(
+    scalar_t* __restrict__ out,           // [num_seqs, num_heads, head_size]
+    const scalar_t* __restrict__ q,       // [num_seqs, num_heads, head_size]
+    const cache_t* __restrict__ k_cache,  // [num_blocks, num_kv_heads,
+                                          // head_size/x, block_size, x]
+    const cache_t* __restrict__ v_cache,  // [num_blocks, num_kv_heads,
+                                          // head_size, block_size]
+    const int num_heads,               // [num_heads]    
+    const int num_kv_heads,               // [num_heads]
+    const float scale,
+    const int* __restrict__ block_tables,  // [num_seqs, max_num_blocks_per_seq]
+    const int* __restrict__ seq_lens,      // [num_seqs]
+    const int max_num_blocks_per_seq,
+    const float* __restrict__ alibi_slopes,  // [num_heads]
+    const int q_stride, const int kv_block_stride, const int kv_head_stride,
+    const float kv_scale, const int tp_rank, const int blocksparse_local_blocks,
+    const int blocksparse_vert_stride, const int blocksparse_block_size,
+    const int blocksparse_head_sliding_step) {
+      paged_attention_kernel_opt<scalar_t, cache_t, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS,
+                          KV_DTYPE, IS_BLOCK_SPARSE, REUSE_KV_TIMES, odd_nheads>(
+        /* exp_sums */ nullptr, /* max_logits */ nullptr, out, q, k_cache,
+        v_cache, num_heads, num_kv_heads, scale, block_tables, seq_lens,
+        max_num_blocks_per_seq, alibi_slopes, q_stride, kv_block_stride,
+        kv_head_stride, kv_scale, tp_rank, blocksparse_local_blocks,
+        blocksparse_vert_stride, blocksparse_block_size,
+        blocksparse_head_sliding_step);
+}
+
+// Grid: (num_heads, num_seqs, max_num_partitions).
+template <typename scalar_t, typename cache_t, int HEAD_SIZE, int BLOCK_SIZE,
+          int NUM_THREADS, vllm::Fp8KVCacheDataType KV_DTYPE,
+          bool IS_BLOCK_SPARSE,
+          int REUSE_KV_TIMES,
+          int PARTITION_SIZE,
+          bool odd_nheads = false>
+__global__ __launch_bounds__(256,1) void paged_attention_v2_kernel_opt(
+    float* __restrict__ exp_sums,  // [num_seqs, num_heads, max_num_partitions]
+    float* __restrict__ max_logits,       // [num_seqs, num_heads,
+                                          // max_num_partitions]
+    scalar_t* __restrict__ tmp_out,       // [num_seqs, num_heads,
+                                          // max_num_partitions, head_size]
+    const scalar_t* __restrict__ q,       // [num_seqs, num_heads, head_size]
+    const cache_t* __restrict__ k_cache,  // [num_blocks, num_kv_heads,
+                                          // head_size/x, block_size, x]
+    const cache_t* __restrict__ v_cache,  // [num_blocks, num_kv_heads,
+                                          // head_size, block_size]
+    const int num_heads,               // [num_heads]                                      
+    const int num_kv_heads,               // [num_kv_heads]
+    const float scale,
+    const int* __restrict__ block_tables,  // [num_seqs, max_num_blocks_per_seq]
+    const int* __restrict__ seq_lens,      // [num_seqs]
+    const int max_num_blocks_per_seq,
+    const float* __restrict__ alibi_slopes,  // [num_heads]
+    const int q_stride, const int kv_block_stride, const int kv_head_stride,
+    const float kv_scale, const int tp_rank, const int blocksparse_local_blocks,
+    const int blocksparse_vert_stride, const int blocksparse_block_size,
+    const int blocksparse_head_sliding_step) {
+        paged_attention_kernel_opt<scalar_t, cache_t, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS,
+                         KV_DTYPE, IS_BLOCK_SPARSE, REUSE_KV_TIMES, odd_nheads, PARTITION_SIZE>(
+          exp_sums, max_logits, tmp_out, q, k_cache, v_cache, num_heads, num_kv_heads, scale,
+          block_tables, seq_lens, max_num_blocks_per_seq, alibi_slopes, q_stride,
+          kv_block_stride, kv_head_stride, kv_scale, tp_rank,
+          blocksparse_local_blocks, blocksparse_vert_stride, blocksparse_block_size,
+          blocksparse_head_sliding_step);
+}
+
+// Grid: (num_heads, num_seqs).
+template <typename scalar_t, int HEAD_SIZE, int NUM_THREADS,
+          int PARTITION_SIZE>
+__global__ __launch_bounds__(256,1) void paged_attention_v2_reduce_kernel_opt(
+    scalar_t* __restrict__ out,            // [num_seqs, num_heads, head_size]
+    const float* __restrict__ exp_sums,    // [num_seqs, num_heads,
+                                           // max_num_partitions]
+    const float* __restrict__ max_logits,  // [num_seqs, num_heads,
+                                           // max_num_partitions]
+    const scalar_t* __restrict__ tmp_out,  // [num_seqs, num_heads,
+                                           // max_num_partitions, head_size]
+    const int* __restrict__ seq_lens,      // [num_seqs]
+    const int max_num_partitions) {
+  const int num_heads = gridDim.x;
+  const int head_idx = blockIdx.x;
+  const int seq_idx = blockIdx.y;
+  const int seq_len = seq_lens[seq_idx];
+  const int num_partitions = DIVIDE_ROUND_UP(seq_len, PARTITION_SIZE);
+  if (num_partitions == 1) {
+    // No need to reduce. Only copy tmp_out to out.
+    scalar_t* out_ptr =
+        out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE;
+    const scalar_t* tmp_out_ptr =
+        tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE +
+        head_idx * max_num_partitions * HEAD_SIZE;
+    for (int i = threadIdx.x; i < HEAD_SIZE; i += blockDim.x) {
+      out_ptr[i] = tmp_out_ptr[i];
+    }
+    // Terminate the thread block.
+    return;
+  }
+
+  constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
+  const int warp_idx = threadIdx.x / WARP_SIZE;
+  const int lane = threadIdx.x % WARP_SIZE;
+
+  // Size: 2 * num_partitions.
+  extern __shared__ char shared_mem[];
+  // Workspace for reduction.
+  __shared__ float red_smem[2 * NUM_WARPS];
+
+  // Load max logits to shared memory.
+  float* shared_max_logits = reinterpret_cast<float*>(shared_mem);
+  const float* max_logits_ptr = max_logits +
+                                seq_idx * num_heads * max_num_partitions +
+                                head_idx * max_num_partitions;
+  float max_logit = -FLT_MAX;
+  for (int i = threadIdx.x; i < num_partitions; i += blockDim.x) {
+    const float l = max_logits_ptr[i];
+    shared_max_logits[i] = l;
+    max_logit = fmaxf(max_logit, l);
+  }
+  __syncthreads();
+
+  // Get the global max logit.
+  // Reduce within the warp.
+#pragma unroll
+  for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) {
+    max_logit = fmaxf(max_logit, VLLM_SHFL_XOR_SYNC(max_logit, mask));
+  }
+  if (lane == 0) {
+    red_smem[warp_idx] = max_logit;
+  }
+  __syncthreads();
+  // Reduce across warps.
+  max_logit = lane < NUM_WARPS ? red_smem[lane] : -FLT_MAX;
+#pragma unroll
+  for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) {
+    max_logit = fmaxf(max_logit, VLLM_SHFL_XOR_SYNC(max_logit, mask));
+  }
+  // Broadcast the max value to all threads.
+  max_logit = VLLM_SHFL_SYNC(max_logit, 0);
+
+  // Load rescaled exp sums to shared memory.
+  float* shared_exp_sums =
+      reinterpret_cast<float*>(shared_mem + sizeof(float) * num_partitions);
+  const float* exp_sums_ptr = exp_sums +
+                              seq_idx * num_heads * max_num_partitions +
+                              head_idx * max_num_partitions;
+  float global_exp_sum = 0.0f;
+  for (int i = threadIdx.x; i < num_partitions; i += blockDim.x) {
+    float l = shared_max_logits[i];
+    float rescaled_exp_sum = exp_sums_ptr[i] * expf(l - max_logit);
+    global_exp_sum += rescaled_exp_sum;
+    shared_exp_sums[i] = rescaled_exp_sum;
+  }
+  __syncthreads();
+  global_exp_sum = block_sum<NUM_WARPS>(&red_smem[NUM_WARPS], global_exp_sum);
+  const float inv_global_exp_sum = __fdividef(1.0f, global_exp_sum + 1e-6f);
+
+  // Aggregate tmp_out to out.
+  const scalar_t* tmp_out_ptr =
+      tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE +
+      head_idx * max_num_partitions * HEAD_SIZE;
+  scalar_t* out_ptr =
+      out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE;
+#pragma unroll
+  for (int i = threadIdx.x; i < HEAD_SIZE; i += NUM_THREADS) {
+    float acc = 0.0f;
+    for (int j = 0; j < num_partitions; ++j) {
+      acc += to_float(tmp_out_ptr[j * HEAD_SIZE + i]) * shared_exp_sums[j] *
+             inv_global_exp_sum;
+    }
+    from_float(out_ptr[i], acc);
+  }
+}
+
+}  // namespace vllm
+
+#define LAUNCH_PAGED_ATTENTION_V1(HEAD_SIZE)                                \
+  VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(                     \
+      ((void*)vllm::paged_attention_v1_kernel_opt<T, CACHE_T, HEAD_SIZE,        \
+                                              BLOCK_SIZE, NUM_THREADS,      \
+                                              KV_DTYPE, REUSE_KV_TIMES, IS_BLOCK_SPARSE, odd_nheads>),  \
+      shared_mem_size);                                                     \
+ hipLaunchKernelGGL(( vllm::paged_attention_v1_kernel_opt<T, CACHE_T, HEAD_SIZE, BLOCK_SIZE,        \
+                                  NUM_THREADS, KV_DTYPE, REUSE_KV_TIMES, IS_BLOCK_SPARSE, odd_nheads>)   \
+      , dim3(grid), dim3(block), shared_mem_size, stream,                            \
+          out_ptr, query_ptr, key_cache_ptr, value_cache_ptr, num_heads, num_kv_heads, \
+          scale, block_tables_ptr, seq_lens_ptr, max_num_blocks_per_seq,    \
+          alibi_slopes_ptr, q_stride, kv_block_stride, kv_head_stride,      \
+          kv_scale, tp_rank, blocksparse_local_blocks,                      \
+          blocksparse_vert_stride, blocksparse_block_size,                  \
+          blocksparse_head_sliding_step);
+
+// #define LAUNCH_PAGED_ATTENTION_V1(HEAD_SIZE)                                \
+// vllm::paged_attention_v1_kernel<T, CACHE_T, HEAD_SIZE, BLOCK_SIZE,        \
+//                                   NUM_THREADS, KV_DTYPE, REUSE_KV_TIMES, IS_BLOCK_SPARSE, odd_nheads>   \
+//       <<<dim3(grid), dim3(block)>>>(                           \
+//           out_ptr, query_ptr, key_cache_ptr, value_cache_ptr, num_heads, num_kv_heads, \
+//           scale, block_tables_ptr, seq_lens_ptr, max_num_blocks_per_seq,    \
+//           alibi_slopes_ptr, q_stride, kv_block_stride, kv_head_stride,      \
+//           kv_scale, tp_rank, blocksparse_local_blocks,                      \
+//           blocksparse_vert_stride, blocksparse_block_size,                  \
+//           blocksparse_head_sliding_step);
+
+// TODO(woosuk): Tune NUM_THREADS.
+template <typename T, typename CACHE_T, int BLOCK_SIZE,
+          vllm::Fp8KVCacheDataType KV_DTYPE, bool IS_BLOCK_SPARSE>
+void paged_attention_v1_launcher(
+    torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
+    torch::Tensor& value_cache, int num_kv_heads, float scale,
+    torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len,
+    const c10::optional<torch::Tensor>& alibi_slopes, float kv_scale,
+    const int tp_rank, const int blocksparse_local_blocks,
+    const int blocksparse_vert_stride, const int blocksparse_block_size,
+    const int blocksparse_head_sliding_step) {
+  int num_seqs = query.size(0);
+  int num_heads = query.size(1);
+  int head_size = query.size(2);
+  int max_num_blocks_per_seq = block_tables.size(1);
+  int q_stride = query.stride(0);
+  int kv_block_stride = key_cache.stride(0);
+  int kv_head_stride = key_cache.stride(1);
+  int num_threads = 128;
+  if(num_heads!=num_kv_heads){
+    num_threads =256;
+  }
+  int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
+  assert(head_size % thread_group_size == 0);
+
+  // NOTE: alibi_slopes is optional.
+  const float* alibi_slopes_ptr =
+      alibi_slopes
+          ? reinterpret_cast<const float*>(alibi_slopes.value().data_ptr())
+          : nullptr;
+
+  T* out_ptr = reinterpret_cast<T*>(out.data_ptr());
+  T* query_ptr = reinterpret_cast<T*>(query.data_ptr());
+  CACHE_T* key_cache_ptr = reinterpret_cast<CACHE_T*>(key_cache.data_ptr());
+  CACHE_T* value_cache_ptr = reinterpret_cast<CACHE_T*>(value_cache.data_ptr());
+  int* block_tables_ptr = block_tables.data_ptr<int>();
+  int* seq_lens_ptr = seq_lens.data_ptr<int>();
+
+  int padded_max_seq_len = DIVIDE_ROUND_UP(max_seq_len, BLOCK_SIZE) * BLOCK_SIZE;
+  REUSEKV_SWITCH_V1(num_heads * num_seqs , [&] {
+    BOOL_SWITCH((num_heads/num_kv_heads % REUSE_KV_TIMES != 0), odd_nheads, [&] {
+      HEADSIZE_SWITCH(head_size, [&] {
+        NUM_THREADS_SWITCH(num_threads, [&] {
+          OPT_SWITCH(num_heads == num_kv_heads, [&] {
+          constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
+          int logits_size =  REUSE_KV_TIMES*padded_max_seq_len * sizeof(float);
+          int outputs_size =  REUSE_KV_TIMES*(NUM_WARPS / 2) * head_size * sizeof(float);
+          // Python-side check in vllm.worker.worker._check_if_can_support_max_seq_len
+          // Keep that in sync with the logic here!
+          int shared_mem_size = ::max(logits_size, outputs_size);
+          if(num_heads == num_kv_heads) shared_mem_size = ::max(12 * 1024, shared_mem_size);
+          // int shared_mem_size = ::max(31*1024, ::max(logits_size, outputs_size));
+          // std::cout<<"shared_mem_size = "<<shared_mem_size<<std::endl;
+          dim3 grid((num_heads/num_kv_heads + REUSE_KV_TIMES - 1) / REUSE_KV_TIMES*num_kv_heads, 1, num_seqs);
+          dim3 block(NUM_THREADS);
+          const at::hip::OptionalHIPGuardMasqueradingAsCUDA device_guard(device_of(query));
+          const hipStream_t stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
+          LAUNCH_PAGED_ATTENTION_V1(HEAD_SIZE);
+          });
+        });
+      });
+    });
+  }); 
+}
+
+#define CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, KV_DTYPE, IS_BLOCK_SPARSE)  \
+  paged_attention_v1_launcher<T, CACHE_T, BLOCK_SIZE, KV_DTYPE,              \
+                              IS_BLOCK_SPARSE>(                              \
+      out, query, key_cache, value_cache, num_kv_heads, scale, block_tables, \
+      seq_lens, max_seq_len, alibi_slopes, kv_scale, tp_rank,                \
+      blocksparse_local_blocks, blocksparse_vert_stride,                     \
+      blocksparse_block_size, blocksparse_head_sliding_step);
+
+#define CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \
+  switch (is_block_sparse) {                                               \
+    case true:                                                             \
+      CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true);     \
+      break;                                                               \
+    case false:                                                            \
+      CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false);    \
+      break;                                                               \
+  }
+
+// NOTE(woosuk): To reduce the compilation time, we omitted block sizes
+// 1, 2, 4, 64, 128, 256.
+#define CALL_V1_LAUNCHER_BLOCK_SIZE(T, CACHE_T, KV_DTYPE)         \
+  switch (block_size) {                                           \
+    case 8:                                                       \
+      CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 8, KV_DTYPE);         \
+      break;                                                      \
+    case 16:                                                      \
+      CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 16, KV_DTYPE);        \
+      break;                                                      \
+    case 32:                                                      \
+      CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 32, KV_DTYPE);        \
+      break;                                                      \
+    default:                                                      \
+      TORCH_CHECK(false, "Unsupported block size: ", block_size); \
+      break;                                                      \
+  }
+
+
+void paged_attention_v1_opt(
+    torch::Tensor& out,    // [num_seqs, num_heads, head_size]
+    torch::Tensor& query,  // [num_seqs, num_heads, head_size]
+    torch::Tensor&
+        key_cache,  // [num_blocks, num_heads, head_size/x, block_size, x]
+    torch::Tensor&
+        value_cache,       // [num_blocks, num_heads, head_size, block_size]
+    int64_t num_kv_heads,  // [num_heads]
+    double scale,
+    torch::Tensor& block_tables,  // [num_seqs, max_num_blocks_per_seq]
+    torch::Tensor& seq_lens,      // [num_seqs]
+    int64_t block_size, int64_t max_seq_len,
+    const c10::optional<torch::Tensor>& alibi_slopes,
+    const std::string& kv_cache_dtype, double kv_scale, const int64_t tp_rank,
+    const int64_t blocksparse_local_blocks,
+    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
+    const int64_t blocksparse_head_sliding_step) {
+  const bool is_block_sparse = (blocksparse_vert_stride > 1);
+
+  DISPATCH_BY_KV_CACHE_DTYPE(query.dtype(), kv_cache_dtype,
+                             CALL_V1_LAUNCHER_BLOCK_SIZE)
+}
+
+#define LAUNCH_PAGED_ATTENTION_V2(HEAD_SIZE)                                   \
+ hipLaunchKernelGGL(( vllm::paged_attention_v2_kernel_opt<T, CACHE_T, HEAD_SIZE, BLOCK_SIZE,           \
+                                  NUM_THREADS, KV_DTYPE, IS_BLOCK_SPARSE,      \
+                                  REUSE_KV_TIMES, PARTITION_SIZE, odd_nheads>)                              \
+      , dim3(grid), dim3(block), shared_mem_size, stream,                               \
+          exp_sums_ptr, max_logits_ptr, tmp_out_ptr, query_ptr, key_cache_ptr, \
+          value_cache_ptr, num_heads, num_kv_heads, scale, block_tables_ptr,              \
+          seq_lens_ptr, max_num_blocks_per_seq, alibi_slopes_ptr, q_stride,    \
+          kv_block_stride, kv_head_stride, kv_scale, tp_rank,                  \
+          blocksparse_local_blocks, blocksparse_vert_stride,                   \
+          blocksparse_block_size, blocksparse_head_sliding_step);              \
+ hipLaunchKernelGGL(( vllm::paged_attention_v2_reduce_kernel_opt<T, HEAD_SIZE, NUM_THREADS,            \
+                                         PARTITION_SIZE>)                       \
+      , dim3(reduce_grid), dim3(block), reduce_shared_mem_size, stream,                 \
+          out_ptr, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, seq_lens_ptr,    \
+          max_num_partitions);
+
+template <typename T, typename CACHE_T, int BLOCK_SIZE,
+          vllm::Fp8KVCacheDataType KV_DTYPE, bool IS_BLOCK_SPARSE,
+          int NUM_THREADS = 256, int PARTITION_SIZE = 512>
+void paged_attention_v2_launcher(
+    torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
+    torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
+    torch::Tensor& value_cache, int num_kv_heads, float scale,
+    torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len,
+    const c10::optional<torch::Tensor>& alibi_slopes, float kv_scale,
+    const int tp_rank, const int blocksparse_local_blocks,
+    const int blocksparse_vert_stride, const int blocksparse_block_size,
+    const int blocksparse_head_sliding_step) {
+  int num_seqs = query.size(0);
+  int num_heads = query.size(1);
+  int head_size = query.size(2);
+  int max_num_blocks_per_seq = block_tables.size(1);
+  int q_stride = query.stride(0);
+  int kv_block_stride = key_cache.stride(0);
+  int kv_head_stride = key_cache.stride(1);
+
+  int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
+  assert(head_size % thread_group_size == 0);
+
+  // NOTE: alibi_slopes is optional.
+  const float* alibi_slopes_ptr =
+      alibi_slopes
+          ? reinterpret_cast<const float*>(alibi_slopes.value().data_ptr())
+          : nullptr;
+
+  T* out_ptr = reinterpret_cast<T*>(out.data_ptr());
+  float* exp_sums_ptr = reinterpret_cast<float*>(exp_sums.data_ptr());
+  float* max_logits_ptr = reinterpret_cast<float*>(max_logits.data_ptr());
+  T* tmp_out_ptr = reinterpret_cast<T*>(tmp_out.data_ptr());
+  T* query_ptr = reinterpret_cast<T*>(query.data_ptr());
+  CACHE_T* key_cache_ptr = reinterpret_cast<CACHE_T*>(key_cache.data_ptr());
+  CACHE_T* value_cache_ptr = reinterpret_cast<CACHE_T*>(value_cache.data_ptr());
+  int* block_tables_ptr = block_tables.data_ptr<int>();
+  int* seq_lens_ptr = seq_lens.data_ptr<int>();
+
+  constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
+  int max_num_partitions = DIVIDE_ROUND_UP(max_seq_len, PARTITION_SIZE);
+  REUSEKV_SWITCH(num_heads * max_num_partitions * num_seqs , [&] {
+    BOOL_SWITCH((num_heads/num_kv_heads % REUSE_KV_TIMES != 0), odd_nheads, [&] {
+      HEADSIZE_SWITCH(head_size, [&] {
+        OPT_SWITCH(num_heads == num_kv_heads, [&] {
+        int logits_size = REUSE_KV_TIMES*PARTITION_SIZE * sizeof(float);
+        int outputs_size = REUSE_KV_TIMES*(NUM_WARPS / 2) * head_size * sizeof(float);
+
+        // For paged attention v2 kernel.
+        // dim3 grid(num_heads, max_num_partitions, num_seqs);
+
+        dim3 grid;
+        grid.x = (num_heads/num_kv_heads + REUSE_KV_TIMES -1)/REUSE_KV_TIMES * num_kv_heads;
+        grid.y = max_num_partitions;
+        grid.z = num_seqs;
+        // int shared_mem_size = ::max(1024*32, ::max(logits_size, outputs_size));
+        int shared_mem_size = ::max(logits_size, outputs_size);
+        // For paged attention v2 reduce kernel.
+        dim3 reduce_grid(num_heads, num_seqs);
+        int reduce_shared_mem_size = 2 * max_num_partitions * sizeof(float);
+        dim3 block(NUM_THREADS);
+        const at::hip::OptionalHIPGuardMasqueradingAsCUDA device_guard(device_of(query));
+        const hipStream_t stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
+        LAUNCH_PAGED_ATTENTION_V2(HEAD_SIZE);
+        });
+      });
+    });
+  });
+}
+
+#define CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, KV_DTYPE, IS_BLOCK_SPARSE)   \
+  paged_attention_v2_launcher<T, CACHE_T, BLOCK_SIZE, KV_DTYPE,               \
+                              IS_BLOCK_SPARSE>(                               \
+      out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache,      \
+      num_kv_heads, scale, block_tables, seq_lens, max_seq_len, alibi_slopes, \
+      kv_scale, tp_rank, blocksparse_local_blocks, blocksparse_vert_stride,   \
+      blocksparse_block_size, blocksparse_head_sliding_step);
+
+#define CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \
+  switch (is_block_sparse) {                                               \
+    case true:                                                             \
+      CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true);     \
+      break;                                                               \
+    case false:                                                            \
+      CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false);    \
+      break;                                                               \
+  }
+
+// NOTE(woosuk): To reduce the compilation time, we omitted block sizes
+// 1, 2, 4, 64, 128, 256.
+#define CALL_V2_LAUNCHER_BLOCK_SIZE(T, CACHE_T, KV_DTYPE)         \
+  switch (block_size) {                                           \
+    case 8:                                                       \
+      CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 8, KV_DTYPE);         \
+      break;                                                      \
+    case 16:                                                      \
+      CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 16, KV_DTYPE);        \
+      break;                                                      \
+    case 32:                                                      \
+      CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 32, KV_DTYPE);        \
+      break;                                                      \
+    default:                                                      \
+      TORCH_CHECK(false, "Unsupported block size: ", block_size); \
+      break;                                                      \
+  }
+
+
+void paged_attention_v2_opt(
+    torch::Tensor& out,         // [num_seqs, num_heads, head_size]
+    torch::Tensor& exp_sums,    // [num_seqs, num_heads, max_num_partitions]
+    torch::Tensor& max_logits,  // [num_seqs, num_heads, max_num_partitions]
+    torch::Tensor&
+        tmp_out,  // [num_seqs, num_heads, max_num_partitions, head_size]
+    torch::Tensor& query,  // [num_seqs, num_heads, head_size]
+    torch::Tensor&
+        key_cache,  // [num_blocks, num_heads, head_size/x, block_size, x]
+    torch::Tensor&
+        value_cache,       // [num_blocks, num_heads, head_size, block_size]
+    int64_t num_kv_heads,  // [num_heads]
+    double scale,
+    torch::Tensor& block_tables,  // [num_seqs, max_num_blocks_per_seq]
+    torch::Tensor& seq_lens,      // [num_seqs]
+    int64_t block_size, int64_t max_seq_len,
+    const c10::optional<torch::Tensor>& alibi_slopes,
+    const std::string& kv_cache_dtype, double kv_scale, const int64_t tp_rank,
+    const int64_t blocksparse_local_blocks,
+    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
+    const int64_t blocksparse_head_sliding_step) {
+  const bool is_block_sparse = (blocksparse_vert_stride > 1);
+  DISPATCH_BY_KV_CACHE_DTYPE(query.dtype(), kv_cache_dtype,
+                             CALL_V2_LAUNCHER_BLOCK_SIZE)
+}
+
+#undef WARP_SIZE
+#undef MAX
+#undef MIN
+#undef DIVIDE_ROUND_UP
--- a/csrc/attention/attention_utils.cuh
+++ b/csrc/attention/attention_utils.cuh
@@ -26,17 +26,104 @@

 namespace vllm {

-// Q*K^T operation.
-template <int THREAD_GROUP_SIZE, typename Vec, int N>
+inline __device__ void v_dot2_f32_f16(float& a, const uint32_t &  b,const uint32_t &  c) {
+  asm volatile("v_dot2_f32_f16 %0, %1, %2, %0;": "=v"(a): "v"(b), "v"(c), "0"(a));
+}
+
+inline __device__ void v_pk_fma_f16(uint32_t& a, const uint32_t &  b,const uint32_t &  c){
+   asm volatile("v_pk_fma_f16 %0, %1, %2, %3;": "=v"(a) : "v"(b), "v"(c), "v"(a));
+}
+
+inline __device__ void ds_read_b128(uint4& a, uint32_t offset){
+    asm volatile("ds_read_b128 %0 %1;": "=v" (a): "v" (offset));
+}
+
+inline __device__ void ds_read_b128_sync(uint4& a, uint32_t offset){
+    asm volatile("ds_read_b128 %0 %1\ns_waitcnt lgkmcnt(1);": "=v" (a): "v" (offset));
+}
+
+inline __device__ void lgkmcnt0(){
+    asm volatile("s_waitcnt lgkmcnt(0);");
+}
+
+__device__ inline size_t  __nv_cvta_generic_to_shared_impl(const void *__ptr) {
+        return (size_t)(void __attribute__((address_space(3))) *)__ptr;
+}
+
+inline __device__ void v_dot2_f32_f16(float& a,const uint2 &  b,const uint2 &  c) {
+  v_dot2_f32_f16(a, b.x, c.x);
+  v_dot2_f32_f16(a, b.y, c.y);
+}
+
+inline __device__ void v_dot2_f32_f16(float& a,const uint4 &  b,const uint4 &  c) {
+  v_dot2_f32_f16(a, b.x, c.x);
+  v_dot2_f32_f16(a, b.y, c.y);
+  v_dot2_f32_f16(a, b.z, c.z);
+  v_dot2_f32_f16(a, b.w, c.w);
+}
+
+inline __device__ float add_half2(uint32_t a){
+ union {
+    uint32_t u32;
+    half u16[2];
+  } tmp;
+  tmp.u32=a;
+  return static_cast<float>(tmp.u16[0]+tmp.u16[1]);
+}
+
+inline __device__ void v_pk_fma_f16x8(float& a,const uint4 &  b,const uint4 &  c) {
+  uint32_t tmp = mul<uint32_t, uint32_t, uint32_t>(b.x,c.x);
+  v_pk_fma_f16(tmp,b.y,c.y);
+  v_pk_fma_f16(tmp,b.z,c.z);
+  v_pk_fma_f16(tmp,b.w,c.w);
+  a+=add_half2(tmp);
+}
+
+// Q*K^T operation. fp16
+template <int THREAD_GROUP_SIZE, typename Vec, int N, typename scalar_t, std::enable_if_t<std::is_same<scalar_t, uint16_t>::value, int> = 0>
+// template <int THREAD_GROUP_SIZE, typename Vec, int N>
 inline __device__ float qk_dot_(const Vec (&q)[N], const Vec (&k)[N]) {
+  
+  float qk =0;
+  // uint32_t offset = __nv_cvta_generic_to_shared_impl(q);
+  // const uint4 *k_ptr= reinterpret_cast<const uint4 *>(k);
+  // // Compute the parallel products for Q*K^T (treat vector lanes separately).
+
+  // constexpr int loop=N*sizeof(Vec)/16/2;
+  // uint4 qt[2];
+  // #pragma unroll
+  // for (int ii = 0; ii < loop; ++ii) {
+  //   ds_read_b128(qt[0],offset+16*ii*2);
+  //   ds_read_b128_sync(qt[1],offset+16*(ii*2+1));
+  //   v_dot2_f32_f16(qk,qt[0],k_ptr[ii*2]);
+  //   // v_pk_fma_f16x8(qk,qt[0],k_ptr[ii*2]);
+  //   lgkmcnt0();
+  //   v_dot2_f32_f16(qk,qt[1],k_ptr[ii*2+1]);
+  //   // v_pk_fma_f16x8(qk,qt[1],k_ptr[ii*2+1]);
+  // }
+  #pragma unroll
+  for (int ii = 0; ii < N; ++ii) {
+    v_dot2_f32_f16(qk,q[ii],k[ii]);
+  }
+  // Finalize the reduction across lanes.
+#pragma unroll
+  for (int mask = THREAD_GROUP_SIZE / 2; mask >= 1; mask /= 2) {
+    qk += VLLM_SHFL_XOR_SYNC(qk, mask);
+  }
+  return qk;
+}
+
+// Q*K^T operation. //bf16
+template <int THREAD_GROUP_SIZE, typename Vec, int N, typename scalar_t, std::enable_if_t<!std::is_same<scalar_t, uint16_t>::value, int> = 0>
+// template <int THREAD_GROUP_SIZE, typename Vec, int N>
+inline __device__ float qk_dot_(const Vec (&q)[N], const Vec (&k)[N]) {
+
  using A_vec = typename FloatVec<Vec>::Type;
-  // Compute the parallel products for Q*K^T (treat vector lanes separately).
  A_vec qk_vec = mul<A_vec, Vec, Vec>(q[0], k[0]);
-#pragma unroll
+  #pragma unroll
  for (int ii = 1; ii < N; ++ii) {
    qk_vec = fma(q[ii], k[ii], qk_vec);
  }
-
  // Finalize the reduction across lanes.
  float qk = sum(qk_vec);
 #pragma unroll
@@ -46,12 +133,17 @@ inline __device__ float qk_dot_(const Vec (&q)[N], const Vec (&k)[N]) {
  return qk;
 }

+
 template <typename T, int THREAD_GROUP_SIZE>
 struct Qk_dot {
  template <typename Vec, int N>
  static inline __device__ float dot(const Vec (&q)[N], const Vec (&k)[N]) {
-    return qk_dot_<THREAD_GROUP_SIZE>(q, k);
+    return qk_dot_<THREAD_GROUP_SIZE,Vec,N,T>(q, k);
  }
+  // template <typename Vec, int N>
+  // static inline __device__ float qk_dot_vpack(const Vec (&q)[N], const Vec (&k)[N]) {
+  //   return qk_dot_vpack_<THREAD_GROUP_SIZE>(q, k);
+  // }
 };

 }  // namespace vllm
--- a/csrc/attention/static_switch.h
+++ b/csrc/attention/static_switch.h
+#define BOOL_SWITCH(COND, CONST_NAME, ...)      \
+  [&] {                                         \
+    if (COND) {                                 \
+      constexpr static bool CONST_NAME = true;  \
+      return __VA_ARGS__();                     \
+    } else {                                    \
+      constexpr static bool CONST_NAME = false; \
+      return __VA_ARGS__();                     \
+    }                                           \
+  }()
+
+#define OPT_SWITCH(COND, ...)      \
+  [&] {                                         \
+    if (COND) {                                 \
+      constexpr static int opt = 1;  \
+      return __VA_ARGS__();                     \
+    } else {                                    \
+      constexpr static int opt = 2; \
+      return __VA_ARGS__();                     \
+    }                                           \
+  }()
+
+#define NUM_THREADS_SWITCH(NUM_THREAD, ...)    \
+  [&] {                                         \
+    if (NUM_THREAD == 256) {                   \
+      constexpr static int NUM_THREADS = 256;  \
+      return __VA_ARGS__();                     \
+    } else {                                    \
+      constexpr static int NUM_THREADS = 128;  \
+      return __VA_ARGS__();                     \
+    }                                           \
+  }()
+
+  #define HEADSIZE_SWITCH(HEADDIM, ...)   \
+  [&] {                                    \
+    if (HEADDIM == 64) {                   \
+      constexpr static int HEAD_SIZE = 64;  \
+      return __VA_ARGS__();                \
+    } else if (HEADDIM == 80) {            \
+      constexpr static int HEAD_SIZE = 80;  \
+      return __VA_ARGS__();                \
+    } else if (HEADDIM == 96) {            \
+      constexpr static int HEAD_SIZE = 96;  \
+      return __VA_ARGS__();                \
+    } else if (HEADDIM == 112) {           \
+      constexpr static int HEAD_SIZE = 112; \
+      return __VA_ARGS__();                \
+    } else if (HEADDIM == 128) {           \
+      constexpr static int HEAD_SIZE = 128; \
+      return __VA_ARGS__();                \
+    } else if (HEADDIM == 256) {           \
+      constexpr static int HEAD_SIZE = 256; \
+      return __VA_ARGS__();                \
+    }                                      \
+    else {                                 \
+      TORCH_CHECK(false, "Unsupported head size: ", HEADDIM);\
+    }                                      \
+  }()
+
+#define REUSEKV_SWITCH(num_blocks , ...)      \
+[&] {                                                   \
+    if (num_heads % 2 == 0 && num_heads / num_kv_heads >= 4 && num_blocks >= 1200){      \
+        constexpr static int REUSE_KV_TIMES = 4;        \
+        return __VA_ARGS__();                           \
+    } else if (num_heads / num_kv_heads >= 2 && num_blocks >= 1200){\
+        constexpr static int REUSE_KV_TIMES = 2;        \
+        return __VA_ARGS__();                           \
+    } else {                                            \
+        constexpr static int REUSE_KV_TIMES = 1;        \
+        return __VA_ARGS__();                           \
+    }                                                   \
+}()
+
+#define REUSEKV_SWITCH_V1(num_blocks , ...)      \
+[&] {                                                   \
+    if (num_heads > num_kv_heads && num_blocks >= 1200){      \
+        constexpr static int REUSE_KV_TIMES = 2;        \
+        return __VA_ARGS__();                           \
+    }  else {                                           \
+        constexpr static int REUSE_KV_TIMES = 1;        \
+        return __VA_ARGS__();                           \
+    }                                                   \
+}()
+
--- a/csrc/layernorm_kernels.cu
+++ b/csrc/layernorm_kernels.cu
@@ -349,4 +349,4 @@ void fused_add_rms_norm(torch::Tensor& input,     // [..., hidden_size]
  } else {
    LAUNCH_FUSED_ADD_RMS_NORM(0);
  }
-}
+}
\ No newline at end of file
--- a/csrc/moe_align_block_size_kernels.cu
+++ b/csrc/moe_align_block_size_kernels.cu
@@ -9,6 +9,8 @@

 #define CEILDIV(x, y) (((x) + (y) - 1) / (y))

+#define MAX_SHARED_MEM_SIZE 64 * 1024
+
 namespace vllm {

 namespace {
@@ -19,11 +21,12 @@ __device__ __forceinline__ int32_t index(int32_t total_col, int32_t row,
 }
 }  // namespace

-template <typename scalar_t>
+template <typename scalar_t, bool experts_num_exceed_limit>
 __global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids,
                                            int32_t* sorted_token_ids,
                                            int32_t* expert_ids,
                                            int32_t* total_tokens_post_pad,
+                                            int32_t* global_tokens_cnts_ptr,
                                            int32_t num_experts,
                                            int32_t block_size, size_t numel) {
  const size_t tokens_per_thread = CEILDIV(numel, blockDim.x);
@@ -31,11 +34,18 @@ __global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids,

  extern __shared__ int32_t shared_mem[];

-  int32_t* tokens_cnts =
-      shared_mem;  // 2d tensor with shape (num_experts + 1, num_experts)
-  int32_t* cumsum =
-      shared_mem + (num_experts + 1) *
-                       num_experts;  // 1d tensor with shape (num_experts + 1)
+  int32_t* tokens_cnts = nullptr;
+  int32_t* cumsum = nullptr;
+  if (experts_num_exceed_limit) {
+    // 2d tensor with shape (num_experts + 1, num_experts)
+    tokens_cnts = global_tokens_cnts_ptr;
+
+    // 1d tensor with shape (num_experts + 1)
+    cumsum = shared_mem;
+  } else {
+    tokens_cnts = shared_mem;  // 2d tensor with shape (num_experts + 1, num_experts)
+    cumsum = shared_mem + (num_experts + 1) * num_experts;  // 1d tensor with shape (num_experts + 1)
+  }

  for (int i = 0; i < num_experts; ++i) {
    tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0;
@@ -115,20 +125,40 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  VLLM_DISPATCH_INTEGRAL_TYPES(
      topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] {
-        // calc needed amount of shared mem for `tokens_cnts` and `cumsum`
-        // tensors
-        const int32_t shared_mem =
-            ((num_experts + 1) * num_experts + (num_experts + 1)) *
-            sizeof(int32_t);
-
-        // set dynamic shared mem
-        auto kernel = vllm::moe_align_block_size_kernel<scalar_t>;
-        AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(
+        int32_t shared_mem_normal = ((num_experts + 1) * num_experts + (num_experts + 1)) *
+              sizeof(int32_t);
+
+        const bool experts_num_exceed_limit = shared_mem_normal > MAX_SHARED_MEM_SIZE;
+
+        // calc needed amount of shared mem for `cumsum`
+        const int32_t shared_mem = experts_num_exceed_limit ? (num_experts + 1) * sizeof(int32_t) : shared_mem_normal;
+
+        if (experts_num_exceed_limit) {
+          // set dynamic shared mem
+          auto kernel = vllm::moe_align_block_size_kernel<scalar_t, true>;
+          AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(
+            (void*)kernel, shared_mem));
+
+          int32_t tokens_cnts[(num_experts + 1) * num_experts];
+          torch::Tensor key_cache_ptrs_tensor = torch::from_blob(tokens_cnts, {(num_experts + 1) * num_experts}, torch::kInt32)
+              .to(topk_ids.device());
+
+          kernel<<<1, num_experts, shared_mem, stream>>>(
+              topk_ids.data_ptr<scalar_t>(), sorted_token_ids.data_ptr<int32_t>(),
+              experts_ids.data_ptr<int32_t>(),
+              num_tokens_post_pad.data_ptr<int32_t>(), key_cache_ptrs_tensor.data_ptr<int32_t>(), num_experts,
+              block_size, topk_ids.numel());
+        } else {
+          // set dynamic shared mem
+          auto kernel = vllm::moe_align_block_size_kernel<scalar_t, false>;
+          AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(
            (void*)kernel, shared_mem));
-        kernel<<<1, num_experts, shared_mem, stream>>>(
-            topk_ids.data_ptr<scalar_t>(), sorted_token_ids.data_ptr<int32_t>(),
-            experts_ids.data_ptr<int32_t>(),
-            num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size,
-            topk_ids.numel());
+          kernel<<<1, num_experts, shared_mem, stream>>>(
+              topk_ids.data_ptr<scalar_t>(), sorted_token_ids.data_ptr<int32_t>(),
+              experts_ids.data_ptr<int32_t>(),
+              num_tokens_post_pad.data_ptr<int32_t>(), nullptr, num_experts, block_size,
+              topk_ids.numel());
+        }
+
      });
 }
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -23,12 +23,39 @@ void paged_attention_v2(
    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
    const int64_t blocksparse_head_sliding_step);

+void paged_attention_v1_opt(
+    torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
+    torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
+    torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
+    int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
+    const std::string& kv_cache_dtype, double kv_scale, const int64_t tp_rank,
+    const int64_t blocksparse_local_blocks,
+    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
+    const int64_t blocksparse_head_sliding_step);
+
+void paged_attention_v2_opt(
+    torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
+    torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
+    torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
+    torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
+    int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
+    const std::string& kv_cache_dtype, double kv_scale, const int64_t tp_rank,
+    const int64_t blocksparse_local_blocks,
+    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
+    const int64_t blocksparse_head_sliding_step);
+
 void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
              double epsilon);

 void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual,
                        torch::Tensor& weight, double epsilon);

+void rms_norm_opt(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
+              double epsilon);
+
+void fused_add_rms_norm_opt(torch::Tensor& input, torch::Tensor& residual,
+                        torch::Tensor& weight, double epsilon);
+
 void rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
                      torch::Tensor& key, int64_t head_size,
                      torch::Tensor& cos_sin_cache, bool is_neox);
@@ -38,6 +65,13 @@ void batched_rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
                              torch::Tensor& cos_sin_cache, bool is_neox,
                              int64_t rot_dim,
                              torch::Tensor& cos_sin_cache_offsets);
+void rotary_embedding_tgi(
+  torch::Tensor& query,
+  torch::Tensor& key,
+  int64_t head_size,
+  torch::Tensor& cos_cache,
+  torch::Tensor& sin_cache,
+  bool is_neox);

 void silu_and_mul(torch::Tensor& out, torch::Tensor& input);

@@ -49,6 +83,14 @@ void gelu_new(torch::Tensor& out, torch::Tensor& input);

 void gelu_fast(torch::Tensor& out, torch::Tensor& input);

+void silu_and_mul_opt(torch::Tensor& out, torch::Tensor& input);
+
+void gelu_and_mul_opt(torch::Tensor& out, torch::Tensor& input);
+
+void gelu_tanh_and_mul_opt(torch::Tensor& out, torch::Tensor& input);
+
+void trans_w16_gemm(torch::Tensor dst, torch::Tensor src, int64_t row, int64_t col);
+
 #ifndef USE_ROCM
 torch::Tensor aqlm_gemm(const torch::Tensor& input, const torch::Tensor& codes,
                        const torch::Tensor& codebooks,

--- a/csrc/opt/activation_kernels_opt.cu
+++ b/csrc/opt/activation_kernels_opt.cu
+#include <ATen/cuda/CUDAContext.h>
+#include <torch/all.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <ATen/native/cuda/MemoryAccess.cuh>
+
+#include <cmath>
+
+#include "cuda_compat.h"
+#include "../dispatch_utils.h"
+
+namespace vllm {
+
+// Activation and gating kernel template.
+template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&)>
+__global__ void act_and_mul_kernel(
+    scalar_t* __restrict__ out,          // [..., d]
+    const scalar_t* __restrict__ input,  // [..., 2, d]
+    const int d) {
+  const int64_t token_idx = blockIdx.x;
+  for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
+    const scalar_t x = VLLM_LDG(&input[token_idx * 2 * d + idx]);
+    const scalar_t y = VLLM_LDG(&input[token_idx * 2 * d + d + idx]);
+    out[token_idx * d + idx] = ACT_FN(x) * y;
+  }
+}
+
+template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&), int VEC>
+__global__ void act_and_mul_kernel_opt1(
+    scalar_t* __restrict__ out,          // [..., d]
+    const scalar_t* __restrict__ input,  // [..., 2, d]
+    const int d) {
+  using VecType = at::native::memory::aligned_vector<scalar_t, VEC>;
+  const int64_t token_idx= blockIdx.x;
+  int idx = threadIdx.x * VEC;
+  if (idx < d) {
+    const int64_t x_index = token_idx * 2 * d + idx;
+    const int64_t y_index = token_idx * d + idx;
+    VecType* x1 = (VecType*)(input + x_index);
+    VecType* x2 = (VecType*)(input + x_index + d);
+    VecType* y = (VecType*)(out + y_index);
+    scalar_t r_x1[VEC];
+    scalar_t r_x2[VEC];
+    scalar_t r_y[VEC];
+    *(VecType*)r_x1 = *x1;
+    *(VecType*)r_x2 = *x2;
+#pragma unroll
+    for (int i = 0; i < VEC; i++) {
+      r_y[i] = ACT_FN(r_x1[i]) * r_x2[i];
+    }
+    *y = *(VecType*)r_y;
+  }
+}
+
+template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&), int VEC>
+__global__ void act_and_mul_kernel_opt2(
+    scalar_t* __restrict__ out,          // [..., d]
+    const scalar_t* __restrict__ input,  // [..., 2, d]
+    const int d) {
+  using VecType = at::native::memory::aligned_vector<scalar_t, VEC>;
+  const int64_t token_idx = blockIdx.x;
+  int idx = threadIdx.x * VEC;
+  for (; idx < d; idx += blockDim.x * VEC) {
+    const int64_t x_index = token_idx * 2 * d + idx;
+    const int64_t y_index = token_idx * d + idx;
+    VecType* x1 = (VecType*)(input + x_index);
+    VecType* x2 = (VecType*)(input + x_index + d);
+    VecType* y = (VecType*)(out + y_index);
+    scalar_t r_x1[VEC];
+    scalar_t r_x2[VEC];
+    scalar_t r_y[VEC];
+    *(VecType*)r_x1 = *x1;
+    *(VecType*)r_x2 = *x2;
+#pragma unroll
+    for (int i = 0; i < VEC; i++) {
+      r_y[i] = ACT_FN(r_x1[i]) * r_x2[i];
+    }
+    *y = *(VecType*)r_y;
+  }
+}
+
+template <typename T>
+__device__ __forceinline__ T silu_kernel(const T& x) {
+  // x * sigmoid(x)
+  return (T)(((float)x) / (1.0f + expf((float)-x)));
+}
+
+template <typename T>
+__device__ __forceinline__ T gelu_kernel(const T& x) {
+  // Equivalent to PyTorch GELU with 'none' approximation.
+  // Refer to:
+  // https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L36-L38
+  const float f = (float)x;
+  constexpr float ALPHA = M_SQRT1_2;
+  return (T)(f * 0.5f * (1.0f + ::erf(f * ALPHA)));
+}
+
+template <typename T>
+__device__ __forceinline__ T gelu_tanh_kernel(const T& x) {
+  // Equivalent to PyTorch GELU with 'tanh' approximation.
+  // Refer to:
+  // https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L25-L30
+  const float f = (float)x;
+  constexpr float BETA = M_SQRT2 * M_2_SQRTPI * 0.5f;
+  constexpr float KAPPA = 0.044715;
+  float x_cube = f * f * f;
+  float inner = BETA * (f + KAPPA * x_cube);
+  return (T)(0.5f * f * (1.0f + ::tanhf(inner)));
+}
+
+}  // namespace vllm
+
+#define LAUNCH_ACTIVATION_GATE_KERNEL(KERNEL)                                  \
+  int d = input.size(-1) / 2;                                                  \
+  int64_t num_tokens = input.numel() / input.size(-1);                         \
+  dim3 grid(num_tokens);                                                       \
+  dim3 block(std::min(d, 1024));                                               \
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));            \
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();                \
+  VLLM_DISPATCH_FLOATING_TYPES(                                                \
+      input.scalar_type(), "act_and_mul_kernel", [&] {                         \
+        if (0 == d % 8 && d <= 16384) {                                        \
+          if (d <= 512) {                                                      \
+            vllm::act_and_mul_kernel_opt1<scalar_t, KERNEL<scalar_t>, 2> \
+                <<<grid, 256, 0, stream>>>(out.data_ptr<scalar_t>(),           \
+                                           input.data_ptr<scalar_t>(), d);     \
+          } else if (d <= 1024) {                                              \
+            vllm::act_and_mul_kernel_opt1<scalar_t, KERNEL<scalar_t>, 8> \
+                <<<grid, 128, 0, stream>>>(out.data_ptr<scalar_t>(),           \
+                                           input.data_ptr<scalar_t>(), d);     \
+          } else if (d <= 2048) {                                              \
+            vllm::act_and_mul_kernel_opt1<scalar_t, KERNEL<scalar_t>, 8> \
+                <<<grid, 256, 0, stream>>>(out.data_ptr<scalar_t>(),           \
+                                           input.data_ptr<scalar_t>(), d);     \
+          } else if (d <= 4096) {                                              \
+            vllm::act_and_mul_kernel_opt1<scalar_t, KERNEL<scalar_t>, 8> \
+                <<<grid, 512, 0, stream>>>(out.data_ptr<scalar_t>(),           \
+                                           input.data_ptr<scalar_t>(), d);     \
+          } else {                                                             \
+            vllm::act_and_mul_kernel_opt2<scalar_t, KERNEL<scalar_t>, 8> \
+                <<<grid, 1024, 0, stream>>>(out.data_ptr<scalar_t>(),          \
+                                            input.data_ptr<scalar_t>(), d);    \
+          }                                                                    \
+        } else {                                                               \
+              vllm::act_and_mul_kernel<scalar_t, KERNEL<scalar_t>>             \
+                  <<<grid, block, 0, stream>>>(out.data_ptr<scalar_t>(),       \
+                                              input.data_ptr<scalar_t>(), d);  \
+        }                                                                      \
+      });
+
+void silu_and_mul_opt(torch::Tensor& out,    // [..., d]
+                  torch::Tensor& input)  // [..., 2 * d]
+{
+  LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel);
+}
+
+void gelu_and_mul_opt(torch::Tensor& out,    // [..., d]
+                  torch::Tensor& input)  // [..., 2 * d]
+{
+  LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_kernel);
+}
+
+void gelu_tanh_and_mul_opt(torch::Tensor& out,    // [..., d]
+                       torch::Tensor& input)  // [..., 2 * d]
+{
+  LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_tanh_kernel);
+}
+
--- a/csrc/opt/layernorm_kernels_opt.cu
+++ b/csrc/opt/layernorm_kernels_opt.cu
+#include <torch/all.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <ATen/native/cuda/MemoryAccess.cuh>
+#include <c10/cuda/CUDAMathCompat.h>
+#include <ATen/AccumulateType.h>
+#include <THC/THCDeviceUtils.cuh>
+#include "../dispatch_utils.h"
+#include "../reduction_utils.cuh"
+#ifndef USE_ROCM
+  #include <cuda_bf16.h>
+  #include <cuda_fp16.h>
+#else
+  #include <hip/hip_bf16.h>
+  #include <hip/hip_fp16.h>
+
+using __nv_bfloat16 = __hip_bfloat16;
+using __nv_bfloat162 = __hip_bfloat162;
+#endif
+
+namespace vllm {
+
+// TODO(woosuk): Further optimize this kernel.
+template <typename scalar_t>
+__global__ void rms_norm_kernel(
+    scalar_t* __restrict__ out,           // [..., hidden_size]
+    const scalar_t* __restrict__ input,   // [..., hidden_size]
+    const scalar_t* __restrict__ weight,  // [hidden_size]
+    const float epsilon, const int num_tokens, const int hidden_size) {
+  __shared__ float s_variance;
+  float variance = 0.0f;
+
+  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
+    const float x = (float)input[blockIdx.x * hidden_size + idx];
+    variance += x * x;
+  }
+  variance = blockReduceSum<float>(variance);
+  if (threadIdx.x == 0) {
+    s_variance = rsqrtf(variance / hidden_size + epsilon);
+  }
+  __syncthreads();
+
+  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
+    float x = (float)input[blockIdx.x * hidden_size + idx];
+    out[blockIdx.x * hidden_size + idx] =
+        ((scalar_t)(x * s_variance)) * weight[idx];
+  }
+}
+
+/* Converter structs for the conversion from torch types to HIP/CUDA types,
+   and the associated type conversions within HIP/CUDA. These helpers need
+   to be implemented for now because the relevant type conversion
+   operators/constructors are not consistently implemented by HIP/CUDA, so
+   a generic conversion via type casts cannot be implemented.
+
+   Each struct should have the member static constexpr bool `exists`:
+   If false, the optimized kernel is not used for the corresponding torch type.
+   If true, the struct should be fully defined as shown in the examples below.
+ */
+template <typename torch_type>
+struct _typeConvert {
+  static constexpr bool exists = false;
+};
+
+#if defined(USE_ROCM) || (defined(CUDA_VERSION) && (CUDA_VERSION >= 12000))
+// CUDA < 12.0 runs into issues with packed type conversion
+template <>
+struct _typeConvert<c10::Half> {
+  static constexpr bool exists = true;
+  using hip_type = __half;
+  using packed_hip_type = __half2;
+
+  __device__ static inline float convert(hip_type x) { return __half2float(x); }
+  __device__ static inline float2 convert(packed_hip_type x) {
+    return __half22float2(x);
+  }
+  __device__ static inline hip_type convert(float x) {
+    return __float2half_rn(x);
+  }
+  __device__ static inline packed_hip_type convert(float2 x) {
+    return __float22half2_rn(x);
+  }
+};
+
+  #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+// CUDA_ARCH < 800 does not have BF16 support
+// TODO: Add in ROCm support once public headers handle bf16 maturely
+template <>
+struct _typeConvert<c10::BFloat16> {
+  static constexpr bool exists = true;
+  using hip_type = __nv_bfloat16;
+  using packed_hip_type = __nv_bfloat162;
+
+  __device__ static inline float convert(hip_type x) {
+    return __bfloat162float(x);
+  }
+  __device__ static inline float2 convert(packed_hip_type x) {
+    return __bfloat1622float2(x);
+  }
+  __device__ static inline hip_type convert(float x) {
+    return __float2bfloat16(x);
+  }
+  __device__ static inline packed_hip_type convert(float2 x) {
+    return __float22bfloat162_rn(x);
+  }
+};
+  #endif  // defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+#endif    // defined(USE_ROCM) || (defined(CUDA_VERSION) && (CUDA_VERSION >=
+          // 12000))
+
+/* Vector POD struct to generate vectorized and packed FP16/BF16 ops
+   for appropriate specializations of fused_add_rms_norm_kernel.
+   Only functions that are necessary in that kernel are implemented.
+   Alignment to 16 bytes is required to use 128-bit global memory ops.
+ */
+template <typename scalar_t, int width>
+struct alignas(16) _f16Vec {
+  /* Not theoretically necessary that width is a power of 2 but should
+     almost always be the case for optimization purposes */
+  static_assert(width > 0 && (width & (width - 1)) == 0,
+                "Width is not a positive power of 2!");
+  using Converter = _typeConvert<scalar_t>;
+  using T1 = typename Converter::hip_type;
+  using T2 = typename Converter::packed_hip_type;
+  T1 data[width];
+
+  __device__ _f16Vec& operator+=(const _f16Vec<scalar_t, width>& other) {
+    if constexpr (width % 2 == 0) {
+#pragma unroll
+      for (int i = 0; i < width; i += 2) {
+        T2 temp{data[i], data[i + 1]};
+        temp += T2{other.data[i], other.data[i + 1]};
+        data[i] = temp.x;
+        data[i + 1] = temp.y;
+      }
+    } else {
+#pragma unroll
+      for (int i = 0; i < width; ++i) data[i] += other.data[i];
+    }
+    return *this;
+  }
+
+  __device__ _f16Vec& operator*=(const _f16Vec<scalar_t, width>& other) {
+    if constexpr (width % 2 == 0) {
+#pragma unroll
+      for (int i = 0; i < width; i += 2) {
+        T2 temp{data[i], data[i + 1]};
+        temp *= T2{other.data[i], other.data[i + 1]};
+        data[i] = temp.x;
+        data[i + 1] = temp.y;
+      }
+    } else {
+#pragma unroll
+      for (int i = 0; i < width; ++i) data[i] *= other.data[i];
+    }
+    return *this;
+  }
+
+  __device__ _f16Vec& operator*=(const float scale) {
+    if constexpr (width % 2 == 0) {
+#pragma unroll
+      for (int i = 0; i < width; i += 2) {
+        float2 temp_f = Converter::convert(T2{data[i], data[i + 1]});
+        temp_f.x *= scale;
+        temp_f.y *= scale;
+        T2 temp = Converter::convert(temp_f);
+        data[i] = temp.x;
+        data[i + 1] = temp.y;
+      }
+    } else {
+#pragma unroll
+      for (int i = 0; i < width; ++i) {
+        float temp = Converter::convert(data[i]) * scale;
+        data[i] = Converter::convert(temp);
+      }
+    }
+    return *this;
+  }
+
+  __device__ float sum_squares() const {
+    float result = 0.0f;
+    if constexpr (width % 2 == 0) {
+#pragma unroll
+      for (int i = 0; i < width; i += 2) {
+        float2 z = Converter::convert(T2{data[i], data[i + 1]});
+        result += z.x * z.x + z.y * z.y;
+      }
+    } else {
+#pragma unroll
+      for (int i = 0; i < width; ++i) {
+        float x = Converter::convert(data[i]);
+        result += x * x;
+      }
+    }
+    return result;
+  }
+};
+
+/* Function specialization in the case of FP16/BF16 tensors.
+   Additional optimizations we can make in this case are
+   packed and vectorized operations, which help with the
+   memory latency bottleneck. */
+template <typename scalar_t, int width>
+__global__ std::enable_if_t<(width > 0) && _typeConvert<scalar_t>::exists>
+fused_add_rms_norm_kernel(
+    scalar_t* __restrict__ input,         // [..., hidden_size]
+    scalar_t* __restrict__ residual,      // [..., hidden_size]
+    const scalar_t* __restrict__ weight,  // [hidden_size]
+    const float epsilon, const int num_tokens, const int hidden_size) {
+  // Sanity checks on our vector struct and type-punned pointer arithmetic
+  static_assert(std::is_pod_v<_f16Vec<scalar_t, width>>);
+  static_assert(sizeof(_f16Vec<scalar_t, width>) == sizeof(scalar_t) * width);
+
+  const int vec_hidden_size = hidden_size / width;
+  __shared__ float s_variance;
+  float variance = 0.0f;
+  /* These and the argument pointers are all declared `restrict` as they are
+     not aliased in practice. Argument pointers should not be dereferenced
+     in this kernel as that would be undefined behavior */
+  auto* __restrict__ input_v =
+      reinterpret_cast<_f16Vec<scalar_t, width>*>(input);
+  auto* __restrict__ residual_v =
+      reinterpret_cast<_f16Vec<scalar_t, width>*>(residual);
+  auto* __restrict__ weight_v =
+      reinterpret_cast<const _f16Vec<scalar_t, width>*>(weight);
+
+  for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) {
+    int id = blockIdx.x * vec_hidden_size + idx;
+    _f16Vec<scalar_t, width> temp = input_v[id];
+    temp += residual_v[id];
+    variance += temp.sum_squares();
+    residual_v[id] = temp;
+  }
+  /* Keep the following if-else block in sync with the
+     calculation of max_block_size in fused_add_rms_norm */
+  if (num_tokens < 256) {
+    variance = blockReduceSum<float, 1024>(variance);
+  } else
+    variance = blockReduceSum<float, 256>(variance);
+  if (threadIdx.x == 0) {
+    s_variance = rsqrtf(variance / hidden_size + epsilon);
+  }
+  __syncthreads();
+
+  for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) {
+    int id = blockIdx.x * vec_hidden_size + idx;
+    _f16Vec<scalar_t, width> temp = residual_v[id];
+    temp *= s_variance;
+    temp *= weight_v[idx];
+    input_v[id] = temp;
+  }
+}
+
+/* Generic fused_add_rms_norm_kernel
+   The width field is not used here but necessary for other specializations.
+ */
+template <typename scalar_t, int width>
+__global__ std::enable_if_t<(width == 0) || !_typeConvert<scalar_t>::exists>
+fused_add_rms_norm_kernel(
+    scalar_t* __restrict__ input,         // [..., hidden_size]
+    scalar_t* __restrict__ residual,      // [..., hidden_size]
+    const scalar_t* __restrict__ weight,  // [hidden_size]
+    const float epsilon, const int num_tokens, const int hidden_size) {
+  __shared__ float s_variance;
+  float variance = 0.0f;
+
+  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
+    scalar_t z = input[blockIdx.x * hidden_size + idx];
+    z += residual[blockIdx.x * hidden_size + idx];
+    float x = (float)z;
+    variance += x * x;
+    residual[blockIdx.x * hidden_size + idx] = z;
+  }
+  /* Keep the following if-else block in sync with the
+     calculation of max_block_size in fused_add_rms_norm */
+  if (num_tokens < 256) {
+    variance = blockReduceSum<float, 1024>(variance);
+  } else
+    variance = blockReduceSum<float, 256>(variance);
+  if (threadIdx.x == 0) {
+    s_variance = rsqrtf(variance / hidden_size + epsilon);
+  }
+  __syncthreads();
+
+  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
+    float x = (float)residual[blockIdx.x * hidden_size + idx];
+    input[blockIdx.x * hidden_size + idx] =
+        ((scalar_t)(x * s_variance)) * weight[idx];
+  }
+}
+
+}  // namespace vllm
+
+template <typename T,int reducesize=C10_WARP_SIZE>
+__inline__ __device__ T WarpReduceSum_NEW(T val) {
+#pragma unroll
+  for (int offset = reducesize/2; offset > 0; offset >>= 1) {
+    val += WARP_SHFL_DOWN(val, offset);
+  }
+  return val;
+}
+
+template <typename T,int block_size=512>
+__inline__ __device__ T BlockReduceSum_NEW(T val, T* shared) {
+  constexpr int share_size=block_size/C10_WARP_SIZE;
+  val = WarpReduceSum_NEW<T>(val);
+  if constexpr(block_size==C10_WARP_SIZE)
+  {
+    return val;
+  }
+  else{
+    const int lid = threadIdx.x % C10_WARP_SIZE;
+    const int wid = threadIdx.x / C10_WARP_SIZE;
+    if (lid == 0&&wid<share_size) {
+      shared[wid] = val;
+    }
+    __syncthreads();
+    if (wid == 0&&lid<share_size) {
+      val = WarpReduceSum_NEW<T,share_size>(shared[lid]);
+    }
+    return val;
+  }
+}
+
+template <typename scalar_t,typename T_ACC,int Vec=4,int block_size=512>
+__global__ void fused_add_rms_kernel_opt(scalar_t* input,scalar_t* residual,scalar_t* gamma,int cols,T_ACC eps)
+{
+  constexpr int share_size=block_size/C10_WARP_SIZE;
+  __shared__ T_ACC val_shared[share_size];
+  __shared__ T_ACC s_rstd;
+  T_ACC val=0;
+  int i=blockIdx.x;
+  int j=threadIdx.x;
+  int tcol=cols/Vec;
+  using LoadT = at::native::memory::aligned_vector<scalar_t, Vec>;
+  scalar_t intput_vec[Vec];
+  scalar_t residual_vec[Vec];
+  T_ACC trstd;
+  int idx = i * tcol + j;
+  idx*=Vec;
+  if (j < tcol) {
+    *(LoadT*)intput_vec = *(LoadT*)(input+idx);
+    *(LoadT*)residual_vec = *(LoadT*)(residual+idx);
+    #pragma unroll
+    for (int ii = 0; ii < Vec; ii++) {
+      residual_vec[ii]+=intput_vec[ii];
+      val += static_cast<T_ACC>(residual_vec[ii])*static_cast<T_ACC>(residual_vec[ii]);
+    }
+  }
+  val = BlockReduceSum_NEW<T_ACC,block_size>(val,val_shared);
+  if (j == 0) s_rstd=c10::cuda::compat::rsqrt(val/cols + eps);
+  __syncthreads();
+  trstd=s_rstd;
+  if (j < tcol) {
+    #pragma unroll
+    for(int ii=0;ii<Vec;ii++){
+      int jj=j*Vec+ii;
+      intput_vec[ii] = static_cast<T_ACC>(residual_vec[ii]) *trstd* static_cast<T_ACC>(gamma[jj]);
+    }
+    *(LoadT*)(residual+idx)=*(LoadT*)residual_vec;
+    *(LoadT*)(input+idx)=*(LoadT*)intput_vec;
+  }
+}
+
+template <typename scalar_t,typename T_ACC,int Vec=4,int block_size=512>
+__global__ void fused_rms_kernel_opt(scalar_t* input,scalar_t* output,scalar_t* gamma,int cols,T_ACC eps)
+{
+  constexpr int share_size=block_size/C10_WARP_SIZE;
+  __shared__ T_ACC val_shared[share_size];
+  __shared__ T_ACC s_rstd;
+  T_ACC val=0;
+  int i=blockIdx.x;
+  int j=threadIdx.x;
+  int tcol=cols/Vec;
+  using LoadT = at::native::memory::aligned_vector<scalar_t, Vec>;
+  scalar_t intput_vec[Vec];
+  T_ACC trstd;
+  int idx = i * tcol + j;
+  idx*=Vec;
+  if (j < tcol) {
+    *(LoadT*)intput_vec = *(LoadT*)(input+idx);
+    #pragma unroll
+    for (int ii = 0; ii < Vec; ii++) {
+      val += static_cast<T_ACC>(intput_vec[ii])*static_cast<T_ACC>(intput_vec[ii]);
+    }
+  }
+  val = BlockReduceSum_NEW<T_ACC,block_size>(val,val_shared);
+  if (j == 0) s_rstd=c10::cuda::compat::rsqrt(val/cols + eps);
+  __syncthreads();
+  trstd=s_rstd;
+  if (j < tcol) {
+    #pragma unroll
+    for(int ii=0;ii<Vec;ii++){
+      int jj=j*Vec+ii;
+      intput_vec[ii] = static_cast<T_ACC>(intput_vec[ii]) *trstd* static_cast<T_ACC>(gamma[jj]);
+    }
+    *(LoadT*)(output+idx)=*(LoadT*)intput_vec;
+  }
+}
+
+void rms_norm_opt(torch::Tensor& out,     // [..., hidden_size]
+              torch::Tensor& input,   // [..., hidden_size]
+              torch::Tensor& weight,  // [hidden_size]
+              double epsilon) {
+  int hidden_size = input.size(-1);
+  int num_tokens = input.numel() / hidden_size;
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  auto inp_ptr = reinterpret_cast<std::uintptr_t>(input.data_ptr());
+  auto wt_ptr = reinterpret_cast<std::uintptr_t>(weight.data_ptr());
+  bool ptrs_are_aligned =inp_ptr % 16 == 0  && wt_ptr % 16 == 0;
+  if(hidden_size%16==0&&hidden_size<=16384&&ptrs_are_aligned){
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+    at::ScalarType::Half,
+    at::ScalarType::BFloat16,
+    input.scalar_type(),
+    "fused_add_rms_norm_kernel",
+    [&] {
+      using T_ACC = at::acc_type<scalar_t, true>;
+      T_ACC eps = epsilon;
+      scalar_t* self_data = input.data_ptr<scalar_t>();
+      scalar_t* out_data =out.data_ptr<scalar_t>();
+      scalar_t* weight_data=weight.data_ptr<scalar_t>();
+      if (hidden_size<=1024){
+          fused_rms_kernel_opt<scalar_t,T_ACC,8,128><<<num_tokens,  128, 0, stream>>>(self_data,out_data,weight_data,hidden_size,eps);
+      }
+      else if(hidden_size<=2048){
+          fused_rms_kernel_opt<scalar_t,T_ACC,8,256><<<num_tokens,  256, 0, stream>>>(self_data,out_data,weight_data,hidden_size,eps);
+      }
+      else if(hidden_size<=4096){
+          if(num_tokens>1200){
+            fused_rms_kernel_opt<scalar_t,T_ACC,8,512><<<num_tokens,  512, 0, stream>>>(self_data,out_data,weight_data,hidden_size,eps);
+          }
+          else{
+            fused_rms_kernel_opt<scalar_t,T_ACC,4,1024><<<num_tokens,  1024, 0, stream>>>(self_data,out_data,weight_data,hidden_size,eps);
+          }
+      }
+      else if(hidden_size<=8192){
+           fused_rms_kernel_opt<scalar_t,T_ACC,8,1024><<<num_tokens,  1024, 0, stream>>>(self_data,out_data,weight_data,hidden_size,eps);
+      }
+      else{
+          fused_rms_kernel_opt<scalar_t,T_ACC,16,1024><<<num_tokens,  1024, 0, stream>>>(self_data,out_data,weight_data,hidden_size,eps);
+      } 
+    });
+  }
+  else{
+    dim3 grid(num_tokens);
+    dim3 block(std::min(hidden_size, 1024));
+  
+    VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "rms_norm_kernel", [&] {
+      vllm::rms_norm_kernel<scalar_t><<<grid, block, 0, stream>>>(
+          out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(),
+          weight.data_ptr<scalar_t>(), epsilon, num_tokens, hidden_size);
+    });
+  }
+}
+
+#define LAUNCH_FUSED_ADD_RMS_NORM(width)                                       \
+  VLLM_DISPATCH_FLOATING_TYPES(                                                \
+      input.scalar_type(), "fused_add_rms_norm_kernel", [&] {                  \
+        vllm::fused_add_rms_norm_kernel<scalar_t, width>                       \
+            <<<grid, block, 0, stream>>>(input.data_ptr<scalar_t>(),           \
+                                         residual.data_ptr<scalar_t>(),        \
+                                         weight.data_ptr<scalar_t>(), epsilon, \
+                                         num_tokens, hidden_size);             \
+      });
+
+
+
+void fused_add_rms_norm_opt(torch::Tensor& input,     // [..., hidden_size]
+                        torch::Tensor& residual,  // [..., hidden_size]
+                        torch::Tensor& weight,    // [hidden_size]
+                        double epsilon) {
+  int hidden_size = input.size(-1);
+  int num_tokens = input.numel() / hidden_size;
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  auto inp_ptr = reinterpret_cast<std::uintptr_t>(input.data_ptr());
+  auto res_ptr = reinterpret_cast<std::uintptr_t>(residual.data_ptr());
+  auto wt_ptr = reinterpret_cast<std::uintptr_t>(weight.data_ptr());
+  bool ptrs_are_aligned =inp_ptr % 16 == 0 && res_ptr % 16 == 0 && wt_ptr % 16 == 0;
+  if(hidden_size%16==0&&hidden_size>=2048&&hidden_size<=8192&&ptrs_are_aligned){
+    AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      input.scalar_type(),
+      "fused_add_rms_norm_kernel",
+      [&] {
+        using T_ACC = at::acc_type<scalar_t, true>;
+        T_ACC eps = epsilon;
+        scalar_t* self_data = input.data_ptr<scalar_t>();
+        scalar_t* other_data =residual.data_ptr<scalar_t>();
+        scalar_t* weight_data=weight.data_ptr<scalar_t>();
+        if (hidden_size<=1024){
+            fused_add_rms_kernel_opt<scalar_t,T_ACC,8,128><<<num_tokens,  128, 0, stream>>>(self_data,other_data,weight_data,hidden_size,eps);
+        }
+        else if(hidden_size<=2048){
+            fused_add_rms_kernel_opt<scalar_t,T_ACC,8,256><<<num_tokens,  256, 0, stream>>>(self_data,other_data,weight_data,hidden_size,eps);
+        }
+        else if(hidden_size<=4096){
+            if(num_tokens>1200){
+              fused_add_rms_kernel_opt<scalar_t,T_ACC,8,512><<<num_tokens,  512, 0, stream>>>(self_data,other_data,weight_data,hidden_size,eps);
+            }
+            else{
+              fused_add_rms_kernel_opt<scalar_t,T_ACC,4,1024><<<num_tokens,  1024, 0, stream>>>(self_data,other_data,weight_data,hidden_size,eps);
+            }
+        }
+        else if(hidden_size<=8192){
+            fused_add_rms_kernel_opt<scalar_t,T_ACC,8,1024><<<num_tokens,  1024, 0, stream>>>(self_data,other_data,weight_data,hidden_size,eps);
+        }
+        else{
+            fused_add_rms_kernel_opt<scalar_t,T_ACC,16,1024><<<num_tokens,  1024, 0, stream>>>(self_data,other_data,weight_data,hidden_size,eps);
+        } 
+      });
+  }
+  else{
+    dim3 grid(num_tokens);
+    /* This kernel is memory-latency bound in many scenarios.
+      When num_tokens is large, a smaller block size allows
+      for increased block occupancy on CUs and better latency
+      hiding on global mem ops. */
+    const int max_block_size = (num_tokens < 256) ? 1024 : 256;
+    dim3 block(std::min(hidden_size, max_block_size));
+    /*If the tensor types are FP16/BF16, try to use the optimized kernel
+      with packed + vectorized ops.
+      Max optimization is achieved with a width-8 vector of FP16/BF16s
+      since we can load at most 128 bits at once in a global memory op.
+      However, this requires each tensor's data to be aligned to 16
+      bytes.
+    */
+
+    if (ptrs_are_aligned && hidden_size % 8 == 0) {
+      LAUNCH_FUSED_ADD_RMS_NORM(8);
+    } else {
+      LAUNCH_FUSED_ADD_RMS_NORM(0);
+    }
+  }
+}
--- a/csrc/opt/transpose_kernels.cu
+++ b/csrc/opt/transpose_kernels.cu
+#include <torch/all.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+
+namespace vllm {
+template <typename T>
+__global__ void trans_w16_gemm_cudakernel(int64_t num_kernels,T* dst,const T* src,int64_t row,int64_t col)
+{
+    int64_t id = blockIdx.x * blockDim.x + threadIdx.x;
+    if(id >= num_kernels) return;
+
+    int64_t j=id%row; 
+    int64_t i=id/row;
+
+    dst[i*row+j]=src[j*col+i];
+}
+
+
+void trans_w16_gemm_cuda(half* dst,const half* src,int64_t row,int64_t col){
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  int64_t num_kernels=row*col;
+  int block_size=256;
+  trans_w16_gemm_cudakernel<<<(num_kernels+block_size-1)/block_size,block_size, 0, stream>>>(num_kernels,dst,src,row,col);
+}
+}   // namespace vllm
+
+void trans_w16_gemm(torch::Tensor dst,torch::Tensor src,int64_t row,int64_t col){
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(src));
+  vllm::trans_w16_gemm_cuda(
+              (half*)dst.data_ptr(),
+              (const half*)src.data_ptr(),
+              row,
+              col
+            );
+}
\ No newline at end of file
--- a/csrc/pos_encoding_tgi_kernels.cu
+++ b/csrc/pos_encoding_tgi_kernels.cu
+#include <torch/all.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include "cuda_compat.h"
+#include "dispatch_utils.h"
+
+namespace vllm {
+
+template<typename scalar_t, bool IS_NEOX>
+inline __device__ void apply_token_rotary_embedding_tgi(
+  scalar_t* __restrict__ arr,
+  const float* __restrict__ cos_ptr,
+  const float* __restrict__ sin_ptr,
+  int rot_offset,
+  int embed_dim)
+{
+  int x_index, y_index;
+  float cos, sin;
+  if (IS_NEOX) {
+    // GPT-NeoX style rotary embedding.
+    x_index = rot_offset;
+    y_index = embed_dim + rot_offset;
+    cos = VLLM_LDG(cos_ptr + x_index);
+    sin = VLLM_LDG(sin_ptr + x_index);
+  } else {
+    // GPT-J style rotary embedding.
+    x_index = 2 * rot_offset;
+    y_index = 2 * rot_offset + 1;
+    cos = VLLM_LDG(cos_ptr + x_index / 2);
+    sin = VLLM_LDG(sin_ptr + x_index / 2);
+  }
+
+  const scalar_t x = arr[x_index];
+  const scalar_t y = arr[y_index];
+  arr[x_index] = x * cos - y * sin;
+  arr[y_index] = y * cos + x * sin;
+}
+
+template<typename scalar_t, bool IS_NEOX>
+inline __device__ void apply_rotary_embedding_tgi(
+  scalar_t* __restrict__ query,                 // [batch_size, seq_len, num_heads, head_size] or [num_tokens, num_heads, head_size]
+  scalar_t* __restrict__ key,                   // [batch_size, seq_len, num_kv_heads, head_size] or [num_tokens, num_kv_heads, head_size]
+  const float* __restrict__ cos_ptr,   // [max_position, 1, rot_dim]
+  const float* __restrict__ sin_ptr,   // [max_position, 1, rot_dim]
+  const int head_size,
+  const int num_heads,
+  const int num_kv_heads,
+  const int rot_dim,
+  const int token_idx,
+  const int64_t query_stride,
+  const int64_t key_stride)
+{
+  const int nq = num_heads * rot_dim;
+  for (int i = threadIdx.x; i < nq; i += blockDim.x) {
+    const int head_idx = i / rot_dim;
+    const int64_t token_head = token_idx * query_stride + head_idx * head_size;
+    const int rot_offset = i % rot_dim;
+    apply_token_rotary_embedding_tgi<scalar_t, IS_NEOX>(query + token_head, cos_ptr,
+                                              sin_ptr, rot_offset, rot_dim);
+  }
+
+  const int nk = num_kv_heads * rot_dim;
+  for (int i = threadIdx.x; i < nk; i += blockDim.x) {
+    const int head_idx = i / rot_dim;
+    const int64_t token_head = token_idx * key_stride + head_idx * head_size;
+    const int rot_offset = i % rot_dim;
+    apply_token_rotary_embedding_tgi<scalar_t, IS_NEOX>(key + token_head, cos_ptr,
+                                              sin_ptr, rot_offset, rot_dim);
+  }
+}
+
+template<typename scalar_t, bool IS_NEOX>
+__global__ void rotary_embedding_tgi_kernel(
+  scalar_t* __restrict__ query,                 // [batch_size, seq_len, num_heads, head_size] or [num_tokens, num_heads, head_size]
+  scalar_t* __restrict__ key,                   // [batch_size, seq_len, num_kv_heads, head_size] or [num_tokens, num_kv_heads, head_size]
+  const float* __restrict__ cos_cache,   // [max_position, 1, rot_dim]
+  const float* __restrict__ sin_cache,   // [max_position, 1, rot_dim]
+  const int rot_dim,
+  const int64_t query_stride,
+  const int64_t key_stride,
+  const int num_heads,
+  const int num_kv_heads,
+  const int head_size) {
+  // Each thread block is responsible for one token.
+  const int token_idx = blockIdx.x;
+
+  const float* cos_ptr = cos_cache + token_idx * rot_dim;
+  const float* sin_ptr = sin_cache + token_idx * rot_dim;
+
+  apply_rotary_embedding_tgi<scalar_t, IS_NEOX>(query, key, cos_ptr, sin_ptr, head_size, num_heads, num_kv_heads, rot_dim, token_idx, query_stride, key_stride);
+}
+
+} // namespace vllm
+
+void rotary_embedding_tgi(
+  torch::Tensor& query,             // [batch_size, seq_len, num_heads * head_size] or [num_tokens, num_heads * head_size]
+  torch::Tensor& key,               // [batch_size, seq_len, num_kv_heads * head_size] or [num_tokens, num_kv_heads * head_size]
+  int64_t head_size,
+  torch::Tensor& cos_cache,
+  torch::Tensor& sin_cache,
+  bool is_neox) {
+  int num_tokens = query.size(0);
+  int rot_dim = cos_cache.size(2);
+  int num_heads = query.size(1);
+  int num_kv_heads = key.size(1);
+  int query_stride = query.stride(0);
+  int key_stride = key.stride(0);
+
+  dim3 grid(num_tokens);
+  dim3 block(std::min(num_heads * rot_dim / 2, 512));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  VLLM_DISPATCH_FLOATING_TYPES(
+    query.scalar_type(),
+    "rotary_embedding_tgi",
+    [&] {
+      if (is_neox) {
+        vllm::rotary_embedding_tgi_kernel<scalar_t, true><<<grid, block, 0, stream>>>(
+          query.data_ptr<scalar_t>(),
+          key.data_ptr<scalar_t>(),
+          cos_cache.data_ptr<float>(),
+          sin_cache.data_ptr<float>(),
+          rot_dim,
+          query_stride,
+          key_stride,
+          num_heads,
+          num_kv_heads,
+          head_size);
+      } else {
+        vllm::rotary_embedding_tgi_kernel<scalar_t, false><<<grid, block, 0, stream>>>(
+          query.data_ptr<scalar_t>(),
+          key.data_ptr<scalar_t>(),
+          cos_cache.data_ptr<float>(),
+          sin_cache.data_ptr<float>(),
+          rot_dim,
+          query_stride,
+          key_stride,
+          num_heads,
+          num_kv_heads,
+          head_size);
+      }
+    });
+}
--- a/csrc/quantization/gptq/q_gemm.cu
+++ b/csrc/quantization/gptq/q_gemm.cu
@@ -1542,6 +1542,7 @@ void gemm_half_q_half_cuda(cublasHandle_t cublas_handle, const half* a,
  }
 }

+
 __global__ void shuffle_4bit_kernel(uint32_t* __restrict__ b_q_weight,
                                    const int size_k, const int size_n) {
  int n = blockIdx.x * THREADS_X + threadIdx.x;
@@ -1847,6 +1848,7 @@ torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight,
  return c;
 }

+
 void gptq_shuffle(torch::Tensor q_weight, torch::Tensor q_perm, int64_t bit) {
  const at::cuda::OptionalCUDAGuard device_guard(device_of(q_weight));
  vllm::gptq::shuffle_exllama_weight(

--- a/csrc/quantization/gptq/setup.py
+++ b/csrc/quantization/gptq/setup.py
+from setuptools import setup
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+import torch
+
+# Compiler flags.
+CXX_FLAGS = ["-g", "-O3", "-std=c++17"]
+NVCC_FLAGS = ["-O3", "-std=c++17","-DUSE_ROCM","-U__HIP_NO_HALF_CONVERSIONS__","-U__HIP_NO_HALF_OPERATORS__"]
+#--gpu-max-threads-per-block=1024编译会导致GPTQ多batch性能下降。
+# NVCC_FLAGS = ["-O3", "-std=c++17","-DUSE_ROCM","--gpu-max-threads-per-block=1024","-U__HIP_NO_HALF_CONVERSIONS__","-U__HIP_NO_HALF_OPERATORS__"]
+
+
+ABI = 1 if torch._C._GLIBCXX_USE_CXX11_ABI else 0
+CXX_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"]
+NVCC_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"]
+
+extra_compile_args={
+    "cxx": CXX_FLAGS,
+    "nvcc": NVCC_FLAGS,
+}
+
+setup(
+    name="gptq_kernels",
+    ext_modules=[
+        CUDAExtension(
+            name="gptq_kernels",
+            sources=[
+                "csrc/quantization/gptq/torch_bindings.cpp",
+                "csrc/quantization/gptq/q_gemm.cu",
+            ],
+            extra_compile_args=extra_compile_args,
+        )
+    ],
+    cmdclass={"build_ext": BuildExtension},
+)
--- a/csrc/quantization/gptq/torch_bindings.cpp
+++ b/csrc/quantization/gptq/torch_bindings.cpp
+#include <torch/extension.h>
+
+torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight,
+                        torch::Tensor b_gptq_qzeros,
+                        torch::Tensor b_gptq_scales, torch::Tensor b_g_idx,
+                        bool use_exllama, int64_t bit);
+
+void gptq_shuffle(torch::Tensor q_weight, torch::Tensor q_perm, int64_t bit);
+
+// Bindings
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    m.def("gptq_gemm", &gptq_gemm, "make_q_matrix");
+    m.def("gptq_shuffle", &gptq_shuffle, "gemm_half_q_half");
+}