Merge branch 'v0.5.0-dtk24.04.1' into v0.5.3.post1-dtk24.04.1

39ae4102 · zhuwenwen · 75011627 · 880b2e41 · 39ae4102 · 39ae4102
Commit 39ae4102 authored Aug 12, 2024 by zhuwenwen
20 changed files
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@
 vLLM是一个快速且易于使用的LLM推理和服务库，使用PageAttention高效管理kv内存，Continuous batching传入请求，支持很多Hugging Face模型，如LLaMA & LLaMA-2、Qwen、Chatglm2 & Chatglm3等。

 ## 暂不支持的官方功能
- **量化推理**：目前支持fp16的推理和gptq推理，awq-int4和mralin的权重量化、kv-cache fp8推理方案暂不支持
+- **量化推理**：目前支持fp16的推理和gptq,awq-int4推理，mralin的权重量化、kv-cache fp8推理方案暂不支持
 - **模块支持**：目前不支持Sliding window attention、 moe kernel模块


@@ -62,14 +62,11 @@ pip install -r requirements-rocm.txt
 ```
 1. 编译whl包并安装
 VLLM_INSTALL_PUNICA_KERNELS=1 python setup.py bdist_wheel 
-python csrc/quantization/gptq/setup.py bdist_wheel
 cd dist
 pip install vllm*
-pip install gptq_kernel

 2. 源码编译安装
 VLLM_INSTALL_PUNICA_KERNELS=1 python3 setup.py install 
-python csrc/quantization/gptq/setup.py install 
 ```

 #### 运行基础环境准备
@@ -79,7 +76,7 @@ python csrc/quantization/gptq/setup.py install
 - triton:[https://cancon.hpccube.com:65024/4/main/triton](https://cancon.hpccube.com:65024/4/main/triton/)
 - xformers:[https://cancon.hpccube.com:65024/4/main/xformers](https://cancon.hpccube.com:65024/4/main/xformers)
 - flash_attn: [https://cancon.hpccube.com:65024/4/main/flash_attn](https://cancon.hpccube.com:65024/4/main/flash_attn)
-
+- lmslim: [https://cancon.hpccube.com:65024/4/main/lmslim](https://cancon.hpccube.com:65024/4/main/lmslim)

 #### 注意事项
 + 若使用 pip install 下载安装过慢，可添加源：-i https://pypi.tuna.tsinghua.edu.cn/simple/

--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -64,6 +64,7 @@ def sample_requests(


 def run_vllm(
+    warmup_requests: List[Tuple[str, int, int]],
    requests: List[Tuple[str, int, int]],
    model: str,
    tokenizer: str,
@@ -126,21 +127,39 @@ def run_vllm(
            ))

    # warmup
-    dummy_prompt_token_ids = np.random.randint(10000,
-                                               size=(args.num_prompts,
-                                                     args.input_len))
-    dummy_inputs: List[PromptInputs] = [{
-        "prompt_token_ids": batch
-    } for batch in dummy_prompt_token_ids.tolist()]
-    
-    def run_to_completion():
-        llm.generate(dummy_inputs,
-                        sampling_params=sampling_params,
-                        use_tqdm=False)
+    warmup_prompts = []
+    warmup_sampling_params = []
+    for prompt, _, output_len in warmup_requests:
+        warmup_prompts.append(prompt)
+        warmup_sampling_params.append(
+            SamplingParams(
+                n=n,
+                temperature=0.0 if use_beam_search else 1.0,
+                top_p=1.0,
+                use_beam_search=use_beam_search,
+                ignore_eos=True,
+                max_tokens=output_len,
+            ))
        
    print("Warming up...")
    for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
-        run_to_completion()
+        llm.generate(warmup_prompts, warmup_sampling_params, use_tqdm=True)
+    
+    # dummy_prompt_token_ids = np.random.randint(10000,
+    #                                            size=(args.num_prompts,
+    #                                                  args.input_len))
+    # dummy_inputs: List[PromptInputs] = [{
+    #     "prompt_token_ids": batch
+    # } for batch in dummy_prompt_token_ids.tolist()]
+
+    # def run_to_completion():
+    #     llm.generate(dummy_inputs,
+    #                     sampling_params=sampling_params,
+    #                     use_tqdm=False)
+
+    # print("Warming up...")
+    # for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
+    #     run_to_completion()
    
    start = time.perf_counter()
    llm.generate(prompts, sampling_params, use_tqdm=True)
@@ -235,6 +254,10 @@ def main(args: argparse.Namespace):
        args.tokenizer, trust_remote_code=args.trust_remote_code)
    if args.dataset is None:
        # Synthesize a prompt with the given input length.
+        warmup_prompt = "hi" * 10
+        warmup_requests = [(warmup_prompt, 10, 10)
+                    for _ in range(1)]
+        
        prompt = "hi" * (args.input_len - 1)
        requests = [(prompt, args.input_len, args.output_len)
                    for _ in range(args.num_prompts)]
@@ -244,7 +267,7 @@ def main(args: argparse.Namespace):

    if args.backend == "vllm":
        elapsed_time = run_vllm(
-            requests, args.model, args.tokenizer, args.quantization,
+            warmup_requests, requests, args.model, args.tokenizer, args.quantization,
            args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
            args.trust_remote_code, args.dtype, args.max_model_len,
            args.enforce_eager, args.kv_cache_dtype,

--- a/csrc/activation_kernels.cu
+++ b/csrc/activation_kernels.cu
@@ -160,3 +160,4 @@ void gelu_quick(torch::Tensor& out,    // [..., d]
 {
  LAUNCH_ACTIVATION_KERNEL(vllm::gelu_quick_kernel);
 }
+
--- a/csrc/attention/attention_kernels.cu
+++ b/csrc/attention/attention_kernels.cu
@@ -71,38 +71,8 @@ template <typename scalar_t, typename cache_t, int HEAD_SIZE, int BLOCK_SIZE,
          bool IS_BLOCK_SPARSE, 
          int REUSE_KV_TIMES = 1, 
          bool odd_nheads = false,
-          int PARTITION_SIZE = 0,std::enable_if_t<!std::is_same<scalar_t, uint16_t>::value, int> = 0>  // Zero means no partitioning.
-__device__ void paged_attention_kernel(
-    float* __restrict__ exp_sums,  // [num_seqs, num_heads, max_num_partitions]
-    float* __restrict__ max_logits,  // [num_seqs, num_heads,
-                                     // max_num_partitions]
-    scalar_t* __restrict__ out,  // [num_seqs, num_heads, max_num_partitions,
-                                 // head_size]
-    const scalar_t* __restrict__ q,       // [num_seqs, num_heads, head_size]
-    const cache_t* __restrict__ k_cache,  // [num_blocks, num_kv_heads,
-                                          // head_size/x, block_size, x]
-    const cache_t* __restrict__ v_cache,  // [num_blocks, num_kv_heads,
-                                          // head_size, block_size]
-    const int num_heads,                   // [num_heads]
-    const int num_kv_heads,               // [num_kv_heads]
-    const float scale,
-    const int* __restrict__ block_tables,  // [num_seqs, max_num_blocks_per_seq]
-    const int* __restrict__ seq_lens,      // [num_seqs]
-    const int max_num_blocks_per_seq,
-    const float* __restrict__ alibi_slopes,  // [num_heads]
-    const int q_stride, const int kv_block_stride, const int kv_head_stride,
-    const float k_scale, const float v_scale, const int tp_rank, const int blocksparse_local_blocks,
-    const int blocksparse_vert_stride, const int blocksparse_block_size,
-    const int blocksparse_head_sliding_step) {}
+          int PARTITION_SIZE = 0>  // Zero means no partitioning.

-// TODO(woosuk): Merge the last two dimensions of the grid.
-// Grid: (num_heads, num_seqs, max_num_partitions).
-template <typename scalar_t, typename cache_t, int HEAD_SIZE, int BLOCK_SIZE,
-          int NUM_THREADS, vllm::Fp8KVCacheDataType KV_DTYPE,
-          bool IS_BLOCK_SPARSE,
-          int REUSE_KV_TIMES = 1,
-          bool odd_nheads = false,
-          int PARTITION_SIZE = 0,std::enable_if_t<std::is_same<scalar_t, uint16_t>::value, int> = 0>  // Zero means no partitioning.
 __device__ void paged_attention_kernel(
    float* __restrict__ exp_sums,  // [num_seqs, num_heads, max_num_partitions]
    float* __restrict__ max_logits,  // [num_seqs, num_heads,
@@ -134,6 +104,7 @@ __device__ void paged_attention_kernel(
    // No work to do. Terminate the thread block.
    return;
  }
+  if constexpr (sizeof(scalar_t)==2){

  const int num_seq_blocks = DIVIDE_ROUND_UP(seq_len, BLOCK_SIZE);
  const int num_blocks_per_partition =
@@ -612,6 +583,7 @@ __device__ void paged_attention_kernel(
      }
    }
  }
+  }
 }



--- a/csrc/attention/attention_utils.cuh
+++ b/csrc/attention/attention_utils.cuh
@@ -80,8 +80,8 @@ inline __device__ void v_pk_fma_f16x8(float& a,const uint4 &  b,const uint4 &  c
 }

 // Q*K^T operation. fp16
-// template <int THREAD_GROUP_SIZE, typename Vec, int N, typename scalar_t, std::enable_if_t<std::is_same<scalar_t, uint16_t>::value, int> = 0>
-template <int THREAD_GROUP_SIZE, typename Vec, int N>
+template <int THREAD_GROUP_SIZE, typename Vec, int N, typename scalar_t, std::enable_if_t<std::is_same<scalar_t, uint16_t>::value, int> = 0>
+// template <int THREAD_GROUP_SIZE, typename Vec, int N>
 inline __device__ float qk_dot_(const Vec (&q)[N], const Vec (&k)[N]) {
  
  float qk =0;
@@ -114,9 +114,9 @@ inline __device__ float qk_dot_(const Vec (&q)[N], const Vec (&k)[N]) {
 }

 // Q*K^T operation. //bf16
-// template <int THREAD_GROUP_SIZE, typename Vec, int N, typename scalar_t, std::enable_if_t<!std::is_same<scalar_t, uint16_t>::value, int> = 0>
-template <int THREAD_GROUP_SIZE, typename Vec, int N>
-inline __device__ float qk_dot_vpack_(const Vec (&q)[N], const Vec (&k)[N]) {
+template <int THREAD_GROUP_SIZE, typename Vec, int N, typename scalar_t, std::enable_if_t<!std::is_same<scalar_t, uint16_t>::value, int> = 0>
+// template <int THREAD_GROUP_SIZE, typename Vec, int N>
+inline __device__ float qk_dot_(const Vec (&q)[N], const Vec (&k)[N]) {

  using A_vec = typename FloatVec<Vec>::Type;
  A_vec qk_vec = mul<A_vec, Vec, Vec>(q[0], k[0]);
@@ -138,7 +138,7 @@ template <typename T, int THREAD_GROUP_SIZE>
 struct Qk_dot {
  template <typename Vec, int N>
  static inline __device__ float dot(const Vec (&q)[N], const Vec (&k)[N]) {
-    return qk_dot_<THREAD_GROUP_SIZE>(q, k);
+    return qk_dot_<THREAD_GROUP_SIZE,Vec,N,T>(q, k);
  }
  // template <typename Vec, int N>
  // static inline __device__ float qk_dot_vpack(const Vec (&q)[N], const Vec (&k)[N]) {

--- a/csrc/moe_align_block_size_kernels.cu
+++ b/csrc/moe_align_block_size_kernels.cu
@@ -9,6 +9,8 @@

 #define CEILDIV(x, y) (((x) + (y) - 1) / (y))

+#define MAX_SHARED_MEM_SIZE 64 * 1024
+
 namespace vllm {

 namespace {
@@ -19,11 +21,12 @@ __device__ __forceinline__ int32_t index(int32_t total_col, int32_t row,
 }
 }  // namespace

-template <typename scalar_t>
+template <typename scalar_t, bool experts_num_exceed_limit>
 __global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids,
                                            int32_t* sorted_token_ids,
                                            int32_t* expert_ids,
                                            int32_t* total_tokens_post_pad,
+                                            int32_t* global_tokens_cnts_ptr,
                                            int32_t num_experts,
                                            int32_t block_size, size_t numel) {
  const size_t tokens_per_thread = CEILDIV(numel, blockDim.x);
@@ -31,11 +34,18 @@ __global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids,

  extern __shared__ int32_t shared_mem[];

-  int32_t* tokens_cnts =
-      shared_mem;  // 2d tensor with shape (num_experts + 1, num_experts)
-  int32_t* cumsum =
-      shared_mem + (num_experts + 1) *
-                       num_experts;  // 1d tensor with shape (num_experts + 1)
+  int32_t* tokens_cnts = nullptr;
+  int32_t* cumsum = nullptr;
+  if (experts_num_exceed_limit) {
+    // 2d tensor with shape (num_experts + 1, num_experts)
+    tokens_cnts = global_tokens_cnts_ptr;
+
+    // 1d tensor with shape (num_experts + 1)
+    cumsum = shared_mem;
+  } else {
+    tokens_cnts = shared_mem;  // 2d tensor with shape (num_experts + 1, num_experts)
+    cumsum = shared_mem + (num_experts + 1) * num_experts;  // 1d tensor with shape (num_experts + 1)
+  }

  for (int i = 0; i < num_experts; ++i) {
    tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0;
@@ -115,20 +125,40 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  VLLM_DISPATCH_INTEGRAL_TYPES(
      topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] {
-        // calc needed amount of shared mem for `tokens_cnts` and `cumsum`
-        // tensors
-        const int32_t shared_mem =
-            ((num_experts + 1) * num_experts + (num_experts + 1)) *
+        int32_t shared_mem_normal = ((num_experts + 1) * num_experts + (num_experts + 1)) *
              sizeof(int32_t);

+        const bool experts_num_exceed_limit = shared_mem_normal > MAX_SHARED_MEM_SIZE;
+
+        // calc needed amount of shared mem for `cumsum`
+        const int32_t shared_mem = experts_num_exceed_limit ? (num_experts + 1) * sizeof(int32_t) : shared_mem_normal;
+
+        if (experts_num_exceed_limit) {
+          // set dynamic shared mem
+          auto kernel = vllm::moe_align_block_size_kernel<scalar_t, true>;
+          AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(
+            (void*)kernel, shared_mem));
+
+          int32_t tokens_cnts[(num_experts + 1) * num_experts];
+          torch::Tensor key_cache_ptrs_tensor = torch::from_blob(tokens_cnts, {(num_experts + 1) * num_experts}, torch::kInt32)
+              .to(topk_ids.device());
+
+          kernel<<<1, num_experts, shared_mem, stream>>>(
+              topk_ids.data_ptr<scalar_t>(), sorted_token_ids.data_ptr<int32_t>(),
+              experts_ids.data_ptr<int32_t>(),
+              num_tokens_post_pad.data_ptr<int32_t>(), key_cache_ptrs_tensor.data_ptr<int32_t>(), num_experts,
+              block_size, topk_ids.numel());
+        } else {
          // set dynamic shared mem
-        auto kernel = vllm::moe_align_block_size_kernel<scalar_t>;
+          auto kernel = vllm::moe_align_block_size_kernel<scalar_t, false>;
          AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(
            (void*)kernel, shared_mem));
          kernel<<<1, num_experts, shared_mem, stream>>>(
              topk_ids.data_ptr<scalar_t>(), sorted_token_ids.data_ptr<int32_t>(),
              experts_ids.data_ptr<int32_t>(),
-            num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size,
+              num_tokens_post_pad.data_ptr<int32_t>(), nullptr, num_experts, block_size,
              topk_ids.numel());
+        }
+
      });
 }
--- a/examples/offline_inference.py
+++ b/examples/offline_inference.py
@@ -9,10 +9,10 @@ if __name__ == '__main__':
        "The future of AI is",
    ]
    # Create a sampling params object.
-    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=16)

    # Create an LLM.
-    llm = LLM(model="facebook/opt-125m",trust_remote_code=True, dtype="float16", enforce_eager=True)
+    llm = LLM(model="facebook/opt-125m",tensor_parallel_size=1, distributed_executor_backend="ray", dtype="float16",trust_remote_code=True, enforce_eager=True)
    # Generate texts from the prompts. The output is a list of RequestOutput objects
    # that contain the prompt, generated text, and other information.
    outputs = llm.generate(prompts, sampling_params)

--- a/setup.py
+++ b/setup.py
@@ -19,6 +19,9 @@ from typing import Optional, Union
 import subprocess
 from pathlib import Path

+add_git_version = False
+if int(os.environ.get('ADD_GIT_VERSION', '0')) == 1:
+    add_git_version = True

 def load_module_from_path(module_name, path):
    spec = importlib.util.spec_from_file_location(module_name, path)
@@ -354,33 +357,23 @@ def find_version(filepath: str) -> str:
        raise RuntimeError("Unable to find version string.")


-def get_abi():
-    try:
-        command = "echo '#include <string>' | gcc -x c++ -E -dM - | fgrep _GLIBCXX_USE_CXX11_ABI" 
-        result = subprocess.run(command, shell=True, capture_output=True, text=True) 
-        output = result.stdout.strip() 
-        abi = "abi" + output.split(" ")[-1]
-        return abi
-    except Exception:
-        return 'abiUnknown'
-
-
 def get_sha(root: Union[str, Path]) -> str:
    try:
        return subprocess.check_output(['git', 'rev-parse', 'HEAD'], cwd=root).decode('ascii').strip()
    except Exception:
        return 'Unknown'

+
 def get_version_add(sha: Optional[str] = None) -> str:
    vllm_root = os.path.dirname(os.path.abspath(__file__))
    add_version_path = os.path.join(os.path.join(vllm_root, "vllm"), "version.py")
+    if add_git_version:
        if sha != 'Unknown':
            if sha is None:
                sha = get_sha(vllm_root)
-        version = 'das1.2.git' + sha[:7]
-
-    # abi version
-    version += "." + get_abi()
+            version = 'das.opt1.' + sha[:7]
+    else:
+        version = 'das.opt1'

    # dtk version
    if os.getenv("ROCM_PATH"):

--- a/tests/basic_correctness/test_preemption.py
+++ b/tests/basic_correctness/test_preemption.py
@@ -69,7 +69,8 @@ def test_chunked_prefill_recompute(


 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
+# @pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [96])
 def test_preemption(
    caplog_vllm,
@@ -120,7 +121,8 @@ def test_preemption(


 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
+# @pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [96])
 @pytest.mark.parametrize("beam_width", [4])
 def test_swap(
@@ -178,7 +180,8 @@ def test_swap(


 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
+# @pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [96])
 @pytest.mark.parametrize("beam_width", [4])
 def test_swap_infeasible(
@@ -222,7 +225,8 @@ def test_swap_infeasible(


 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
+# @pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [96])
 def test_preemption_infeasible(
    vllm_runner,

--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
 import contextlib
 import functools
 from typing import List, Optional, Tuple, Type
-
 import torch
+
 try:
-    import gptq_kernels
-except ImportError as e:
-    raise RuntimeError("Failed to import gptq_kernel with, Please install gptq_kernels from csrc/quantization/gptq ")
+    from lmslim import quant_ops 
+except Exception:
+    print("INFO: Please install lmslim if you want to infer gptq or awq model.\n") 

 from vllm.logger import init_logger

@@ -191,17 +191,47 @@ def awq_dequantize(qweight: torch.Tensor, scales: torch.Tensor,
                                       thx, thy)


-def awq_gemm(input: torch.Tensor, qweight: torch.Tensor, qzeros: torch.Tensor,
-             scales: torch.Tensor, split_k_iters: int) -> torch.Tensor:
-    return torch.ops._C.awq_gemm(input, qweight, qzeros, scales, split_k_iters)
-
+# def awq_gemm(input: torch.Tensor, qweight: torch.Tensor, qzeros: torch.Tensor,
+#              scales: torch.Tensor, split_k_iters: int) -> torch.Tensor:
+#     return quant_ops.awq_gemm(input, qweight, qzeros, scales, split_k_iters)
+
+def awq_gemm(input: torch.Tensor, weight: torch.Tensor,
+             zeros_and_scales:torch.Tensor,
+             m:int,n:int,k:int,
+             group_size:int,padding_group:int,splikspace:torch.Tensor,
+            splikspacesize:int) -> torch.Tensor:
+    return quant_ops.awq_gemm(input,
+                              weight,
+                              zeros_and_scales,
+                              m,
+                              n,
+                              k,
+                              group_size,
+                              padding_group,
+                              splikspace,
+                              splikspacesize)
+
+def convert_s4(qw: torch.Tensor, qz: torch.Tensor, s: torch.Tensor,
+               group_size: int):
+    return quant_ops.convert_s4(qw,qz,s,group_size)
+
+def sz_permute(sz:torch.Tensor)-> torch.Tensor:
+    return quant_ops.sz_permute(sz)
+
+def dequant_w4_gemm_colmajor(qweight:torch.Tensor,
+                                zeros_and_scale:torch.Tensor,
+                                k:int,
+                                n:int,
+                                group_size:int
+                             )->torch.Tensor:
+    return quant_ops.dequant_w4_gemm_colmajor(qweight,zeros_and_scale,k,n,group_size)

 # gptq
 def gptq_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
              b_gptq_qzeros: torch.Tensor, b_gptq_scales: torch.Tensor,
              b_g_idx: torch.Tensor, use_exllama: bool,
              bit: int) -> torch.Tensor:
-    return gptq_kernels.gptq_gemm(a, b_q_weight, b_gptq_qzeros, b_gptq_scales,
+    return quant_ops.gptq_gemm(a, b_q_weight, b_gptq_qzeros, b_gptq_scales,
                                  b_g_idx, use_exllama, bit)
    # return torch.ops._C.gptq_gemm(a, b_q_weight, b_gptq_qzeros, b_gptq_scales,
    #                               b_g_idx, use_exllama, bit)
@@ -209,7 +239,7 @@ def gptq_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,

 def gptq_shuffle(q_weight: torch.Tensor, q_perm: torch.Tensor,
                 bit: int) -> None:
-    gptq_kernels.gptq_shuffle(q_weight, q_perm, bit)
+    quant_ops.gptq_shuffle(q_weight, q_perm, bit)
    # torch.ops._C.gptq_shuffle(q_weight, q_perm, bit)

 # trans_w16

--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -270,13 +270,23 @@ class ROCmFlashAttentionImpl(AttentionImpl):
        self.use_naive_attn = False
        # NOTE: Allow for switching between Triton and CK. Defaulting to triton.
        self.use_triton_flash_attn = envs.VLLM_USE_TRITON_FLASH_ATTN
+        # NOTE: Allow automatic switching between Triton and CK. Defaulting to triton when seqlen > 8192
+        self.use_flash_attn_auto = envs.VLLM_USE_FLASH_ATTN_AUTO
        if self.use_triton_flash_attn:
+            if self.use_flash_attn_auto:
+                from vllm.attention.ops.flash_attn_triton_mqa_gqa import ( 
+                flash_attn_varlen_func)
+                self.attn_func_triton = flash_attn_varlen_func
+                
+                from flash_attn import flash_attn_varlen_func  # noqa: F401
+                self.attn_func_ck = flash_attn_varlen_func
+                logger.debug("When SEQ_LEN > 8192, Use Triton FA in ROCmBackend, otherwise Use CK FA")
+            else:
                # from vllm.attention.ops.triton_flash_attention import (  # noqa: F401
                #     triton_attention)
-            # self.attn_func = triton_attention
                from vllm.attention.ops.flash_attn_triton_mqa_gqa import ( 
                    flash_attn_varlen_func)
-            self.attn_func = flash_attn_varlen_func  
+                self.attn_func = flash_attn_varlen_func # triton_attention
                logger.debug("Using Triton FA in ROCmBackend")
                if self.sliding_window != (-1, -1):
                    logger.warning("ROCm Triton FA does not currently support "
@@ -284,6 +294,7 @@ class ROCmFlashAttentionImpl(AttentionImpl):
                                "precision, please try using the ROCm CK "
                                "FA backend instead by setting the env var "
                                "`VLLM_USE_TRITON_FLASH_ATTN=0`")
+        
        else:
            # if not using triton, navi3x/navi21/navi10 do not use flash-attn
            # either
@@ -392,6 +403,32 @@ class ROCmFlashAttentionImpl(AttentionImpl):
                            query.dtype,
                            attn_metadata.seq_lens,
                            make_attn_mask=False)  # type: ignore
+                    if self.use_flash_attn_auto:
+                        if prefill_meta.max_prefill_seq_len > 8192:
+                            out = self.attn_func_triton(
+                                q=query,
+                                k=key,
+                                v=value,
+                                cu_seqlens_q=prefill_meta.seq_start_loc,
+                                cu_seqlens_k=prefill_meta.seq_start_loc,
+                                max_seqlens_q=prefill_meta.max_prefill_seq_len,
+                                max_seqlens_k=prefill_meta.max_prefill_seq_len,
+                                softmax_scale=self.scale,
+                                causal=True,
+                            )
+                        else:
+                            out = self.attn_func_ck(
+                                q=query,
+                                k=key,
+                                v=value,
+                                cu_seqlens_q=prefill_meta.seq_start_loc,
+                                cu_seqlens_k=prefill_meta.seq_start_loc,
+                                max_seqlen_q=prefill_meta.max_prefill_seq_len,
+                                max_seqlen_k=prefill_meta.max_prefill_seq_len,
+                                softmax_scale=self.scale,
+                                causal=True,
+                            )
+                    else:
                    # out = self.attn_func(
                    #     query,
                    #     key,

--- a/vllm/benchmark_throughput.py
+++ b/vllm/benchmark_throughput.py
+"""Benchmark offline inference throughput."""
+import argparse
+import json
+import random
+import time
+from typing import List, Optional, Tuple
+
+import numpy as np
+import torch
+from tqdm import tqdm
+from transformers import (AutoModelForCausalLM, AutoTokenizer,
+                          PreTrainedTokenizerBase)
+
+from vllm.engine.arg_utils import EngineArgs
+from vllm.inputs import PromptInputs
+from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+from vllm.utils import FlexibleArgumentParser
+
+
+def sample_requests(
+    dataset_path: str,
+    num_requests: int,
+    tokenizer: PreTrainedTokenizerBase,
+    fixed_output_len: Optional[int],
+) -> List[Tuple[str, int, int]]:
+    if fixed_output_len is not None and fixed_output_len < 4:
+        raise ValueError("output_len too small")
+
+    # Load the dataset.
+    with open(dataset_path) as f:
+        dataset = json.load(f)
+    # Filter out the conversations with less than 2 turns.
+    dataset = [data for data in dataset if len(data["conversations"]) >= 2]
+    # Only keep the first two turns of each conversation.
+    dataset = [(data["conversations"][0]["value"],
+                data["conversations"][1]["value"]) for data in dataset]
+
+    # Shuffle the dataset.
+    random.shuffle(dataset)
+
+    # Filter out sequences that are too long or too short
+    filtered_dataset: List[Tuple[str, int, int]] = []
+    for i in range(len(dataset)):
+        if len(filtered_dataset) == num_requests:
+            break
+
+        # Tokenize the prompts and completions.
+        prompt = dataset[i][0]
+        prompt_token_ids = tokenizer(prompt).input_ids
+        completion = dataset[i][1]
+        completion_token_ids = tokenizer(completion).input_ids
+        prompt_len = len(prompt_token_ids)
+        output_len = len(completion_token_ids
+                         ) if fixed_output_len is None else fixed_output_len
+        if prompt_len < 4 or output_len < 4:
+            # Prune too short sequences.
+            continue
+        if prompt_len > 1024 or prompt_len + output_len > 2048:
+            # Prune too long sequences.
+            continue
+        filtered_dataset.append((prompt, prompt_len, output_len))
+
+    return filtered_dataset
+
+
+def run_vllm(
+    warmup_requests: List[Tuple[str, int, int]],
+    requests: List[Tuple[str, int, int]],
+    model: str,
+    tokenizer: str,
+    quantization: Optional[str],
+    tensor_parallel_size: int,
+    seed: int,
+    n: int,
+    use_beam_search: bool,
+    trust_remote_code: bool,
+    dtype: str,
+    max_model_len: Optional[int],
+    enforce_eager: bool,
+    kv_cache_dtype: str,
+    quantization_param_path: Optional[str],
+    device: str,
+    enable_prefix_caching: bool,
+    enable_chunked_prefill: bool,
+    max_num_batched_tokens: int,
+    distributed_executor_backend: Optional[str],
+    gpu_memory_utilization: float = 0.9,
+    download_dir: Optional[str] = None,
+    load_format: str = EngineArgs.load_format,
+) -> float:
+    from vllm import LLM, SamplingParams
+    llm = LLM(
+        model=model,
+        tokenizer=tokenizer,
+        quantization=quantization,
+        tensor_parallel_size=tensor_parallel_size,
+        seed=seed,
+        trust_remote_code=trust_remote_code,
+        dtype=dtype,
+        max_model_len=max_model_len,
+        gpu_memory_utilization=gpu_memory_utilization,
+        enforce_eager=enforce_eager,
+        kv_cache_dtype=kv_cache_dtype,
+        quantization_param_path=quantization_param_path,
+        device=device,
+        enable_prefix_caching=enable_prefix_caching,
+        download_dir=download_dir,
+        enable_chunked_prefill=enable_chunked_prefill,
+        max_num_batched_tokens=max_num_batched_tokens,
+        distributed_executor_backend=distributed_executor_backend,
+        load_format=load_format,
+    )
+
+    # Add the requests to the engine.
+    prompts: List[str] = []
+    sampling_params: List[SamplingParams] = []
+    for prompt, _, output_len in requests:
+        prompts.append(prompt)
+        sampling_params.append(
+            SamplingParams(
+                n=n,
+                temperature=0.0 if use_beam_search else 1.0,
+                top_p=1.0,
+                use_beam_search=use_beam_search,
+                ignore_eos=True,
+                max_tokens=output_len,
+            ))
+
+    # warmup
+    warmup_prompts = []
+    warmup_sampling_params = []
+    for prompt, _, output_len in warmup_requests:
+        warmup_prompts.append(prompt)
+        warmup_sampling_params.append(
+            SamplingParams(
+                n=n,
+                temperature=0.0 if use_beam_search else 1.0,
+                top_p=1.0,
+                use_beam_search=use_beam_search,
+                ignore_eos=True,
+                max_tokens=output_len,
+            ))
+        
+    print("Warming up...")
+    for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
+        llm.generate(warmup_prompts, warmup_sampling_params, use_tqdm=True)
+    
+    # dummy_prompt_token_ids = np.random.randint(10000,
+    #                                            size=(args.num_prompts,
+    #                                                  args.input_len))
+    # dummy_inputs: List[PromptInputs] = [{
+    #     "prompt_token_ids": batch
+    # } for batch in dummy_prompt_token_ids.tolist()]
+
+    # def run_to_completion():
+    #     llm.generate(dummy_inputs,
+    #                     sampling_params=sampling_params,
+    #                     use_tqdm=False)
+
+    # print("Warming up...")
+    # for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
+    #     run_to_completion()
+    
+    start = time.perf_counter()
+    llm.generate(prompts, sampling_params, use_tqdm=True)
+    end = time.perf_counter()
+    return end - start
+
+
+def run_hf(
+    requests: List[Tuple[str, int, int]],
+    model: str,
+    tokenizer: PreTrainedTokenizerBase,
+    n: int,
+    use_beam_search: bool,
+    max_batch_size: int,
+    trust_remote_code: bool,
+) -> float:
+    assert not use_beam_search
+    llm = AutoModelForCausalLM.from_pretrained(
+        model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code)
+    if llm.config.model_type == "llama":
+        # To enable padding in the HF backend.
+        tokenizer.pad_token = tokenizer.eos_token
+    llm = llm.cuda()
+
+    pbar = tqdm(total=len(requests))
+    start = time.perf_counter()
+    batch: List[str] = []
+    max_prompt_len = 0
+    max_output_len = 0
+    for i in range(len(requests)):
+        prompt, prompt_len, output_len = requests[i]
+        # Add the prompt to the batch.
+        batch.append(prompt)
+        max_prompt_len = max(max_prompt_len, prompt_len)
+        max_output_len = max(max_output_len, output_len)
+        if len(batch) < max_batch_size and i != len(requests) - 1:
+            # Check if we can add more requests to the batch.
+            _, next_prompt_len, next_output_len = requests[i + 1]
+            if (max(max_prompt_len, next_prompt_len) +
+                    max(max_output_len, next_output_len)) <= 2048:
+                # We can add more requests to the batch.
+                continue
+
+        # Generate the sequences.
+        input_ids = tokenizer(batch, return_tensors="pt",
+                              padding=True).input_ids
+        llm_outputs = llm.generate(
+            input_ids=input_ids.cuda(),
+            do_sample=not use_beam_search,
+            num_return_sequences=n,
+            temperature=1.0,
+            top_p=1.0,
+            use_cache=True,
+            max_new_tokens=max_output_len,
+        )
+        # Include the decoding time.
+        tokenizer.batch_decode(llm_outputs, skip_special_tokens=True)
+        pbar.update(len(batch))
+
+        # Clear the batch.
+        batch = []
+        max_prompt_len = 0
+        max_output_len = 0
+    end = time.perf_counter()
+    return end - start
+
+
+def run_mii(
+    requests: List[Tuple[str, int, int]],
+    model: str,
+    tensor_parallel_size: int,
+    output_len: int,
+) -> float:
+    from mii import client, serve
+    llm = serve(model, tensor_parallel=tensor_parallel_size)
+    prompts = [prompt for prompt, _, _ in requests]
+
+    start = time.perf_counter()
+    llm.generate(prompts, max_new_tokens=output_len)
+    end = time.perf_counter()
+    client = client(model)
+    client.terminate_server()
+    return end - start
+
+
+def main(args: argparse.Namespace):
+    print(args)
+    random.seed(args.seed)
+
+    # Sample the requests.
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.tokenizer, trust_remote_code=args.trust_remote_code)
+    if args.dataset is None:
+        # Synthesize a prompt with the given input length.
+        warmup_prompt = "hi" * 10
+        warmup_requests = [(warmup_prompt, 10, 10)
+                    for _ in range(1)]
+        
+        prompt = "hi" * (args.input_len - 1)
+        requests = [(prompt, args.input_len, args.output_len)
+                    for _ in range(args.num_prompts)]
+    else:
+        requests = sample_requests(args.dataset, args.num_prompts, tokenizer,
+                                   args.output_len)
+
+    if args.backend == "vllm":
+        elapsed_time = run_vllm(
+            warmup_requests, requests, args.model, args.tokenizer, args.quantization,
+            args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
+            args.trust_remote_code, args.dtype, args.max_model_len,
+            args.enforce_eager, args.kv_cache_dtype,
+            args.quantization_param_path, args.device,
+            args.enable_prefix_caching, args.enable_chunked_prefill,
+            args.max_num_batched_tokens, args.distributed_executor_backend,
+            args.gpu_memory_utilization, args.download_dir, args.load_format)
+    elif args.backend == "hf":
+        assert args.tensor_parallel_size == 1
+        elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
+                              args.use_beam_search, args.hf_max_batch_size,
+                              args.trust_remote_code)
+    elif args.backend == "mii":
+        elapsed_time = run_mii(requests, args.model, args.tensor_parallel_size,
+                               args.output_len)
+    else:
+        raise ValueError(f"Unknown backend: {args.backend}")
+    total_num_tokens = sum(prompt_len + output_len
+                           for _, prompt_len, output_len in requests)
+    if args.dataset is None:
+        total_out_tokens = args.output_len * args.num_prompts
+    else:
+        total_out_tokens = sum(output_len for _, _, output_len in requests) 
+    print(f"Latency: {elapsed_time:.2f} s")
+    print(f"All Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
+          f"{total_num_tokens / elapsed_time:.2f} tokens/s")
+    print(f"Generate Throughput: {total_out_tokens / elapsed_time:.2f} tokens/s")
+
+
+    # Output JSON results if specified
+    if args.output_json:
+        results = {
+            "elapsed_time": elapsed_time,
+            "num_requests": len(requests),
+            "total_num_tokens": total_num_tokens,
+            "requests_per_second": len(requests) / elapsed_time,
+            "tokens_per_second": total_num_tokens / elapsed_time,
+        }
+        with open(args.output_json, "w") as f:
+            json.dump(results, f, indent=4)
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(description="Benchmark the throughput.")
+    parser.add_argument("--backend",
+                        type=str,
+                        choices=["vllm", "hf", "mii"],
+                        default="vllm")
+    parser.add_argument("--dataset",
+                        type=str,
+                        default=None,
+                        help="Path to the dataset.")
+    parser.add_argument("--input-len",
+                        type=int,
+                        default=None,
+                        help="Input prompt length for each request")
+    parser.add_argument("--output-len",
+                        type=int,
+                        default=None,
+                        help="Output length for each request. Overrides the "
+                        "output length from the dataset.")
+    parser.add_argument("--model", type=str, default="facebook/opt-125m")
+    parser.add_argument("--tokenizer", type=str, default=None)
+    parser.add_argument('--quantization',
+                        '-q',
+                        choices=[*QUANTIZATION_METHODS, None],
+                        default=None)
+    parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1)
+    parser.add_argument("--n",
+                        type=int,
+                        default=1,
+                        help="Number of generated sequences per prompt.")
+    parser.add_argument("--use-beam-search", action="store_true")
+    parser.add_argument('--num-iters-warmup',
+                        type=int,
+                        default=1,
+                        help='Number of iterations to run for warmup.')
+    parser.add_argument("--num-prompts",
+                        type=int,
+                        default=1000,
+                        help="Number of prompts to process.")
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument("--hf-max-batch-size",
+                        type=int,
+                        default=None,
+                        help="Maximum batch size for HF backend.")
+    parser.add_argument('--trust-remote-code',
+                        action='store_true',
+                        help='trust remote code from huggingface')
+    parser.add_argument(
+        '--max-model-len',
+        type=int,
+        default=None,
+        help='Maximum length of a sequence (including prompt and output). '
+        'If None, will be derived from the model.')
+    parser.add_argument(
+        '--dtype',
+        type=str,
+        default='auto',
+        choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
+        help='data type for model weights and activations. '
+        'The "auto" option will use FP16 precision '
+        'for FP32 and FP16 models, and BF16 precision '
+        'for BF16 models.')
+    parser.add_argument('--gpu-memory-utilization',
+                        type=float,
+                        default=0.9,
+                        help='the fraction of GPU memory to be used for '
+                        'the model executor, which can range from 0 to 1.'
+                        'If unspecified, will use the default value of 0.9.')
+    parser.add_argument("--enforce-eager",
+                        action="store_true",
+                        help="enforce eager execution")
+    parser.add_argument(
+        '--kv-cache-dtype',
+        type=str,
+        choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'],
+        default="auto",
+        help='Data type for kv cache storage. If "auto", will use model '
+        'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
+        'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)')
+    parser.add_argument(
+        '--quantization-param-path',
+        type=str,
+        default=None,
+        help='Path to the JSON file containing the KV cache scaling factors. '
+        'This should generally be supplied, when KV cache dtype is FP8. '
+        'Otherwise, KV cache scaling factors default to 1.0, which may cause '
+        'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
+        'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
+        'instead supported for common inference criteria.')
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="auto",
+        choices=["auto", "cuda", "cpu", "openvino", "tpu", "xpu"],
+        help='device type for vLLM execution, supporting CUDA, OpenVINO and '
+        'CPU.')
+    parser.add_argument(
+        "--enable-prefix-caching",
+        action='store_true',
+        help="enable automatic prefix caching for vLLM backend.")
+    parser.add_argument("--enable-chunked-prefill",
+                        action='store_true',
+                        help="enable chunked prefill for vLLM backend.")
+    parser.add_argument('--max-num-batched-tokens',
+                        type=int,
+                        default=None,
+                        help='maximum number of batched tokens per '
+                        'iteration')
+    parser.add_argument('--download-dir',
+                        type=str,
+                        default=None,
+                        help='directory to download and load the weights, '
+                        'default to the default cache dir of huggingface')
+    parser.add_argument(
+        '--output-json',
+        type=str,
+        default=None,
+        help='Path to save the throughput results in JSON format.')
+    parser.add_argument(
+        '--distributed-executor-backend',
+        choices=['ray', 'mp'],
+        default=None,
+        help='Backend to use for distributed serving. When more than 1 GPU '
+        'is used, will be automatically set to "ray" if installed '
+        'or "mp" (multiprocessing) otherwise.')
+    parser.add_argument(
+        '--load-format',
+        type=str,
+        default=EngineArgs.load_format,
+        choices=[
+            'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer',
+            'bitsandbytes'
+        ],
+        help='The format of the model weights to load.\n\n'
+        '* "auto" will try to load the weights in the safetensors format '
+        'and fall back to the pytorch bin format if safetensors format '
+        'is not available.\n'
+        '* "pt" will load the weights in the pytorch bin format.\n'
+        '* "safetensors" will load the weights in the safetensors format.\n'
+        '* "npcache" will load the weights in pytorch format and store '
+        'a numpy cache to speed up the loading.\n'
+        '* "dummy" will initialize the weights with random values, '
+        'which is mainly for profiling.\n'
+        '* "tensorizer" will load the weights using tensorizer from '
+        'CoreWeave. See the Tensorize vLLM Model script in the Examples'
+        'section for more information.\n'
+        '* "bitsandbytes" will load the weights using bitsandbytes '
+        'quantization.\n')
+    args = parser.parse_args()
+    if args.tokenizer is None:
+        args.tokenizer = args.model
+    if args.dataset is None:
+        assert args.input_len is not None
+        assert args.output_len is not None
+    else:
+        assert args.input_len is None
+
+    if args.backend == "vllm":
+        if args.hf_max_batch_size is not None:
+            raise ValueError("HF max batch size is only for HF backend.")
+    elif args.backend == "hf":
+        if args.hf_max_batch_size is None:
+            raise ValueError("HF max batch size is required for HF backend.")
+        if args.quantization is not None:
+            raise ValueError("Quantization is only for vLLM backend.")
+    elif args.backend == "mii":
+        if args.dtype != "auto":
+            raise ValueError("dtype must be auto for MII backend.")
+        if args.n != 1:
+            raise ValueError("n must be 1 for MII backend.")
+        if args.use_beam_search:
+            raise ValueError("Beam search is not supported for MII backend.")
+        if args.quantization is not None:
+            raise ValueError("Quantization is only for vLLM backend.")
+        if args.hf_max_batch_size is not None:
+            raise ValueError("HF max batch size is only for HF backend.")
+        if args.tokenizer != args.model:
+            raise ValueError("Tokenizer must be the same as the model for MII "
+                             "backend.")
+    main(args)
\ No newline at end of file
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -201,7 +201,7 @@ class ModelConfig:

    def _verify_quantization(self) -> None:
        supported_quantization = [*QUANTIZATION_METHODS]
-        rocm_supported_quantization = ["gptq", "squeezellm"]
+        rocm_supported_quantization = ["gptq", "squeezellm","awq"]
        if self.quantization is not None:
            self.quantization = self.quantization.lower()


--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -261,6 +261,8 @@ class LLMEngine:
            prompt_adapter_config=prompt_adapter_config,
        )

+        init_success = False
+        try:
            if not self.model_config.embedding_mode:
                self._initialize_kv_caches()

@@ -339,6 +341,11 @@ class LLMEngine:
                    "vllm.llm_engine",
                    self.observability_config.otlp_traces_endpoint)
                
+            def get_tokenizer_for_seq(self,
+                                    sequence: Sequence) -> "PreTrainedTokenizer":
+                return self.get_tokenizer_group().get_lora_tokenizer(
+                    sequence.lora_request)
+
            # Create sequence output processor, e.g. for beam search or
            # speculative decoding.
            self.output_processor = (
@@ -353,6 +360,13 @@ class LLMEngine:
                        self.get_tokenizer_for_seq,
                    ),
                ))
+            init_success = True
+        finally:
+            if not init_success:
+                # Ensure that model_executor is shut down if LLMEngine init
+                # failed
+                self.model_executor.shutdown()
+        

    def _initialize_kv_caches(self) -> None:
        """Initialize the KV cache in the worker(s).
@@ -476,10 +490,10 @@ class LLMEngine:
    ) -> "PreTrainedTokenizer":
        return self.get_tokenizer_group().get_lora_tokenizer(lora_request)

-    def get_tokenizer_for_seq(self,
-                              sequence: Sequence) -> "PreTrainedTokenizer":
-        return self.get_tokenizer_group().get_lora_tokenizer(
-            sequence.lora_request)
+    # def get_tokenizer_for_seq(self,
+    #                           sequence: Sequence) -> "PreTrainedTokenizer":
+    #     return self.get_tokenizer_group().get_lora_tokenizer(
+    #         sequence.lora_request)

    def _init_tokenizer(self, **tokenizer_init_kwargs) -> BaseTokenizerGroup:
        init_kwargs = dict(

--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -176,6 +176,11 @@ environment_variables: Dict[str, Callable[[], Any]] = {
    lambda: (os.environ.get("VLLM_USE_TRITON_FLASH_ATTN", "False").lower() in
             ("true", "1")),
    
+    # flag to control vllm to automatically switch between Triton FA and CK FA
+    "VLLM_USE_FLASH_ATTN_AUTO":
+    lambda: (os.environ.get("VLLM_USE_FLASH_ATTN_AUTO", "False").lower() in
+             ("true", "1")),
+
    # local rank of the process in the distributed setting, used to determine
    # the GPU device id
    "LOCAL_RANK":
@@ -187,7 +192,7 @@ environment_variables: Dict[str, Callable[[], Any]] = {

    # timeout for each iteration in the engine
    "VLLM_ENGINE_ITERATION_TIMEOUT_S":
-    lambda: int(os.environ.get("VLLM_ENGINE_ITERATION_TIMEOUT_S", "60")),
+    lambda: int(os.environ.get("VLLM_ENGINE_ITERATION_TIMEOUT_S", "120")),

    # API key for VLLM API server
    "VLLM_API_KEY":

--- a/vllm/executor/multiproc_worker_utils.py
+++ b/vllm/executor/multiproc_worker_utils.py
@@ -76,7 +76,8 @@ class ResultHandler(threading.Thread):
    """Handle results from all workers (in background thread)"""

    def __init__(self) -> None:
-        super().__init__(daemon=True)
+        super().__init__(daemon=False)
+        # super().__init__(daemon=True)
        self.result_queue = mp.Queue()
        self.tasks: Dict[uuid.UUID, Union[ResultFuture, asyncio.Future]] = {}

@@ -100,7 +101,8 @@ class WorkerMonitor(threading.Thread):

    def __init__(self, workers: List['ProcessWorkerWrapper'],
                 result_handler: ResultHandler):
-        super().__init__(daemon=True)
+        super().__init__(daemon=False)
+        # super().__init__(daemon=True)
        self.workers = workers
        self.result_handler = result_handler
        self._close = False
@@ -112,15 +114,31 @@ class WorkerMonitor(threading.Thread):
            self._close = True

            # Kill / cleanup all workers
+            # for worker in self.workers:
+            #     process = worker.process
+            #     if process.sentinel in dead_sentinels:
+            #         process.join(JOIN_TIMEOUT_S)
+            #     if process.exitcode is not None and process.exitcode != 0:
+            #         logger.error("Worker %s pid %s died, exit code: %s",
+            #                      process.name, process.pid, process.exitcode)
+            if not sys.is_finalizing():
+                # Kill / cleanup all workers
+                died_count = 0
                for worker in self.workers:
                    process = worker.process
                    if process.sentinel in dead_sentinels:
                        process.join(JOIN_TIMEOUT_S)
                    if process.exitcode is not None and process.exitcode != 0:
+                        died_count += 1
                        logger.error("Worker %s pid %s died, exit code: %s",
-                                 process.name, process.pid, process.exitcode)
+                                     process.name, process.pid,
+                                     process.exitcode)
+                if died_count < len(self.workers):
+                    logger.info(
+                        "Killing remaining local vLLM worker processes")
+                    
            # Cleanup any remaining workers
-            logger.info("Killing local vLLM worker processes")
+            # logger.info("Killing local vLLM worker processes")
            for worker in self.workers:
                worker.kill_worker()
            # Must be done after worker task queues are all closed

--- a/vllm/model_executor/layers/quantization/awq.py
+++ b/vllm/model_executor/layers/quantization/awq.py
@@ -10,6 +10,11 @@ from vllm.model_executor.layers.quantization.base_config import (
 from vllm.model_executor.utils import set_weight_attrs


+class AWQShareWorkSpace():
+    awqworkshapcesize=2<<29  #
+    awqworkshapce=torch.zeros(awqworkshapcesize//2+1,dtype=torch.float16).cuda()
+
+
 class AWQConfig(QuantizationConfig):
    """Config class for AWQ.

@@ -144,33 +149,66 @@ class AWQLinearMethod(LinearMethodBase):
            "output_dim": 1,
        })
        
+        zeros_and_scales=Parameter(
+            torch.empty(
+                (input_size_per_partition // self.quant_config.group_size),
+                output_size_per_partition,
+                dtype=torch.int32,
+            ),
+            requires_grad=False,
+        )    
+        set_weight_attrs(zeros_and_scales, {
+            "input_dim": 0,
+            "output_dim": 1,
+        })
+
        layer.register_parameter("qweight", qweight)
        set_weight_attrs(qweight, extra_weight_attrs)
        layer.register_parameter("qzeros", qzeros)
        set_weight_attrs(qzeros, extra_weight_attrs)
        layer.register_parameter("scales", scales)
        set_weight_attrs(scales, extra_weight_attrs)
+        layer.register_parameter("zeros_and_scales", zeros_and_scales)
+        set_weight_attrs(zeros_and_scales, extra_weight_attrs)

    def apply(self,
              layer: torch.nn.Module,
              x: torch.Tensor,
              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
        qweight = layer.qweight
-        scales = layer.scales
-        qzeros = layer.qzeros
-        pack_factor = self.quant_config.pack_factor
-        out_shape = (x.shape[:-1] + (qweight.shape[-1] * pack_factor, ))
+        zeros_and_scales = layer.zeros_and_scales
+        
+        out_shape = (x.shape[:-1] + (qweight.shape[0] * 1, ))
        reshaped_x = x.reshape(-1, x.shape[-1])
        
-        # num_tokens >= threshold
-        FP16_MATMUL_HEURISTIC_CONDITION = x.shape[:-1].numel() >= 256
+        m = reshaped_x.shape[0]
+        k = reshaped_x.shape[-1]
+        n = qweight.shape[0]
        
-        if FP16_MATMUL_HEURISTIC_CONDITION:
-            out = ops.awq_dequantize(qweight, scales, qzeros, 0, 0, 0)
-            out = torch.matmul(reshaped_x, out)
+        if k % 4096==0:
+            padding_group=2
        else:
-            out = ops.awq_gemm(reshaped_x, qweight, scales, qzeros,
-                               pack_factor)
+            padding_group=0
+        
+        out = ops.awq_gemm(reshaped_x,
+                            qweight,
+                            zeros_and_scales,
+                            m,
+                            n,
+                            k,
+                            self.quant_config.group_size,
+                            padding_group,
+                            AWQShareWorkSpace.awqworkshapce,
+                            AWQShareWorkSpace.awqworkshapcesize)
+        #下面是采用rocblas的做法
+        # deqweight=ops.dequant_w4_gemm_colmajor(    #shape[n,k/8]--->[n,k]
+        #                   qweight, 
+        #                   zeros_and_scales,
+        #                   k,
+        #                   n,
+        #                   self.quant_config.group_size)
+        # output=F.linear(reshaped_x, deqweight)    
+        
        if bias is not None:
            out.add_(bias)
        return out.reshape(out_shape)
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -293,7 +293,7 @@ class DefaultModelLoader(BaseModelLoader):

            for _, module in model.named_modules():
                quant_method = getattr(module, "quant_method", None)
-                if quant_method is not None:
+                if quant_method is not None and quant_method!="awq" and quant_method!="gptq":
                    quant_method.process_weights_after_loading(module)
        return model.eval()


--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -417,6 +417,11 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA):
            self.sampler = Sampler()
        else:
            self.lm_head = PPMissingLayer()
+        self.quant_method =  None
+     
+        if quant_config is not None:
+            self.quant_method=quant_config.get_name()
+            self.quant_config=quant_config
              
        self.use_llama_nn = os.environ.get('LLAMA_NN') == '1'
        self.use_gemm_pad = os.environ.get('GEMM_PAD') == '1'
@@ -552,6 +557,47 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA):
                    
                    weight.data=weight.data.reshape(ori_shape[1], -1)
     
+        if self.quant_method == "awq":
+            lay_key_words = [
+                "self_attn.qkv_proj.qweight",
+                "self_attn.o_proj.qweight",
+                "mlp.gate_up_proj.qweight",
+                "mlp.down_proj.qweight"
+            ]
+            combined_words = "|".join(lay_key_words)
+            
+            for layername, weight in params_dict.items():
+                
+                matches = re.findall(combined_words, layername)
+                if matches:
+                    qweight =params_dict[layername]
+                    qzeros=params_dict[layername.replace("qweight", "qzeros")]
+                    scales=params_dict[layername.replace("qweight", "scales")]
+                    zeros_and_scalse =params_dict[layername.replace("qweight", "zeros_and_scales")]
+                    
+                    group_size= self.quant_config.group_size 
+                   
+                    dim_n = scales.data.shape[1]
+                    dim_k = qweight.data.shape[0]
+                    pad_group=2              
+                    
+                    _qw, _sz=ops.convert_s4(qweight,qzeros,scales,int(group_size)) 
+                    
+                    sz = ops.sz_permute(_sz).reshape(-1,dim_n)       
+                    
+                    zeros_and_scalse.data.copy_(sz)
+                    qweight.data.copy_(_qw)
+                    
+                    #reshape
+                    zeros_and_scalse.data=zeros_and_scalse.reshape(dim_n,-1)    #[k/greop_size,n]------>[n,k/group_size]
+                    qweight.data=qweight.data.reshape(dim_n,-1)                      #[k,n/8]---->[n,k/8]  
+                
+                    if dim_k % 4096==0:
+                        zeros_and_scalse_pad= torch.zeros(dim_n,pad_group,dtype=torch.int32).cuda()
+                        zeros_and_scalse.data=torch.cat((zeros_and_scalse.data,zeros_and_scalse_pad),dim=1).contiguous()
+                        qweight_pad= torch.zeros(dim_n,int(group_size//4),dtype=torch.int32).cuda()
+                        qweight.data=torch.cat((qweight.data,qweight_pad),dim=1).contiguous()
+                         
    # If this function is called, it should always initialize KV cache scale
    # factors (or else raise an exception). Thus, handled exceptions should
    # make sure to leave KV cache scale factors in a known good (dummy) state

--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -248,6 +248,12 @@ class QWenLMHeadModel(nn.Module):
                                      quant_config=quant_config)
        self.logits_processor = LogitsProcessor(config.vocab_size)
        self.sampler = Sampler()
+        
+        self.quant_method =  None
+        if quant_config is not None:
+            self.quant_method=quant_config.get_name()
+            self.quant_config=quant_config
+              
        self.use_llama_nn = os.environ.get('LLAMA_NN') == '1'
        self.use_gemm_pad = os.environ.get('GEMM_PAD') == '1'
        self.use_fa_pad = os.environ.get('FA_PAD') == '1'
@@ -353,4 +359,44 @@ class QWenLMHeadModel(nn.Module):
                    
                    weight.data=weight.data.reshape(ori_shape[1],-1)
                    
+        if self.quant_method == "awq":
+            lay_key_words = [
+                "attn.c_attn.qweight",
+                "attn.c_proj.qweight",
+                "mlp.gate_up_proj.qweight",
+                "mlp.c_proj.qweight"
+            ]
+            combined_words = "|".join(lay_key_words)
+            
+            for layername, weight in params_dict.items():
+                
+                matches = re.findall(combined_words, layername)
+                if matches:
+                    qweight =params_dict[layername]
+                    qzeros=params_dict[layername.replace("qweight", "qzeros")]
+                    scales=params_dict[layername.replace("qweight", "scales")]
+                    zeros_and_scalse =params_dict[layername.replace("qweight", "zeros_and_scales")]
+                    
+                    group_size= self.quant_config.group_size 
+                   
+                    dim_n = scales.data.shape[1]
+                    dim_k = qweight.data.shape[0]
+                    pad_group=2              
+                    
+                    _qw, _sz=ops.convert_s4(qweight,qzeros,scales,int(group_size)) 
+                    
+                    sz = ops.sz_permute(_sz).reshape(-1,dim_n)       
+                    
+                    zeros_and_scalse.data.copy_(sz)
+                    qweight.data.copy_(_qw)
+                    
+                    #reshape
+                    zeros_and_scalse.data=zeros_and_scalse.reshape(dim_n,-1)    #[k/greop_size,n]------>[n,k/group_size]
+                    qweight.data=qweight.data.reshape(dim_n,-1)                      #[k,n/8]---->[n,k/8]  
+                
+                    if dim_k % 4096==0:
+                        zeros_and_scalse_pad= torch.zeros(dim_n,pad_group,dtype=torch.int32).cuda()
+                        zeros_and_scalse.data=torch.cat((zeros_and_scalse.data,zeros_and_scalse_pad),dim=1).contiguous()
+                        qweight_pad= torch.zeros(dim_n,int(group_size//4),dtype=torch.int32).cuda()
+                        qweight.data=torch.cat((qweight.data,qweight_pad),dim=1).contiguous()