Merge branch 'v0.5.0-dtk24.04.1' into v0.5.3.post1-dtk24.04.1

b0b9d2d9 · zhuwenwen · c9305344 · ffbef65c · b0b9d2d9 · b0b9d2d9
Commit b0b9d2d9 authored Aug 01, 2024 by zhuwenwen
10 changed files
--- a/README.md
+++ b/README.md
@@ -54,7 +54,10 @@ pip install setuptools wheel
 ```shell
 git clone http://developer.hpccube.com/codes/OpenDAS/vllm.git # 根据需要的分支进行切换
 ```
-
+安装依赖：
+```shell
+pip install -r requirements-rocm.txt
+```
 - 提供2种源码编译方式（进入vllm目录）：
 ```
 1. 编译whl包并安装

--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -5,12 +5,14 @@ import random
 import time
 from typing import List, Optional, Tuple

+import numpy as np
 import torch
 from tqdm import tqdm
 from transformers import (AutoModelForCausalLM, AutoTokenizer,
                          PreTrainedTokenizerBase)

 from vllm.engine.arg_utils import EngineArgs
+from vllm.inputs import PromptInputs
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 from vllm.utils import FlexibleArgumentParser

@@ -123,6 +125,23 @@ def run_vllm(
                max_tokens=output_len,
            ))

+    # warmup
+    dummy_prompt_token_ids = np.random.randint(10000,
+                                               size=(args.batch_size,
+                                                     args.input_len))
+    dummy_inputs: List[PromptInputs] = [{
+        "prompt_token_ids": batch
+    } for batch in dummy_prompt_token_ids.tolist()]
+    
+    def run_to_completion():
+        llm.generate(dummy_inputs,
+                        sampling_params=sampling_params,
+                        use_tqdm=False)
+    
+    print("Warming up...")
+    for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
+        run_to_completion()
+
    start = time.perf_counter()
    llm.generate(prompts, sampling_params, use_tqdm=True)
    end = time.perf_counter()
@@ -299,6 +318,10 @@ if __name__ == "__main__":
                        default=1,
                        help="Number of generated sequences per prompt.")
    parser.add_argument("--use-beam-search", action="store_true")
+    parser.add_argument('--num-iters-warmup',
+                        type=int,
+                        default=1,
+                        help='Number of iterations to run for warmup.')
    parser.add_argument("--num-prompts",
                        type=int,
                        default=1000,

--- a/csrc/attention/static_switch.h
+++ b/csrc/attention/static_switch.h
@@ -31,38 +31,28 @@
    }                                           \
  }()

-  // #define HEADSIZE_SWITCH(HEADDIM, ...)   \
-  // [&] {                                    \
-  //   if (HEADDIM == 64) {                   \
-  //     constexpr static int HEAD_SIZE = 64;  \
-  //     return __VA_ARGS__();                \
-  //   } else if (HEADDIM == 80) {            \
-  //     constexpr static int HEAD_SIZE = 80;  \
-  //     return __VA_ARGS__();                \
-  //   } else if (HEADDIM == 96) {            \
-  //     constexpr static int HEAD_SIZE = 96;  \
-  //     return __VA_ARGS__();                \
-  //   } else if (HEADDIM == 112) {           \
-  //     constexpr static int HEAD_SIZE = 112; \
-  //     return __VA_ARGS__();                \
-  //   } else if (HEADDIM == 128) {           \
-  //     constexpr static int HEAD_SIZE = 128; \
-  //     return __VA_ARGS__();                \
-  //   } else if (HEADDIM == 256) {           \
-  //     constexpr static int HEAD_SIZE = 256; \
-  //     return __VA_ARGS__();                \
-  //   }                                      \
-  //   else {                                 \
-  //     TORCH_CHECK(false, "Unsupported head size: ", HEADDIM);\
-  //   }                                      \
-  // }()
-
  #define HEADSIZE_SWITCH(HEADDIM, ...)   \
  [&] {                                    \
-    if (HEADDIM == 128) {           \
+    if (HEADDIM == 64) {                   \
+      constexpr static int HEAD_SIZE = 64;  \
+      return __VA_ARGS__();                \
+    } else if (HEADDIM == 80) {            \
+      constexpr static int HEAD_SIZE = 80;  \
+      return __VA_ARGS__();                \
+    } else if (HEADDIM == 96) {            \
+      constexpr static int HEAD_SIZE = 96;  \
+      return __VA_ARGS__();                \
+    } else if (HEADDIM == 112) {           \
+      constexpr static int HEAD_SIZE = 112; \
+      return __VA_ARGS__();                \
+    } else if (HEADDIM == 128) {           \
      constexpr static int HEAD_SIZE = 128; \
      return __VA_ARGS__();                \
-    } else {                                 \
+    } else if (HEADDIM == 256) {           \
+      constexpr static int HEAD_SIZE = 256; \
+      return __VA_ARGS__();                \
+    }                                      \
+    else {                                 \
      TORCH_CHECK(false, "Unsupported head size: ", HEADDIM);\
    }                                      \
  }()

--- a/examples/offline_inference.py
+++ b/examples/offline_inference.py
@@ -12,7 +12,7 @@ if __name__ == '__main__':
    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)

    # Create an LLM.
-    llm = LLM(model="facebook/opt-125m",trust_remote_code=True, dtype="float16", enforce_eager=False)
+    llm = LLM(model="facebook/opt-125m",trust_remote_code=True, dtype="float16", enforce_eager=True)
    # Generate texts from the prompts. The output is a list of RequestOutput objects
    # that contain the prompt, generated text, and other information.
    outputs = llm.generate(prompts, sampling_params)

--- a/setup.py
+++ b/setup.py
@@ -377,7 +377,7 @@ def get_version_add(sha: Optional[str] = None) -> str:
    if sha != 'Unknown':
        if sha is None:
            sha = get_sha(vllm_root)
-        version = 'das1.1.git' + sha[:7]
+        version = 'das1.2.git' + sha[:7]

    # abi version
    version += "." + get_abi()

--- a/vllm/model_executor/models/baichuan.py
+++ b/vllm/model_executor/models/baichuan.py
@@ -186,7 +186,7 @@ class BaiChuanAttention(nn.Module):
        attn_metadata: AttentionMetadata,
    ) -> torch.Tensor:
        qkv, _ = self.W_pack(hidden_states)
-        if os.environ.get('FA_PAD') == '1' and qkv.shape[-1] == 12320:
+        if os.environ.get('FA_PAD') == '1':
            qkv = qkv[...,:-32]
        q, k, v = qkv.chunk(chunks=3, dim=-1)
        if self.postion_embedding != "ALIBI":
@@ -423,14 +423,18 @@ class BaiChuanBaseForCausalLM(nn.Module, SupportsLoRA):
            ]
            combined_words = "|".join(lay_key_words)
            
+            lay_qkv_words = ["self_attn.W_pack.weight"]   
+            qkv_words = "|".join(lay_qkv_words)  
+            
            for layername, weight in params_dict.items():
                matches = re.findall(combined_words, layername)
                if matches:      
                    if self.use_gemm_pad and gemm_bank_conf(weight.data.shape[0]):
                        weight.data = pad_weight(weight.data, 32)  
                        
-                    if self.use_fa_pad and weight.data.shape[0] == 12288:
-                        weight.data = pad_weight(weight.data, 32)
+                    if self.use_fa_pad and (re.findall(qkv_words, layername)):
+                        if not gemm_bank_conf(weight.data.shape[0]):
+                            weight.data = pad_weight(weight.data, 32)
                                    
                    _weight = torch.zeros_like(weight.data)
                    ori_shape =_weight.shape

--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -108,7 +108,7 @@ class GLMAttention(nn.Module):
        attn_metadata: AttentionMetadata,
    ) -> torch.Tensor:
        qkv, _ = self.query_key_value(hidden_states)
-        if os.environ.get('FA_PAD') == '1' and qkv.shape[-1] == 12320:
+        if os.environ.get('FA_PAD') == '1':
            qkv = qkv[...,:-32]
        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
        q, k = self.rotary_emb(position_ids, q, k)
@@ -421,14 +421,24 @@ class ChatGLMForCausalLM(nn.Module, SupportsLoRA):
            ]
            combined_words = "|".join(lay_key_words)
            
+            lay_qkv_words = ["self_attention.query_key_value.weight"]   
+            qkv_words = "|".join(lay_qkv_words)  
+            
+            lay_qkv_bias_words = ["self_attention.query_key_value.bias"]   
+            qkv_bias_words = "|".join(lay_qkv_bias_words)
+            
            for layername, weight in params_dict.items():
+                if self.use_fa_pad and (re.findall(qkv_bias_words, layername)):
+                    weight.data = pad_weight(weight.data, 32)
+                    
                matches = re.findall(combined_words, layername)
                if matches:  
                    if self.use_gemm_pad and gemm_bank_conf(weight.data.shape[0]):
                        weight.data = pad_weight(weight.data, 32)  
                        
-                    if self.use_fa_pad and weight.data.shape[0] == 12288:
-                        weight.data = pad_weight(weight.data, 32)
+                    if self.use_fa_pad and (re.findall(qkv_words, layername)):
+                        if not gemm_bank_conf(weight.data.shape[0]):
+                            weight.data = pad_weight(weight.data, 32)
                                        
                    _weight = torch.zeros_like(weight.data)
                    ori_shape =_weight.shape

--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -175,7 +175,7 @@ class LlamaAttention(nn.Module):
        attn_metadata: AttentionMetadata,
    ) -> torch.Tensor:
        qkv, _ = self.qkv_proj(hidden_states)
-        if os.environ.get('FA_PAD') == '1' and qkv.shape[-1] == 12320:
+        if os.environ.get('FA_PAD') == '1':
            qkv = qkv[...,:-32]
        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
        q, k = self.rotary_emb(positions, q, k)
@@ -531,14 +531,18 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA):
            ]
            combined_words = "|".join(lay_key_words)
            
+            lay_qkv_words = ["self_attn.qkv_proj.weight"]   
+            qkv_words = "|".join(lay_qkv_words)          
+            
            for layername, weight in params_dict.items():
                matches = re.findall(combined_words, layername)
                if matches:         
                    if self.use_gemm_pad and gemm_bank_conf(weight.data.shape[0]):
                        weight.data = pad_weight(weight.data, 32)  
                        
-                    if self.use_fa_pad and weight.data.shape[0] == 12288:
-                        weight.data = pad_weight(weight.data, 32)
+                    if self.use_fa_pad and (re.findall(qkv_words, layername)):
+                        if not gemm_bank_conf(weight.data.shape[0]):
+                            weight.data = pad_weight(weight.data, 32)
                                 
                    _weight = torch.zeros_like(weight.data)
                    ori_shape =_weight.shape

--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -124,7 +124,7 @@ class QWenAttention(nn.Module):
        attn_metadata: AttentionMetadata,
    ) -> torch.Tensor:
        qkv, _ = self.c_attn(hidden_states)
-        if os.environ.get('FA_PAD') == '1' and qkv.shape[-1] == 12320:
+        if os.environ.get('FA_PAD') == '1':
            qkv = qkv[...,:-32]
        q, k, v = qkv.chunk(chunks=3, dim=-1)
        q, k = self.rotary_emb(positions, q, k)
@@ -326,14 +326,24 @@ class QWenLMHeadModel(nn.Module):
            ]
            combined_words = "|".join(lay_key_words)
            
+            lay_qkv_words = ["attn.c_attn.weight"]   
+            qkv_words = "|".join(lay_qkv_words)  
+            
+            lay_qkv_bias_words = ["attn.c_attn.bias"]   
+            qkv_bias_words = "|".join(lay_qkv_bias_words) 
+                      
            for layername, weight in params_dict.items():
+                if self.use_fa_pad and (re.findall(qkv_bias_words, layername)):
+                    weight.data = pad_weight(weight.data, 32)
+                
                matches = re.findall(combined_words, layername)
                if matches:         
                    if self.use_gemm_pad and gemm_bank_conf(weight.data.shape[0]):
                        weight.data = pad_weight(weight.data, 32)  
                        
-                    if self.use_fa_pad and weight.data.shape[0] == 12288:
-                        weight.data = pad_weight(weight.data, 32)
+                    if self.use_fa_pad and (re.findall(qkv_words, layername)):
+                        if not gemm_bank_conf(weight.data.shape[0]):
+                            weight.data = pad_weight(weight.data, 32)
                        
                    _weight = torch.zeros_like(weight.data)
                    ori_shape =_weight.shape

--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -156,7 +156,7 @@ class Qwen2Attention(nn.Module):
        attn_metadata: AttentionMetadata,
    ) -> torch.Tensor:
        qkv, _ = self.qkv_proj(hidden_states)
-        if os.environ.get('FA_PAD') == '1' and qkv.shape[-1] == 12320:
+        if os.environ.get('FA_PAD') == '1':
            qkv = qkv[...,:-32]
        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
        q, k = self.rotary_emb(positions, q, k)
@@ -411,14 +411,24 @@ class Qwen2ForCausalLM(nn.Module, SupportsLoRA):
            ]
            combined_words = "|".join(lay_key_words)
            
+            lay_qkv_words = ["self_attn.qkv_proj.weight"]   
+            qkv_words = "|".join(lay_qkv_words)  
+            
+            lay_qkv_bias_words = ["self_attn.qkv_proj.bias"]   
+            qkv_bias_words = "|".join(lay_qkv_bias_words) 
+            
            for layername, weight in params_dict.items():
+                if self.use_fa_pad and (re.findall(qkv_bias_words, layername)):
+                    weight.data = pad_weight(weight.data, 32)
+                    
                matches = re.findall(combined_words, layername)
                if matches:   
                    if self.use_gemm_pad and gemm_bank_conf(weight.data.shape[0]):
                        weight.data = pad_weight(weight.data, 32)  
-                        
-                    if self.use_fa_pad and weight.data.shape[0] == 12288:
-                        weight.data = pad_weight(weight.data, 32)
+                    
+                    if self.use_fa_pad and (re.findall(qkv_words, layername)):
+                        if not gemm_bank_conf(weight.data.shape[0]):
+                            weight.data = pad_weight(weight.data, 32)
                        
                    _weight = torch.zeros_like(weight.data)
                    ori_shape =_weight.shape