Merge branch 'v0.6.2-eval' into v0.6.2-dev

# Conflicts: # csrc/attention/static_switch_tc.h # vllm/model_executor/layers/vocab_parallel_embedding.py # vllm/model_executor/model_loader/utils.py # vllm/model_executor/models/llama.py

Merge branch 'v0.6.2-eval' into v0.6.2-dev
# Conflicts: # csrc/attention/static_switch_tc.h # vllm/model_executor/layers/vocab_parallel_embedding.py # vllm/model_executor/model_loader/utils.py # vllm/model_executor/models/llama.py
3f42b83d · zhuwenwen · cbdc3a13 · 510401e2 · 3f42b83d · 3f42b83d
Commit 3f42b83d authored Oct 30, 2024 by zhuwenwen
6 changed files
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@@ -35,6 +35,8 @@ from transformers import PreTrainedTokenizerBase
 from vllm import LLM, SamplingParams
 from vllm.utils import FlexibleArgumentParser

+from triton.common.backend import compute_core_version_key
+
 try:
    from vllm.transformers_utils.tokenizer import get_tokenizer
 except ImportError:
@@ -44,6 +46,7 @@ PROMPT = "You are a helpful assistant in recognizes the content of tables in mar


 def test_prefix(llm=None, sampling_params=None, prompts=None):
+    version_key = compute_core_version_key()
    start_time = time.time()

    llm.generate(prompts, sampling_params=sampling_params)

--- a/csrc/attention/static_switch_tc.h
+++ b/csrc/attention/static_switch_tc.h
@@ -43,6 +43,9 @@
    } else if (HEADDIM == 160) {           \
      constexpr static int HEAD_SIZE = 160; \
      return __VA_ARGS__();                \
+    } else if (HEADDIM == 192) {           \
+      constexpr static int HEAD_SIZE = 192; \
+      return __VA_ARGS__();                \
    } else if (HEADDIM == 256) {           \
      constexpr static int HEAD_SIZE = 256; \
      return __VA_ARGS__();                \

--- a/vllm/model_executor/layers/vocab_parallel_embedding.py
+++ b/vllm/model_executor/layers/vocab_parallel_embedding.py
@@ -22,7 +22,6 @@ class UnquantizedEmbeddingMethod(QuantizeMethodBase):
    
    def __init__(self):
        self.use_llama_nn = os.environ.get('LLAMA_NN') == '1'
-        self.use_lm_nn = os.environ.get('LM_NN') == '1'

    def create_weights(self, layer: torch.nn.Module,
                       input_size_per_partition: int,
@@ -42,7 +41,7 @@ class UnquantizedEmbeddingMethod(QuantizeMethodBase):
              layer: torch.nn.Module,
              x: torch.Tensor,
              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-        if self.use_llama_nn and self.use_lm_nn:
+        if self.use_llama_nn and os.environ['LM_NN'] == '1':
            if bias is not None:
                if len(x.shape) == 2: 
                    return torch.addmm(bias, x, layer.weight)

--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -219,4 +219,4 @@ class ModelRegistry:

 __all__ = [
    "ModelRegistry",
-]
+]
\ No newline at end of file
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -726,4 +726,4 @@ class ChatGLMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
                    ops.trans_w16_gemm(_weight, weight.data, _weight.shape[0], _weight.shape[1])
                    weight.data.copy_(_weight)
                    
-                    weight.data=weight.data.reshape(ori_shape[1], -1)
+                    weight.data=weight.data.reshape(ori_shape[1], -1)
\ No newline at end of file
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -574,18 +574,20 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA):
                "self_attn.qkv_proj.weight",
                "self_attn.o_proj.weight",
                "mlp.gate_up_proj.weight",
-                "mlp.down_proj.weight"
+                "mlp.down_proj.weight",
+                "lm_head.weight"
            ]
            
-            if self.use_lm_nn:
-                lay_key_words.append("lm_head.weight")
-                
            combined_words = "|".join(lay_key_words)
            
            lay_qkv_words = ["self_attn.qkv_proj.weight"]   
            qkv_words = "|".join(lay_qkv_words)          
            
            for layername, weight in params_dict.items():
+                if "lm_head.weight" in layername:
+                    os.environ['LM_NN'] = '1'  
+                else:
+                    os.environ['LM_NN'] = '0' 
                matches = re.findall(combined_words, layername)
                if matches:         
                    if self.use_gemm_pad and gemm_bank_conf(weight.data.shape[0]):