Commit 3f42b83d authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge branch 'v0.6.2-eval' into v0.6.2-dev

# Conflicts:
#	csrc/attention/static_switch_tc.h
#	vllm/model_executor/layers/vocab_parallel_embedding.py
#	vllm/model_executor/model_loader/utils.py
#	vllm/model_executor/models/llama.py
parents cbdc3a13 510401e2
...@@ -35,6 +35,8 @@ from transformers import PreTrainedTokenizerBase ...@@ -35,6 +35,8 @@ from transformers import PreTrainedTokenizerBase
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.utils import FlexibleArgumentParser from vllm.utils import FlexibleArgumentParser
from triton.common.backend import compute_core_version_key
try: try:
from vllm.transformers_utils.tokenizer import get_tokenizer from vllm.transformers_utils.tokenizer import get_tokenizer
except ImportError: except ImportError:
...@@ -44,6 +46,7 @@ PROMPT = "You are a helpful assistant in recognizes the content of tables in mar ...@@ -44,6 +46,7 @@ PROMPT = "You are a helpful assistant in recognizes the content of tables in mar
def test_prefix(llm=None, sampling_params=None, prompts=None): def test_prefix(llm=None, sampling_params=None, prompts=None):
version_key = compute_core_version_key()
start_time = time.time() start_time = time.time()
llm.generate(prompts, sampling_params=sampling_params) llm.generate(prompts, sampling_params=sampling_params)
......
...@@ -43,6 +43,9 @@ ...@@ -43,6 +43,9 @@
} else if (HEADDIM == 160) { \ } else if (HEADDIM == 160) { \
constexpr static int HEAD_SIZE = 160; \ constexpr static int HEAD_SIZE = 160; \
return __VA_ARGS__(); \ return __VA_ARGS__(); \
} else if (HEADDIM == 192) { \
constexpr static int HEAD_SIZE = 192; \
return __VA_ARGS__(); \
} else if (HEADDIM == 256) { \ } else if (HEADDIM == 256) { \
constexpr static int HEAD_SIZE = 256; \ constexpr static int HEAD_SIZE = 256; \
return __VA_ARGS__(); \ return __VA_ARGS__(); \
......
...@@ -22,7 +22,6 @@ class UnquantizedEmbeddingMethod(QuantizeMethodBase): ...@@ -22,7 +22,6 @@ class UnquantizedEmbeddingMethod(QuantizeMethodBase):
def __init__(self): def __init__(self):
self.use_llama_nn = os.environ.get('LLAMA_NN') == '1' self.use_llama_nn = os.environ.get('LLAMA_NN') == '1'
self.use_lm_nn = os.environ.get('LM_NN') == '1'
def create_weights(self, layer: torch.nn.Module, def create_weights(self, layer: torch.nn.Module,
input_size_per_partition: int, input_size_per_partition: int,
...@@ -42,7 +41,7 @@ class UnquantizedEmbeddingMethod(QuantizeMethodBase): ...@@ -42,7 +41,7 @@ class UnquantizedEmbeddingMethod(QuantizeMethodBase):
layer: torch.nn.Module, layer: torch.nn.Module,
x: torch.Tensor, x: torch.Tensor,
bias: Optional[torch.Tensor] = None) -> torch.Tensor: bias: Optional[torch.Tensor] = None) -> torch.Tensor:
if self.use_llama_nn and self.use_lm_nn: if self.use_llama_nn and os.environ['LM_NN'] == '1':
if bias is not None: if bias is not None:
if len(x.shape) == 2: if len(x.shape) == 2:
return torch.addmm(bias, x, layer.weight) return torch.addmm(bias, x, layer.weight)
......
...@@ -219,4 +219,4 @@ class ModelRegistry: ...@@ -219,4 +219,4 @@ class ModelRegistry:
__all__ = [ __all__ = [
"ModelRegistry", "ModelRegistry",
] ]
\ No newline at end of file
...@@ -726,4 +726,4 @@ class ChatGLMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal): ...@@ -726,4 +726,4 @@ class ChatGLMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
ops.trans_w16_gemm(_weight, weight.data, _weight.shape[0], _weight.shape[1]) ops.trans_w16_gemm(_weight, weight.data, _weight.shape[0], _weight.shape[1])
weight.data.copy_(_weight) weight.data.copy_(_weight)
weight.data=weight.data.reshape(ori_shape[1], -1) weight.data=weight.data.reshape(ori_shape[1], -1)
\ No newline at end of file
...@@ -574,18 +574,20 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA): ...@@ -574,18 +574,20 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA):
"self_attn.qkv_proj.weight", "self_attn.qkv_proj.weight",
"self_attn.o_proj.weight", "self_attn.o_proj.weight",
"mlp.gate_up_proj.weight", "mlp.gate_up_proj.weight",
"mlp.down_proj.weight" "mlp.down_proj.weight",
"lm_head.weight"
] ]
if self.use_lm_nn:
lay_key_words.append("lm_head.weight")
combined_words = "|".join(lay_key_words) combined_words = "|".join(lay_key_words)
lay_qkv_words = ["self_attn.qkv_proj.weight"] lay_qkv_words = ["self_attn.qkv_proj.weight"]
qkv_words = "|".join(lay_qkv_words) qkv_words = "|".join(lay_qkv_words)
for layername, weight in params_dict.items(): for layername, weight in params_dict.items():
if "lm_head.weight" in layername:
os.environ['LM_NN'] = '1'
else:
os.environ['LM_NN'] = '0'
matches = re.findall(combined_words, layername) matches = re.findall(combined_words, layername)
if matches: if matches:
if self.use_gemm_pad and gemm_bank_conf(weight.data.shape[0]): if self.use_gemm_pad and gemm_bank_conf(weight.data.shape[0]):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment