Commit 3f42b83d authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge branch 'v0.6.2-eval' into v0.6.2-dev

# Conflicts:
#	csrc/attention/static_switch_tc.h
#	vllm/model_executor/layers/vocab_parallel_embedding.py
#	vllm/model_executor/model_loader/utils.py
#	vllm/model_executor/models/llama.py
parents cbdc3a13 510401e2
......@@ -35,6 +35,8 @@ from transformers import PreTrainedTokenizerBase
from vllm import LLM, SamplingParams
from vllm.utils import FlexibleArgumentParser
from triton.common.backend import compute_core_version_key
try:
from vllm.transformers_utils.tokenizer import get_tokenizer
except ImportError:
......@@ -44,6 +46,7 @@ PROMPT = "You are a helpful assistant in recognizes the content of tables in mar
def test_prefix(llm=None, sampling_params=None, prompts=None):
version_key = compute_core_version_key()
start_time = time.time()
llm.generate(prompts, sampling_params=sampling_params)
......
......@@ -43,6 +43,9 @@
} else if (HEADDIM == 160) { \
constexpr static int HEAD_SIZE = 160; \
return __VA_ARGS__(); \
} else if (HEADDIM == 192) { \
constexpr static int HEAD_SIZE = 192; \
return __VA_ARGS__(); \
} else if (HEADDIM == 256) { \
constexpr static int HEAD_SIZE = 256; \
return __VA_ARGS__(); \
......
......@@ -22,7 +22,6 @@ class UnquantizedEmbeddingMethod(QuantizeMethodBase):
def __init__(self):
self.use_llama_nn = os.environ.get('LLAMA_NN') == '1'
self.use_lm_nn = os.environ.get('LM_NN') == '1'
def create_weights(self, layer: torch.nn.Module,
input_size_per_partition: int,
......@@ -42,7 +41,7 @@ class UnquantizedEmbeddingMethod(QuantizeMethodBase):
layer: torch.nn.Module,
x: torch.Tensor,
bias: Optional[torch.Tensor] = None) -> torch.Tensor:
if self.use_llama_nn and self.use_lm_nn:
if self.use_llama_nn and os.environ['LM_NN'] == '1':
if bias is not None:
if len(x.shape) == 2:
return torch.addmm(bias, x, layer.weight)
......
......@@ -219,4 +219,4 @@ class ModelRegistry:
__all__ = [
"ModelRegistry",
]
]
\ No newline at end of file
......@@ -726,4 +726,4 @@ class ChatGLMForCausalLM(nn.Module, SupportsLoRA, SupportsMultiModal):
ops.trans_w16_gemm(_weight, weight.data, _weight.shape[0], _weight.shape[1])
weight.data.copy_(_weight)
weight.data=weight.data.reshape(ori_shape[1], -1)
weight.data=weight.data.reshape(ori_shape[1], -1)
\ No newline at end of file
......@@ -574,18 +574,20 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA):
"self_attn.qkv_proj.weight",
"self_attn.o_proj.weight",
"mlp.gate_up_proj.weight",
"mlp.down_proj.weight"
"mlp.down_proj.weight",
"lm_head.weight"
]
if self.use_lm_nn:
lay_key_words.append("lm_head.weight")
combined_words = "|".join(lay_key_words)
lay_qkv_words = ["self_attn.qkv_proj.weight"]
qkv_words = "|".join(lay_qkv_words)
for layername, weight in params_dict.items():
if "lm_head.weight" in layername:
os.environ['LM_NN'] = '1'
else:
os.environ['LM_NN'] = '0'
matches = re.findall(combined_words, layername)
if matches:
if self.use_gemm_pad and gemm_bank_conf(weight.data.shape[0]):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment