Commit b72de6bd authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge branch 'v0.7.2-dev' into v0.7.3-dev

parents ec5e299c 437e6aef
......@@ -6,7 +6,7 @@ requests >= 2.26.0
tqdm
blake3
py-cpuinfo
transformers >= 4.48.2 # Required for Bamba model and Transformers backend.
transformers >= 4.49.0 # Required for Bamba model and Transformers backend.
tokenizers >= 0.19.1 # Required for Llama 3.
protobuf # Required by LlamaTokenizer.
fastapi[standard] >= 0.107.0, < 0.113.0; python_version < '3.9'
......
......@@ -324,7 +324,8 @@ class ModelConfig:
# Set enforce_eager to False if the value is unset.
if self.enforce_eager is None:
self.enforce_eager = False
# self.enforce_eager = False
self.enforce_eager = True
sliding_window = getattr(self.hf_text_config, "sliding_window", None)
has_interleaved_attention = (sliding_window is not None) and (
......
......@@ -597,7 +597,8 @@ class EngineArgs:
'This should be a JSON string that will be '
'parsed into a dictionary.')
parser.add_argument('--enforce-eager',
action='store_true',
# action='store_true',
default=True,
help='Always use eager-mode PyTorch. If False, '
'will use eager mode and CUDA graph in hybrid '
'for maximal performance and flexibility.')
......
......@@ -88,7 +88,7 @@ def get_model_architecture(
model_config: ModelConfig) -> Tuple[Type[nn.Module], str]:
architectures = getattr(model_config.hf_config, "architectures", [])
visions = getattr(model_config.hf_config, "visual", []) or getattr(model_config.hf_config, "vision_config", [])
support_nn_architectures = ['LlamaForCausalLM', 'QWenLMHeadModel', 'Qwen2ForCausalLM', 'Qwen2VLForConditionalGeneration', 'Qwen2MoeForCausalLM', 'ChatGLMModel', 'ChatGLMForConditionalGeneration',
support_nn_architectures = ['LlamaForCausalLM', 'QWenLMHeadModel', 'Qwen2ForCausalLM', 'Qwen2VLForConditionalGeneration', 'Qwen2_5_VLForConditionalGeneration', 'Qwen2MoeForCausalLM', 'ChatGLMModel', 'ChatGLMForConditionalGeneration',
'BaichuanForCausalLM', 'BloomForCausalLM', 'MedusaModel', 'MixtralForCausalLM', 'MLPSpeculatorPreTrainedModel', 'FalconForCausalLM', 'DeepseekV2ForCausalLM', 'DeepseekV3ForCausalLM']
if any(arch in architectures for arch in support_nn_architectures):
if os.getenv('LLAMA_NN') != '0':
......
......@@ -69,6 +69,11 @@ from .utils import (AutoWeightsLoader, WeightsMapper,
merge_multimodal_embeddings)
from .vision import get_vit_attn_backend
import os
import re
from vllm import _custom_ops as ops
from vllm.model_executor.utils import pad_weight, gemm_bank_conf
logger = init_logger(__name__)
# === Vision Inputs === #
......@@ -528,6 +533,16 @@ class Qwen2_5_VisionTransformer(nn.Module):
quant_config=quant_config,
prefix=f"{prefix}.merger",
)
self.quant_method = None
if quant_config is not None:
self.quant_method=quant_config.get_name()
self.quant_config=quant_config
self.use_llama_nn = os.environ.get('LLAMA_NN') == '1'
self.use_gemm_pad = os.environ.get('GEMM_PAD') == '1'
self.use_fa_pad = os.environ.get('FA_PAD') == '1'
self.use_awq_pad = os.environ.get('AWQ_PAD') == '1'
@property
def dtype(self) -> torch.dtype:
......@@ -675,6 +690,49 @@ class Qwen2_5_VisionTransformer(nn.Module):
default_weight_loader)
weight_loader(param, loaded_weight)
loaded_params.add(name)
if self.use_llama_nn and self.quant_method is None:
lay_key_words = [
"attn.qkv.weight",
"attn.proj.weight",
"mlp.0.weight",
"mlp.2.weight",
"self_attn.qkv_proj.weight",
"self_attn.o_proj.weight",
"mlp.gate_up_proj.weight",
"mlp.down_proj.weight",
"lm_head.weight",
]
combined_words = "|".join(lay_key_words)
# lay_qkv_words = ["attn.qkv.weight"]
# qkv_words = "|".join(lay_qkv_words)
# lay_qkv_bias_words = ["attn.qkv.bias"]
# qkv_bias_words = "|".join(lay_qkv_bias_words)
for layername in loaded_params:
weight = params_dict[layername]
# if self.use_fa_pad and (re.findall(qkv_bias_words, layername)):
# weight.data = pad_weight(weight.data, 32)
matches = re.findall(combined_words, layername)
if matches:
# if self.use_gemm_pad and gemm_bank_conf(weight.data.shape[0]):
# weight.data = pad_weight(weight.data, 32)
# if self.use_fa_pad and (re.findall(qkv_words, layername)):
# if not gemm_bank_conf(weight.data.shape[0]):
# weight.data = pad_weight(weight.data, 32)
_weight = torch.zeros_like(weight.data)
ori_shape =_weight.shape
ops.trans_w16_gemm(_weight, weight.data, _weight.shape[0], _weight.shape[1])
weight.data.copy_(_weight)
weight.data=weight.data.reshape(ori_shape[1],-1)
return loaded_params
......@@ -1103,4 +1161,4 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
return MultiModelKeys.from_string_field(
language_model="language_model",
connector="visual.",
tower_model="visual.merger.")
tower_model="visual.merger.")
\ No newline at end of file
......@@ -1468,4 +1468,4 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
return MultiModelKeys.from_string_field(
language_model="language_model",
connector="visual.",
tower_model="visual.merger.")
tower_model="visual.merger.")
\ No newline at end of file
......@@ -425,4 +425,4 @@ class MultiModalDataParser:
mm_items[k] = subparsers[k](v)
return mm_items
return mm_items
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment