"...git@developer.sourcefind.cn:2222/OpenDAS/vllm_cscc.git" did not exist on "cd9e5b8340b4c9dc093b7bd6f960fc2fdddf2b98"
Unverified Commit 82e7e19a authored by Kyle Sayers's avatar Kyle Sayers Committed by GitHub
Browse files

[SupportsQuant] Chameleon, Chatglm, Commandr (#15952)


Signed-off-by: default avatarKyle Sayers <kylesayrs@gmail.com>
parent 421c4629
...@@ -38,7 +38,8 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor, ...@@ -38,7 +38,8 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP from .interfaces import (MultiModalEmbeddings, SupportsMultiModal, SupportsPP,
SupportsQuant)
from .utils import (flatten_bn, is_pp_missing_parameter, from .utils import (flatten_bn, is_pp_missing_parameter,
make_empty_intermediate_tensors_factory, make_layers, make_empty_intermediate_tensors_factory, make_layers,
maybe_prefix, merge_multimodal_embeddings) maybe_prefix, merge_multimodal_embeddings)
...@@ -927,7 +928,11 @@ class ChameleonModel(nn.Module): ...@@ -927,7 +928,11 @@ class ChameleonModel(nn.Module):
info=ChameleonProcessingInfo, info=ChameleonProcessingInfo,
dummy_inputs=ChameleonDummyInputsBuilder) dummy_inputs=ChameleonDummyInputsBuilder)
class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal, class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal,
SupportsPP): SupportsPP, SupportsQuant):
packed_modules_mapping = {
"qkv_proj": ["q_proj", "k_proj", "v_proj"],
"gate_up_proj": ["gate_proj", "up_proj"]
}
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__() super().__init__()
......
...@@ -29,7 +29,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata ...@@ -29,7 +29,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.configs import ChatGLMConfig from vllm.transformers_utils.configs import ChatGLMConfig
from .interfaces import SupportsLoRA, SupportsPP from .interfaces import SupportsLoRA, SupportsPP, SupportsQuant
from .utils import (AutoWeightsLoader, WeightsMapper, is_pp_missing_parameter, from .utils import (AutoWeightsLoader, WeightsMapper, is_pp_missing_parameter,
make_empty_intermediate_tensors_factory, make_layers, make_empty_intermediate_tensors_factory, make_layers,
maybe_prefix) maybe_prefix)
...@@ -295,7 +295,11 @@ class GLMTransformer(nn.Module): ...@@ -295,7 +295,11 @@ class GLMTransformer(nn.Module):
@support_torch_compile @support_torch_compile
class ChatGLMModel(nn.Module): class ChatGLMModel(nn.Module, SupportsQuant):
packed_modules_mapping = {
"linear_proj.merged_proj":
["linear_proj.gate_proj", "linear_proj.dense_h_to_4h"]
}
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__() super().__init__()
...@@ -395,7 +399,6 @@ class ChatGLMModel(nn.Module): ...@@ -395,7 +399,6 @@ class ChatGLMModel(nn.Module):
class ChatGLMBaseModel(nn.Module): class ChatGLMBaseModel(nn.Module):
hf_to_vllm_mapper = WeightsMapper( hf_to_vllm_mapper = WeightsMapper(
orig_to_new_substr={".word_embeddings": ""}, ) orig_to_new_substr={".word_embeddings": ""}, )
...@@ -452,7 +455,8 @@ class ChatGLMBaseModel(nn.Module): ...@@ -452,7 +455,8 @@ class ChatGLMBaseModel(nn.Module):
return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
class ChatGLMForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP): class ChatGLMForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP,
SupportsQuant):
packed_modules_mapping = { packed_modules_mapping = {
"query_key_value": ["query_key_value"], "query_key_value": ["query_key_value"],
"dense_h_to_4h": ["dense_h_to_4h"] "dense_h_to_4h": ["dense_h_to_4h"]
......
...@@ -49,7 +49,7 @@ from vllm.model_executor.utils import set_weight_attrs ...@@ -49,7 +49,7 @@ from vllm.model_executor.utils import set_weight_attrs
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from .interfaces import SupportsLoRA, SupportsPP from .interfaces import SupportsLoRA, SupportsPP, SupportsQuant
from .utils import (extract_layer_index, is_pp_missing_parameter, from .utils import (extract_layer_index, is_pp_missing_parameter,
make_empty_intermediate_tensors_factory, make_layers, make_empty_intermediate_tensors_factory, make_layers,
maybe_prefix) maybe_prefix)
...@@ -332,7 +332,7 @@ class CohereModel(nn.Module): ...@@ -332,7 +332,7 @@ class CohereModel(nn.Module):
return hidden_states return hidden_states
class CohereForCausalLM(nn.Module, SupportsLoRA, SupportsPP): class CohereForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsQuant):
packed_modules_mapping = { packed_modules_mapping = {
"qkv_proj": [ "qkv_proj": [
"q_proj", "q_proj",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment