Unverified Commit cf73f0c9 authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[Model] Enable optional prefix when loading embedding models (#10639)


Signed-off-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
parent b1d92053
...@@ -14,18 +14,17 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear, ...@@ -14,18 +14,17 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
RowParallelLinear) RowParallelLinear)
from vllm.model_executor.layers.pooler import (CrossEncodingPooler, Pooler, from vllm.model_executor.layers.pooler import (CrossEncodingPooler, Pooler,
PoolingType) PoolingType)
from vllm.model_executor.layers.quantization.base_config import ( from vllm.model_executor.layers.quantization import QuantizationConfig
QuantizationConfig)
from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.layers.vocab_parallel_embedding import (
VocabParallelEmbedding) VocabParallelEmbedding)
from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.models.interfaces import SupportsCrossEncoding
from vllm.model_executor.pooling_metadata import PoolingMetadata from vllm.model_executor.pooling_metadata import PoolingMetadata
from vllm.sequence import IntermediateTensors, PoolerOutput from vllm.sequence import IntermediateTensors, PoolerOutput
from vllm.transformers_utils.config import ( from vllm.transformers_utils.config import (
get_cross_encoder_activation_function) get_cross_encoder_activation_function)
from .utils import maybe_prefix from .interfaces import SupportsCrossEncoding
from .utils import WeightsMapper, maybe_prefix
class BertEmbedding(nn.Module): class BertEmbedding(nn.Module):
...@@ -442,6 +441,8 @@ class BertEmbeddingModel(nn.Module): ...@@ -442,6 +441,8 @@ class BertEmbeddingModel(nn.Module):
return self._pooler(hidden_states, pooling_metadata) return self._pooler(hidden_states, pooling_metadata)
def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""})
weights = hf_to_vllm_mapper.apply(weights)
self.model.load_weights(weights) self.model.load_weights(weights)
def _build_model(self, def _build_model(self,
......
...@@ -42,7 +42,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata ...@@ -42,7 +42,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.sequence import IntermediateTensors, PoolerOutput from vllm.sequence import IntermediateTensors, PoolerOutput
from .interfaces import SupportsLoRA, SupportsPP from .interfaces import SupportsLoRA, SupportsPP
from .utils import (AutoWeightsLoader, extract_layer_index, from .utils import (AutoWeightsLoader, WeightsMapper, extract_layer_index,
is_pp_missing_parameter, is_pp_missing_parameter,
make_empty_intermediate_tensors_factory, make_layers, make_empty_intermediate_tensors_factory, make_layers,
maybe_prefix) maybe_prefix)
...@@ -511,4 +511,6 @@ class Gemma2EmbeddingModel(nn.Module, SupportsPP): ...@@ -511,4 +511,6 @@ class Gemma2EmbeddingModel(nn.Module, SupportsPP):
return self._pooler(hidden_states, pooling_metadata) return self._pooler(hidden_states, pooling_metadata)
def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""})
weights = hf_to_vllm_mapper.apply(weights)
self.model.load_weights(weights) self.model.load_weights(weights)
...@@ -53,7 +53,8 @@ from vllm.platforms import current_platform ...@@ -53,7 +53,8 @@ from vllm.platforms import current_platform
from vllm.sequence import IntermediateTensors, PoolerOutput from vllm.sequence import IntermediateTensors, PoolerOutput
from .interfaces import SupportsLoRA, SupportsPP from .interfaces import SupportsLoRA, SupportsPP
from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter, from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper,
is_pp_missing_parameter,
make_empty_intermediate_tensors_factory, make_layers, make_empty_intermediate_tensors_factory, make_layers,
maybe_prefix) maybe_prefix)
...@@ -689,6 +690,8 @@ class LlamaEmbeddingModel(nn.Module, SupportsLoRA, SupportsPP): ...@@ -689,6 +690,8 @@ class LlamaEmbeddingModel(nn.Module, SupportsLoRA, SupportsPP):
return self._pooler(hidden_states, pooling_metadata) return self._pooler(hidden_states, pooling_metadata)
def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""})
weights = hf_to_vllm_mapper.apply(weights)
self.model.load_weights(weights) self.model.load_weights(weights)
def load_kv_cache_scales(self, quantization_param_path: str) -> None: def load_kv_cache_scales(self, quantization_param_path: str) -> None:
......
...@@ -50,7 +50,8 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata ...@@ -50,7 +50,8 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.sequence import IntermediateTensors, PoolerOutput from vllm.sequence import IntermediateTensors, PoolerOutput
from .interfaces import SupportsLoRA, SupportsPP from .interfaces import SupportsLoRA, SupportsPP
from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter, from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper,
is_pp_missing_parameter,
make_empty_intermediate_tensors_factory, make_layers, make_empty_intermediate_tensors_factory, make_layers,
maybe_prefix) maybe_prefix)
...@@ -585,8 +586,7 @@ class Qwen2EmbeddingModel(nn.Module, SupportsLoRA, SupportsPP): ...@@ -585,8 +586,7 @@ class Qwen2EmbeddingModel(nn.Module, SupportsLoRA, SupportsPP):
) -> Optional[PoolerOutput]: ) -> Optional[PoolerOutput]:
return self._pooler(hidden_states, pooling_metadata) return self._pooler(hidden_states, pooling_metadata)
def load_weights(self, weights: Iterable[Tuple[str, def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
torch.Tensor]]) -> Set[str]: hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""})
loader = AutoWeightsLoader(self, weights = hf_to_vllm_mapper.apply(weights)
ignore_unexpected_prefixes=["lm_head."]) self.model.load_weights(weights)
return loader.load_weights(weights)
...@@ -11,13 +11,14 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( ...@@ -11,13 +11,14 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
VocabParallelEmbedding) VocabParallelEmbedding)
from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.models.bert import BertEmbeddingModel, BertModel from vllm.model_executor.models.bert import BertEmbeddingModel, BertModel
from vllm.model_executor.models.interfaces import SupportsCrossEncoding
from vllm.model_executor.models.utils import maybe_prefix from vllm.model_executor.models.utils import maybe_prefix
from vllm.model_executor.pooling_metadata import PoolingMetadata from vllm.model_executor.pooling_metadata import PoolingMetadata
from vllm.sequence import IntermediateTensors, PoolerOutput from vllm.sequence import IntermediateTensors, PoolerOutput
from vllm.transformers_utils.config import ( from vllm.transformers_utils.config import (
get_cross_encoder_activation_function) get_cross_encoder_activation_function)
from .interfaces import SupportsCrossEncoding
class RobertaEmbedding(nn.Module): class RobertaEmbedding(nn.Module):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment