Commit 705f6a35 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.5.2' into v0.5.2-dtk24.04.1

parents af837396 4cf256ae
......@@ -3,8 +3,8 @@ from typing import Optional
from torch import nn
from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
ModelConfig, ParallelConfig, SchedulerConfig,
VisionLanguageConfig)
ModelConfig, MultiModalConfig, ParallelConfig,
SchedulerConfig)
from vllm.model_executor.model_loader.loader import (BaseModelLoader,
get_model_loader)
from vllm.model_executor.model_loader.utils import (
......@@ -15,13 +15,13 @@ def get_model(*, model_config: ModelConfig, load_config: LoadConfig,
device_config: DeviceConfig, parallel_config: ParallelConfig,
scheduler_config: SchedulerConfig,
lora_config: Optional[LoRAConfig],
vision_language_config: Optional[VisionLanguageConfig],
multimodal_config: Optional[MultiModalConfig],
cache_config: CacheConfig) -> nn.Module:
loader = get_model_loader(load_config)
return loader.load_model(model_config=model_config,
device_config=device_config,
lora_config=lora_config,
vision_language_config=vision_language_config,
multimodal_config=multimodal_config,
parallel_config=parallel_config,
scheduler_config=scheduler_config,
cache_config=cache_config)
......
......@@ -16,15 +16,15 @@ from huggingface_hub import HfApi, hf_hub_download
from torch import nn
from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoadFormat,
LoRAConfig, ModelConfig, ParallelConfig,
SchedulerConfig, VisionLanguageConfig)
LoRAConfig, ModelConfig, MultiModalConfig,
ParallelConfig, SchedulerConfig)
from vllm.envs import VLLM_USE_MODELSCOPE
from vllm.logger import init_logger
from vllm.model_executor.layers.quantization.base_config import (
QuantizationConfig)
from vllm.model_executor.model_loader.tensorizer import (
TensorizerConfig, is_vllm_tensorized, load_with_tensorizer,
tensorizer_weights_iterator)
serialize_vllm_model, tensorizer_weights_iterator)
from vllm.model_executor.model_loader.utils import (get_model_architecture,
set_default_torch_dtype)
from vllm.model_executor.model_loader.weight_utils import (
......@@ -32,8 +32,11 @@ from vllm.model_executor.model_loader.weight_utils import (
filter_duplicate_safetensors_files, filter_files_not_needed_for_inference,
get_quant_config, initialize_dummy_weights, np_cache_weights_iterator,
pt_weights_iterator, safetensors_weights_iterator)
from vllm.model_executor.models.vlm_base import VisionLanguageModelBase
from vllm.model_executor.models.interfaces import (supports_lora,
supports_vision)
from vllm.model_executor.utils import set_weight_attrs
from vllm.platforms import current_platform
from vllm.utils import is_tpu
logger = init_logger(__name__)
......@@ -44,7 +47,7 @@ def _get_quantization_config(
"""Get the quantization config."""
if model_config.quantization is not None:
quant_config = get_quant_config(model_config, load_config)
capability = torch.cuda.get_device_capability()
capability = current_platform.get_device_capability()
capability = capability[0] * 10 + capability[1]
if capability < quant_config.get_min_capability():
raise ValueError(
......@@ -66,12 +69,15 @@ def _get_quantization_config(
def _get_model_initialization_kwargs(
model_class: Type[nn.Module], lora_config: Optional[LoRAConfig],
vision_language_config: Optional[VisionLanguageConfig]
model_class: Type[nn.Module],
lora_config: Optional[LoRAConfig],
multimodal_config: Optional[MultiModalConfig],
) -> Dict[str, Any]:
"""Get extra kwargs for model initialization."""
extra_kwargs = {}
if hasattr(model_class, "supported_lora_modules"):
extra_kwargs: Dict[str, Any] = {}
if supports_lora(model_class):
# lora_config=None is used to disable LoRA
extra_kwargs["lora_config"] = lora_config
elif lora_config:
raise ValueError(
......@@ -79,19 +85,20 @@ def _get_model_initialization_kwargs(
"but LoRA is enabled. Support for this model may "
"be added in the future. If this is important to you, "
"please open an issue on github.")
elif issubclass(model_class, VisionLanguageModelBase):
if vision_language_config is None:
raise ValueError("Provide `image_input_type` and other vision "
"related configurations through LLM entrypoint "
"or engine arguments.")
extra_kwargs["vision_language_config"] = vision_language_config
if supports_vision(model_class):
if multimodal_config is None:
raise ValueError("Provide vision related configurations "
"through LLM entrypoint or engine arguments.")
extra_kwargs["multimodal_config"] = multimodal_config
return extra_kwargs
def _initialize_model(model_config: ModelConfig, load_config: LoadConfig,
lora_config: Optional[LoRAConfig],
vision_language_config: Optional[VisionLanguageConfig],
multimodal_config: Optional[MultiModalConfig],
cache_config: CacheConfig) -> nn.Module:
"""Initialize a model with the given configurations."""
model_class = get_model_architecture(model_config)[0]
......@@ -101,7 +108,7 @@ def _initialize_model(model_config: ModelConfig, load_config: LoadConfig,
cache_config=cache_config,
quant_config=quant_config,
**_get_model_initialization_kwargs(
model_class, lora_config, vision_language_config))
model_class, lora_config, multimodal_config))
class BaseModelLoader(ABC):
......@@ -114,7 +121,7 @@ class BaseModelLoader(ABC):
def load_model(self, *, model_config: ModelConfig,
device_config: DeviceConfig,
lora_config: Optional[LoRAConfig],
vision_language_config: Optional[VisionLanguageConfig],
multimodal_config: Optional[MultiModalConfig],
parallel_config: ParallelConfig,
scheduler_config: SchedulerConfig,
cache_config: CacheConfig) -> nn.Module:
......@@ -230,24 +237,38 @@ class DefaultModelLoader(BaseModelLoader):
if self.load_config.load_format == LoadFormat.NPCACHE:
# Currently np_cache only support *.bin checkpoints
assert use_safetensors is False
return np_cache_weights_iterator(model_name_or_path,
self.load_config.download_dir,
hf_folder, hf_weights_files)
if use_safetensors:
return safetensors_weights_iterator(hf_weights_files)
return pt_weights_iterator(hf_weights_files)
weights_iterator = np_cache_weights_iterator(
model_name_or_path, self.load_config.download_dir, hf_folder,
hf_weights_files)
elif use_safetensors:
weights_iterator = safetensors_weights_iterator(hf_weights_files)
else:
weights_iterator = pt_weights_iterator(hf_weights_files)
if is_tpu():
# In PyTorch XLA, we should call `xm.mark_step` frequently so that
# not too many ops are accumulated in the XLA program.
import torch_xla.core.xla_model as xm
def _xla_weights_iterator(iterator: Generator):
for weights in iterator:
yield weights
xm.mark_step()
weights_iterator = _xla_weights_iterator(weights_iterator)
return weights_iterator
def load_model(self, *, model_config: ModelConfig,
device_config: DeviceConfig,
lora_config: Optional[LoRAConfig],
vision_language_config: Optional[VisionLanguageConfig],
multimodal_config: Optional[MultiModalConfig],
parallel_config: ParallelConfig,
scheduler_config: SchedulerConfig,
cache_config: CacheConfig) -> nn.Module:
with set_default_torch_dtype(model_config.dtype):
with torch.device(device_config.device):
model = _initialize_model(model_config, self.load_config,
lora_config, vision_language_config,
lora_config, multimodal_config,
cache_config)
model.load_weights(
self._get_weights_iterator(model_config.model,
......@@ -280,14 +301,14 @@ class DummyModelLoader(BaseModelLoader):
def load_model(self, *, model_config: ModelConfig,
device_config: DeviceConfig,
lora_config: Optional[LoRAConfig],
vision_language_config: Optional[VisionLanguageConfig],
multimodal_config: Optional[MultiModalConfig],
parallel_config: ParallelConfig,
scheduler_config: SchedulerConfig,
cache_config: CacheConfig) -> nn.Module:
with set_default_torch_dtype(model_config.dtype):
with torch.device(device_config.device):
model = _initialize_model(model_config, self.load_config,
lora_config, vision_language_config,
lora_config, multimodal_config,
cache_config)
# NOTE(woosuk): For accurate performance evaluation, we assign
# random values to the weights.
......@@ -321,7 +342,7 @@ class TensorizerLoader(BaseModelLoader):
model_config: ModelConfig,
device_config: DeviceConfig,
lora_config: Optional[LoRAConfig],
vision_language_config: Optional[VisionLanguageConfig],
multimodal_config: Optional[MultiModalConfig],
cache_config: CacheConfig,
) -> nn.Module:
"""Load a serialized model with tensorizer to the CPU.
......@@ -334,7 +355,7 @@ class TensorizerLoader(BaseModelLoader):
with set_default_torch_dtype(model_config.dtype):
with torch.device(device_config.device):
model = _initialize_model(model_config, self.load_config,
lora_config, vision_language_config,
lora_config, multimodal_config,
cache_config)
model.load_weights(self._get_weights_iterator())
......@@ -345,7 +366,7 @@ class TensorizerLoader(BaseModelLoader):
model_config: ModelConfig,
device_config: DeviceConfig,
lora_config: Optional[LoRAConfig],
vision_language_config: Optional[VisionLanguageConfig],
multimodal_config: Optional[MultiModalConfig],
cache_config: CacheConfig,
) -> nn.Module:
"""Load a serialized model with tensorizer.
......@@ -359,7 +380,7 @@ class TensorizerLoader(BaseModelLoader):
quant_config = _get_quantization_config(
model_config, self.load_config)
extra_kwargs = _get_model_initialization_kwargs(
model_class, lora_config, vision_language_config)
model_class, lora_config, multimodal_config)
extra_kwargs["quant_config"] = quant_config
extra_kwargs["cache_config"] = cache_config
......@@ -374,22 +395,36 @@ class TensorizerLoader(BaseModelLoader):
def load_model(self, *, model_config: ModelConfig,
device_config: DeviceConfig,
lora_config: Optional[LoRAConfig],
vision_language_config: Optional[VisionLanguageConfig],
multimodal_config: Optional[MultiModalConfig],
parallel_config: ParallelConfig,
scheduler_config: SchedulerConfig,
cache_config: CacheConfig) -> nn.Module:
self._verify_config(model_config, parallel_config)
if parallel_config.tensor_parallel_size > 1:
from vllm.distributed import get_tensor_model_parallel_rank
self.tensorizer_config.tensorizer_uri = \
self.tensorizer_config.tensorizer_uri \
% get_tensor_model_parallel_rank()
if is_vllm_tensorized(self.tensorizer_config):
return self._load_model_serialized(model_config, device_config,
lora_config,
vision_language_config,
lora_config, multimodal_config,
cache_config)
return self._load_model_serialized_cpu(model_config, device_config,
lora_config,
vision_language_config,
lora_config, multimodal_config,
cache_config)
@staticmethod
def save_model(
model: torch.nn.Module,
tensorizer_config: TensorizerConfig,
) -> None:
serialize_vllm_model(
model=model,
tensorizer_config=tensorizer_config,
)
class ShardedStateLoader(BaseModelLoader):
"""
......@@ -418,7 +453,8 @@ class ShardedStateLoader(BaseModelLoader):
Filter out all tensors that share the same memory or a subset of the
memory of another tensor.
"""
same_storage_groups = collections.defaultdict(list)
same_storage_groups: Dict[Any, List[Tuple[
str, torch.Tensor]]] = collections.defaultdict(list)
for key, tensor in tensors.items():
if tensor.numel():
ptr = tensor.untyped_storage().data_ptr()
......@@ -427,7 +463,7 @@ class ShardedStateLoader(BaseModelLoader):
def get_end_ptr(tensor: torch.Tensor) -> int:
return tensor.view(-1)[-1].data_ptr() + tensor.element_size()
result = {}
result: Dict[str, torch.Tensor] = {}
for group in same_storage_groups.values():
for k, t in group:
a, b = t.data_ptr(), get_end_ptr(t)
......@@ -459,7 +495,7 @@ class ShardedStateLoader(BaseModelLoader):
def load_model(self, *, model_config: ModelConfig,
device_config: DeviceConfig,
lora_config: Optional[LoRAConfig],
vision_language_config: Optional[VisionLanguageConfig],
multimodal_config: Optional[MultiModalConfig],
parallel_config: ParallelConfig,
scheduler_config: SchedulerConfig,
cache_config: CacheConfig) -> nn.Module:
......@@ -473,7 +509,7 @@ class ShardedStateLoader(BaseModelLoader):
with set_default_torch_dtype(model_config.dtype):
with torch.device(device_config.device):
model = _initialize_model(model_config, self.load_config,
lora_config, vision_language_config,
lora_config, multimodal_config,
cache_config)
rank = get_tensor_model_parallel_rank()
pattern = os.path.join(
......@@ -769,14 +805,14 @@ class BitsAndBytesModelLoader(BaseModelLoader):
def load_model(self, *, model_config: ModelConfig,
device_config: DeviceConfig,
lora_config: Optional[LoRAConfig],
vision_language_config: Optional[VisionLanguageConfig],
multimodal_config: Optional[MultiModalConfig],
parallel_config: ParallelConfig,
scheduler_config: SchedulerConfig,
cache_config: CacheConfig) -> nn.Module:
with set_default_torch_dtype(model_config.dtype):
with torch.device(device_config.device):
model = _initialize_model(model_config, self.load_config,
lora_config, vision_language_config,
lora_config, multimodal_config,
cache_config)
self._load_weights(model_config, model)
......
# ruff: noqa: SIM117
from pathlib import Path
from typing import List, Optional, Tuple
import openvino as ov
import torch
from huggingface_hub import HfApi
from openvino._offline_transformations import paged_attention_transformation
from optimum.intel import OVModelForCausalLM
from torch import nn
import vllm.envs as envs
from vllm.attention.backends.openvino import OpenVINOAttentionMetadata
from vllm.config import DeviceConfig, ModelConfig
from vllm.logger import init_logger
from vllm.model_executor.layers.logits_processor import (LogitsProcessor,
_prune_hidden_states)
from vllm.model_executor.layers.sampler import Sampler
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.sequence import SamplerOutput
logger = init_logger(__name__)
def _flattenize_inputs(inputs):
"""
Helper function for making nested inputs flattens
"""
flatten_inputs = []
for input_data in inputs:
if input_data is None:
continue
if isinstance(input_data, (list, tuple)):
flatten_inputs.extend(_flattenize_inputs(input_data))
elif isinstance(input_data, dict):
flatten_inputs.extend(_flattenize_inputs(list(
input_data.values())))
else:
flatten_inputs.append(input_data)
return flatten_inputs
def _modify_cache_parameters(model: ov.Model, kv_cache_dtype: ov.Type,
is_cpu: bool):
# Apply hardware dependent modifications to KV tensors
for parameter in model.get_parameters():
input = parameter.get_output_tensor(0)
input_names = input.get_names()
if len(input_names) != 1:
continue
input_name = next(iter(input_names))
shape = parameter.get_partial_shape()
# use real block size if available, just a placeholder
# to provide the expected rank
x_size = 1
num_blocks = ov.Dimension()
block_size = ov.Dimension()
head_size = ov.Dimension()
# TODO: Negotiate required layout with plugins (CPU is ~OK, GPU is TBD),
# pass more parameters to this function to set more static dimensions
if input_name.startswith("key_cache."):
cpu_shape = [num_blocks, shape[1], block_size, head_size]
gpu_shape = [
num_blocks,
shape[1],
shape[2].get_length() //
x_size if shape[2].is_static else ov.Dimension(),
block_size,
x_size,
]
elif input_name.startswith("value_cache."):
cpu_shape = [num_blocks, shape[1], block_size, head_size]
gpu_shape = [num_blocks, shape[1], shape[2], block_size]
else:
continue
parameter.set_partial_shape(
ov.PartialShape(cpu_shape if is_cpu else gpu_shape))
parameter.set_element_type(kv_cache_dtype)
model.validate_nodes_and_infer_types()
def _require_model_export(model_id, revision=None, subfolder=None):
model_dir = Path(model_id)
if subfolder is not None:
model_dir = model_dir / subfolder
if model_dir.is_dir():
return (not (model_dir / "openvino_model.xml").exists()
or not (model_dir / "openvino_model.bin").exists())
hf_api = HfApi()
try:
model_info = hf_api.model_info(model_id, revision=revision or "main")
normalized_subfolder = (None if subfolder is None else
Path(subfolder).as_posix())
model_files = [
file.rfilename for file in model_info.siblings
if normalized_subfolder is None
or file.rfilename.startswith(normalized_subfolder)
]
ov_model_path = ("openvino_model.xml" if normalized_subfolder is None
else f"{normalized_subfolder}/openvino_model.xml")
return (ov_model_path not in model_files
or ov_model_path.replace(".xml", ".bin") not in model_files)
except Exception:
return True
class OpenVINOCasualLM(nn.Module):
def __init__(
self,
model_config: ModelConfig,
device_config: DeviceConfig,
kv_cache_dtype: ov.Type,
) -> None:
super().__init__()
self.logits_processor = LogitsProcessor(
model_config.hf_config.vocab_size, logits_as_input=True)
self.sampler = Sampler()
export = _require_model_export(model_config.model)
if export:
logger.warning(
f"Provided model id {model_config.model} does not " # noqa: G004
"contain OpenVINO IR, the model will be converted to IR with "
"default options. If you need to use specific options for "
"model conversion, use optimum-cli export openvino with "
"desired options.")
else:
logger.warning(
"OpenVINO IR is available for provided model id " # noqa: G004
f"{model_config.model}. This IR will be used for inference "
"as-is, all possible options that may affect model conversion "
"are ignored.")
load_in_8bit = envs.VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS
pt_model = OVModelForCausalLM.from_pretrained(
model_config.model,
export=export,
compile=False,
load_in_8bit=load_in_8bit,
trust_remote_code=model_config.trust_remote_code,
)
paged_attention_transformation(pt_model.model)
_modify_cache_parameters(pt_model.model, kv_cache_dtype,
device_config.device.type == "cpu")
core = ov.Core()
ov_compiled = core.compile_model(pt_model.model, "CPU")
self.ov_request = ov_compiled.create_infer_request()
def forward(
self,
input_ids: torch.Tensor,
positions: torch.Tensor,
kv_caches: List[Tuple[ov.Tensor, ov.Tensor]],
attn_metadata: OpenVINOAttentionMetadata,
) -> torch.Tensor:
flatten_kv_cache = _flattenize_inputs(kv_caches)
inputs = [
input_ids,
positions,
*flatten_kv_cache,
attn_metadata.past_lens,
attn_metadata.subsequence_begins,
attn_metadata.block_indices,
attn_metadata.block_indices_begins,
attn_metadata.max_context_len,
]
self.ov_request.start_async(inputs, share_inputs=True)
self.ov_request.wait()
logits = torch.from_numpy(self.ov_request.get_tensor("logits").data)
# TODO: remove 'view' once OpenVINO PA will drop 'seq_len' dimension
return logits.view(-1, logits.shape[-1])
def compute_logits(self, hidden_states: torch.Tensor,
sampling_metadata: SamplingMetadata) -> torch.Tensor:
hidden_states = _prune_hidden_states(hidden_states, sampling_metadata)
logits = self.logits_processor(None, hidden_states, sampling_metadata)
return logits
def sample(
self,
logits: torch.Tensor,
sampling_metadata: SamplingMetadata,
) -> Optional[SamplerOutput]:
next_tokens = self.sampler(logits, sampling_metadata)
return next_tokens
def get_model(
model_config: ModelConfig,
device_config: DeviceConfig,
kv_cache_dtype: ov.Type,
**kwargs,
) -> torch.nn.Module:
lora_config = kwargs.get("lora_config", None)
if lora_config:
raise ValueError(
"OpenVINO modeling does not support LoRA, "
"but LoRA is enabled. Support for this model may "
"be added in the future. If this is important to you, "
"please open an issue on github.")
return OpenVINOCasualLM(model_config, device_config, kv_cache_dtype)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment