"csrc/vscode:/vscode.git/clone" did not exist on "230b131b54e8ad4ee9086a15c69b29b387ddb3b0"
Commit 0da696a7 authored by 王敏's avatar 王敏
Browse files

Merge remote-tracking branch 'origin/v0.11.0-dev' into v0.11.0-dev

parents 82c0bf76 6fa116fb
...@@ -210,8 +210,9 @@ class ModelConfig: ...@@ -210,8 +210,9 @@ class ModelConfig:
output will contain token ids.""" output will contain token ids."""
enable_prompt_embeds: bool = False enable_prompt_embeds: bool = False
"""If `True`, enables passing text embeddings as inputs via the """If `True`, enables passing text embeddings as inputs via the
`prompt_embeds` key. Note that enabling this will double the time required `prompt_embeds` key.
for graph compilation.""" WARNING: The vLLM engine may crash if incorrect shape of embeddings is passed.
Only enable this flag for trusted users!"""
served_model_name: Optional[Union[str, list[str]]] = None served_model_name: Optional[Union[str, list[str]]] = None
"""The model name(s) used in the API. If multiple names are provided, the """The model name(s) used in the API. If multiple names are provided, the
server will respond to any of the provided names. The model name in the server will respond to any of the provided names. The model name in the
...@@ -284,6 +285,7 @@ class ModelConfig: ...@@ -284,6 +285,7 @@ class ModelConfig:
"""Configuration for multimodal model. If `None`, this will be inferred """Configuration for multimodal model. If `None`, this will be inferred
from the architecture of `self.model`.""" from the architecture of `self.model`."""
limit_mm_per_prompt: InitVar[Optional[dict[str, int]]] = None limit_mm_per_prompt: InitVar[Optional[dict[str, int]]] = None
enable_mm_embeds: InitVar[bool | None] = None
media_io_kwargs: InitVar[Optional[dict[str, dict[str, Any]]]] = None media_io_kwargs: InitVar[Optional[dict[str, dict[str, Any]]]] = None
mm_processor_kwargs: InitVar[Optional[dict[str, Any]]] = None mm_processor_kwargs: InitVar[Optional[dict[str, Any]]] = None
mm_processor_cache_gb: InitVar[Optional[float]] = None mm_processor_cache_gb: InitVar[Optional[float]] = None
...@@ -353,6 +355,7 @@ class ModelConfig: ...@@ -353,6 +355,7 @@ class ModelConfig:
self, self,
# Multimodal config init vars # Multimodal config init vars
limit_mm_per_prompt: Optional[dict[str, int]], limit_mm_per_prompt: Optional[dict[str, int]],
enable_mm_embeds: bool | None,
media_io_kwargs: Optional[dict[str, dict[str, Any]]], media_io_kwargs: Optional[dict[str, dict[str, Any]]],
mm_processor_kwargs: Optional[dict[str, Any]], mm_processor_kwargs: Optional[dict[str, Any]],
mm_processor_cache_gb: Optional[float], mm_processor_cache_gb: Optional[float],
...@@ -618,6 +621,7 @@ class ModelConfig: ...@@ -618,6 +621,7 @@ class ModelConfig:
mm_config_kwargs = dict( mm_config_kwargs = dict(
limit_per_prompt=limit_mm_per_prompt, limit_per_prompt=limit_mm_per_prompt,
enable_mm_embeds=enable_mm_embeds,
media_io_kwargs=media_io_kwargs, media_io_kwargs=media_io_kwargs,
mm_processor_kwargs=mm_processor_kwargs, mm_processor_kwargs=mm_processor_kwargs,
mm_processor_cache_gb=mm_processor_cache_gb, mm_processor_cache_gb=mm_processor_cache_gb,
......
...@@ -26,6 +26,13 @@ class MultiModalConfig: ...@@ -26,6 +26,13 @@ class MultiModalConfig:
For example, to allow up to 16 images and 2 videos per prompt: For example, to allow up to 16 images and 2 videos per prompt:
`{"image": 16, "video": 2}`""" `{"image": 16, "video": 2}`"""
enable_mm_embeds: bool = False
"""If `True`, enables passing multimodal embeddings:
for `LLM` class, this refers to tensor inputs under `multi_modal_data`;
for the OpenAI-compatible server, this refers to chat messages with content
`"type": "*_embeds"`.
WARNING: The vLLM engine may crash if incorrect shape of embeddings is passed.
Only enable this flag for trusted users!"""
media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict) media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
"""Additional args passed to process media inputs, keyed by modalities. """Additional args passed to process media inputs, keyed by modalities.
For example, to set num_frames for video, set For example, to set num_frames for video, set
......
...@@ -379,6 +379,7 @@ class EngineArgs: ...@@ -379,6 +379,7 @@ class EngineArgs:
disable_custom_all_reduce: bool = ParallelConfig.disable_custom_all_reduce disable_custom_all_reduce: bool = ParallelConfig.disable_custom_all_reduce
limit_mm_per_prompt: dict[str, int] = \ limit_mm_per_prompt: dict[str, int] = \
get_field(MultiModalConfig, "limit_per_prompt") get_field(MultiModalConfig, "limit_per_prompt")
enable_mm_embeds: bool = MultiModalConfig.enable_mm_embeds
interleave_mm_strings: bool = MultiModalConfig.interleave_mm_strings interleave_mm_strings: bool = MultiModalConfig.interleave_mm_strings
media_io_kwargs: dict[str, dict[str, media_io_kwargs: dict[str, dict[str,
Any]] = get_field(MultiModalConfig, Any]] = get_field(MultiModalConfig,
...@@ -796,6 +797,9 @@ class EngineArgs: ...@@ -796,6 +797,9 @@ class EngineArgs:
) )
multimodal_group.add_argument("--limit-mm-per-prompt", multimodal_group.add_argument("--limit-mm-per-prompt",
**multimodal_kwargs["limit_per_prompt"]) **multimodal_kwargs["limit_per_prompt"])
multimodal_group.add_argument(
"--enable-mm-embeds", **multimodal_kwargs["enable_mm_embeds"]
)
multimodal_group.add_argument("--media-io-kwargs", multimodal_group.add_argument("--media-io-kwargs",
**multimodal_kwargs["media_io_kwargs"]) **multimodal_kwargs["media_io_kwargs"])
multimodal_group.add_argument( multimodal_group.add_argument(
...@@ -1034,6 +1038,7 @@ class EngineArgs: ...@@ -1034,6 +1038,7 @@ class EngineArgs:
enable_prompt_embeds=self.enable_prompt_embeds, enable_prompt_embeds=self.enable_prompt_embeds,
served_model_name=self.served_model_name, served_model_name=self.served_model_name,
limit_mm_per_prompt=self.limit_mm_per_prompt, limit_mm_per_prompt=self.limit_mm_per_prompt,
enable_mm_embeds=self.enable_mm_embeds,
interleave_mm_strings=self.interleave_mm_strings, interleave_mm_strings=self.interleave_mm_strings,
media_io_kwargs=self.media_io_kwargs, media_io_kwargs=self.media_io_kwargs,
skip_mm_profiling=self.skip_mm_profiling, skip_mm_profiling=self.skip_mm_profiling,
......
...@@ -845,6 +845,10 @@ class MultiModalContentParser(BaseMultiModalContentParser): ...@@ -845,6 +845,10 @@ class MultiModalContentParser(BaseMultiModalContentParser):
allowed_media_domains=tracker.allowed_media_domains, allowed_media_domains=tracker.allowed_media_domains,
) )
@property
def model_config(self) -> ModelConfig:
return self._tracker.model_config
def parse_image( def parse_image(
self, image_url: Optional[str], uuid: Optional[str] = None self, image_url: Optional[str], uuid: Optional[str] = None
) -> None: ) -> None:
...@@ -858,6 +862,12 @@ class MultiModalContentParser(BaseMultiModalContentParser): ...@@ -858,6 +862,12 @@ class MultiModalContentParser(BaseMultiModalContentParser):
image_embeds: Union[str, dict[str, str], None], image_embeds: Union[str, dict[str, str], None],
uuid: Optional[str] = None, uuid: Optional[str] = None,
) -> None: ) -> None:
mm_config = self.model_config.get_multimodal_config()
if not mm_config.enable_mm_embeds:
raise ValueError(
"You must set `--enable-mm-embeds` to input `image_embeds`"
)
if isinstance(image_embeds, dict): if isinstance(image_embeds, dict):
embeds = { embeds = {
k: self._connector.fetch_image_embedding(v) k: self._connector.fetch_image_embedding(v)
...@@ -930,6 +940,10 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser): ...@@ -930,6 +940,10 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser):
allowed_media_domains=tracker.allowed_media_domains, allowed_media_domains=tracker.allowed_media_domains,
) )
@property
def model_config(self) -> ModelConfig:
return self._tracker.model_config
def parse_image( def parse_image(
self, image_url: Optional[str], uuid: Optional[str] = None self, image_url: Optional[str], uuid: Optional[str] = None
) -> None: ) -> None:
...@@ -945,6 +959,12 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser): ...@@ -945,6 +959,12 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser):
image_embeds: Union[str, dict[str, str], None], image_embeds: Union[str, dict[str, str], None],
uuid: Optional[str] = None, uuid: Optional[str] = None,
) -> None: ) -> None:
mm_config = self.model_config.get_multimodal_config()
if not mm_config.enable_mm_embeds:
raise ValueError(
"You must set `--enable-mm-embeds` to input `image_embeds`"
)
future: asyncio.Future[Union[str, dict[str, str], None]] = ( future: asyncio.Future[Union[str, dict[str, str], None]] = (
asyncio.Future() asyncio.Future()
) )
......
...@@ -135,14 +135,17 @@ class BaseRenderer(ABC): ...@@ -135,14 +135,17 @@ class BaseRenderer(ABC):
""" """
raise NotImplementedError raise NotImplementedError
@classmethod
def load_prompt_embeds( def load_prompt_embeds(
cls, self,
prompt_embeds: Union[bytes, list[bytes]], prompt_embeds: Union[bytes, list[bytes]],
truncate_prompt_tokens: Optional[Annotated[int, Field(ge=0)]] = None, truncate_prompt_tokens: Optional[Annotated[int, Field(ge=0)]] = None,
cache_salt: Optional[str] = None, cache_salt: Optional[str] = None,
) -> list[EngineEmbedsPrompt]: ) -> list[EngineEmbedsPrompt]:
"""Load and validate base64-encoded embeddings into prompt objects.""" """Load and validate base64-encoded embeddings into prompt objects."""
if not self.model_config.enable_prompt_embeds:
raise ValueError(
"You must set `--enable-prompt-embeds` to input `prompt_embeds`."
)
def _load_and_validate_embed(embed: bytes) -> EngineEmbedsPrompt: def _load_and_validate_embed(embed: bytes) -> EngineEmbedsPrompt:
tensor = torch.load( tensor = torch.load(
......
{
"triton_version": "3.1.0",
"1": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 2,
"num_stages": 2,
"num_ldmatrixes": 1
},
"2": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 1
},
"4": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 1
},
"8": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 1
},
"16": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 3,
"num_ldmatrixes": 1
},
"24": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 3,
"num_ldmatrixes": 1
},
"32": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 3,
"num_ldmatrixes": 1
},
"48": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 3,
"num_ldmatrixes": 1
},
"64": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 3,
"num_ldmatrixes": 1
},
"96": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 3,
"num_ldmatrixes": 1
},
"128": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 3,
"num_ldmatrixes": 1
},
"256": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 3,
"num_ldmatrixes": 1
},
"512": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 3,
"num_ldmatrixes": 1
},
"1024": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 32,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 5,
"num_ldmatrixes": 1
},
"1536": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 1
},
"2048": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 3,
"num_ldmatrixes": 1
},
"3072": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 1
},
"4096": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 1
}
}
{
"triton_version": "3.1.0",
"1": {
"BLOCK_SIZE_M": 16,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 1
},
"2": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 32,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 1
},
"4": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 64,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"num_warps": 8,
"num_stages": 2,
"num_ldmatrixes": 1
},
"8": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"num_warps": 8,
"num_stages": 2,
"num_ldmatrixes": 1
},
"16": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"num_warps": 8,
"num_stages": 2,
"num_ldmatrixes": 1
},
"24": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 256,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"num_warps": 8,
"num_stages": 2,
"num_ldmatrixes": 1
},
"32": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 256,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"num_warps": 8,
"num_stages": 2,
"num_ldmatrixes": 1
},
"48": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 256,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"num_warps": 8,
"num_stages": 2,
"num_ldmatrixes": 1
},
"64": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 256,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"num_warps": 8,
"num_stages": 2,
"num_ldmatrixes": 1
},
"96": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 256,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"num_warps": 8,
"num_stages": 2,
"num_ldmatrixes": 1
},
"128": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 256,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"num_warps": 8,
"num_stages": 2,
"num_ldmatrixes": 1
},
"256": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"num_warps": 8,
"num_stages": 3,
"num_ldmatrixes": 1
},
"512": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 256,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 1
},
"1024": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 8,
"num_stages": 2,
"num_ldmatrixes": 1
},
"1536": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"num_warps": 8,
"num_stages": 3,
"num_ldmatrixes": 1
},
"2048": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 8,
"num_stages": 2,
"num_ldmatrixes": 1
},
"3072": {
"BLOCK_SIZE_M": 64,
"BLOCK_SIZE_N": 256,
"BLOCK_SIZE_K": 64,
"GROUP_SIZE_M": 1,
"num_warps": 4,
"num_stages": 2,
"num_ldmatrixes": 1
},
"4096": {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
"num_warps": 8,
"num_stages": 2,
"num_ldmatrixes": 1
}
}
...@@ -1363,14 +1363,14 @@ def vllm_topk_softmax(topk_weights: torch.Tensor, topk_indices: torch.Tensor, ...@@ -1363,14 +1363,14 @@ def vllm_topk_softmax(topk_weights: torch.Tensor, topk_indices: torch.Tensor,
token_expert_indices: torch.Tensor, token_expert_indices: torch.Tensor,
gating_output: torch.Tensor, gating_output: torch.Tensor,
renormalize: bool) -> tuple[torch.Tensor, ...]: renormalize: bool) -> tuple[torch.Tensor, ...]:
if envs.VLLM_USE_TOPK_RENORM: if envs.VLLM_USE_TOPK_RENORM and renormalize is True:
from lightop import op as op from lightop import op as op
op.topk_softmax( op.topk_softmax(
topk_weights, topk_weights,
topk_indices, topk_indices,
token_expert_indices, token_expert_indices,
gating_output, gating_output,
True, renormalize,
) )
else: else:
ops.topk_softmax( ops.topk_softmax(
......
...@@ -1296,6 +1296,16 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): ...@@ -1296,6 +1296,16 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
[`_get_hf_mm_data`][vllm.multimodal.processing.BaseMultiModalProcessor._get_hf_mm_data]. [`_get_hf_mm_data`][vllm.multimodal.processing.BaseMultiModalProcessor._get_hf_mm_data].
""" """
mm_items = self.data_parser.parse_mm_data(mm_data) mm_items = self.data_parser.parse_mm_data(mm_data)
mm_config = self.info.ctx.model_config.get_multimodal_config()
if not mm_config.enable_mm_embeds:
for modality, items in mm_items.items():
if isinstance(items, (EmbeddingItems, DictEmbeddingItems)):
raise ValueError(
f"You must set `--enable-mm-embeds` to input "
f"`{modality}_embeds`"
)
for modality, items in mm_items.items(): for modality, items in mm_items.items():
self.validate_num_items(modality, len(items)) self.validate_num_items(modality, len(items))
......
...@@ -16,12 +16,6 @@ from vllm.utils import cuda_device_count_stateless ...@@ -16,12 +16,6 @@ from vllm.utils import cuda_device_count_stateless
from .interface import DeviceCapability, Platform, PlatformEnum, _Backend from .interface import DeviceCapability, Platform, PlatformEnum, _Backend
from vllm.utils import SUPPORT_MOE_MARLIN_W16A16
if SUPPORT_MOE_MARLIN_W16A16:
os.environ['VLLM_USE_MARLIN_W16A16_MOE'] = '1'
os.environ['MOE_NN'] = '0'
if TYPE_CHECKING: if TYPE_CHECKING:
from vllm.config import ModelConfig, VllmConfig from vllm.config import ModelConfig, VllmConfig
......
...@@ -86,10 +86,6 @@ if TYPE_CHECKING: ...@@ -86,10 +86,6 @@ if TYPE_CHECKING:
logger = init_logger(__name__) logger = init_logger(__name__)
GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName
SUPPORT_MOE_MARLIN_W16A16 = any(arch in GPU_ARCH for arch in ["gfx936"])
# This value is chosen to have a balance between ITL and TTFT. Note it is # This value is chosen to have a balance between ITL and TTFT. Note it is
# not optimized for throughput. # not optimized for throughput.
DEFAULT_MAX_NUM_BATCHED_TOKENS = 2048 DEFAULT_MAX_NUM_BATCHED_TOKENS = 2048
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment