Unverified Commit 1bc3b5e7 authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[VLM] Separate text-only and vision variants of the same model architecture (#13157)

parent 02ed8a1f
...@@ -699,10 +699,10 @@ See [this page](#generative-models) for more information on how to use generativ ...@@ -699,10 +699,10 @@ See [this page](#generative-models) for more information on how to use generativ
* *
* ✅︎ * ✅︎
* ✅︎ * ✅︎
- * `DeepseekVLV2ForCausalLM` - * `DeepseekVLV2ForCausalLM`<sup>^</sup>
* DeepSeek-VL2 * DeepSeek-VL2
* T + I<sup>+</sup> * T + I<sup>+</sup>
* `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2` etc. (see note) * `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2` etc.
* *
* ✅︎ * ✅︎
* ✅︎ * ✅︎
...@@ -713,10 +713,10 @@ See [this page](#generative-models) for more information on how to use generativ ...@@ -713,10 +713,10 @@ See [this page](#generative-models) for more information on how to use generativ
* *
* ✅︎ * ✅︎
* ✅︎ * ✅︎
- * `ChatGLMModel` - * `GLM4VForCausalLM`<sup>^</sup>
* GLM-4V * GLM-4V
* T + I * T + I
* `THUDM/glm-4v-9b` etc. * `THUDM/glm-4v-9b`, `THUDM/cogagent-9b-20241220` etc.
* ✅︎ * ✅︎
* ✅︎ * ✅︎
* ✅︎ * ✅︎
...@@ -825,7 +825,7 @@ See [this page](#generative-models) for more information on how to use generativ ...@@ -825,7 +825,7 @@ See [this page](#generative-models) for more information on how to use generativ
* *
* ✅︎ * ✅︎
* ✅︎ * ✅︎
- * `QWenLMHeadModel` - * `QwenVLForConditionalGeneration`<sup>^</sup>
* Qwen-VL * Qwen-VL
* T + I<sup>E+</sup> * T + I<sup>E+</sup>
* `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc. * `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc.
...@@ -862,13 +862,12 @@ See [this page](#generative-models) for more information on how to use generativ ...@@ -862,13 +862,12 @@ See [this page](#generative-models) for more information on how to use generativ
* ✅︎ * ✅︎
::: :::
<sup>^</sup> You need to set the architecture name via `--hf-overrides` to match the one in vLLM.
&nbsp;&nbsp;&nbsp;&nbsp;• For example, to use DeepSeek-VL2 series models:
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;`--hf-overrides '{"architectures": ["DeepseekVLV2ForCausalLM"]}'`
<sup>E</sup> Pre-computed embeddings can be inputted for this modality. <sup>E</sup> Pre-computed embeddings can be inputted for this modality.
<sup>+</sup> Multiple items can be inputted per text prompt for this modality. <sup>+</sup> Multiple items can be inputted per text prompt for this modality.
:::{note}
To use DeepSeek-VL2 series models, you have to pass `--hf_overrides '{"architectures": ["DeepseekVLV2ForCausalLM"]}'` when running vLLM.
:::
:::{note} :::{note}
H2O-VL series models will be available in V1 once we support backends other than FlashAttention. H2O-VL series models will be available in V1 once we support backends other than FlashAttention.
::: :::
......
...@@ -105,7 +105,9 @@ def run_glm4v(question: str, modality: str): ...@@ -105,7 +105,9 @@ def run_glm4v(question: str, modality: str):
max_num_seqs=2, max_num_seqs=2,
trust_remote_code=True, trust_remote_code=True,
enforce_eager=True, enforce_eager=True,
hf_overrides={"architectures": ["GLM4VForCausalLM"]},
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache) disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
prompt = f"<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>\ prompt = f"<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>\
{question}<|assistant|>" {question}<|assistant|>"
...@@ -495,6 +497,7 @@ def run_qwen_vl(question: str, modality: str): ...@@ -495,6 +497,7 @@ def run_qwen_vl(question: str, modality: str):
trust_remote_code=True, trust_remote_code=True,
max_model_len=1024, max_model_len=1024,
max_num_seqs=2, max_num_seqs=2,
hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]},
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
) )
......
...@@ -77,7 +77,7 @@ def load_deepseek_vl2(question: str, image_urls: List[str]): ...@@ -77,7 +77,7 @@ def load_deepseek_vl2(question: str, image_urls: List[str]):
) )
def load_h2onvl(question: str, image_urls: List[str]) -> ModelRequestData: def load_h2ovl(question: str, image_urls: List[str]) -> ModelRequestData:
model_name = "h2oai/h2ovl-mississippi-2b" model_name = "h2oai/h2ovl-mississippi-2b"
llm = LLM( llm = LLM(
...@@ -302,6 +302,7 @@ def load_qwen_vl_chat(question: str, ...@@ -302,6 +302,7 @@ def load_qwen_vl_chat(question: str,
trust_remote_code=True, trust_remote_code=True,
max_model_len=1024, max_model_len=1024,
max_num_seqs=2, max_num_seqs=2,
hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]},
limit_mm_per_prompt={"image": len(image_urls)}, limit_mm_per_prompt={"image": len(image_urls)},
) )
placeholders = "".join(f"Picture {i}: <img></img>\n" placeholders = "".join(f"Picture {i}: <img></img>\n"
...@@ -452,7 +453,7 @@ def load_qwen2_5_vl(question, image_urls: List[str]) -> ModelRequestData: ...@@ -452,7 +453,7 @@ def load_qwen2_5_vl(question, image_urls: List[str]) -> ModelRequestData:
model_example_map = { model_example_map = {
"aria": load_aria, "aria": load_aria,
"deepseek_vl_v2": load_deepseek_vl2, "deepseek_vl_v2": load_deepseek_vl2,
"h2ovl_chat": load_h2onvl, "h2ovl_chat": load_h2ovl,
"idefics3": load_idefics3, "idefics3": load_idefics3,
"internvl_chat": load_internvl, "internvl_chat": load_internvl,
"mllama": load_mllama, "mllama": load_mllama,
......
...@@ -6,6 +6,7 @@ WARNING: This test runs in both single-node (4 GPUs) and multi-node ...@@ -6,6 +6,7 @@ WARNING: This test runs in both single-node (4 GPUs) and multi-node
all workers in a node other than the head node, which can cause the test all workers in a node other than the head node, which can cause the test
to fail. to fail.
""" """
import json
import os import os
from dataclasses import dataclass from dataclasses import dataclass
from typing import List, Literal, NamedTuple, Optional from typing import List, Literal, NamedTuple, Optional
...@@ -15,6 +16,7 @@ import pytest ...@@ -15,6 +16,7 @@ import pytest
from vllm.config import TaskOption from vllm.config import TaskOption
from vllm.logger import init_logger from vllm.logger import init_logger
from ..models.registry import HF_EXAMPLE_MODELS
from ..utils import compare_two_settings, fork_new_process_for_each_test from ..utils import compare_two_settings, fork_new_process_for_each_test
logger = init_logger("test_pipeline_parallel") logger = init_logger("test_pipeline_parallel")
...@@ -31,10 +33,7 @@ class ParallelSetup(NamedTuple): ...@@ -31,10 +33,7 @@ class ParallelSetup(NamedTuple):
class PPTestOptions(NamedTuple): class PPTestOptions(NamedTuple):
multi_node_only: bool multi_node_only: bool
trust_remote_code: bool
tokenizer_mode: Optional[str]
load_format: Optional[str] = None load_format: Optional[str] = None
hf_overrides: Optional[str] = None
@dataclass @dataclass
...@@ -64,10 +63,7 @@ class PPTestSettings: ...@@ -64,10 +63,7 @@ class PPTestSettings:
pp_base: int = 2, pp_base: int = 2,
multi_node_only: bool = False, multi_node_only: bool = False,
task: TaskOption = "auto", task: TaskOption = "auto",
trust_remote_code: bool = False,
tokenizer_mode: Optional[str] = None,
load_format: Optional[str] = None, load_format: Optional[str] = None,
hf_overrides: Optional[str] = None,
): ):
return PPTestSettings( return PPTestSettings(
parallel_setups=[ parallel_setups=[
...@@ -97,10 +93,7 @@ class PPTestSettings: ...@@ -97,10 +93,7 @@ class PPTestSettings:
vllm_major_versions=["0", "0", "1"], vllm_major_versions=["0", "0", "1"],
task=task, task=task,
test_options=PPTestOptions(multi_node_only=multi_node_only, test_options=PPTestOptions(multi_node_only=multi_node_only,
trust_remote_code=trust_remote_code, load_format=load_format),
tokenizer_mode=tokenizer_mode,
load_format=load_format,
hf_overrides=hf_overrides),
) )
@staticmethod @staticmethod
...@@ -110,10 +103,7 @@ class PPTestSettings: ...@@ -110,10 +103,7 @@ class PPTestSettings:
pp_base: int = 2, pp_base: int = 2,
task: TaskOption = "auto", task: TaskOption = "auto",
multi_node_only: bool = False, multi_node_only: bool = False,
trust_remote_code: bool = False,
tokenizer_mode: Optional[str] = None,
load_format: Optional[str] = None, load_format: Optional[str] = None,
hf_overrides: Optional[str] = None,
): ):
return PPTestSettings( return PPTestSettings(
parallel_setups=[ parallel_setups=[
...@@ -126,19 +116,16 @@ class PPTestSettings: ...@@ -126,19 +116,16 @@ class PPTestSettings:
vllm_major_versions=["0"], vllm_major_versions=["0"],
task=task, task=task,
test_options=PPTestOptions(multi_node_only=multi_node_only, test_options=PPTestOptions(multi_node_only=multi_node_only,
trust_remote_code=trust_remote_code, load_format=load_format),
tokenizer_mode=tokenizer_mode,
load_format=load_format,
hf_overrides=hf_overrides),
) )
def iter_params(self, model_name: str): def iter_params(self, model_id: str):
opts = self.test_options opts = self.test_options
for parallel_setup in self.parallel_setups: for parallel_setup in self.parallel_setups:
for backend, vllm_major_version in zip(self.distributed_backends, for backend, vllm_major_version in zip(self.distributed_backends,
self.vllm_major_versions): self.vllm_major_versions):
yield (model_name, parallel_setup, backend, vllm_major_version, yield (model_id, parallel_setup, backend, vllm_major_version,
self.task, opts) self.task, opts)
...@@ -150,16 +137,16 @@ TEXT_GENERATION_MODELS = { ...@@ -150,16 +137,16 @@ TEXT_GENERATION_MODELS = {
# [Decoder-only] # [Decoder-only]
# Uses Llama # Uses Llama
# "BAAI/AquilaChat-7B": PPTestSettings.fast(), # "BAAI/AquilaChat-7B": PPTestSettings.fast(),
"Snowflake/snowflake-arctic-instruct": PPTestSettings.fast(tp_base=8, trust_remote_code=True), # noqa: E501 "Snowflake/snowflake-arctic-instruct": PPTestSettings.fast(load_format="dummy"), # noqa: E501
"baichuan-inc/Baichuan-7B": PPTestSettings.fast(trust_remote_code=True), "baichuan-inc/Baichuan-7B": PPTestSettings.fast(),
"baichuan-inc/Baichuan2-13B-Chat": PPTestSettings.fast(trust_remote_code=True), # noqa: E501 "baichuan-inc/Baichuan2-13B-Chat": PPTestSettings.fast(),
"bigscience/bloomz-1b1": PPTestSettings.fast(), "bigscience/bloomz-1b1": PPTestSettings.fast(),
"THUDM/chatglm3-6b": PPTestSettings.fast(trust_remote_code=True), "THUDM/chatglm3-6b": PPTestSettings.fast(),
"CohereForAI/c4ai-command-r-v01": PPTestSettings.fast(tp_base=2, trust_remote_code=True), # noqa: E501 "CohereForAI/c4ai-command-r-v01": PPTestSettings.fast(load_format="dummy"),
"databricks/dbrx-instruct": PPTestSettings.fast(tp_base=8), "databricks/dbrx-instruct": PPTestSettings.fast(load_format="dummy"),
"Deci/DeciLM-7B-instruct": PPTestSettings.fast(trust_remote_code=True), "Deci/DeciLM-7B-instruct": PPTestSettings.fast(),
"deepseek-ai/deepseek-llm-7b-chat": PPTestSettings.fast(), "deepseek-ai/deepseek-llm-7b-chat": PPTestSettings.fast(),
"deepseek-ai/DeepSeek-V2-Lite-Chat": PPTestSettings.fast(trust_remote_code=True), # noqa: E501 "deepseek-ai/DeepSeek-V2-Lite-Chat": PPTestSettings.fast(),
"LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct": PPTestSettings.fast(), "LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct": PPTestSettings.fast(),
"tiiuae/falcon-7b": PPTestSettings.fast(), "tiiuae/falcon-7b": PPTestSettings.fast(),
"google/gemma-2b": PPTestSettings.fast(), "google/gemma-2b": PPTestSettings.fast(),
...@@ -172,36 +159,36 @@ TEXT_GENERATION_MODELS = { ...@@ -172,36 +159,36 @@ TEXT_GENERATION_MODELS = {
"ibm/PowerMoE-3b": PPTestSettings.fast(), "ibm/PowerMoE-3b": PPTestSettings.fast(),
# Uses Llama # Uses Llama
# "internlm/internlm-chat-7b": PPTestSettings.fast(), # "internlm/internlm-chat-7b": PPTestSettings.fast(),
"internlm/internlm2-chat-7b": PPTestSettings.fast(trust_remote_code=True), "internlm/internlm2-chat-7b": PPTestSettings.fast(),
"inceptionai/jais-13b-chat": PPTestSettings.fast(), "inceptionai/jais-13b-chat": PPTestSettings.fast(),
"ai21labs/Jamba-tiny-dev": PPTestSettings.fast(), "ai21labs/Jamba-tiny-dev": PPTestSettings.fast(),
"meta-llama/Meta-Llama-3-8B": PPTestSettings.detailed(), "meta-llama/Meta-Llama-3-8B": PPTestSettings.detailed(),
"openbmb/MiniCPM-2B-sft-bf16": PPTestSettings.fast(trust_remote_code=True), "openbmb/MiniCPM-2B-sft-bf16": PPTestSettings.fast(),
"openbmb/MiniCPM3-4B": PPTestSettings.fast(trust_remote_code=True), "openbmb/MiniCPM3-4B": PPTestSettings.fast(),
# Uses Llama # Uses Llama
# "mistralai/Mistral-7B-Instruct-v0.1": PPTestSettings.fast(), # "mistralai/Mistral-7B-Instruct-v0.1": PPTestSettings.fast(),
"state-spaces/mamba-130m-hf": PPTestSettings.fast(), "state-spaces/mamba-130m-hf": PPTestSettings.fast(),
"mistralai/Mixtral-8x7B-Instruct-v0.1": PPTestSettings.fast(tp_base=4), "mistralai/Mixtral-8x7B-Instruct-v0.1": PPTestSettings.fast(load_format="dummy"), # noqa: E501
"mosaicml/mpt-7b": PPTestSettings.fast(), "mosaicml/mpt-7b": PPTestSettings.fast(),
"nvidia/Minitron-8B-Base": PPTestSettings.fast(), "nvidia/Minitron-8B-Base": PPTestSettings.fast(),
"allenai/OLMo-1B-hf": PPTestSettings.fast(), "allenai/OLMo-1B-hf": PPTestSettings.fast(),
"shanearora/OLMo-7B-1124-hf": PPTestSettings.fast(), "shanearora/OLMo-7B-1124-hf": PPTestSettings.fast(),
"allenai/OLMoE-1B-7B-0924-Instruct": PPTestSettings.fast(), "allenai/OLMoE-1B-7B-0924-Instruct": PPTestSettings.fast(),
"facebook/opt-iml-max-1.3b": PPTestSettings.fast(), "facebook/opt-iml-max-1.3b": PPTestSettings.fast(),
"OrionStarAI/Orion-14B-Chat": PPTestSettings.fast(trust_remote_code=True), "OrionStarAI/Orion-14B-Chat": PPTestSettings.fast(),
"adept/persimmon-8b-chat": PPTestSettings.fast(), "adept/persimmon-8b-chat": PPTestSettings.fast(),
"microsoft/phi-2": PPTestSettings.fast(), "microsoft/phi-2": PPTestSettings.fast(),
"microsoft/Phi-3-small-8k-instruct": PPTestSettings.fast(trust_remote_code=True), # noqa: E501 "microsoft/Phi-3-small-8k-instruct": PPTestSettings.fast(),
"microsoft/Phi-3.5-MoE-instruct": PPTestSettings.detailed(trust_remote_code=True, multi_node_only=True, load_format="dummy", hf_overrides='{"num_hidden_layers": 4, "hidden_size": 512, "intermediate_size": 800, "num_attention_heads": 4, "num_key_value_heads": 1}'), # noqa: E501 "microsoft/Phi-3.5-MoE-instruct": PPTestSettings.detailed(multi_node_only=True, load_format="dummy"), # noqa: E501
"Qwen/Qwen-7B-Chat": PPTestSettings.fast(trust_remote_code=True), "Qwen/Qwen-7B-Chat": PPTestSettings.fast(),
"Qwen/Qwen2-7B-Instruct": PPTestSettings.fast(), "Qwen/Qwen2-7B-Instruct": PPTestSettings.fast(),
"Qwen/Qwen1.5-MoE-A2.7B-Chat": PPTestSettings.fast(), "Qwen/Qwen1.5-MoE-A2.7B-Chat": PPTestSettings.fast(),
"stabilityai/stablelm-3b-4e1t": PPTestSettings.fast(), "stabilityai/stablelm-3b-4e1t": PPTestSettings.fast(),
"bigcode/starcoder2-3b": PPTestSettings.fast(), "bigcode/starcoder2-3b": PPTestSettings.fast(),
"upstage/solar-pro-preview-instruct": PPTestSettings.fast(tp_base=2), "upstage/solar-pro-preview-instruct": PPTestSettings.fast(load_format="dummy"), # noqa: E501
# FIXME: Cannot load tokenizer in latest transformers version. # FIXME: Cannot load tokenizer in latest transformers version.
# Need to use tokenizer from `meta-llama/Llama-2-7b-chat-hf` # Need to use tokenizer from `meta-llama/Llama-2-7b-chat-hf`
# "xverse/XVERSE-7B-Chat": PPTestSettings.fast(trust_remote_code=True), # "xverse/XVERSE-7B-Chat": PPTestSettings.fast(),
# [Encoder-only] # [Encoder-only]
# TODO: Implement PP # TODO: Implement PP
# "facebook/bart-base": PPTestSettings.fast(), # "facebook/bart-base": PPTestSettings.fast(),
...@@ -211,7 +198,7 @@ EMBEDDING_MODELS = { # type: ignore[var-annotated] ...@@ -211,7 +198,7 @@ EMBEDDING_MODELS = { # type: ignore[var-annotated]
# [Text-only] # [Text-only]
"intfloat/e5-mistral-7b-instruct": PPTestSettings.fast(), "intfloat/e5-mistral-7b-instruct": PPTestSettings.fast(),
"BAAI/bge-multilingual-gemma2": PPTestSettings.fast(), "BAAI/bge-multilingual-gemma2": PPTestSettings.fast(),
"Qwen/Qwen2.5-Math-RM-72B": PPTestSettings.fast(tp_base=4, trust_remote_code=True), # noqa: E501 "Qwen/Qwen2.5-Math-RM-72B": PPTestSettings.fast(load_format="dummy"),
} }
MULTIMODAL_MODELS = { MULTIMODAL_MODELS = {
...@@ -219,20 +206,20 @@ MULTIMODAL_MODELS = { ...@@ -219,20 +206,20 @@ MULTIMODAL_MODELS = {
"Salesforce/blip2-opt-2.7b": PPTestSettings.fast(), "Salesforce/blip2-opt-2.7b": PPTestSettings.fast(),
"facebook/chameleon-7b": PPTestSettings.fast(), "facebook/chameleon-7b": PPTestSettings.fast(),
"adept/fuyu-8b": PPTestSettings.fast(), "adept/fuyu-8b": PPTestSettings.fast(),
"THUDM/glm-4v-9b": PPTestSettings.fast(trust_remote_code=True), "THUDM/glm-4v-9b": PPTestSettings.fast(),
"OpenGVLab/InternVL2-1B": PPTestSettings.fast(trust_remote_code=True), "OpenGVLab/InternVL2-1B": PPTestSettings.fast(),
"llava-hf/llava-1.5-7b-hf": PPTestSettings.fast(), "llava-hf/llava-1.5-7b-hf": PPTestSettings.fast(),
"llava-hf/llava-v1.6-mistral-7b-hf": PPTestSettings.fast(), "llava-hf/llava-v1.6-mistral-7b-hf": PPTestSettings.fast(),
"llava-hf/LLaVA-NeXT-Video-7B-hf": PPTestSettings.fast(), "llava-hf/LLaVA-NeXT-Video-7B-hf": PPTestSettings.fast(),
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf": PPTestSettings.fast(), "llava-hf/llava-onevision-qwen2-0.5b-ov-hf": PPTestSettings.fast(),
"openbmb/MiniCPM-Llama3-V-2_5": PPTestSettings.fast(trust_remote_code=True), "openbmb/MiniCPM-Llama3-V-2_5": PPTestSettings.fast(),
"allenai/Molmo-7B-D-0924": PPTestSettings.fast(trust_remote_code=True), "allenai/Molmo-7B-D-0924": PPTestSettings.fast(),
"microsoft/Phi-3-vision-128k-instruct": PPTestSettings.fast(trust_remote_code=True), # noqa: E501 "microsoft/Phi-3-vision-128k-instruct": PPTestSettings.fast(),
"mistralai/Pixtral-12B-2409": PPTestSettings.fast(tp_base=2, tokenizer_mode="mistral"), # noqa: E501 "mistralai/Pixtral-12B-2409": PPTestSettings.fast(load_format="dummy"),
"Qwen/Qwen-VL-Chat": PPTestSettings.fast(trust_remote_code=True), "Qwen/Qwen-VL-Chat": PPTestSettings.fast(),
"Qwen/Qwen2-Audio-7B-Instruct": PPTestSettings.fast(), "Qwen/Qwen2-Audio-7B-Instruct": PPTestSettings.fast(),
"Qwen/Qwen2-VL-2B-Instruct": PPTestSettings.fast(), "Qwen/Qwen2-VL-2B-Instruct": PPTestSettings.fast(),
"fixie-ai/ultravox-v0_5-llama-3_2-1b": PPTestSettings.fast(trust_remote_code=True), # noqa: E501 "fixie-ai/ultravox-v0_5-llama-3_2-1b": PPTestSettings.fast(),
# [Encoder-decoder] # [Encoder-decoder]
# TODO: Implement PP # TODO: Implement PP
# "meta-llama/Llama-3.2-11B-Vision-Instruct": PPTestSettings.fast(), # "meta-llama/Llama-3.2-11B-Vision-Instruct": PPTestSettings.fast(),
...@@ -258,7 +245,7 @@ TEST_MODELS = [ ...@@ -258,7 +245,7 @@ TEST_MODELS = [
def _compare_tp( def _compare_tp(
model_name: str, model_id: str,
parallel_setup: ParallelSetup, parallel_setup: ParallelSetup,
distributed_backend: str, distributed_backend: str,
vllm_major_version: str, vllm_major_version: str,
...@@ -267,6 +254,7 @@ def _compare_tp( ...@@ -267,6 +254,7 @@ def _compare_tp(
num_gpus_available: int, num_gpus_available: int,
*, *,
method: Literal["generate", "encode"], method: Literal["generate", "encode"],
is_multimodal: bool,
): ):
( (
tp_size, tp_size,
...@@ -274,13 +262,32 @@ def _compare_tp( ...@@ -274,13 +262,32 @@ def _compare_tp(
eager_mode, eager_mode,
chunked_prefill, chunked_prefill,
) = parallel_setup ) = parallel_setup
(
multi_node_only, multi_node_only, load_format = test_options
trust_remote_code,
tokenizer_mode, model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
load_format, model_info.check_transformers_version(on_fail="skip")
hf_overrides,
) = test_options trust_remote_code = model_info.trust_remote_code
tokenizer_mode = model_info.tokenizer_mode
hf_overrides = model_info.hf_overrides
if load_format == "dummy":
# Avoid OOM
text_overrides = {
"num_layers": 1,
"num_hidden_layers": 1,
"num_experts": 2,
"num_experts_per_tok": 2,
"num_local_experts": 2,
}
if is_multimodal:
hf_overrides.update({"text_config": text_overrides})
else:
hf_overrides.update(text_overrides)
else:
model_info.check_available_online(on_fail="skip")
if num_gpus_available < tp_size * pp_size: if num_gpus_available < tp_size * pp_size:
pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs") pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")
...@@ -312,7 +319,7 @@ def _compare_tp( ...@@ -312,7 +319,7 @@ def _compare_tp(
if load_format: if load_format:
common_args.extend(["--load-format", load_format]) common_args.extend(["--load-format", load_format])
if hf_overrides: if hf_overrides:
common_args.extend(["--hf-overrides", hf_overrides]) common_args.extend(["--hf-overrides", json.dumps(hf_overrides)])
specific_case = tp_size == 2 and pp_size == 2 and chunked_prefill specific_case = tp_size == 2 and pp_size == 2 and chunked_prefill
if distributed_backend == "ray" and (vllm_major_version == "1" if distributed_backend == "ray" and (vllm_major_version == "1"
...@@ -355,11 +362,7 @@ def _compare_tp( ...@@ -355,11 +362,7 @@ def _compare_tp(
] ]
try: try:
compare_two_settings(model_name, compare_two_settings(model_id, pp_args, tp_args, pp_env, method=method)
pp_args,
tp_args,
pp_env,
method=method)
except Exception: except Exception:
if pp_env is None: if pp_env is None:
raise raise
...@@ -369,17 +372,16 @@ def _compare_tp( ...@@ -369,17 +372,16 @@ def _compare_tp(
@pytest.mark.parametrize( @pytest.mark.parametrize(
("model_name", "parallel_setup", "distributed_backend", ("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
"vllm_major_version", "task", "test_options"), "task", "test_options"),
[ [
params for model_name, settings in TEXT_GENERATION_MODELS.items() params for model_id, settings in TEXT_GENERATION_MODELS.items()
for params in settings.iter_params(model_name) for params in settings.iter_params(model_id) if model_id in TEST_MODELS
if model_name in TEST_MODELS
], ],
) )
@fork_new_process_for_each_test @fork_new_process_for_each_test
def test_tp_language_generation( def test_tp_language_generation(
model_name: str, model_id: str,
parallel_setup: ParallelSetup, parallel_setup: ParallelSetup,
distributed_backend: str, distributed_backend: str,
vllm_major_version: str, vllm_major_version: str,
...@@ -387,28 +389,28 @@ def test_tp_language_generation( ...@@ -387,28 +389,28 @@ def test_tp_language_generation(
test_options: PPTestOptions, test_options: PPTestOptions,
num_gpus_available, num_gpus_available,
): ):
_compare_tp(model_name, _compare_tp(model_id,
parallel_setup, parallel_setup,
distributed_backend, distributed_backend,
vllm_major_version, vllm_major_version,
task, task,
test_options, test_options,
num_gpus_available, num_gpus_available,
method="generate") method="generate",
is_multimodal=False)
@pytest.mark.parametrize( @pytest.mark.parametrize(
("model_name", "parallel_setup", "distributed_backend", ("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
"vllm_major_version", "task", "test_options"), "task", "test_options"),
[ [
params for model_name, settings in EMBEDDING_MODELS.items() params for model_id, settings in EMBEDDING_MODELS.items()
for params in settings.iter_params(model_name) for params in settings.iter_params(model_id) if model_id in TEST_MODELS
if model_name in TEST_MODELS
], ],
) )
@fork_new_process_for_each_test @fork_new_process_for_each_test
def test_tp_language_embedding( def test_tp_language_embedding(
model_name: str, model_id: str,
parallel_setup: ParallelSetup, parallel_setup: ParallelSetup,
distributed_backend: str, distributed_backend: str,
vllm_major_version: str, vllm_major_version: str,
...@@ -416,28 +418,28 @@ def test_tp_language_embedding( ...@@ -416,28 +418,28 @@ def test_tp_language_embedding(
test_options: PPTestOptions, test_options: PPTestOptions,
num_gpus_available, num_gpus_available,
): ):
_compare_tp(model_name, _compare_tp(model_id,
parallel_setup, parallel_setup,
distributed_backend, distributed_backend,
vllm_major_version, vllm_major_version,
task, task,
test_options, test_options,
num_gpus_available, num_gpus_available,
method="encode") method="encode",
is_multimodal=False)
@pytest.mark.parametrize( @pytest.mark.parametrize(
("model_name", "parallel_setup", "distributed_backend", ("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
"vllm_major_version", "task", "test_options"), "task", "test_options"),
[ [
params for model_name, settings in MULTIMODAL_MODELS.items() params for model_id, settings in MULTIMODAL_MODELS.items()
for params in settings.iter_params(model_name) for params in settings.iter_params(model_id) if model_id in TEST_MODELS
if model_name in TEST_MODELS
], ],
) )
@fork_new_process_for_each_test @fork_new_process_for_each_test
def test_tp_multimodal_generation( def test_tp_multimodal_generation(
model_name: str, model_id: str,
parallel_setup: ParallelSetup, parallel_setup: ParallelSetup,
distributed_backend: str, distributed_backend: str,
vllm_major_version: str, vllm_major_version: str,
...@@ -445,11 +447,12 @@ def test_tp_multimodal_generation( ...@@ -445,11 +447,12 @@ def test_tp_multimodal_generation(
test_options: PPTestOptions, test_options: PPTestOptions,
num_gpus_available, num_gpus_available,
): ):
_compare_tp(model_name, _compare_tp(model_id,
parallel_setup, parallel_setup,
distributed_backend, distributed_backend,
vllm_major_version, vllm_major_version,
task, task,
test_options, test_options,
num_gpus_available, num_gpus_available,
method="generate") method="generate",
is_multimodal=True)
...@@ -155,10 +155,7 @@ VLM_TEST_SETTINGS = { ...@@ -155,10 +155,7 @@ VLM_TEST_SETTINGS = {
auto_cls=AutoModelForVision2Seq, auto_cls=AutoModelForVision2Seq,
vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output, vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
marks=[pytest.mark.skipif( marks=[pytest.mark.core_model, pytest.mark.cpu_model],
TRANSFORMERS_VERSION < "4.49.0",
reason="HF model requires transformers>=4.49.0",
), pytest.mark.core_model, pytest.mark.cpu_model],
), ),
#### Extended model tests #### Extended model tests
"aria": VLMTestInfo( "aria": VLMTestInfo(
...@@ -215,7 +212,6 @@ VLM_TEST_SETTINGS = { ...@@ -215,7 +212,6 @@ VLM_TEST_SETTINGS = {
"cherry_blossom": "<image>\nPlease infer the season with reason in details.", # noqa: E501 "cherry_blossom": "<image>\nPlease infer the season with reason in details.", # noqa: E501
}), }),
multi_image_prompt="image_1:<image>\nimage_2:<image>\nWhich image can we see the car and the tower?", # noqa: E501 multi_image_prompt="image_1:<image>\nimage_2:<image>\nWhich image can we see the car and the tower?", # noqa: E501
vllm_runner_kwargs={"hf_overrides": {"architectures": ["DeepseekVLV2ForCausalLM"]}}, # noqa: E501
patch_hf_runner=model_utils.deepseekvl2_patch_hf_runner, patch_hf_runner=model_utils.deepseekvl2_patch_hf_runner,
postprocess_inputs=model_utils.cast_dtype_post_processor("images"), postprocess_inputs=model_utils.cast_dtype_post_processor("images"),
hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output, hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output,
...@@ -240,7 +236,7 @@ VLM_TEST_SETTINGS = { ...@@ -240,7 +236,7 @@ VLM_TEST_SETTINGS = {
num_logprobs=10, num_logprobs=10,
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
), ),
"glm4": VLMTestInfo( "glm4v": VLMTestInfo(
models=["THUDM/glm-4v-9b"], models=["THUDM/glm-4v-9b"],
test_type=VLMTestType.IMAGE, test_type=VLMTestType.IMAGE,
prompt_formatter=identity, prompt_formatter=identity,
...@@ -351,7 +347,6 @@ VLM_TEST_SETTINGS = { ...@@ -351,7 +347,6 @@ VLM_TEST_SETTINGS = {
postprocess_inputs=model_utils.cast_dtype_post_processor( postprocess_inputs=model_utils.cast_dtype_post_processor(
"pixel_values" "pixel_values"
), ),
vllm_runner_kwargs={"hf_overrides": {"architectures": ["MantisForConditionalGeneration"]}}, # noqa: E501
get_stop_token_ids=lambda tok: [128009], get_stop_token_ids=lambda tok: [128009],
auto_cls=AutoModelForVision2Seq, auto_cls=AutoModelForVision2Seq,
vllm_output_post_proc=model_utils.mantis_vllm_to_hf_output, vllm_output_post_proc=model_utils.mantis_vllm_to_hf_output,
...@@ -437,7 +432,7 @@ VLM_TEST_SETTINGS = { ...@@ -437,7 +432,7 @@ VLM_TEST_SETTINGS = {
auto_cls=AutoModelForVision2Seq, auto_cls=AutoModelForVision2Seq,
marks=[large_gpu_mark(min_gb=48)], marks=[large_gpu_mark(min_gb=48)],
), ),
"qwen": VLMTestInfo( "qwen_vl": VLMTestInfo(
models=["Qwen/Qwen-VL"], models=["Qwen/Qwen-VL"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=identity, prompt_formatter=identity,
......
...@@ -4,12 +4,14 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union ...@@ -4,12 +4,14 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
import torch import torch
from PIL.Image import Image from PIL.Image import Image
from transformers import AutoTokenizer, BatchEncoding, PreTrainedTokenizerBase from transformers import BatchEncoding
from transformers.models.auto.auto_factory import _BaseAutoModelClass from transformers.models.auto.auto_factory import _BaseAutoModelClass
from vllm.config import TaskOption from vllm.config import TaskOption
from vllm.transformers_utils.tokenizer import AnyTokenizer
from .....conftest import HfRunner, VllmRunner from .....conftest import HfRunner, VllmRunner
from ....registry import HF_EXAMPLE_MODELS
from .types import RunnerOutput from .types import RunnerOutput
...@@ -31,10 +33,8 @@ def run_test( ...@@ -31,10 +33,8 @@ def run_test(
use_tokenizer_eos: bool, use_tokenizer_eos: bool,
postprocess_inputs: Callable[[BatchEncoding], BatchEncoding], postprocess_inputs: Callable[[BatchEncoding], BatchEncoding],
comparator: Callable[..., None], comparator: Callable[..., None],
get_stop_token_ids: Optional[Callable[[PreTrainedTokenizerBase], get_stop_token_ids: Optional[Callable[[AnyTokenizer], list[int]]],
List[int]]],
stop_str: Optional[List[str]], stop_str: Optional[List[str]],
tokenizer_mode: str,
limit_mm_per_prompt: Dict[str, int], limit_mm_per_prompt: Dict[str, int],
vllm_runner_kwargs: Optional[Dict[str, Any]], vllm_runner_kwargs: Optional[Dict[str, Any]],
hf_model_kwargs: Optional[Dict[str, Any]], hf_model_kwargs: Optional[Dict[str, Any]],
...@@ -48,7 +48,10 @@ def run_test( ...@@ -48,7 +48,10 @@ def run_test(
"""Modality agnostic test test executor for comparing HF/vLLM outputs.""" """Modality agnostic test test executor for comparing HF/vLLM outputs."""
# In the case of embeddings, vLLM takes separate input tensors # In the case of embeddings, vLLM takes separate input tensors
vllm_inputs = vllm_embeddings if vllm_embeddings is not None else inputs vllm_inputs = vllm_embeddings if vllm_embeddings is not None else inputs
tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip")
model_info.check_transformers_version(on_fail="skip")
vllm_outputs_per_mm = [] vllm_outputs_per_mm = []
hf_outputs_per_mm = [] hf_outputs_per_mm = []
...@@ -57,17 +60,19 @@ def run_test( ...@@ -57,17 +60,19 @@ def run_test(
# vLLM needs a fresh new process without cuda initialization. # vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it # if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method). # will hurt multiprocessing backend with fork method (the default method).
vllm_kwargs: Dict[str, Any] = {}
if get_stop_token_ids is not None:
vllm_kwargs["stop_token_ids"] = get_stop_token_ids(tokenizer)
if stop_str:
vllm_kwargs["stop"] = stop_str
if vllm_runner_kwargs is None: vllm_runner_kwargs_: Dict[str, Any] = {}
vllm_runner_kwargs = {} if model_info.tokenizer:
vllm_runner_kwargs_["tokenizer"] = model_info.tokenizer
if model_info.tokenizer_mode:
vllm_runner_kwargs_["tokenizer_mode"] = model_info.tokenizer_mode
if model_info.hf_overrides:
vllm_runner_kwargs_["hf_overrides"] = model_info.hf_overrides
if vllm_runner_kwargs:
vllm_runner_kwargs_.update(vllm_runner_kwargs)
with vllm_runner(model, with vllm_runner(model,
tokenizer_mode=tokenizer_mode,
max_model_len=max_model_len, max_model_len=max_model_len,
max_num_seqs=max_num_seqs, max_num_seqs=max_num_seqs,
dtype=dtype, dtype=dtype,
...@@ -76,7 +81,15 @@ def run_test( ...@@ -76,7 +81,15 @@ def run_test(
distributed_executor_backend=distributed_executor_backend, distributed_executor_backend=distributed_executor_backend,
enforce_eager=enforce_eager, enforce_eager=enforce_eager,
task=task, task=task,
**vllm_runner_kwargs) as vllm_model: **vllm_runner_kwargs_) as vllm_model:
tokenizer = vllm_model.model.get_tokenizer()
vllm_kwargs: Dict[str, Any] = {}
if get_stop_token_ids is not None:
vllm_kwargs["stop_token_ids"] = get_stop_token_ids(tokenizer)
if stop_str:
vllm_kwargs["stop"] = stop_str
for prompts, media in vllm_inputs: for prompts, media in vllm_inputs:
vllm_kwargs[runner_mm_key] = media vllm_kwargs[runner_mm_key] = media
vllm_output = vllm_model.generate_greedy_logprobs( vllm_output = vllm_model.generate_greedy_logprobs(
...@@ -93,16 +106,19 @@ def run_test( ...@@ -93,16 +106,19 @@ def run_test(
if patch_hf_runner is not None: if patch_hf_runner is not None:
hf_model = patch_hf_runner(hf_model) hf_model = patch_hf_runner(hf_model)
# Some models need to explicitly pass the eos_token_id off the tokenizer or
# processor for a good comparison; currently assume processor/tokenizer
# agree on the EOS, and pull it off the tokenizer if requested.
hf_kwargs = {}
if use_tokenizer_eos:
hf_kwargs["eos_token_id"] = tokenizer.eos_token_id
if stop_str:
hf_kwargs["stop_strings"] = stop_str
with hf_model, torch.no_grad(): with hf_model, torch.no_grad():
tokenizer = hf_model.tokenizer
# Some models need to explicitly pass the eos_token_id off the tokenizer
# or processor for a good comparison;
# currently assume processor/tokenizer agree on the EOS, and pull it off
# the tokenizer if requested.
hf_kwargs = {}
if use_tokenizer_eos:
hf_kwargs["eos_token_id"] = tokenizer.eos_token_id
if stop_str:
hf_kwargs["stop_strings"] = stop_str
for prompts, media in inputs: for prompts, media in inputs:
hf_kwargs[runner_mm_key] = media hf_kwargs[runner_mm_key] = media
hf_output = hf_model.generate_greedy_logprobs_limit( hf_output = hf_model.generate_greedy_logprobs_limit(
......
...@@ -8,12 +8,12 @@ from typing import (Any, Callable, Dict, Iterable, List, NamedTuple, Optional, ...@@ -8,12 +8,12 @@ from typing import (Any, Callable, Dict, Iterable, List, NamedTuple, Optional,
import torch import torch
from PIL.Image import Image from PIL.Image import Image
from pytest import MarkDecorator from pytest import MarkDecorator
from transformers import (AutoModelForCausalLM, BatchEncoding, from transformers import AutoModelForCausalLM, BatchEncoding
PreTrainedTokenizerBase)
from transformers.models.auto.auto_factory import _BaseAutoModelClass from transformers.models.auto.auto_factory import _BaseAutoModelClass
from vllm.config import TaskOption from vllm.config import TaskOption
from vllm.sequence import SampleLogprobs from vllm.sequence import SampleLogprobs
from vllm.transformers_utils.tokenizer import AnyTokenizer
from vllm.utils import identity from vllm.utils import identity
from .....conftest import IMAGE_ASSETS, HfRunner, ImageAsset, _ImageAssets from .....conftest import IMAGE_ASSETS, HfRunner, ImageAsset, _ImageAssets
...@@ -100,8 +100,7 @@ class VLMTestInfo(NamedTuple): ...@@ -100,8 +100,7 @@ class VLMTestInfo(NamedTuple):
vllm_runner_kwargs: Optional[Dict[str, Any]] = None vllm_runner_kwargs: Optional[Dict[str, Any]] = None
# Optional callable which gets a list of token IDs from the model tokenizer # Optional callable which gets a list of token IDs from the model tokenizer
get_stop_token_ids: Optional[Callable[[PreTrainedTokenizerBase], get_stop_token_ids: Optional[Callable[[AnyTokenizer], list[int]]] = None
List[int]]] = None
# Optional list of strings to stop generation, useful when stop tokens are # Optional list of strings to stop generation, useful when stop tokens are
# not special tokens in the tokenizer # not special tokens in the tokenizer
stop_str: Optional[List[str]] = None stop_str: Optional[List[str]] = None
...@@ -156,8 +155,6 @@ class VLMTestInfo(NamedTuple): ...@@ -156,8 +155,6 @@ class VLMTestInfo(NamedTuple):
marks: Optional[List[MarkDecorator]] = None marks: Optional[List[MarkDecorator]] = None
tokenizer_mode: str = "auto"
def get_non_parametrized_runner_kwargs(self): def get_non_parametrized_runner_kwargs(self):
"""Returns a dictionary of expandable kwargs for items that are used """Returns a dictionary of expandable kwargs for items that are used
in all test types, which are NOT used when creating the parametrized in all test types, which are NOT used when creating the parametrized
...@@ -180,7 +177,6 @@ class VLMTestInfo(NamedTuple): ...@@ -180,7 +177,6 @@ class VLMTestInfo(NamedTuple):
"hf_model_kwargs": self.hf_model_kwargs, "hf_model_kwargs": self.hf_model_kwargs,
"stop_str": self.stop_str, "stop_str": self.stop_str,
"patch_hf_runner": self.patch_hf_runner, "patch_hf_runner": self.patch_hf_runner,
"tokenizer_mode": self.tokenizer_mode
} }
......
...@@ -104,7 +104,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { ...@@ -104,7 +104,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
trust_remote_code=True), trust_remote_code=True),
"BambaForCausalLM": _HfExamplesInfo("ibm-ai-platform/Bamba-9B"), "BambaForCausalLM": _HfExamplesInfo("ibm-ai-platform/Bamba-9B"),
"BloomForCausalLM": _HfExamplesInfo("bigscience/bloomz-1b1"), "BloomForCausalLM": _HfExamplesInfo("bigscience/bloomz-1b1"),
# ChatGLMModel supports multimodal "ChatGLMModel": _HfExamplesInfo("THUDM/chatglm3-6b",
trust_remote_code=True),
"CohereForCausalLM": _HfExamplesInfo("CohereForAI/c4ai-command-r-v01", "CohereForCausalLM": _HfExamplesInfo("CohereForAI/c4ai-command-r-v01",
trust_remote_code=True), trust_remote_code=True),
"Cohere2ForCausalLM": _HfExamplesInfo("CohereForAI/c4ai-command-r7b-12-2024", # noqa: E501 "Cohere2ForCausalLM": _HfExamplesInfo("CohereForAI/c4ai-command-r7b-12-2024", # noqa: E501
...@@ -138,7 +139,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { ...@@ -138,7 +139,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"InternLM3ForCausalLM": _HfExamplesInfo("internlm/internlm3-8b-instruct", "InternLM3ForCausalLM": _HfExamplesInfo("internlm/internlm3-8b-instruct",
trust_remote_code=True), trust_remote_code=True),
"JAISLMHeadModel": _HfExamplesInfo("inceptionai/jais-13b-chat"), "JAISLMHeadModel": _HfExamplesInfo("inceptionai/jais-13b-chat"),
"JambaForCausalLM": _HfExamplesInfo("ai21labs/AI21-Jamba-1.5-Mini"), "JambaForCausalLM": _HfExamplesInfo("ai21labs/AI21-Jamba-1.5-Mini",
extras={"tiny": "ai21labs/Jamba-tiny-dev"}), # noqa: E501
"LlamaForCausalLM": _HfExamplesInfo("meta-llama/Meta-Llama-3-8B"), "LlamaForCausalLM": _HfExamplesInfo("meta-llama/Meta-Llama-3-8B"),
"LLaMAForCausalLM": _HfExamplesInfo("decapoda-research/llama-7b-hf", "LLaMAForCausalLM": _HfExamplesInfo("decapoda-research/llama-7b-hf",
is_available_online=False), is_available_online=False),
...@@ -167,7 +169,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = { ...@@ -167,7 +169,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
trust_remote_code=True), trust_remote_code=True),
"PhiMoEForCausalLM": _HfExamplesInfo("microsoft/Phi-3.5-MoE-instruct", "PhiMoEForCausalLM": _HfExamplesInfo("microsoft/Phi-3.5-MoE-instruct",
trust_remote_code=True), trust_remote_code=True),
# QWenLMHeadModel supports multimodal "QWenLMHeadModel": _HfExamplesInfo("Qwen/Qwen-7B-Chat",
trust_remote_code=True),
"Qwen2ForCausalLM": _HfExamplesInfo("Qwen/Qwen2-7B-Instruct"), "Qwen2ForCausalLM": _HfExamplesInfo("Qwen/Qwen2-7B-Instruct"),
"Qwen2MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen1.5-MoE-A2.7B-Chat"), "Qwen2MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen1.5-MoE-A2.7B-Chat"),
"RWForCausalLM": _HfExamplesInfo("tiiuae/falcon-40b", "RWForCausalLM": _HfExamplesInfo("tiiuae/falcon-40b",
...@@ -232,18 +235,19 @@ _MULTIMODAL_EXAMPLE_MODELS = { ...@@ -232,18 +235,19 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria"), "AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria"),
"Blip2ForConditionalGeneration": _HfExamplesInfo("Salesforce/blip2-opt-2.7b"), # noqa: E501 "Blip2ForConditionalGeneration": _HfExamplesInfo("Salesforce/blip2-opt-2.7b"), # noqa: E501
"ChameleonForConditionalGeneration": _HfExamplesInfo("facebook/chameleon-7b"), # noqa: E501 "ChameleonForConditionalGeneration": _HfExamplesInfo("facebook/chameleon-7b"), # noqa: E501
"ChatGLMModel": _HfExamplesInfo("THUDM/glm-4v-9b",
extras={"text_only": "THUDM/chatglm3-6b"},
trust_remote_code=True),
"ChatGLMForConditionalGeneration": _HfExamplesInfo("chatglm2-6b",
is_available_online=False),
"DeepseekVLV2ForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-vl2-tiny", # noqa: E501 "DeepseekVLV2ForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-vl2-tiny", # noqa: E501
hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]}), # noqa: E501 hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]}), # noqa: E501
"FuyuForCausalLM": _HfExamplesInfo("adept/fuyu-8b"), "FuyuForCausalLM": _HfExamplesInfo("adept/fuyu-8b"),
"H2OVLChatModel": _HfExamplesInfo("h2oai/h2ovl-mississippi-800m"), "GLM4VForCausalLM": _HfExamplesInfo("THUDM/glm-4v-9b",
trust_remote_code=True,
hf_overrides={"architectures": ["GLM4VForCausalLM"]}), # noqa: E501
"H2OVLChatModel": _HfExamplesInfo("h2oai/h2ovl-mississippi-800m",
extras={"2b": "h2oai/h2ovl-mississippi-2b"}), # noqa: E501
"InternVLChatModel": _HfExamplesInfo("OpenGVLab/InternVL2-1B", "InternVLChatModel": _HfExamplesInfo("OpenGVLab/InternVL2-1B",
extras={"2B": "OpenGVLab/InternVL2-2B"}, # noqa: E501
trust_remote_code=True), trust_remote_code=True),
"Idefics3ForConditionalGeneration": _HfExamplesInfo("HuggingFaceM4/Idefics3-8B-Llama3"), # noqa: E501 "Idefics3ForConditionalGeneration": _HfExamplesInfo("HuggingFaceM4/Idefics3-8B-Llama3", # noqa: E501
{"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"}), # noqa: E501
"LlavaForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-1.5-7b-hf", "LlavaForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-1.5-7b-hf",
extras={"mistral": "mistral-community/pixtral-12b"}), # noqa: E501 extras={"mistral": "mistral-community/pixtral-12b"}), # noqa: E501
"LlavaNextForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-v1.6-mistral-7b-hf"), # noqa: E501 "LlavaNextForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-v1.6-mistral-7b-hf"), # noqa: E501
...@@ -253,21 +257,24 @@ _MULTIMODAL_EXAMPLE_MODELS = { ...@@ -253,21 +257,24 @@ _MULTIMODAL_EXAMPLE_MODELS = {
hf_overrides={"architectures": ["MantisForConditionalGeneration"]}), # noqa: E501 hf_overrides={"architectures": ["MantisForConditionalGeneration"]}), # noqa: E501
"MiniCPMO": _HfExamplesInfo("openbmb/MiniCPM-o-2_6", "MiniCPMO": _HfExamplesInfo("openbmb/MiniCPM-o-2_6",
trust_remote_code=True), trust_remote_code=True),
"MiniCPMV": _HfExamplesInfo("openbmb/MiniCPM-V-2_6", "MiniCPMV": _HfExamplesInfo("openbmb/MiniCPM-Llama3-V-2_5",
extras={"2.6": "openbmb/MiniCPM-V-2_6"}, # noqa: E501
trust_remote_code=True), trust_remote_code=True),
"MolmoForCausalLM": _HfExamplesInfo("allenai/Molmo-7B-D-0924", "MolmoForCausalLM": _HfExamplesInfo("allenai/Molmo-7B-D-0924",
extras={"olmo": "allenai/Molmo-7B-O-0924"}, # noqa: E501 extras={"olmo": "allenai/Molmo-7B-O-0924"}, # noqa: E501
trust_remote_code=True), trust_remote_code=True),
"NVLM_D": _HfExamplesInfo("nvidia/NVLM-D-72B", "NVLM_D": _HfExamplesInfo("nvidia/NVLM-D-72B",
trust_remote_code=True), trust_remote_code=True),
"PaliGemmaForConditionalGeneration": _HfExamplesInfo("google/paligemma-3b-pt-224"), # noqa: E501 "PaliGemmaForConditionalGeneration": _HfExamplesInfo("google/paligemma-3b-mix-224", # noqa: E501
extras={"v2": "google/paligemma2-3b-ft-docci-448"}), # noqa: E501
"Phi3VForCausalLM": _HfExamplesInfo("microsoft/Phi-3-vision-128k-instruct", "Phi3VForCausalLM": _HfExamplesInfo("microsoft/Phi-3-vision-128k-instruct",
trust_remote_code=True), trust_remote_code=True),
"PixtralForConditionalGeneration": _HfExamplesInfo("mistralai/Pixtral-12B-2409", # noqa: E501 "PixtralForConditionalGeneration": _HfExamplesInfo("mistralai/Pixtral-12B-2409", # noqa: E501
tokenizer_mode="mistral"), tokenizer_mode="mistral"),
"QWenLMHeadModel": _HfExamplesInfo("Qwen/Qwen-VL-Chat", "QwenVLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen-VL",
extras={"text_only": "Qwen/Qwen-7B-Chat"}, # noqa: E501 extras={"chat": "Qwen/Qwen-VL-Chat"}, # noqa: E501
trust_remote_code=True), trust_remote_code=True,
hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]}), # noqa: E501
"Qwen2AudioForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-Audio-7B-Instruct"), # noqa: E501 "Qwen2AudioForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-Audio-7B-Instruct"), # noqa: E501
"Qwen2VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-VL-2B-Instruct"), # noqa: E501 "Qwen2VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-VL-2B-Instruct"), # noqa: E501
"Qwen2_5_VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-VL-3B-Instruct", # noqa: E501 "Qwen2_5_VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-VL-3B-Instruct", # noqa: E501
......
...@@ -18,8 +18,7 @@ def test_can_initialize(model_arch): ...@@ -18,8 +18,7 @@ def test_can_initialize(model_arch):
# Avoid OOM # Avoid OOM
def hf_overrides(hf_config: PretrainedConfig) -> PretrainedConfig: def hf_overrides(hf_config: PretrainedConfig) -> PretrainedConfig:
if hf_config.model_type == "deepseek_vl_v2": hf_config.update(model_info.hf_overrides)
hf_config.update({"architectures": ["DeepseekVLV2ForCausalLM"]})
if hasattr(hf_config, "text_config"): if hasattr(hf_config, "text_config"):
text_config: PretrainedConfig = hf_config.text_config text_config: PretrainedConfig = hf_config.text_config
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# Adapted from # Adapted from
# https://github.com/THUDM/CogAgent # https://github.com/THUDM/ChatGLM2-6B
"""Inference-only CogAgent model compatible with THUDM weights.""" """Inference-only ChatGLM model compatible with THUDM weights."""
from argparse import Namespace from typing import Iterable, List, Optional, Set, Tuple, Union
from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict,
Union)
import torch import torch
from torch import nn from torch import nn
from torch.nn import LayerNorm from torch.nn import LayerNorm
from torchvision import transforms
from torchvision.transforms import InterpolationMode
from transformers import PreTrainedTokenizer, TensorType
from transformers.image_utils import ImageInput
from transformers.tokenization_utils_base import TextInput
from vllm.attention import Attention, AttentionMetadata from vllm.attention import Attention, AttentionMetadata
from vllm.config import CacheConfig, VllmConfig from vllm.config import CacheConfig, VllmConfig
...@@ -31,204 +23,14 @@ from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler ...@@ -31,204 +23,14 @@ from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.layers.vocab_parallel_embedding import (
ParallelLMHead, VocabParallelEmbedding) ParallelLMHead, VocabParallelEmbedding)
from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.models.glm4_vision_encoder import EVA2CLIPModel
from vllm.model_executor.models.module_mapping import MultiModelKeys
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors
from vllm.multimodal.parse import MultiModalDataItems
from vllm.multimodal.processing import (BaseMultiModalProcessor,
BaseProcessingInfo, BatchFeature,
MultiModalFieldConfig,
PromptReplacement)
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.configs import ChatGLMConfig from vllm.transformers_utils.configs import ChatGLMConfig
from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP from .interfaces import SupportsLoRA, SupportsPP
from .utils import (AutoWeightsLoader, WeightsMapper, is_pp_missing_parameter, from .utils import (AutoWeightsLoader, WeightsMapper, is_pp_missing_parameter,
make_empty_intermediate_tensors_factory, make_layers, make_empty_intermediate_tensors_factory, make_layers,
maybe_prefix, merge_multimodal_embeddings) maybe_prefix)
class GLMImagePixelInputs(TypedDict):
pixel_values: torch.Tensor
"""Shape: `(batch_size, num_channels, height, width)`"""
class GLM4VProcessor:
"""
This model doesn't define its own HF processor,
so we implement our own one here.
"""
def __init__(
self,
config: ChatGLMConfig,
tokenizer: PreTrainedTokenizer,
) -> None:
super().__init__()
self.config = config
self.tokenizer = tokenizer
if vision_config := getattr(config, "vision_config", None):
image_size = vision_config["image_size"]
self.image_transform = transforms.Compose([
transforms.Resize(
(image_size, image_size),
interpolation=InterpolationMode.BICUBIC,
),
transforms.ToTensor(),
transforms.Normalize(
mean=(0.48145466, 0.4578275, 0.40821073),
std=(0.26862954, 0.26130258, 0.27577711),
),
])
else:
self.image_transform = None
def __call__(
self,
text: Optional[Union[TextInput, list[TextInput]]] = None,
images: Optional[Union[ImageInput, list[ImageInput]]] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
) -> BatchFeature:
if text is None:
text = []
if not isinstance(text, list):
text = [text]
if images is None:
images = []
if not isinstance(images, list):
images = [images]
text_inputs = self.tokenizer(text)
if len(images) == 0:
image_inputs = {}
else:
if self.image_transform is None:
raise ValueError("This model does not support image inputs")
pixel_values = [self.image_transform(image) for image in images]
image_inputs = {"pixel_values": torch.stack(pixel_values)}
return BatchFeature(
{
**text_inputs,
**image_inputs,
},
tensor_type=return_tensors,
)
class GLM4VProcessingInfo(BaseProcessingInfo):
def get_tokenizer(self):
tokenizer = self.ctx.tokenizer
assert isinstance(tokenizer, PreTrainedTokenizer)
return tokenizer
def get_hf_config(self):
return self.ctx.get_hf_config(ChatGLMConfig)
def get_hf_processor(self) -> GLM4VProcessor:
return GLM4VProcessor(
self.get_hf_config(),
self.get_tokenizer(),
)
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
return {"image": 1}
def get_mm_max_tokens_per_item(
self,
seq_len: int,
mm_counts: Mapping[str, int],
) -> Mapping[str, int]:
return {"image": self.get_num_image_feature_tokens()}
def get_num_image_tokens(self) -> int:
hf_config = self.get_hf_config()
if not (vision_config := getattr(hf_config, "vision_config", None)):
return 0
image_size = vision_config["image_size"]
patch_size = vision_config["patch_size"]
grid_length = image_size // patch_size // 2
return grid_length * grid_length
def get_num_image_feature_tokens(self) -> int:
# EVA2CLIPModel has embeddings for boi and eoi tokens as well
return self.get_num_image_tokens() + 2
class GLM4VDummyInputsBuilder(BaseDummyInputsBuilder[GLM4VProcessingInfo]):
def get_dummy_processor_inputs(
self,
seq_len: int,
mm_counts: Mapping[str, int],
) -> ProcessorInputs:
hf_config = self.info.get_hf_config()
if not (vision_config := getattr(hf_config, "vision_config", None)):
return ProcessorInputs(prompt_text="", mm_data={})
target_width = target_height = vision_config["image_size"]
num_images = mm_counts.get("image", 0)
mm_data = {
"image":
self._get_dummy_images(width=target_width,
height=target_height,
num_images=num_images)
}
base_text = "<|begin_of_image|><|endoftext|><|end_of_image|>"
return ProcessorInputs(
prompt_text=base_text * num_images,
mm_data=mm_data,
)
class GLM4VMultiModalProcessor(BaseMultiModalProcessor[GLM4VProcessingInfo]):
def _get_mm_fields_config(
self,
hf_inputs: BatchFeature,
hf_processor_mm_kwargs: Mapping[str, object],
) -> Mapping[str, MultiModalFieldConfig]:
return dict(pixel_values=MultiModalFieldConfig.batched("image"))
def _get_prompt_replacements(
self,
mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs,
) -> list[PromptReplacement]:
hf_config = self.info.get_hf_config()
if not hasattr(hf_config, "vision_config"):
return []
boi_token_id = hf_config.boi_token_id
image_token_id = hf_config.pad_token_id
eoi_token_id = hf_config.eoi_token_id
def get_replacement(item_idx: int):
num_image_tokens = self.info.get_num_image_tokens()
image_tokens = [image_token_id] * num_image_tokens
return [boi_token_id] + image_tokens + [eoi_token_id]
return [
PromptReplacement(
modality="image",
target=[boi_token_id, image_token_id, eoi_token_id],
replacement=get_replacement,
),
]
class GLMAttention(nn.Module): class GLMAttention(nn.Module):
...@@ -489,7 +291,7 @@ class GLMTransformer(nn.Module): ...@@ -489,7 +291,7 @@ class GLMTransformer(nn.Module):
position_ids: torch.Tensor, position_ids: torch.Tensor,
kv_caches: List[torch.Tensor], kv_caches: List[torch.Tensor],
attn_metadata: AttentionMetadata, attn_metadata: AttentionMetadata,
) -> torch.Tensor: ) -> Union[torch.Tensor, IntermediateTensors]:
for i in range(self.start_layer, self.end_layer): for i in range(self.start_layer, self.end_layer):
layer = self.layers[i] layer = self.layers[i]
hidden_states = layer( hidden_states = layer(
...@@ -498,8 +300,12 @@ class GLMTransformer(nn.Module): ...@@ -498,8 +300,12 @@ class GLMTransformer(nn.Module):
kv_cache=kv_caches[i - self.start_layer], kv_cache=kv_caches[i - self.start_layer],
attn_metadata=attn_metadata, attn_metadata=attn_metadata,
) )
if not get_pp_group().is_last_rank:
return IntermediateTensors({"hidden_states": hidden_states})
# Final layer norm. # Final layer norm.
if get_pp_group().is_last_rank and self.post_layer_norm: if self.post_layer_norm:
hidden_states = self.final_layernorm(hidden_states) hidden_states = self.final_layernorm(hidden_states)
return hidden_states return hidden_states
...@@ -534,61 +340,11 @@ class ChatGLMModel(nn.Module): ...@@ -534,61 +340,11 @@ class ChatGLMModel(nn.Module):
quant_config=quant_config, quant_config=quant_config,
prefix=f"{prefix}.output_layer") prefix=f"{prefix}.output_layer")
vision_config_flag = getattr(config, 'vision_config', None)
if vision_config_flag is not None:
self.vision_config = Namespace(**config.vision_config)
self.vision = EVA2CLIPModel(self.config,
quant_config,
prefix=f"{prefix}.vision")
else:
self.vision = None
self.make_empty_intermediate_tensors = ( self.make_empty_intermediate_tensors = (
self.encoder.make_empty_intermediate_tensors) self.encoder.make_empty_intermediate_tensors)
def _parse_and_validate_image_input( def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
self, **kwargs: object) -> GLMImagePixelInputs: return self.embedding(input_ids)
pixel_values = kwargs.pop("pixel_values", None)
if pixel_values is not None and self.vision is not None:
if isinstance(pixel_values, torch.Tensor):
if pixel_values.ndim > 2:
pixel_values = torch.concat(list(pixel_values))
elif isinstance(pixel_values, list):
return torch.concat(pixel_values)
else:
raise TypeError("""pixel_values must be a torch.Tensor
or a list of torch.Tensor
""")
return GLMImagePixelInputs(pixel_values=pixel_values)
def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
image_input = self._parse_and_validate_image_input(**kwargs)
if image_input["pixel_values"] is None:
return None
pixel_values = image_input["pixel_values"].to(
dtype=self.config.torch_dtype)
vision_embeddings = self.vision(pixel_values)
return vision_embeddings
def get_input_embeddings(
self,
input_ids: torch.Tensor,
multimodal_embeddings: Optional[NestedTensors] = None,
) -> torch.Tensor:
inputs_embeds = self.embedding(input_ids)
if multimodal_embeddings is not None:
inputs_embeds = merge_multimodal_embeddings(
input_ids=input_ids,
inputs_embeds=inputs_embeds,
multimodal_embeddings=multimodal_embeddings,
placeholder_token_id=[
self.config.boi_token_id,
self.config.pad_token_id,
self.config.eoi_token_id,
],
)
return inputs_embeds
def forward( def forward(
self, self,
...@@ -599,26 +355,24 @@ class ChatGLMModel(nn.Module): ...@@ -599,26 +355,24 @@ class ChatGLMModel(nn.Module):
intermediate_tensors: Optional[IntermediateTensors] = None, intermediate_tensors: Optional[IntermediateTensors] = None,
inputs_embeds: Optional[torch.Tensor] = None, inputs_embeds: Optional[torch.Tensor] = None,
**kwargs: object, **kwargs: object,
) -> torch.Tensor: ) -> Union[torch.Tensor, IntermediateTensors]:
if get_pp_group().is_first_rank:
if inputs_embeds is not None:
hidden_states = inputs_embeds
else:
hidden_states = self.get_input_embeddings(input_ids)
else:
assert intermediate_tensors is not None
hidden_states = intermediate_tensors["hidden_states"]
# NOTE: In v1, inputs_embeds is always generated at model runner, this
# condition is for v0 compatibility.
if intermediate_tensors is not None:
inputs_embeds = intermediate_tensors["hidden_states"]
elif inputs_embeds is None:
vision_embeddings = self.get_multimodal_embeddings(**kwargs)
inputs_embeds = self.get_input_embeddings(input_ids,
vision_embeddings)
# Run encoder. # Run encoder.
hidden_states = self.encoder( hidden_states = self.encoder(
hidden_states=inputs_embeds, hidden_states=hidden_states,
position_ids=positions, position_ids=positions,
kv_caches=kv_caches, kv_caches=kv_caches,
attn_metadata=attn_metadata, attn_metadata=attn_metadata,
) )
if not get_pp_group().is_last_rank:
return IntermediateTensors({"hidden_states": hidden_states})
return hidden_states return hidden_states
def load_weights(self, weights: Iterable[Tuple[str, def load_weights(self, weights: Iterable[Tuple[str,
...@@ -660,12 +414,18 @@ class ChatGLMModel(nn.Module): ...@@ -660,12 +414,18 @@ class ChatGLMModel(nn.Module):
return loaded_params return loaded_params
class ChatGLMBaseModel(nn.Module, SupportsLoRA, SupportsPP): class ChatGLMBaseModel(nn.Module):
hf_to_vllm_mapper = WeightsMapper( hf_to_vllm_mapper = WeightsMapper(
orig_to_new_substr={".word_embeddings": ""}, ) orig_to_new_substr={".word_embeddings": ""}, )
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): def __init__(
self,
*,
vllm_config: VllmConfig,
prefix: str = "",
transformer_type: type[ChatGLMModel] = ChatGLMModel,
) -> None:
super().__init__() super().__init__()
config = vllm_config.model_config.hf_config config = vllm_config.model_config.hf_config
quant_config = vllm_config.quant_config quant_config = vllm_config.quant_config
...@@ -678,27 +438,17 @@ class ChatGLMBaseModel(nn.Module, SupportsLoRA, SupportsPP): ...@@ -678,27 +438,17 @@ class ChatGLMBaseModel(nn.Module, SupportsLoRA, SupportsPP):
self.quant_config = quant_config self.quant_config = quant_config
self.max_position_embeddings = getattr(config, "max_sequence_length", self.max_position_embeddings = getattr(config, "max_sequence_length",
8192) 8192)
self.transformer = ChatGLMModel(vllm_config=vllm_config, self.transformer = transformer_type(vllm_config=vllm_config,
prefix=maybe_prefix( prefix=maybe_prefix(
prefix, "transformer")) prefix, "transformer"))
if self.config.tie_word_embeddings: if self.config.tie_word_embeddings:
self.transformer.output_layer.weight = ( self.transformer.output_layer.weight = (
self.transformer.embedding.weight) self.transformer.embedding.weight)
self.lm_head = self.transformer.output_layer self.lm_head = self.transformer.output_layer
self.logits_processor = LogitsProcessor(config.padded_vocab_size) self.logits_processor = LogitsProcessor(config.padded_vocab_size)
self.sampler = get_sampler() self.sampler = get_sampler()
self.make_empty_intermediate_tensors = (
def forward(self, self.transformer.make_empty_intermediate_tensors)
input_ids: torch.Tensor,
positions: torch.Tensor,
kv_caches: List[torch.Tensor],
attn_metadata: AttentionMetadata,
intermediate_tensors: Optional[IntermediateTensors] = None,
**kwargs) -> torch.Tensor:
hidden_states = self.transformer(input_ids, positions, kv_caches,
attn_metadata, intermediate_tensors,
**kwargs)
return hidden_states
def compute_logits( def compute_logits(
self, self,
...@@ -722,7 +472,7 @@ class ChatGLMBaseModel(nn.Module, SupportsLoRA, SupportsPP): ...@@ -722,7 +472,7 @@ class ChatGLMBaseModel(nn.Module, SupportsLoRA, SupportsPP):
return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
class ChatGLM(ChatGLMBaseModel): class ChatGLMForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP):
packed_modules_mapping = { packed_modules_mapping = {
"query_key_value": ["query_key_value"], "query_key_value": ["query_key_value"],
"dense_h_to_4h": ["dense_h_to_4h"] "dense_h_to_4h": ["dense_h_to_4h"]
...@@ -738,82 +488,28 @@ class ChatGLM(ChatGLMBaseModel): ...@@ -738,82 +488,28 @@ class ChatGLM(ChatGLMBaseModel):
embedding_modules = {} embedding_modules = {}
embedding_padding_modules = [] embedding_padding_modules = []
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
config = vllm_config.model_config.hf_config
if hasattr(config, "vision_config"):
hf_overrides = {"architectures": ["GLM4VForCausalLM"]}
raise RuntimeError(
"The configuration of this model indicates that it supports "
"vision inputs, but you instantiated the text-only version "
"of this model. Please use the vision model by setting "
f"`--hf-overrides {hf_overrides!r}`")
class ChatGLMV(ChatGLMBaseModel, SupportsMultiModal): super().__init__(vllm_config=vllm_config, prefix=prefix)
packed_modules_mapping = {
"query_key_value": ["query_key_value"],
"dense_h_to_4h": ["dense_h_to_4h"],
"merged_proj": ["gate_proj", "dense_h_to_4h"]
}
# LoRA specific attributes
supported_lora_modules = [
"query_key_value",
"dense",
"dense_h_to_4h",
"dense_4h_to_h",
# vision
"fc1",
"fc2",
"merged_proj",
"linear_proj"
]
embedding_modules = {}
embedding_padding_modules = []
def get_mm_mapping(self) -> MultiModelKeys:
"""
Get the module prefix in multimodal models
"""
return MultiModelKeys.from_string_field(
language_model="transformer.encoder",
connector="transformer.vision.linear_proj",
tower_model="transformer.vision.transformer")
def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
return self.transformer.get_multimodal_embeddings(**kwargs)
def get_input_embeddings( def forward(
self, self,
input_ids: torch.Tensor, input_ids: torch.Tensor,
multimodal_embeddings: Optional[NestedTensors] = None, positions: torch.Tensor,
) -> torch.Tensor: kv_caches: List[torch.Tensor],
return self.transformer.get_input_embeddings(input_ids, attn_metadata: AttentionMetadata,
multimodal_embeddings) intermediate_tensors: Optional[IntermediateTensors] = None,
inputs_embeds: Optional[torch.Tensor] = None,
) -> Union[torch.Tensor, IntermediateTensors]:
@MULTIMODAL_REGISTRY.register_processor(GLM4VMultiModalProcessor, hidden_states = self.transformer(input_ids, positions, kv_caches,
info=GLM4VProcessingInfo, attn_metadata, intermediate_tensors,
dummy_inputs=GLM4VDummyInputsBuilder) inputs_embeds)
class ChatGLMForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP, return hidden_states
SupportsMultiModal):
# Ensure that the LoRA support check passes when the class is not
# initialized, but set all these attributes to empty.
# These will be updated when an instance class is selected
packed_modules_mapping = {}
supported_lora_modules = []
embedding_modules = {}
embedding_padding_modules = []
def __new__(
cls,
vllm_config: VllmConfig,
prefix: str = "",
) -> None:
config = vllm_config.model_config.hf_config
# Initialize VL
if hasattr(config, "vision_config"): # noqa: SIM108
instance_cls = ChatGLMV
# Initialize LLM
else:
instance_cls = ChatGLM
# quant_config references base class members,
# so update values before init is called
cls.packed_modules_mapping.update(instance_cls.packed_modules_mapping)
cls.supported_lora_modules += instance_cls.supported_lora_modules
cls.embedding_modules.update(instance_cls.embedding_modules)
cls.embedding_padding_modules += instance_cls.embedding_padding_modules
return instance_cls(vllm_config=vllm_config, prefix=prefix)
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# Adapted from # Adapted from
# https://github.com/THUDM/GLM-4 # https://github.com/THUDM/CogAgent
"""Inference-only GLM-4v model visual encoder compatible with THUDM weights.""" """Inference-only CogAgent model compatible with THUDM weights."""
from argparse import Namespace from argparse import Namespace
from typing import Optional from typing import List, Literal, Mapping, Optional, TypedDict, Union
import torch import torch
from torch import nn from torch import nn
from torch.nn import LayerNorm from torch.nn import LayerNorm
from torchvision import transforms
from torchvision.transforms import InterpolationMode
from transformers import PreTrainedTokenizer, TensorType
from transformers.image_utils import ImageInput
from transformers.tokenization_utils_base import TextInput
from vllm.attention import AttentionMetadata
from vllm.attention.layer import MultiHeadAttention from vllm.attention.layer import MultiHeadAttention
from vllm.config import VllmConfig
from vllm.distributed import get_tensor_model_parallel_world_size from vllm.distributed import get_tensor_model_parallel_world_size
from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn
from vllm.model_executor.layers.linear import (ColumnParallelLinear, from vllm.model_executor.layers.linear import (ColumnParallelLinear,
...@@ -18,11 +25,31 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear, ...@@ -18,11 +25,31 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
QKVParallelLinear, QKVParallelLinear,
ReplicatedLinear, ReplicatedLinear,
RowParallelLinear) RowParallelLinear)
from vllm.model_executor.layers.quantization.base_config import ( from vllm.model_executor.layers.quantization import QuantizationConfig
QuantizationConfig) from vllm.model_executor.models.module_mapping import MultiModelKeys
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors
from vllm.multimodal.parse import MultiModalDataItems
from vllm.multimodal.processing import (BaseMultiModalProcessor,
BaseProcessingInfo, BatchFeature,
MultiModalFieldConfig,
PromptReplacement)
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.configs import ChatGLMConfig
from .chatglm import ChatGLMBaseModel, ChatGLMModel
from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP
from .utils import flatten_bn, merge_multimodal_embeddings
class PatchEmbedding(nn.Module):
class GLMVImagePixelInputs(TypedDict):
type: Literal["pixel_values"]
data: torch.Tensor
"""Shape: `(batch_size, num_channels, height, width)`"""
class EVA2CLIPPatchEmbedding(nn.Module):
def __init__(self, config): def __init__(self, config):
super().__init__() super().__init__()
...@@ -54,7 +81,7 @@ class PatchEmbedding(nn.Module): ...@@ -54,7 +81,7 @@ class PatchEmbedding(nn.Module):
return x return x
class Attention(nn.Module): class EVA2CLIPAttention(nn.Module):
def __init__( def __init__(
self, self,
...@@ -97,7 +124,7 @@ class Attention(nn.Module): ...@@ -97,7 +124,7 @@ class Attention(nn.Module):
return output return output
class MLP(nn.Module): class EVA2CLIPMLP(nn.Module):
def __init__( def __init__(
self, self,
...@@ -128,7 +155,7 @@ class MLP(nn.Module): ...@@ -128,7 +155,7 @@ class MLP(nn.Module):
return x return x
class TransformerLayer(nn.Module): class EVA2CLIPTransformerLayer(nn.Module):
def __init__( def __init__(
self, self,
...@@ -139,12 +166,12 @@ class TransformerLayer(nn.Module): ...@@ -139,12 +166,12 @@ class TransformerLayer(nn.Module):
super().__init__() super().__init__()
self.input_layernorm = LayerNorm(config.hidden_size, self.input_layernorm = LayerNorm(config.hidden_size,
eps=config.layer_norm_eps) eps=config.layer_norm_eps)
self.attention = Attention(config, self.attention = EVA2CLIPAttention(config,
quant_config=quant_config, quant_config=quant_config,
prefix=f"{prefix}.attention") prefix=f"{prefix}.attention")
self.mlp = MLP(config, self.mlp = EVA2CLIPMLP(config,
quant_config=quant_config, quant_config=quant_config,
prefix=f"{prefix}.mlp") prefix=f"{prefix}.mlp")
self.post_attention_layernorm = LayerNorm(config.hidden_size, self.post_attention_layernorm = LayerNorm(config.hidden_size,
eps=config.layer_norm_eps) eps=config.layer_norm_eps)
...@@ -159,7 +186,7 @@ class TransformerLayer(nn.Module): ...@@ -159,7 +186,7 @@ class TransformerLayer(nn.Module):
return output return output
class Transformer(nn.Module): class EVA2CLIPTransformer(nn.Module):
def __init__( def __init__(
self, self,
...@@ -169,9 +196,9 @@ class Transformer(nn.Module): ...@@ -169,9 +196,9 @@ class Transformer(nn.Module):
): ):
super().__init__() super().__init__()
self.layers = nn.ModuleList([ self.layers = nn.ModuleList([
TransformerLayer(config, EVA2CLIPTransformerLayer(config,
quant_config=quant_config, quant_config=quant_config,
prefix=f"{prefix}.layers.{layer_idx}") prefix=f"{prefix}.layers.{layer_idx}")
for layer_idx in range(config.num_hidden_layers) for layer_idx in range(config.num_hidden_layers)
]) ])
...@@ -181,7 +208,7 @@ class Transformer(nn.Module): ...@@ -181,7 +208,7 @@ class Transformer(nn.Module):
return hidden_states return hidden_states
class GLU(nn.Module): class EVA2CLIPGLU(nn.Module):
def __init__( def __init__(
self, self,
...@@ -268,14 +295,14 @@ class EVA2CLIPModel(nn.Module): ...@@ -268,14 +295,14 @@ class EVA2CLIPModel(nn.Module):
): ):
super().__init__() super().__init__()
vision_config = Namespace(**config.vision_config) vision_config = Namespace(**config.vision_config)
self.patch_embedding = PatchEmbedding(vision_config) self.patch_embedding = EVA2CLIPPatchEmbedding(vision_config)
self.transformer = Transformer(vision_config, self.transformer = EVA2CLIPTransformer(vision_config,
quant_config=quant_config,
prefix=f"{prefix}.transformer")
self.linear_proj = EVA2CLIPGLU(config,
in_features=config.hidden_size,
quant_config=quant_config, quant_config=quant_config,
prefix=f"{prefix}.transformer") prefix=f"{prefix}.linear_proj")
self.linear_proj = GLU(config,
in_features=config.hidden_size,
quant_config=quant_config,
prefix=f"{prefix}.linear_proj")
self.conv = nn.Conv2d(in_channels=vision_config.hidden_size, self.conv = nn.Conv2d(in_channels=vision_config.hidden_size,
out_channels=config.hidden_size, out_channels=config.hidden_size,
kernel_size=2, kernel_size=2,
...@@ -310,3 +337,326 @@ class EVA2CLIPModel(nn.Module): ...@@ -310,3 +337,326 @@ class EVA2CLIPModel(nn.Module):
x = torch.cat((boi, x, eoi), dim=1) x = torch.cat((boi, x, eoi), dim=1)
x = x / self.scaling_factor x = x / self.scaling_factor
return x return x
class GLM4VModel(ChatGLMModel):
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__(vllm_config=vllm_config, prefix=prefix)
quant_config = vllm_config.quant_config
self.vision = EVA2CLIPModel(self.config,
quant_config,
prefix=f"{prefix}.vision")
class GLM4VProcessor:
"""
This model doesn't define its own HF processor,
so we implement our own one here.
"""
def __init__(
self,
config: ChatGLMConfig,
tokenizer: PreTrainedTokenizer,
) -> None:
super().__init__()
self.config = config
self.tokenizer = tokenizer
vision_config = config.vision_config
image_size = vision_config["image_size"]
self.image_transform = transforms.Compose([
transforms.Resize(
(image_size, image_size),
interpolation=InterpolationMode.BICUBIC,
),
transforms.ToTensor(),
transforms.Normalize(
mean=(0.48145466, 0.4578275, 0.40821073),
std=(0.26862954, 0.26130258, 0.27577711),
),
])
def __call__(
self,
text: Optional[Union[TextInput, list[TextInput]]] = None,
images: Optional[Union[ImageInput, list[ImageInput]]] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
) -> BatchFeature:
if text is None:
text = []
if not isinstance(text, list):
text = [text]
if images is None:
images = []
if not isinstance(images, list):
images = [images]
text_inputs = self.tokenizer(text)
if len(images) == 0:
image_inputs = {}
else:
pixel_values = [self.image_transform(image) for image in images]
image_inputs = {"pixel_values": torch.stack(pixel_values)}
return BatchFeature(
{
**text_inputs,
**image_inputs,
},
tensor_type=return_tensors,
)
class GLM4VProcessingInfo(BaseProcessingInfo):
def get_tokenizer(self):
tokenizer = self.ctx.tokenizer
assert isinstance(tokenizer, PreTrainedTokenizer)
return tokenizer
def get_hf_config(self):
return self.ctx.get_hf_config(ChatGLMConfig)
def get_hf_processor(self) -> GLM4VProcessor:
return GLM4VProcessor(
self.get_hf_config(),
self.get_tokenizer(),
)
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
return {"image": 1}
def get_mm_max_tokens_per_item(
self,
seq_len: int,
mm_counts: Mapping[str, int],
) -> Mapping[str, int]:
return {"image": self.get_num_image_feature_tokens()}
def get_num_image_tokens(self) -> int:
hf_config = self.get_hf_config()
vision_config = hf_config.vision_config
image_size = vision_config["image_size"]
patch_size = vision_config["patch_size"]
grid_length = image_size // patch_size // 2
return grid_length * grid_length
def get_num_image_feature_tokens(self) -> int:
# EVA2CLIPModel has embeddings for boi and eoi tokens as well
return self.get_num_image_tokens() + 2
class GLM4VDummyInputsBuilder(BaseDummyInputsBuilder[GLM4VProcessingInfo]):
def get_dummy_processor_inputs(
self,
seq_len: int,
mm_counts: Mapping[str, int],
) -> ProcessorInputs:
hf_config = self.info.get_hf_config()
vision_config = hf_config.vision_config
target_width = target_height = vision_config["image_size"]
num_images = mm_counts.get("image", 0)
mm_data = {
"image":
self._get_dummy_images(width=target_width,
height=target_height,
num_images=num_images)
}
base_text = "<|begin_of_image|><|endoftext|><|end_of_image|>"
return ProcessorInputs(
prompt_text=base_text * num_images,
mm_data=mm_data,
)
class GLM4VMultiModalProcessor(BaseMultiModalProcessor[GLM4VProcessingInfo]):
def _get_mm_fields_config(
self,
hf_inputs: BatchFeature,
hf_processor_mm_kwargs: Mapping[str, object],
) -> Mapping[str, MultiModalFieldConfig]:
return dict(pixel_values=MultiModalFieldConfig.batched("image"))
def _get_prompt_replacements(
self,
mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs,
) -> list[PromptReplacement]:
hf_config = self.info.get_hf_config()
boi_token_id = hf_config.boi_token_id
image_token_id = hf_config.pad_token_id
eoi_token_id = hf_config.eoi_token_id
def get_replacement(item_idx: int):
num_image_tokens = self.info.get_num_image_tokens()
image_tokens = [image_token_id] * num_image_tokens
return [boi_token_id] + image_tokens + [eoi_token_id]
return [
PromptReplacement(
modality="image",
target=[boi_token_id, image_token_id, eoi_token_id],
replacement=get_replacement,
),
]
@MULTIMODAL_REGISTRY.register_processor(GLM4VMultiModalProcessor,
info=GLM4VProcessingInfo,
dummy_inputs=GLM4VDummyInputsBuilder)
class GLM4VForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP,
SupportsMultiModal):
packed_modules_mapping = {
"query_key_value": ["query_key_value"],
"dense_h_to_4h": ["dense_h_to_4h"],
"merged_proj": ["gate_proj", "dense_h_to_4h"]
}
# LoRA specific attributes
supported_lora_modules = [
"query_key_value",
"dense",
"dense_h_to_4h",
"dense_4h_to_h",
# vision
"fc1",
"fc2",
"merged_proj",
"linear_proj"
]
embedding_modules = {}
embedding_padding_modules = []
def get_mm_mapping(self) -> MultiModelKeys:
"""
Get the module prefix in multimodal models
"""
return MultiModelKeys.from_string_field(
language_model="transformer.encoder",
connector="transformer.vision.linear_proj",
tower_model="transformer.vision.transformer")
def __init__(
self,
*,
vllm_config: VllmConfig,
prefix: str = "",
transformer_type: type[GLM4VModel] = GLM4VModel,
) -> None:
super().__init__(
vllm_config=vllm_config,
prefix=prefix,
transformer_type=transformer_type,
)
self.transformer: GLM4VModel
def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
h = w = self.config.vision_config["image_size"]
expected_dims = (3, h, w)
actual_dims = tuple(data.shape[1:])
if actual_dims != expected_dims:
expected_expr = ("batch_size", *map(str, expected_dims))
raise ValueError(
f"The expected shape of pixel values is {expected_expr}. "
f"You supplied {tuple(data.shape)}.")
return data
def _parse_and_validate_image_input(
self, **kwargs: object) -> Optional[GLMVImagePixelInputs]:
pixel_values = kwargs.pop("pixel_values", None)
if pixel_values is not None:
if not isinstance(pixel_values, torch.Tensor):
raise ValueError("Incorrect type of pixel values. "
f"Got type: {type(pixel_values)}")
return GLMVImagePixelInputs(
type="pixel_values",
data=self._validate_pixel_values(
flatten_bn(pixel_values, concat=True)),
)
return None
def _process_image_input(
self, image_input: GLMVImagePixelInputs) -> torch.Tensor:
pixel_values = image_input["data"].to(dtype=self.config.torch_dtype)
return self.transformer.vision(pixel_values)
def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is None:
return None
vision_embeddings = self._process_image_input(image_input)
return vision_embeddings
def get_input_embeddings(
self,
input_ids: torch.Tensor,
multimodal_embeddings: Optional[NestedTensors] = None,
) -> torch.Tensor:
inputs_embeds = self.transformer.get_input_embeddings(input_ids)
if multimodal_embeddings is not None:
inputs_embeds = merge_multimodal_embeddings(
input_ids=input_ids,
inputs_embeds=inputs_embeds,
multimodal_embeddings=multimodal_embeddings,
placeholder_token_id=[
self.config.boi_token_id,
self.config.pad_token_id,
self.config.eoi_token_id,
],
)
return inputs_embeds
def forward(
self,
input_ids: torch.Tensor,
positions: torch.Tensor,
kv_caches: List[torch.Tensor],
attn_metadata: AttentionMetadata,
intermediate_tensors: Optional[IntermediateTensors] = None,
inputs_embeds: Optional[torch.Tensor] = None,
**kwargs: object,
) -> Union[torch.Tensor, IntermediateTensors]:
if intermediate_tensors is not None:
inputs_embeds = None
# NOTE: In v1, inputs_embeds is always generated at model runner, this
# condition is for v0 compatibility.
elif inputs_embeds is None:
vision_embeddings = self.get_multimodal_embeddings(**kwargs)
inputs_embeds = self.get_input_embeddings(input_ids,
vision_embeddings)
input_ids = None
hidden_states = self.transformer(input_ids, positions, kv_caches,
attn_metadata, intermediate_tensors,
inputs_embeds)
return hidden_states
This diff is collapsed.
This diff is collapsed.
...@@ -39,7 +39,7 @@ _TEXT_GENERATION_MODELS = { ...@@ -39,7 +39,7 @@ _TEXT_GENERATION_MODELS = {
"BaichuanForCausalLM": ("baichuan", "BaichuanForCausalLM"), "BaichuanForCausalLM": ("baichuan", "BaichuanForCausalLM"),
"BambaForCausalLM": ("bamba", "BambaForCausalLM"), "BambaForCausalLM": ("bamba", "BambaForCausalLM"),
"BloomForCausalLM": ("bloom", "BloomForCausalLM"), "BloomForCausalLM": ("bloom", "BloomForCausalLM"),
# ChatGLMModel supports multimodal "ChatGLMModel": ("chatglm", "ChatGLMForCausalLM"),
"CohereForCausalLM": ("commandr", "CohereForCausalLM"), "CohereForCausalLM": ("commandr", "CohereForCausalLM"),
"Cohere2ForCausalLM": ("commandr", "CohereForCausalLM"), "Cohere2ForCausalLM": ("commandr", "CohereForCausalLM"),
"DbrxForCausalLM": ("dbrx", "DbrxForCausalLM"), "DbrxForCausalLM": ("dbrx", "DbrxForCausalLM"),
...@@ -90,7 +90,7 @@ _TEXT_GENERATION_MODELS = { ...@@ -90,7 +90,7 @@ _TEXT_GENERATION_MODELS = {
"Phi3ForCausalLM": ("phi3", "Phi3ForCausalLM"), "Phi3ForCausalLM": ("phi3", "Phi3ForCausalLM"),
"Phi3SmallForCausalLM": ("phi3_small", "Phi3SmallForCausalLM"), "Phi3SmallForCausalLM": ("phi3_small", "Phi3SmallForCausalLM"),
"PhiMoEForCausalLM": ("phimoe", "PhiMoEForCausalLM"), "PhiMoEForCausalLM": ("phimoe", "PhiMoEForCausalLM"),
# QWenLMHeadModel supports multimodal "QWenLMHeadModel": ("qwen", "QWenLMHeadModel"),
"Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"), "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"),
"Qwen2MoeForCausalLM": ("qwen2_moe", "Qwen2MoeForCausalLM"), "Qwen2MoeForCausalLM": ("qwen2_moe", "Qwen2MoeForCausalLM"),
"RWForCausalLM": ("falcon", "FalconForCausalLM"), "RWForCausalLM": ("falcon", "FalconForCausalLM"),
...@@ -156,10 +156,9 @@ _MULTIMODAL_MODELS = { ...@@ -156,10 +156,9 @@ _MULTIMODAL_MODELS = {
"AriaForConditionalGeneration": ("aria", "AriaForConditionalGeneration"), "AriaForConditionalGeneration": ("aria", "AriaForConditionalGeneration"),
"Blip2ForConditionalGeneration": ("blip2", "Blip2ForConditionalGeneration"), "Blip2ForConditionalGeneration": ("blip2", "Blip2ForConditionalGeneration"),
"ChameleonForConditionalGeneration": ("chameleon", "ChameleonForConditionalGeneration"), # noqa: E501 "ChameleonForConditionalGeneration": ("chameleon", "ChameleonForConditionalGeneration"), # noqa: E501
"ChatGLMModel": ("chatglm", "ChatGLMForCausalLM"),
"ChatGLMForConditionalGeneration": ("chatglm", "ChatGLMForCausalLM"),
"DeepseekVLV2ForCausalLM": ("deepseek_vl2", "DeepseekVLV2ForCausalLM"), "DeepseekVLV2ForCausalLM": ("deepseek_vl2", "DeepseekVLV2ForCausalLM"),
"FuyuForCausalLM": ("fuyu", "FuyuForCausalLM"), "FuyuForCausalLM": ("fuyu", "FuyuForCausalLM"),
"GLM4VForCausalLM": ("glm4v", "GLM4VForCausalLM"),
"H2OVLChatModel": ("h2ovl", "H2OVLChatModel"), "H2OVLChatModel": ("h2ovl", "H2OVLChatModel"),
"InternVLChatModel": ("internvl", "InternVLChatModel"), "InternVLChatModel": ("internvl", "InternVLChatModel"),
"Idefics3ForConditionalGeneration":("idefics3","Idefics3ForConditionalGeneration"), "Idefics3ForConditionalGeneration":("idefics3","Idefics3ForConditionalGeneration"),
...@@ -175,7 +174,7 @@ _MULTIMODAL_MODELS = { ...@@ -175,7 +174,7 @@ _MULTIMODAL_MODELS = {
"PaliGemmaForConditionalGeneration": ("paligemma", "PaliGemmaForConditionalGeneration"), # noqa: E501 "PaliGemmaForConditionalGeneration": ("paligemma", "PaliGemmaForConditionalGeneration"), # noqa: E501
"Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"), "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
"PixtralForConditionalGeneration": ("pixtral", "PixtralForConditionalGeneration"), # noqa: E501 "PixtralForConditionalGeneration": ("pixtral", "PixtralForConditionalGeneration"), # noqa: E501
"QWenLMHeadModel": ("qwen", "QWenLMHeadModel"), "QwenVLForConditionalGeneration": ("qwen_vl", "QwenVLForConditionalGeneration"), # noqa: E501
"Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"), # noqa: E501 "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"), # noqa: E501
"Qwen2_5_VLForConditionalGeneration": ("qwen2_5_vl", "Qwen2_5_VLForConditionalGeneration"), # noqa: E501 "Qwen2_5_VLForConditionalGeneration": ("qwen2_5_vl", "Qwen2_5_VLForConditionalGeneration"), # noqa: E501
"Qwen2AudioForConditionalGeneration": ("qwen2_audio", "Qwen2AudioForConditionalGeneration"), # noqa: E501 "Qwen2AudioForConditionalGeneration": ("qwen2_audio", "Qwen2AudioForConditionalGeneration"), # noqa: E501
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment