"docs/contributing/README.md" did not exist on "206e2577fa9c73f0d8019210e36905ba970fe036"
Unverified Commit 1bc3b5e7 authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[VLM] Separate text-only and vision variants of the same model architecture (#13157)

parent 02ed8a1f
......@@ -699,10 +699,10 @@ See [this page](#generative-models) for more information on how to use generativ
*
* ✅︎
* ✅︎
- * `DeepseekVLV2ForCausalLM`
- * `DeepseekVLV2ForCausalLM`<sup>^</sup>
* DeepSeek-VL2
* T + I<sup>+</sup>
* `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2` etc. (see note)
* `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2` etc.
*
* ✅︎
* ✅︎
......@@ -713,10 +713,10 @@ See [this page](#generative-models) for more information on how to use generativ
*
* ✅︎
* ✅︎
- * `ChatGLMModel`
- * `GLM4VForCausalLM`<sup>^</sup>
* GLM-4V
* T + I
* `THUDM/glm-4v-9b` etc.
* `THUDM/glm-4v-9b`, `THUDM/cogagent-9b-20241220` etc.
* ✅︎
* ✅︎
* ✅︎
......@@ -825,7 +825,7 @@ See [this page](#generative-models) for more information on how to use generativ
*
* ✅︎
* ✅︎
- * `QWenLMHeadModel`
- * `QwenVLForConditionalGeneration`<sup>^</sup>
* Qwen-VL
* T + I<sup>E+</sup>
* `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc.
......@@ -862,13 +862,12 @@ See [this page](#generative-models) for more information on how to use generativ
* ✅︎
:::
<sup>^</sup> You need to set the architecture name via `--hf-overrides` to match the one in vLLM.
&nbsp;&nbsp;&nbsp;&nbsp;• For example, to use DeepSeek-VL2 series models:
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;`--hf-overrides '{"architectures": ["DeepseekVLV2ForCausalLM"]}'`
<sup>E</sup> Pre-computed embeddings can be inputted for this modality.
<sup>+</sup> Multiple items can be inputted per text prompt for this modality.
:::{note}
To use DeepSeek-VL2 series models, you have to pass `--hf_overrides '{"architectures": ["DeepseekVLV2ForCausalLM"]}'` when running vLLM.
:::
:::{note}
H2O-VL series models will be available in V1 once we support backends other than FlashAttention.
:::
......
......@@ -105,7 +105,9 @@ def run_glm4v(question: str, modality: str):
max_num_seqs=2,
trust_remote_code=True,
enforce_eager=True,
hf_overrides={"architectures": ["GLM4VForCausalLM"]},
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
prompt = f"<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>\
{question}<|assistant|>"
......@@ -495,6 +497,7 @@ def run_qwen_vl(question: str, modality: str):
trust_remote_code=True,
max_model_len=1024,
max_num_seqs=2,
hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]},
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
)
......
......@@ -77,7 +77,7 @@ def load_deepseek_vl2(question: str, image_urls: List[str]):
)
def load_h2onvl(question: str, image_urls: List[str]) -> ModelRequestData:
def load_h2ovl(question: str, image_urls: List[str]) -> ModelRequestData:
model_name = "h2oai/h2ovl-mississippi-2b"
llm = LLM(
......@@ -302,6 +302,7 @@ def load_qwen_vl_chat(question: str,
trust_remote_code=True,
max_model_len=1024,
max_num_seqs=2,
hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]},
limit_mm_per_prompt={"image": len(image_urls)},
)
placeholders = "".join(f"Picture {i}: <img></img>\n"
......@@ -452,7 +453,7 @@ def load_qwen2_5_vl(question, image_urls: List[str]) -> ModelRequestData:
model_example_map = {
"aria": load_aria,
"deepseek_vl_v2": load_deepseek_vl2,
"h2ovl_chat": load_h2onvl,
"h2ovl_chat": load_h2ovl,
"idefics3": load_idefics3,
"internvl_chat": load_internvl,
"mllama": load_mllama,
......
......@@ -6,6 +6,7 @@ WARNING: This test runs in both single-node (4 GPUs) and multi-node
all workers in a node other than the head node, which can cause the test
to fail.
"""
import json
import os
from dataclasses import dataclass
from typing import List, Literal, NamedTuple, Optional
......@@ -15,6 +16,7 @@ import pytest
from vllm.config import TaskOption
from vllm.logger import init_logger
from ..models.registry import HF_EXAMPLE_MODELS
from ..utils import compare_two_settings, fork_new_process_for_each_test
logger = init_logger("test_pipeline_parallel")
......@@ -31,10 +33,7 @@ class ParallelSetup(NamedTuple):
class PPTestOptions(NamedTuple):
multi_node_only: bool
trust_remote_code: bool
tokenizer_mode: Optional[str]
load_format: Optional[str] = None
hf_overrides: Optional[str] = None
@dataclass
......@@ -64,10 +63,7 @@ class PPTestSettings:
pp_base: int = 2,
multi_node_only: bool = False,
task: TaskOption = "auto",
trust_remote_code: bool = False,
tokenizer_mode: Optional[str] = None,
load_format: Optional[str] = None,
hf_overrides: Optional[str] = None,
):
return PPTestSettings(
parallel_setups=[
......@@ -97,10 +93,7 @@ class PPTestSettings:
vllm_major_versions=["0", "0", "1"],
task=task,
test_options=PPTestOptions(multi_node_only=multi_node_only,
trust_remote_code=trust_remote_code,
tokenizer_mode=tokenizer_mode,
load_format=load_format,
hf_overrides=hf_overrides),
load_format=load_format),
)
@staticmethod
......@@ -110,10 +103,7 @@ class PPTestSettings:
pp_base: int = 2,
task: TaskOption = "auto",
multi_node_only: bool = False,
trust_remote_code: bool = False,
tokenizer_mode: Optional[str] = None,
load_format: Optional[str] = None,
hf_overrides: Optional[str] = None,
):
return PPTestSettings(
parallel_setups=[
......@@ -126,19 +116,16 @@ class PPTestSettings:
vllm_major_versions=["0"],
task=task,
test_options=PPTestOptions(multi_node_only=multi_node_only,
trust_remote_code=trust_remote_code,
tokenizer_mode=tokenizer_mode,
load_format=load_format,
hf_overrides=hf_overrides),
load_format=load_format),
)
def iter_params(self, model_name: str):
def iter_params(self, model_id: str):
opts = self.test_options
for parallel_setup in self.parallel_setups:
for backend, vllm_major_version in zip(self.distributed_backends,
self.vllm_major_versions):
yield (model_name, parallel_setup, backend, vllm_major_version,
yield (model_id, parallel_setup, backend, vllm_major_version,
self.task, opts)
......@@ -150,16 +137,16 @@ TEXT_GENERATION_MODELS = {
# [Decoder-only]
# Uses Llama
# "BAAI/AquilaChat-7B": PPTestSettings.fast(),
"Snowflake/snowflake-arctic-instruct": PPTestSettings.fast(tp_base=8, trust_remote_code=True), # noqa: E501
"baichuan-inc/Baichuan-7B": PPTestSettings.fast(trust_remote_code=True),
"baichuan-inc/Baichuan2-13B-Chat": PPTestSettings.fast(trust_remote_code=True), # noqa: E501
"Snowflake/snowflake-arctic-instruct": PPTestSettings.fast(load_format="dummy"), # noqa: E501
"baichuan-inc/Baichuan-7B": PPTestSettings.fast(),
"baichuan-inc/Baichuan2-13B-Chat": PPTestSettings.fast(),
"bigscience/bloomz-1b1": PPTestSettings.fast(),
"THUDM/chatglm3-6b": PPTestSettings.fast(trust_remote_code=True),
"CohereForAI/c4ai-command-r-v01": PPTestSettings.fast(tp_base=2, trust_remote_code=True), # noqa: E501
"databricks/dbrx-instruct": PPTestSettings.fast(tp_base=8),
"Deci/DeciLM-7B-instruct": PPTestSettings.fast(trust_remote_code=True),
"THUDM/chatglm3-6b": PPTestSettings.fast(),
"CohereForAI/c4ai-command-r-v01": PPTestSettings.fast(load_format="dummy"),
"databricks/dbrx-instruct": PPTestSettings.fast(load_format="dummy"),
"Deci/DeciLM-7B-instruct": PPTestSettings.fast(),
"deepseek-ai/deepseek-llm-7b-chat": PPTestSettings.fast(),
"deepseek-ai/DeepSeek-V2-Lite-Chat": PPTestSettings.fast(trust_remote_code=True), # noqa: E501
"deepseek-ai/DeepSeek-V2-Lite-Chat": PPTestSettings.fast(),
"LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct": PPTestSettings.fast(),
"tiiuae/falcon-7b": PPTestSettings.fast(),
"google/gemma-2b": PPTestSettings.fast(),
......@@ -172,36 +159,36 @@ TEXT_GENERATION_MODELS = {
"ibm/PowerMoE-3b": PPTestSettings.fast(),
# Uses Llama
# "internlm/internlm-chat-7b": PPTestSettings.fast(),
"internlm/internlm2-chat-7b": PPTestSettings.fast(trust_remote_code=True),
"internlm/internlm2-chat-7b": PPTestSettings.fast(),
"inceptionai/jais-13b-chat": PPTestSettings.fast(),
"ai21labs/Jamba-tiny-dev": PPTestSettings.fast(),
"meta-llama/Meta-Llama-3-8B": PPTestSettings.detailed(),
"openbmb/MiniCPM-2B-sft-bf16": PPTestSettings.fast(trust_remote_code=True),
"openbmb/MiniCPM3-4B": PPTestSettings.fast(trust_remote_code=True),
"openbmb/MiniCPM-2B-sft-bf16": PPTestSettings.fast(),
"openbmb/MiniCPM3-4B": PPTestSettings.fast(),
# Uses Llama
# "mistralai/Mistral-7B-Instruct-v0.1": PPTestSettings.fast(),
"state-spaces/mamba-130m-hf": PPTestSettings.fast(),
"mistralai/Mixtral-8x7B-Instruct-v0.1": PPTestSettings.fast(tp_base=4),
"mistralai/Mixtral-8x7B-Instruct-v0.1": PPTestSettings.fast(load_format="dummy"), # noqa: E501
"mosaicml/mpt-7b": PPTestSettings.fast(),
"nvidia/Minitron-8B-Base": PPTestSettings.fast(),
"allenai/OLMo-1B-hf": PPTestSettings.fast(),
"shanearora/OLMo-7B-1124-hf": PPTestSettings.fast(),
"allenai/OLMoE-1B-7B-0924-Instruct": PPTestSettings.fast(),
"facebook/opt-iml-max-1.3b": PPTestSettings.fast(),
"OrionStarAI/Orion-14B-Chat": PPTestSettings.fast(trust_remote_code=True),
"OrionStarAI/Orion-14B-Chat": PPTestSettings.fast(),
"adept/persimmon-8b-chat": PPTestSettings.fast(),
"microsoft/phi-2": PPTestSettings.fast(),
"microsoft/Phi-3-small-8k-instruct": PPTestSettings.fast(trust_remote_code=True), # noqa: E501
"microsoft/Phi-3.5-MoE-instruct": PPTestSettings.detailed(trust_remote_code=True, multi_node_only=True, load_format="dummy", hf_overrides='{"num_hidden_layers": 4, "hidden_size": 512, "intermediate_size": 800, "num_attention_heads": 4, "num_key_value_heads": 1}'), # noqa: E501
"Qwen/Qwen-7B-Chat": PPTestSettings.fast(trust_remote_code=True),
"microsoft/Phi-3-small-8k-instruct": PPTestSettings.fast(),
"microsoft/Phi-3.5-MoE-instruct": PPTestSettings.detailed(multi_node_only=True, load_format="dummy"), # noqa: E501
"Qwen/Qwen-7B-Chat": PPTestSettings.fast(),
"Qwen/Qwen2-7B-Instruct": PPTestSettings.fast(),
"Qwen/Qwen1.5-MoE-A2.7B-Chat": PPTestSettings.fast(),
"stabilityai/stablelm-3b-4e1t": PPTestSettings.fast(),
"bigcode/starcoder2-3b": PPTestSettings.fast(),
"upstage/solar-pro-preview-instruct": PPTestSettings.fast(tp_base=2),
"upstage/solar-pro-preview-instruct": PPTestSettings.fast(load_format="dummy"), # noqa: E501
# FIXME: Cannot load tokenizer in latest transformers version.
# Need to use tokenizer from `meta-llama/Llama-2-7b-chat-hf`
# "xverse/XVERSE-7B-Chat": PPTestSettings.fast(trust_remote_code=True),
# "xverse/XVERSE-7B-Chat": PPTestSettings.fast(),
# [Encoder-only]
# TODO: Implement PP
# "facebook/bart-base": PPTestSettings.fast(),
......@@ -211,7 +198,7 @@ EMBEDDING_MODELS = { # type: ignore[var-annotated]
# [Text-only]
"intfloat/e5-mistral-7b-instruct": PPTestSettings.fast(),
"BAAI/bge-multilingual-gemma2": PPTestSettings.fast(),
"Qwen/Qwen2.5-Math-RM-72B": PPTestSettings.fast(tp_base=4, trust_remote_code=True), # noqa: E501
"Qwen/Qwen2.5-Math-RM-72B": PPTestSettings.fast(load_format="dummy"),
}
MULTIMODAL_MODELS = {
......@@ -219,20 +206,20 @@ MULTIMODAL_MODELS = {
"Salesforce/blip2-opt-2.7b": PPTestSettings.fast(),
"facebook/chameleon-7b": PPTestSettings.fast(),
"adept/fuyu-8b": PPTestSettings.fast(),
"THUDM/glm-4v-9b": PPTestSettings.fast(trust_remote_code=True),
"OpenGVLab/InternVL2-1B": PPTestSettings.fast(trust_remote_code=True),
"THUDM/glm-4v-9b": PPTestSettings.fast(),
"OpenGVLab/InternVL2-1B": PPTestSettings.fast(),
"llava-hf/llava-1.5-7b-hf": PPTestSettings.fast(),
"llava-hf/llava-v1.6-mistral-7b-hf": PPTestSettings.fast(),
"llava-hf/LLaVA-NeXT-Video-7B-hf": PPTestSettings.fast(),
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf": PPTestSettings.fast(),
"openbmb/MiniCPM-Llama3-V-2_5": PPTestSettings.fast(trust_remote_code=True),
"allenai/Molmo-7B-D-0924": PPTestSettings.fast(trust_remote_code=True),
"microsoft/Phi-3-vision-128k-instruct": PPTestSettings.fast(trust_remote_code=True), # noqa: E501
"mistralai/Pixtral-12B-2409": PPTestSettings.fast(tp_base=2, tokenizer_mode="mistral"), # noqa: E501
"Qwen/Qwen-VL-Chat": PPTestSettings.fast(trust_remote_code=True),
"openbmb/MiniCPM-Llama3-V-2_5": PPTestSettings.fast(),
"allenai/Molmo-7B-D-0924": PPTestSettings.fast(),
"microsoft/Phi-3-vision-128k-instruct": PPTestSettings.fast(),
"mistralai/Pixtral-12B-2409": PPTestSettings.fast(load_format="dummy"),
"Qwen/Qwen-VL-Chat": PPTestSettings.fast(),
"Qwen/Qwen2-Audio-7B-Instruct": PPTestSettings.fast(),
"Qwen/Qwen2-VL-2B-Instruct": PPTestSettings.fast(),
"fixie-ai/ultravox-v0_5-llama-3_2-1b": PPTestSettings.fast(trust_remote_code=True), # noqa: E501
"fixie-ai/ultravox-v0_5-llama-3_2-1b": PPTestSettings.fast(),
# [Encoder-decoder]
# TODO: Implement PP
# "meta-llama/Llama-3.2-11B-Vision-Instruct": PPTestSettings.fast(),
......@@ -258,7 +245,7 @@ TEST_MODELS = [
def _compare_tp(
model_name: str,
model_id: str,
parallel_setup: ParallelSetup,
distributed_backend: str,
vllm_major_version: str,
......@@ -267,6 +254,7 @@ def _compare_tp(
num_gpus_available: int,
*,
method: Literal["generate", "encode"],
is_multimodal: bool,
):
(
tp_size,
......@@ -274,13 +262,32 @@ def _compare_tp(
eager_mode,
chunked_prefill,
) = parallel_setup
(
multi_node_only,
trust_remote_code,
tokenizer_mode,
load_format,
hf_overrides,
) = test_options
multi_node_only, load_format = test_options
model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
model_info.check_transformers_version(on_fail="skip")
trust_remote_code = model_info.trust_remote_code
tokenizer_mode = model_info.tokenizer_mode
hf_overrides = model_info.hf_overrides
if load_format == "dummy":
# Avoid OOM
text_overrides = {
"num_layers": 1,
"num_hidden_layers": 1,
"num_experts": 2,
"num_experts_per_tok": 2,
"num_local_experts": 2,
}
if is_multimodal:
hf_overrides.update({"text_config": text_overrides})
else:
hf_overrides.update(text_overrides)
else:
model_info.check_available_online(on_fail="skip")
if num_gpus_available < tp_size * pp_size:
pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")
......@@ -312,7 +319,7 @@ def _compare_tp(
if load_format:
common_args.extend(["--load-format", load_format])
if hf_overrides:
common_args.extend(["--hf-overrides", hf_overrides])
common_args.extend(["--hf-overrides", json.dumps(hf_overrides)])
specific_case = tp_size == 2 and pp_size == 2 and chunked_prefill
if distributed_backend == "ray" and (vllm_major_version == "1"
......@@ -355,11 +362,7 @@ def _compare_tp(
]
try:
compare_two_settings(model_name,
pp_args,
tp_args,
pp_env,
method=method)
compare_two_settings(model_id, pp_args, tp_args, pp_env, method=method)
except Exception:
if pp_env is None:
raise
......@@ -369,17 +372,16 @@ def _compare_tp(
@pytest.mark.parametrize(
("model_name", "parallel_setup", "distributed_backend",
"vllm_major_version", "task", "test_options"),
("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
"task", "test_options"),
[
params for model_name, settings in TEXT_GENERATION_MODELS.items()
for params in settings.iter_params(model_name)
if model_name in TEST_MODELS
params for model_id, settings in TEXT_GENERATION_MODELS.items()
for params in settings.iter_params(model_id) if model_id in TEST_MODELS
],
)
@fork_new_process_for_each_test
def test_tp_language_generation(
model_name: str,
model_id: str,
parallel_setup: ParallelSetup,
distributed_backend: str,
vllm_major_version: str,
......@@ -387,28 +389,28 @@ def test_tp_language_generation(
test_options: PPTestOptions,
num_gpus_available,
):
_compare_tp(model_name,
_compare_tp(model_id,
parallel_setup,
distributed_backend,
vllm_major_version,
task,
test_options,
num_gpus_available,
method="generate")
method="generate",
is_multimodal=False)
@pytest.mark.parametrize(
("model_name", "parallel_setup", "distributed_backend",
"vllm_major_version", "task", "test_options"),
("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
"task", "test_options"),
[
params for model_name, settings in EMBEDDING_MODELS.items()
for params in settings.iter_params(model_name)
if model_name in TEST_MODELS
params for model_id, settings in EMBEDDING_MODELS.items()
for params in settings.iter_params(model_id) if model_id in TEST_MODELS
],
)
@fork_new_process_for_each_test
def test_tp_language_embedding(
model_name: str,
model_id: str,
parallel_setup: ParallelSetup,
distributed_backend: str,
vllm_major_version: str,
......@@ -416,28 +418,28 @@ def test_tp_language_embedding(
test_options: PPTestOptions,
num_gpus_available,
):
_compare_tp(model_name,
_compare_tp(model_id,
parallel_setup,
distributed_backend,
vllm_major_version,
task,
test_options,
num_gpus_available,
method="encode")
method="encode",
is_multimodal=False)
@pytest.mark.parametrize(
("model_name", "parallel_setup", "distributed_backend",
"vllm_major_version", "task", "test_options"),
("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
"task", "test_options"),
[
params for model_name, settings in MULTIMODAL_MODELS.items()
for params in settings.iter_params(model_name)
if model_name in TEST_MODELS
params for model_id, settings in MULTIMODAL_MODELS.items()
for params in settings.iter_params(model_id) if model_id in TEST_MODELS
],
)
@fork_new_process_for_each_test
def test_tp_multimodal_generation(
model_name: str,
model_id: str,
parallel_setup: ParallelSetup,
distributed_backend: str,
vllm_major_version: str,
......@@ -445,11 +447,12 @@ def test_tp_multimodal_generation(
test_options: PPTestOptions,
num_gpus_available,
):
_compare_tp(model_name,
_compare_tp(model_id,
parallel_setup,
distributed_backend,
vllm_major_version,
task,
test_options,
num_gpus_available,
method="generate")
method="generate",
is_multimodal=True)
......@@ -155,10 +155,7 @@ VLM_TEST_SETTINGS = {
auto_cls=AutoModelForVision2Seq,
vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
marks=[pytest.mark.skipif(
TRANSFORMERS_VERSION < "4.49.0",
reason="HF model requires transformers>=4.49.0",
), pytest.mark.core_model, pytest.mark.cpu_model],
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
),
#### Extended model tests
"aria": VLMTestInfo(
......@@ -215,7 +212,6 @@ VLM_TEST_SETTINGS = {
"cherry_blossom": "<image>\nPlease infer the season with reason in details.", # noqa: E501
}),
multi_image_prompt="image_1:<image>\nimage_2:<image>\nWhich image can we see the car and the tower?", # noqa: E501
vllm_runner_kwargs={"hf_overrides": {"architectures": ["DeepseekVLV2ForCausalLM"]}}, # noqa: E501
patch_hf_runner=model_utils.deepseekvl2_patch_hf_runner,
postprocess_inputs=model_utils.cast_dtype_post_processor("images"),
hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output,
......@@ -240,7 +236,7 @@ VLM_TEST_SETTINGS = {
num_logprobs=10,
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
),
"glm4": VLMTestInfo(
"glm4v": VLMTestInfo(
models=["THUDM/glm-4v-9b"],
test_type=VLMTestType.IMAGE,
prompt_formatter=identity,
......@@ -351,7 +347,6 @@ VLM_TEST_SETTINGS = {
postprocess_inputs=model_utils.cast_dtype_post_processor(
"pixel_values"
),
vllm_runner_kwargs={"hf_overrides": {"architectures": ["MantisForConditionalGeneration"]}}, # noqa: E501
get_stop_token_ids=lambda tok: [128009],
auto_cls=AutoModelForVision2Seq,
vllm_output_post_proc=model_utils.mantis_vllm_to_hf_output,
......@@ -437,7 +432,7 @@ VLM_TEST_SETTINGS = {
auto_cls=AutoModelForVision2Seq,
marks=[large_gpu_mark(min_gb=48)],
),
"qwen": VLMTestInfo(
"qwen_vl": VLMTestInfo(
models=["Qwen/Qwen-VL"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=identity,
......
......@@ -4,12 +4,14 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
import torch
from PIL.Image import Image
from transformers import AutoTokenizer, BatchEncoding, PreTrainedTokenizerBase
from transformers import BatchEncoding
from transformers.models.auto.auto_factory import _BaseAutoModelClass
from vllm.config import TaskOption
from vllm.transformers_utils.tokenizer import AnyTokenizer
from .....conftest import HfRunner, VllmRunner
from ....registry import HF_EXAMPLE_MODELS
from .types import RunnerOutput
......@@ -31,10 +33,8 @@ def run_test(
use_tokenizer_eos: bool,
postprocess_inputs: Callable[[BatchEncoding], BatchEncoding],
comparator: Callable[..., None],
get_stop_token_ids: Optional[Callable[[PreTrainedTokenizerBase],
List[int]]],
get_stop_token_ids: Optional[Callable[[AnyTokenizer], list[int]]],
stop_str: Optional[List[str]],
tokenizer_mode: str,
limit_mm_per_prompt: Dict[str, int],
vllm_runner_kwargs: Optional[Dict[str, Any]],
hf_model_kwargs: Optional[Dict[str, Any]],
......@@ -48,7 +48,10 @@ def run_test(
"""Modality agnostic test test executor for comparing HF/vLLM outputs."""
# In the case of embeddings, vLLM takes separate input tensors
vllm_inputs = vllm_embeddings if vllm_embeddings is not None else inputs
tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip")
model_info.check_transformers_version(on_fail="skip")
vllm_outputs_per_mm = []
hf_outputs_per_mm = []
......@@ -57,17 +60,19 @@ def run_test(
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
vllm_kwargs: Dict[str, Any] = {}
if get_stop_token_ids is not None:
vllm_kwargs["stop_token_ids"] = get_stop_token_ids(tokenizer)
if stop_str:
vllm_kwargs["stop"] = stop_str
if vllm_runner_kwargs is None:
vllm_runner_kwargs = {}
vllm_runner_kwargs_: Dict[str, Any] = {}
if model_info.tokenizer:
vllm_runner_kwargs_["tokenizer"] = model_info.tokenizer
if model_info.tokenizer_mode:
vllm_runner_kwargs_["tokenizer_mode"] = model_info.tokenizer_mode
if model_info.hf_overrides:
vllm_runner_kwargs_["hf_overrides"] = model_info.hf_overrides
if vllm_runner_kwargs:
vllm_runner_kwargs_.update(vllm_runner_kwargs)
with vllm_runner(model,
tokenizer_mode=tokenizer_mode,
max_model_len=max_model_len,
max_num_seqs=max_num_seqs,
dtype=dtype,
......@@ -76,7 +81,15 @@ def run_test(
distributed_executor_backend=distributed_executor_backend,
enforce_eager=enforce_eager,
task=task,
**vllm_runner_kwargs) as vllm_model:
**vllm_runner_kwargs_) as vllm_model:
tokenizer = vllm_model.model.get_tokenizer()
vllm_kwargs: Dict[str, Any] = {}
if get_stop_token_ids is not None:
vllm_kwargs["stop_token_ids"] = get_stop_token_ids(tokenizer)
if stop_str:
vllm_kwargs["stop"] = stop_str
for prompts, media in vllm_inputs:
vllm_kwargs[runner_mm_key] = media
vllm_output = vllm_model.generate_greedy_logprobs(
......@@ -93,16 +106,19 @@ def run_test(
if patch_hf_runner is not None:
hf_model = patch_hf_runner(hf_model)
# Some models need to explicitly pass the eos_token_id off the tokenizer or
# processor for a good comparison; currently assume processor/tokenizer
# agree on the EOS, and pull it off the tokenizer if requested.
with hf_model, torch.no_grad():
tokenizer = hf_model.tokenizer
# Some models need to explicitly pass the eos_token_id off the tokenizer
# or processor for a good comparison;
# currently assume processor/tokenizer agree on the EOS, and pull it off
# the tokenizer if requested.
hf_kwargs = {}
if use_tokenizer_eos:
hf_kwargs["eos_token_id"] = tokenizer.eos_token_id
if stop_str:
hf_kwargs["stop_strings"] = stop_str
with hf_model, torch.no_grad():
for prompts, media in inputs:
hf_kwargs[runner_mm_key] = media
hf_output = hf_model.generate_greedy_logprobs_limit(
......
......@@ -8,12 +8,12 @@ from typing import (Any, Callable, Dict, Iterable, List, NamedTuple, Optional,
import torch
from PIL.Image import Image
from pytest import MarkDecorator
from transformers import (AutoModelForCausalLM, BatchEncoding,
PreTrainedTokenizerBase)
from transformers import AutoModelForCausalLM, BatchEncoding
from transformers.models.auto.auto_factory import _BaseAutoModelClass
from vllm.config import TaskOption
from vllm.sequence import SampleLogprobs
from vllm.transformers_utils.tokenizer import AnyTokenizer
from vllm.utils import identity
from .....conftest import IMAGE_ASSETS, HfRunner, ImageAsset, _ImageAssets
......@@ -100,8 +100,7 @@ class VLMTestInfo(NamedTuple):
vllm_runner_kwargs: Optional[Dict[str, Any]] = None
# Optional callable which gets a list of token IDs from the model tokenizer
get_stop_token_ids: Optional[Callable[[PreTrainedTokenizerBase],
List[int]]] = None
get_stop_token_ids: Optional[Callable[[AnyTokenizer], list[int]]] = None
# Optional list of strings to stop generation, useful when stop tokens are
# not special tokens in the tokenizer
stop_str: Optional[List[str]] = None
......@@ -156,8 +155,6 @@ class VLMTestInfo(NamedTuple):
marks: Optional[List[MarkDecorator]] = None
tokenizer_mode: str = "auto"
def get_non_parametrized_runner_kwargs(self):
"""Returns a dictionary of expandable kwargs for items that are used
in all test types, which are NOT used when creating the parametrized
......@@ -180,7 +177,6 @@ class VLMTestInfo(NamedTuple):
"hf_model_kwargs": self.hf_model_kwargs,
"stop_str": self.stop_str,
"patch_hf_runner": self.patch_hf_runner,
"tokenizer_mode": self.tokenizer_mode
}
......
......@@ -104,7 +104,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
trust_remote_code=True),
"BambaForCausalLM": _HfExamplesInfo("ibm-ai-platform/Bamba-9B"),
"BloomForCausalLM": _HfExamplesInfo("bigscience/bloomz-1b1"),
# ChatGLMModel supports multimodal
"ChatGLMModel": _HfExamplesInfo("THUDM/chatglm3-6b",
trust_remote_code=True),
"CohereForCausalLM": _HfExamplesInfo("CohereForAI/c4ai-command-r-v01",
trust_remote_code=True),
"Cohere2ForCausalLM": _HfExamplesInfo("CohereForAI/c4ai-command-r7b-12-2024", # noqa: E501
......@@ -138,7 +139,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"InternLM3ForCausalLM": _HfExamplesInfo("internlm/internlm3-8b-instruct",
trust_remote_code=True),
"JAISLMHeadModel": _HfExamplesInfo("inceptionai/jais-13b-chat"),
"JambaForCausalLM": _HfExamplesInfo("ai21labs/AI21-Jamba-1.5-Mini"),
"JambaForCausalLM": _HfExamplesInfo("ai21labs/AI21-Jamba-1.5-Mini",
extras={"tiny": "ai21labs/Jamba-tiny-dev"}), # noqa: E501
"LlamaForCausalLM": _HfExamplesInfo("meta-llama/Meta-Llama-3-8B"),
"LLaMAForCausalLM": _HfExamplesInfo("decapoda-research/llama-7b-hf",
is_available_online=False),
......@@ -167,7 +169,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
trust_remote_code=True),
"PhiMoEForCausalLM": _HfExamplesInfo("microsoft/Phi-3.5-MoE-instruct",
trust_remote_code=True),
# QWenLMHeadModel supports multimodal
"QWenLMHeadModel": _HfExamplesInfo("Qwen/Qwen-7B-Chat",
trust_remote_code=True),
"Qwen2ForCausalLM": _HfExamplesInfo("Qwen/Qwen2-7B-Instruct"),
"Qwen2MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen1.5-MoE-A2.7B-Chat"),
"RWForCausalLM": _HfExamplesInfo("tiiuae/falcon-40b",
......@@ -232,18 +235,19 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria"),
"Blip2ForConditionalGeneration": _HfExamplesInfo("Salesforce/blip2-opt-2.7b"), # noqa: E501
"ChameleonForConditionalGeneration": _HfExamplesInfo("facebook/chameleon-7b"), # noqa: E501
"ChatGLMModel": _HfExamplesInfo("THUDM/glm-4v-9b",
extras={"text_only": "THUDM/chatglm3-6b"},
trust_remote_code=True),
"ChatGLMForConditionalGeneration": _HfExamplesInfo("chatglm2-6b",
is_available_online=False),
"DeepseekVLV2ForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-vl2-tiny", # noqa: E501
hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]}), # noqa: E501
"FuyuForCausalLM": _HfExamplesInfo("adept/fuyu-8b"),
"H2OVLChatModel": _HfExamplesInfo("h2oai/h2ovl-mississippi-800m"),
"GLM4VForCausalLM": _HfExamplesInfo("THUDM/glm-4v-9b",
trust_remote_code=True,
hf_overrides={"architectures": ["GLM4VForCausalLM"]}), # noqa: E501
"H2OVLChatModel": _HfExamplesInfo("h2oai/h2ovl-mississippi-800m",
extras={"2b": "h2oai/h2ovl-mississippi-2b"}), # noqa: E501
"InternVLChatModel": _HfExamplesInfo("OpenGVLab/InternVL2-1B",
extras={"2B": "OpenGVLab/InternVL2-2B"}, # noqa: E501
trust_remote_code=True),
"Idefics3ForConditionalGeneration": _HfExamplesInfo("HuggingFaceM4/Idefics3-8B-Llama3"), # noqa: E501
"Idefics3ForConditionalGeneration": _HfExamplesInfo("HuggingFaceM4/Idefics3-8B-Llama3", # noqa: E501
{"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"}), # noqa: E501
"LlavaForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-1.5-7b-hf",
extras={"mistral": "mistral-community/pixtral-12b"}), # noqa: E501
"LlavaNextForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-v1.6-mistral-7b-hf"), # noqa: E501
......@@ -253,21 +257,24 @@ _MULTIMODAL_EXAMPLE_MODELS = {
hf_overrides={"architectures": ["MantisForConditionalGeneration"]}), # noqa: E501
"MiniCPMO": _HfExamplesInfo("openbmb/MiniCPM-o-2_6",
trust_remote_code=True),
"MiniCPMV": _HfExamplesInfo("openbmb/MiniCPM-V-2_6",
"MiniCPMV": _HfExamplesInfo("openbmb/MiniCPM-Llama3-V-2_5",
extras={"2.6": "openbmb/MiniCPM-V-2_6"}, # noqa: E501
trust_remote_code=True),
"MolmoForCausalLM": _HfExamplesInfo("allenai/Molmo-7B-D-0924",
extras={"olmo": "allenai/Molmo-7B-O-0924"}, # noqa: E501
trust_remote_code=True),
"NVLM_D": _HfExamplesInfo("nvidia/NVLM-D-72B",
trust_remote_code=True),
"PaliGemmaForConditionalGeneration": _HfExamplesInfo("google/paligemma-3b-pt-224"), # noqa: E501
"PaliGemmaForConditionalGeneration": _HfExamplesInfo("google/paligemma-3b-mix-224", # noqa: E501
extras={"v2": "google/paligemma2-3b-ft-docci-448"}), # noqa: E501
"Phi3VForCausalLM": _HfExamplesInfo("microsoft/Phi-3-vision-128k-instruct",
trust_remote_code=True),
"PixtralForConditionalGeneration": _HfExamplesInfo("mistralai/Pixtral-12B-2409", # noqa: E501
tokenizer_mode="mistral"),
"QWenLMHeadModel": _HfExamplesInfo("Qwen/Qwen-VL-Chat",
extras={"text_only": "Qwen/Qwen-7B-Chat"}, # noqa: E501
trust_remote_code=True),
"QwenVLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen-VL",
extras={"chat": "Qwen/Qwen-VL-Chat"}, # noqa: E501
trust_remote_code=True,
hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]}), # noqa: E501
"Qwen2AudioForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-Audio-7B-Instruct"), # noqa: E501
"Qwen2VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-VL-2B-Instruct"), # noqa: E501
"Qwen2_5_VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-VL-3B-Instruct", # noqa: E501
......
......@@ -18,8 +18,7 @@ def test_can_initialize(model_arch):
# Avoid OOM
def hf_overrides(hf_config: PretrainedConfig) -> PretrainedConfig:
if hf_config.model_type == "deepseek_vl_v2":
hf_config.update({"architectures": ["DeepseekVLV2ForCausalLM"]})
hf_config.update(model_info.hf_overrides)
if hasattr(hf_config, "text_config"):
text_config: PretrainedConfig = hf_config.text_config
......
# SPDX-License-Identifier: Apache-2.0
# Adapted from
# https://github.com/THUDM/CogAgent
"""Inference-only CogAgent model compatible with THUDM weights."""
from argparse import Namespace
from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict,
Union)
# https://github.com/THUDM/ChatGLM2-6B
"""Inference-only ChatGLM model compatible with THUDM weights."""
from typing import Iterable, List, Optional, Set, Tuple, Union
import torch
from torch import nn
from torch.nn import LayerNorm
from torchvision import transforms
from torchvision.transforms import InterpolationMode
from transformers import PreTrainedTokenizer, TensorType
from transformers.image_utils import ImageInput
from transformers.tokenization_utils_base import TextInput
from vllm.attention import Attention, AttentionMetadata
from vllm.config import CacheConfig, VllmConfig
......@@ -31,204 +23,14 @@ from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
from vllm.model_executor.layers.vocab_parallel_embedding import (
ParallelLMHead, VocabParallelEmbedding)
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.models.glm4_vision_encoder import EVA2CLIPModel
from vllm.model_executor.models.module_mapping import MultiModelKeys
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors
from vllm.multimodal.parse import MultiModalDataItems
from vllm.multimodal.processing import (BaseMultiModalProcessor,
BaseProcessingInfo, BatchFeature,
MultiModalFieldConfig,
PromptReplacement)
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.configs import ChatGLMConfig
from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP
from .interfaces import SupportsLoRA, SupportsPP
from .utils import (AutoWeightsLoader, WeightsMapper, is_pp_missing_parameter,
make_empty_intermediate_tensors_factory, make_layers,
maybe_prefix, merge_multimodal_embeddings)
class GLMImagePixelInputs(TypedDict):
pixel_values: torch.Tensor
"""Shape: `(batch_size, num_channels, height, width)`"""
class GLM4VProcessor:
"""
This model doesn't define its own HF processor,
so we implement our own one here.
"""
def __init__(
self,
config: ChatGLMConfig,
tokenizer: PreTrainedTokenizer,
) -> None:
super().__init__()
self.config = config
self.tokenizer = tokenizer
if vision_config := getattr(config, "vision_config", None):
image_size = vision_config["image_size"]
self.image_transform = transforms.Compose([
transforms.Resize(
(image_size, image_size),
interpolation=InterpolationMode.BICUBIC,
),
transforms.ToTensor(),
transforms.Normalize(
mean=(0.48145466, 0.4578275, 0.40821073),
std=(0.26862954, 0.26130258, 0.27577711),
),
])
else:
self.image_transform = None
def __call__(
self,
text: Optional[Union[TextInput, list[TextInput]]] = None,
images: Optional[Union[ImageInput, list[ImageInput]]] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
) -> BatchFeature:
if text is None:
text = []
if not isinstance(text, list):
text = [text]
if images is None:
images = []
if not isinstance(images, list):
images = [images]
text_inputs = self.tokenizer(text)
if len(images) == 0:
image_inputs = {}
else:
if self.image_transform is None:
raise ValueError("This model does not support image inputs")
pixel_values = [self.image_transform(image) for image in images]
image_inputs = {"pixel_values": torch.stack(pixel_values)}
return BatchFeature(
{
**text_inputs,
**image_inputs,
},
tensor_type=return_tensors,
)
class GLM4VProcessingInfo(BaseProcessingInfo):
def get_tokenizer(self):
tokenizer = self.ctx.tokenizer
assert isinstance(tokenizer, PreTrainedTokenizer)
return tokenizer
def get_hf_config(self):
return self.ctx.get_hf_config(ChatGLMConfig)
def get_hf_processor(self) -> GLM4VProcessor:
return GLM4VProcessor(
self.get_hf_config(),
self.get_tokenizer(),
)
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
return {"image": 1}
def get_mm_max_tokens_per_item(
self,
seq_len: int,
mm_counts: Mapping[str, int],
) -> Mapping[str, int]:
return {"image": self.get_num_image_feature_tokens()}
def get_num_image_tokens(self) -> int:
hf_config = self.get_hf_config()
if not (vision_config := getattr(hf_config, "vision_config", None)):
return 0
image_size = vision_config["image_size"]
patch_size = vision_config["patch_size"]
grid_length = image_size // patch_size // 2
return grid_length * grid_length
def get_num_image_feature_tokens(self) -> int:
# EVA2CLIPModel has embeddings for boi and eoi tokens as well
return self.get_num_image_tokens() + 2
class GLM4VDummyInputsBuilder(BaseDummyInputsBuilder[GLM4VProcessingInfo]):
def get_dummy_processor_inputs(
self,
seq_len: int,
mm_counts: Mapping[str, int],
) -> ProcessorInputs:
hf_config = self.info.get_hf_config()
if not (vision_config := getattr(hf_config, "vision_config", None)):
return ProcessorInputs(prompt_text="", mm_data={})
target_width = target_height = vision_config["image_size"]
num_images = mm_counts.get("image", 0)
mm_data = {
"image":
self._get_dummy_images(width=target_width,
height=target_height,
num_images=num_images)
}
base_text = "<|begin_of_image|><|endoftext|><|end_of_image|>"
return ProcessorInputs(
prompt_text=base_text * num_images,
mm_data=mm_data,
)
class GLM4VMultiModalProcessor(BaseMultiModalProcessor[GLM4VProcessingInfo]):
def _get_mm_fields_config(
self,
hf_inputs: BatchFeature,
hf_processor_mm_kwargs: Mapping[str, object],
) -> Mapping[str, MultiModalFieldConfig]:
return dict(pixel_values=MultiModalFieldConfig.batched("image"))
def _get_prompt_replacements(
self,
mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs,
) -> list[PromptReplacement]:
hf_config = self.info.get_hf_config()
if not hasattr(hf_config, "vision_config"):
return []
boi_token_id = hf_config.boi_token_id
image_token_id = hf_config.pad_token_id
eoi_token_id = hf_config.eoi_token_id
def get_replacement(item_idx: int):
num_image_tokens = self.info.get_num_image_tokens()
image_tokens = [image_token_id] * num_image_tokens
return [boi_token_id] + image_tokens + [eoi_token_id]
return [
PromptReplacement(
modality="image",
target=[boi_token_id, image_token_id, eoi_token_id],
replacement=get_replacement,
),
]
maybe_prefix)
class GLMAttention(nn.Module):
......@@ -489,7 +291,7 @@ class GLMTransformer(nn.Module):
position_ids: torch.Tensor,
kv_caches: List[torch.Tensor],
attn_metadata: AttentionMetadata,
) -> torch.Tensor:
) -> Union[torch.Tensor, IntermediateTensors]:
for i in range(self.start_layer, self.end_layer):
layer = self.layers[i]
hidden_states = layer(
......@@ -498,8 +300,12 @@ class GLMTransformer(nn.Module):
kv_cache=kv_caches[i - self.start_layer],
attn_metadata=attn_metadata,
)
if not get_pp_group().is_last_rank:
return IntermediateTensors({"hidden_states": hidden_states})
# Final layer norm.
if get_pp_group().is_last_rank and self.post_layer_norm:
if self.post_layer_norm:
hidden_states = self.final_layernorm(hidden_states)
return hidden_states
......@@ -534,61 +340,11 @@ class ChatGLMModel(nn.Module):
quant_config=quant_config,
prefix=f"{prefix}.output_layer")
vision_config_flag = getattr(config, 'vision_config', None)
if vision_config_flag is not None:
self.vision_config = Namespace(**config.vision_config)
self.vision = EVA2CLIPModel(self.config,
quant_config,
prefix=f"{prefix}.vision")
else:
self.vision = None
self.make_empty_intermediate_tensors = (
self.encoder.make_empty_intermediate_tensors)
def _parse_and_validate_image_input(
self, **kwargs: object) -> GLMImagePixelInputs:
pixel_values = kwargs.pop("pixel_values", None)
if pixel_values is not None and self.vision is not None:
if isinstance(pixel_values, torch.Tensor):
if pixel_values.ndim > 2:
pixel_values = torch.concat(list(pixel_values))
elif isinstance(pixel_values, list):
return torch.concat(pixel_values)
else:
raise TypeError("""pixel_values must be a torch.Tensor
or a list of torch.Tensor
""")
return GLMImagePixelInputs(pixel_values=pixel_values)
def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
image_input = self._parse_and_validate_image_input(**kwargs)
if image_input["pixel_values"] is None:
return None
pixel_values = image_input["pixel_values"].to(
dtype=self.config.torch_dtype)
vision_embeddings = self.vision(pixel_values)
return vision_embeddings
def get_input_embeddings(
self,
input_ids: torch.Tensor,
multimodal_embeddings: Optional[NestedTensors] = None,
) -> torch.Tensor:
inputs_embeds = self.embedding(input_ids)
if multimodal_embeddings is not None:
inputs_embeds = merge_multimodal_embeddings(
input_ids=input_ids,
inputs_embeds=inputs_embeds,
multimodal_embeddings=multimodal_embeddings,
placeholder_token_id=[
self.config.boi_token_id,
self.config.pad_token_id,
self.config.eoi_token_id,
],
)
return inputs_embeds
def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
return self.embedding(input_ids)
def forward(
self,
......@@ -599,26 +355,24 @@ class ChatGLMModel(nn.Module):
intermediate_tensors: Optional[IntermediateTensors] = None,
inputs_embeds: Optional[torch.Tensor] = None,
**kwargs: object,
) -> torch.Tensor:
) -> Union[torch.Tensor, IntermediateTensors]:
if get_pp_group().is_first_rank:
if inputs_embeds is not None:
hidden_states = inputs_embeds
else:
hidden_states = self.get_input_embeddings(input_ids)
else:
assert intermediate_tensors is not None
hidden_states = intermediate_tensors["hidden_states"]
# NOTE: In v1, inputs_embeds is always generated at model runner, this
# condition is for v0 compatibility.
if intermediate_tensors is not None:
inputs_embeds = intermediate_tensors["hidden_states"]
elif inputs_embeds is None:
vision_embeddings = self.get_multimodal_embeddings(**kwargs)
inputs_embeds = self.get_input_embeddings(input_ids,
vision_embeddings)
# Run encoder.
hidden_states = self.encoder(
hidden_states=inputs_embeds,
hidden_states=hidden_states,
position_ids=positions,
kv_caches=kv_caches,
attn_metadata=attn_metadata,
)
if not get_pp_group().is_last_rank:
return IntermediateTensors({"hidden_states": hidden_states})
return hidden_states
def load_weights(self, weights: Iterable[Tuple[str,
......@@ -660,12 +414,18 @@ class ChatGLMModel(nn.Module):
return loaded_params
class ChatGLMBaseModel(nn.Module, SupportsLoRA, SupportsPP):
class ChatGLMBaseModel(nn.Module):
hf_to_vllm_mapper = WeightsMapper(
orig_to_new_substr={".word_embeddings": ""}, )
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
def __init__(
self,
*,
vllm_config: VllmConfig,
prefix: str = "",
transformer_type: type[ChatGLMModel] = ChatGLMModel,
) -> None:
super().__init__()
config = vllm_config.model_config.hf_config
quant_config = vllm_config.quant_config
......@@ -678,7 +438,7 @@ class ChatGLMBaseModel(nn.Module, SupportsLoRA, SupportsPP):
self.quant_config = quant_config
self.max_position_embeddings = getattr(config, "max_sequence_length",
8192)
self.transformer = ChatGLMModel(vllm_config=vllm_config,
self.transformer = transformer_type(vllm_config=vllm_config,
prefix=maybe_prefix(
prefix, "transformer"))
if self.config.tie_word_embeddings:
......@@ -687,18 +447,8 @@ class ChatGLMBaseModel(nn.Module, SupportsLoRA, SupportsPP):
self.lm_head = self.transformer.output_layer
self.logits_processor = LogitsProcessor(config.padded_vocab_size)
self.sampler = get_sampler()
def forward(self,
input_ids: torch.Tensor,
positions: torch.Tensor,
kv_caches: List[torch.Tensor],
attn_metadata: AttentionMetadata,
intermediate_tensors: Optional[IntermediateTensors] = None,
**kwargs) -> torch.Tensor:
hidden_states = self.transformer(input_ids, positions, kv_caches,
attn_metadata, intermediate_tensors,
**kwargs)
return hidden_states
self.make_empty_intermediate_tensors = (
self.transformer.make_empty_intermediate_tensors)
def compute_logits(
self,
......@@ -722,7 +472,7 @@ class ChatGLMBaseModel(nn.Module, SupportsLoRA, SupportsPP):
return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
class ChatGLM(ChatGLMBaseModel):
class ChatGLMForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP):
packed_modules_mapping = {
"query_key_value": ["query_key_value"],
"dense_h_to_4h": ["dense_h_to_4h"]
......@@ -738,82 +488,28 @@ class ChatGLM(ChatGLMBaseModel):
embedding_modules = {}
embedding_padding_modules = []
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
config = vllm_config.model_config.hf_config
if hasattr(config, "vision_config"):
hf_overrides = {"architectures": ["GLM4VForCausalLM"]}
raise RuntimeError(
"The configuration of this model indicates that it supports "
"vision inputs, but you instantiated the text-only version "
"of this model. Please use the vision model by setting "
f"`--hf-overrides {hf_overrides!r}`")
class ChatGLMV(ChatGLMBaseModel, SupportsMultiModal):
packed_modules_mapping = {
"query_key_value": ["query_key_value"],
"dense_h_to_4h": ["dense_h_to_4h"],
"merged_proj": ["gate_proj", "dense_h_to_4h"]
}
# LoRA specific attributes
supported_lora_modules = [
"query_key_value",
"dense",
"dense_h_to_4h",
"dense_4h_to_h",
# vision
"fc1",
"fc2",
"merged_proj",
"linear_proj"
]
embedding_modules = {}
embedding_padding_modules = []
def get_mm_mapping(self) -> MultiModelKeys:
"""
Get the module prefix in multimodal models
"""
return MultiModelKeys.from_string_field(
language_model="transformer.encoder",
connector="transformer.vision.linear_proj",
tower_model="transformer.vision.transformer")
def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
return self.transformer.get_multimodal_embeddings(**kwargs)
super().__init__(vllm_config=vllm_config, prefix=prefix)
def get_input_embeddings(
def forward(
self,
input_ids: torch.Tensor,
multimodal_embeddings: Optional[NestedTensors] = None,
) -> torch.Tensor:
return self.transformer.get_input_embeddings(input_ids,
multimodal_embeddings)
@MULTIMODAL_REGISTRY.register_processor(GLM4VMultiModalProcessor,
info=GLM4VProcessingInfo,
dummy_inputs=GLM4VDummyInputsBuilder)
class ChatGLMForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP,
SupportsMultiModal):
# Ensure that the LoRA support check passes when the class is not
# initialized, but set all these attributes to empty.
# These will be updated when an instance class is selected
packed_modules_mapping = {}
supported_lora_modules = []
embedding_modules = {}
embedding_padding_modules = []
def __new__(
cls,
vllm_config: VllmConfig,
prefix: str = "",
) -> None:
config = vllm_config.model_config.hf_config
# Initialize VL
if hasattr(config, "vision_config"): # noqa: SIM108
instance_cls = ChatGLMV
# Initialize LLM
else:
instance_cls = ChatGLM
# quant_config references base class members,
# so update values before init is called
cls.packed_modules_mapping.update(instance_cls.packed_modules_mapping)
cls.supported_lora_modules += instance_cls.supported_lora_modules
cls.embedding_modules.update(instance_cls.embedding_modules)
cls.embedding_padding_modules += instance_cls.embedding_padding_modules
return instance_cls(vllm_config=vllm_config, prefix=prefix)
positions: torch.Tensor,
kv_caches: List[torch.Tensor],
attn_metadata: AttentionMetadata,
intermediate_tensors: Optional[IntermediateTensors] = None,
inputs_embeds: Optional[torch.Tensor] = None,
) -> Union[torch.Tensor, IntermediateTensors]:
hidden_states = self.transformer(input_ids, positions, kv_caches,
attn_metadata, intermediate_tensors,
inputs_embeds)
return hidden_states
# SPDX-License-Identifier: Apache-2.0
# Adapted from
# https://github.com/THUDM/GLM-4
"""Inference-only GLM-4v model visual encoder compatible with THUDM weights."""
# https://github.com/THUDM/CogAgent
"""Inference-only CogAgent model compatible with THUDM weights."""
from argparse import Namespace
from typing import Optional
from typing import List, Literal, Mapping, Optional, TypedDict, Union
import torch
from torch import nn
from torch.nn import LayerNorm
from torchvision import transforms
from torchvision.transforms import InterpolationMode
from transformers import PreTrainedTokenizer, TensorType
from transformers.image_utils import ImageInput
from transformers.tokenization_utils_base import TextInput
from vllm.attention import AttentionMetadata
from vllm.attention.layer import MultiHeadAttention
from vllm.config import VllmConfig
from vllm.distributed import get_tensor_model_parallel_world_size
from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn
from vllm.model_executor.layers.linear import (ColumnParallelLinear,
......@@ -18,11 +25,31 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
QKVParallelLinear,
ReplicatedLinear,
RowParallelLinear)
from vllm.model_executor.layers.quantization.base_config import (
QuantizationConfig)
from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.models.module_mapping import MultiModelKeys
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors
from vllm.multimodal.parse import MultiModalDataItems
from vllm.multimodal.processing import (BaseMultiModalProcessor,
BaseProcessingInfo, BatchFeature,
MultiModalFieldConfig,
PromptReplacement)
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.configs import ChatGLMConfig
from .chatglm import ChatGLMBaseModel, ChatGLMModel
from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP
from .utils import flatten_bn, merge_multimodal_embeddings
class PatchEmbedding(nn.Module):
class GLMVImagePixelInputs(TypedDict):
type: Literal["pixel_values"]
data: torch.Tensor
"""Shape: `(batch_size, num_channels, height, width)`"""
class EVA2CLIPPatchEmbedding(nn.Module):
def __init__(self, config):
super().__init__()
......@@ -54,7 +81,7 @@ class PatchEmbedding(nn.Module):
return x
class Attention(nn.Module):
class EVA2CLIPAttention(nn.Module):
def __init__(
self,
......@@ -97,7 +124,7 @@ class Attention(nn.Module):
return output
class MLP(nn.Module):
class EVA2CLIPMLP(nn.Module):
def __init__(
self,
......@@ -128,7 +155,7 @@ class MLP(nn.Module):
return x
class TransformerLayer(nn.Module):
class EVA2CLIPTransformerLayer(nn.Module):
def __init__(
self,
......@@ -139,10 +166,10 @@ class TransformerLayer(nn.Module):
super().__init__()
self.input_layernorm = LayerNorm(config.hidden_size,
eps=config.layer_norm_eps)
self.attention = Attention(config,
self.attention = EVA2CLIPAttention(config,
quant_config=quant_config,
prefix=f"{prefix}.attention")
self.mlp = MLP(config,
self.mlp = EVA2CLIPMLP(config,
quant_config=quant_config,
prefix=f"{prefix}.mlp")
self.post_attention_layernorm = LayerNorm(config.hidden_size,
......@@ -159,7 +186,7 @@ class TransformerLayer(nn.Module):
return output
class Transformer(nn.Module):
class EVA2CLIPTransformer(nn.Module):
def __init__(
self,
......@@ -169,7 +196,7 @@ class Transformer(nn.Module):
):
super().__init__()
self.layers = nn.ModuleList([
TransformerLayer(config,
EVA2CLIPTransformerLayer(config,
quant_config=quant_config,
prefix=f"{prefix}.layers.{layer_idx}")
for layer_idx in range(config.num_hidden_layers)
......@@ -181,7 +208,7 @@ class Transformer(nn.Module):
return hidden_states
class GLU(nn.Module):
class EVA2CLIPGLU(nn.Module):
def __init__(
self,
......@@ -268,11 +295,11 @@ class EVA2CLIPModel(nn.Module):
):
super().__init__()
vision_config = Namespace(**config.vision_config)
self.patch_embedding = PatchEmbedding(vision_config)
self.transformer = Transformer(vision_config,
self.patch_embedding = EVA2CLIPPatchEmbedding(vision_config)
self.transformer = EVA2CLIPTransformer(vision_config,
quant_config=quant_config,
prefix=f"{prefix}.transformer")
self.linear_proj = GLU(config,
self.linear_proj = EVA2CLIPGLU(config,
in_features=config.hidden_size,
quant_config=quant_config,
prefix=f"{prefix}.linear_proj")
......@@ -310,3 +337,326 @@ class EVA2CLIPModel(nn.Module):
x = torch.cat((boi, x, eoi), dim=1)
x = x / self.scaling_factor
return x
class GLM4VModel(ChatGLMModel):
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__(vllm_config=vllm_config, prefix=prefix)
quant_config = vllm_config.quant_config
self.vision = EVA2CLIPModel(self.config,
quant_config,
prefix=f"{prefix}.vision")
class GLM4VProcessor:
"""
This model doesn't define its own HF processor,
so we implement our own one here.
"""
def __init__(
self,
config: ChatGLMConfig,
tokenizer: PreTrainedTokenizer,
) -> None:
super().__init__()
self.config = config
self.tokenizer = tokenizer
vision_config = config.vision_config
image_size = vision_config["image_size"]
self.image_transform = transforms.Compose([
transforms.Resize(
(image_size, image_size),
interpolation=InterpolationMode.BICUBIC,
),
transforms.ToTensor(),
transforms.Normalize(
mean=(0.48145466, 0.4578275, 0.40821073),
std=(0.26862954, 0.26130258, 0.27577711),
),
])
def __call__(
self,
text: Optional[Union[TextInput, list[TextInput]]] = None,
images: Optional[Union[ImageInput, list[ImageInput]]] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
) -> BatchFeature:
if text is None:
text = []
if not isinstance(text, list):
text = [text]
if images is None:
images = []
if not isinstance(images, list):
images = [images]
text_inputs = self.tokenizer(text)
if len(images) == 0:
image_inputs = {}
else:
pixel_values = [self.image_transform(image) for image in images]
image_inputs = {"pixel_values": torch.stack(pixel_values)}
return BatchFeature(
{
**text_inputs,
**image_inputs,
},
tensor_type=return_tensors,
)
class GLM4VProcessingInfo(BaseProcessingInfo):
def get_tokenizer(self):
tokenizer = self.ctx.tokenizer
assert isinstance(tokenizer, PreTrainedTokenizer)
return tokenizer
def get_hf_config(self):
return self.ctx.get_hf_config(ChatGLMConfig)
def get_hf_processor(self) -> GLM4VProcessor:
return GLM4VProcessor(
self.get_hf_config(),
self.get_tokenizer(),
)
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
return {"image": 1}
def get_mm_max_tokens_per_item(
self,
seq_len: int,
mm_counts: Mapping[str, int],
) -> Mapping[str, int]:
return {"image": self.get_num_image_feature_tokens()}
def get_num_image_tokens(self) -> int:
hf_config = self.get_hf_config()
vision_config = hf_config.vision_config
image_size = vision_config["image_size"]
patch_size = vision_config["patch_size"]
grid_length = image_size // patch_size // 2
return grid_length * grid_length
def get_num_image_feature_tokens(self) -> int:
# EVA2CLIPModel has embeddings for boi and eoi tokens as well
return self.get_num_image_tokens() + 2
class GLM4VDummyInputsBuilder(BaseDummyInputsBuilder[GLM4VProcessingInfo]):
def get_dummy_processor_inputs(
self,
seq_len: int,
mm_counts: Mapping[str, int],
) -> ProcessorInputs:
hf_config = self.info.get_hf_config()
vision_config = hf_config.vision_config
target_width = target_height = vision_config["image_size"]
num_images = mm_counts.get("image", 0)
mm_data = {
"image":
self._get_dummy_images(width=target_width,
height=target_height,
num_images=num_images)
}
base_text = "<|begin_of_image|><|endoftext|><|end_of_image|>"
return ProcessorInputs(
prompt_text=base_text * num_images,
mm_data=mm_data,
)
class GLM4VMultiModalProcessor(BaseMultiModalProcessor[GLM4VProcessingInfo]):
def _get_mm_fields_config(
self,
hf_inputs: BatchFeature,
hf_processor_mm_kwargs: Mapping[str, object],
) -> Mapping[str, MultiModalFieldConfig]:
return dict(pixel_values=MultiModalFieldConfig.batched("image"))
def _get_prompt_replacements(
self,
mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object],
out_mm_kwargs: MultiModalKwargs,
) -> list[PromptReplacement]:
hf_config = self.info.get_hf_config()
boi_token_id = hf_config.boi_token_id
image_token_id = hf_config.pad_token_id
eoi_token_id = hf_config.eoi_token_id
def get_replacement(item_idx: int):
num_image_tokens = self.info.get_num_image_tokens()
image_tokens = [image_token_id] * num_image_tokens
return [boi_token_id] + image_tokens + [eoi_token_id]
return [
PromptReplacement(
modality="image",
target=[boi_token_id, image_token_id, eoi_token_id],
replacement=get_replacement,
),
]
@MULTIMODAL_REGISTRY.register_processor(GLM4VMultiModalProcessor,
info=GLM4VProcessingInfo,
dummy_inputs=GLM4VDummyInputsBuilder)
class GLM4VForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP,
SupportsMultiModal):
packed_modules_mapping = {
"query_key_value": ["query_key_value"],
"dense_h_to_4h": ["dense_h_to_4h"],
"merged_proj": ["gate_proj", "dense_h_to_4h"]
}
# LoRA specific attributes
supported_lora_modules = [
"query_key_value",
"dense",
"dense_h_to_4h",
"dense_4h_to_h",
# vision
"fc1",
"fc2",
"merged_proj",
"linear_proj"
]
embedding_modules = {}
embedding_padding_modules = []
def get_mm_mapping(self) -> MultiModelKeys:
"""
Get the module prefix in multimodal models
"""
return MultiModelKeys.from_string_field(
language_model="transformer.encoder",
connector="transformer.vision.linear_proj",
tower_model="transformer.vision.transformer")
def __init__(
self,
*,
vllm_config: VllmConfig,
prefix: str = "",
transformer_type: type[GLM4VModel] = GLM4VModel,
) -> None:
super().__init__(
vllm_config=vllm_config,
prefix=prefix,
transformer_type=transformer_type,
)
self.transformer: GLM4VModel
def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
h = w = self.config.vision_config["image_size"]
expected_dims = (3, h, w)
actual_dims = tuple(data.shape[1:])
if actual_dims != expected_dims:
expected_expr = ("batch_size", *map(str, expected_dims))
raise ValueError(
f"The expected shape of pixel values is {expected_expr}. "
f"You supplied {tuple(data.shape)}.")
return data
def _parse_and_validate_image_input(
self, **kwargs: object) -> Optional[GLMVImagePixelInputs]:
pixel_values = kwargs.pop("pixel_values", None)
if pixel_values is not None:
if not isinstance(pixel_values, torch.Tensor):
raise ValueError("Incorrect type of pixel values. "
f"Got type: {type(pixel_values)}")
return GLMVImagePixelInputs(
type="pixel_values",
data=self._validate_pixel_values(
flatten_bn(pixel_values, concat=True)),
)
return None
def _process_image_input(
self, image_input: GLMVImagePixelInputs) -> torch.Tensor:
pixel_values = image_input["data"].to(dtype=self.config.torch_dtype)
return self.transformer.vision(pixel_values)
def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
image_input = self._parse_and_validate_image_input(**kwargs)
if image_input is None:
return None
vision_embeddings = self._process_image_input(image_input)
return vision_embeddings
def get_input_embeddings(
self,
input_ids: torch.Tensor,
multimodal_embeddings: Optional[NestedTensors] = None,
) -> torch.Tensor:
inputs_embeds = self.transformer.get_input_embeddings(input_ids)
if multimodal_embeddings is not None:
inputs_embeds = merge_multimodal_embeddings(
input_ids=input_ids,
inputs_embeds=inputs_embeds,
multimodal_embeddings=multimodal_embeddings,
placeholder_token_id=[
self.config.boi_token_id,
self.config.pad_token_id,
self.config.eoi_token_id,
],
)
return inputs_embeds
def forward(
self,
input_ids: torch.Tensor,
positions: torch.Tensor,
kv_caches: List[torch.Tensor],
attn_metadata: AttentionMetadata,
intermediate_tensors: Optional[IntermediateTensors] = None,
inputs_embeds: Optional[torch.Tensor] = None,
**kwargs: object,
) -> Union[torch.Tensor, IntermediateTensors]:
if intermediate_tensors is not None:
inputs_embeds = None
# NOTE: In v1, inputs_embeds is always generated at model runner, this
# condition is for v0 compatibility.
elif inputs_embeds is None:
vision_embeddings = self.get_multimodal_embeddings(**kwargs)
inputs_embeds = self.get_input_embeddings(input_ids,
vision_embeddings)
input_ids = None
hidden_states = self.transformer(input_ids, positions, kv_caches,
attn_metadata, intermediate_tensors,
inputs_embeds)
return hidden_states
This diff is collapsed.
This diff is collapsed.
......@@ -39,7 +39,7 @@ _TEXT_GENERATION_MODELS = {
"BaichuanForCausalLM": ("baichuan", "BaichuanForCausalLM"),
"BambaForCausalLM": ("bamba", "BambaForCausalLM"),
"BloomForCausalLM": ("bloom", "BloomForCausalLM"),
# ChatGLMModel supports multimodal
"ChatGLMModel": ("chatglm", "ChatGLMForCausalLM"),
"CohereForCausalLM": ("commandr", "CohereForCausalLM"),
"Cohere2ForCausalLM": ("commandr", "CohereForCausalLM"),
"DbrxForCausalLM": ("dbrx", "DbrxForCausalLM"),
......@@ -90,7 +90,7 @@ _TEXT_GENERATION_MODELS = {
"Phi3ForCausalLM": ("phi3", "Phi3ForCausalLM"),
"Phi3SmallForCausalLM": ("phi3_small", "Phi3SmallForCausalLM"),
"PhiMoEForCausalLM": ("phimoe", "PhiMoEForCausalLM"),
# QWenLMHeadModel supports multimodal
"QWenLMHeadModel": ("qwen", "QWenLMHeadModel"),
"Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"),
"Qwen2MoeForCausalLM": ("qwen2_moe", "Qwen2MoeForCausalLM"),
"RWForCausalLM": ("falcon", "FalconForCausalLM"),
......@@ -156,10 +156,9 @@ _MULTIMODAL_MODELS = {
"AriaForConditionalGeneration": ("aria", "AriaForConditionalGeneration"),
"Blip2ForConditionalGeneration": ("blip2", "Blip2ForConditionalGeneration"),
"ChameleonForConditionalGeneration": ("chameleon", "ChameleonForConditionalGeneration"), # noqa: E501
"ChatGLMModel": ("chatglm", "ChatGLMForCausalLM"),
"ChatGLMForConditionalGeneration": ("chatglm", "ChatGLMForCausalLM"),
"DeepseekVLV2ForCausalLM": ("deepseek_vl2", "DeepseekVLV2ForCausalLM"),
"FuyuForCausalLM": ("fuyu", "FuyuForCausalLM"),
"GLM4VForCausalLM": ("glm4v", "GLM4VForCausalLM"),
"H2OVLChatModel": ("h2ovl", "H2OVLChatModel"),
"InternVLChatModel": ("internvl", "InternVLChatModel"),
"Idefics3ForConditionalGeneration":("idefics3","Idefics3ForConditionalGeneration"),
......@@ -175,7 +174,7 @@ _MULTIMODAL_MODELS = {
"PaliGemmaForConditionalGeneration": ("paligemma", "PaliGemmaForConditionalGeneration"), # noqa: E501
"Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
"PixtralForConditionalGeneration": ("pixtral", "PixtralForConditionalGeneration"), # noqa: E501
"QWenLMHeadModel": ("qwen", "QWenLMHeadModel"),
"QwenVLForConditionalGeneration": ("qwen_vl", "QwenVLForConditionalGeneration"), # noqa: E501
"Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"), # noqa: E501
"Qwen2_5_VLForConditionalGeneration": ("qwen2_5_vl", "Qwen2_5_VLForConditionalGeneration"), # noqa: E501
"Qwen2AudioForConditionalGeneration": ("qwen2_audio", "Qwen2AudioForConditionalGeneration"), # noqa: E501
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment