Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
8452946c
Unverified
Commit
8452946c
authored
Jul 02, 2025
by
Kwai-Keye
Committed by
GitHub
Jul 01, 2025
Browse files
[Model][VLM] Support Keye-VL-8B-Preview (#20126)
Signed-off-by:
Kwai-Keye
<
Keye@kuaishou.com
>
parent
2e7cbf2d
Changes
7
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
1801 additions
and
2 deletions
+1801
-2
docs/models/supported_models.md
docs/models/supported_models.md
+1
-0
examples/offline_inference/vision_language.py
examples/offline_inference/vision_language.py
+32
-0
examples/offline_inference/vision_language_multi_image.py
examples/offline_inference/vision_language_multi_image.py
+38
-0
tests/models/registry.py
tests/models/registry.py
+2
-0
vllm/entrypoints/chat_utils.py
vllm/entrypoints/chat_utils.py
+2
-2
vllm/model_executor/models/keye.py
vllm/model_executor/models/keye.py
+1725
-0
vllm/model_executor/models/registry.py
vllm/model_executor/models/registry.py
+1
-0
No files found.
docs/models/supported_models.md
View file @
8452946c
...
@@ -559,6 +559,7 @@ Specified using `--task generate`.
...
@@ -559,6 +559,7 @@ Specified using `--task generate`.
|
`H2OVLChatModel`
| H2OVL | T + I
<sup>
E+
</sup>
|
`h2oai/h2ovl-mississippi-800m`
,
`h2oai/h2ovl-mississippi-2b`
, etc. | | ✅︎ | ✅︎
\*
|
|
`H2OVLChatModel`
| H2OVL | T + I
<sup>
E+
</sup>
|
`h2oai/h2ovl-mississippi-800m`
,
`h2oai/h2ovl-mississippi-2b`
, etc. | | ✅︎ | ✅︎
\*
|
|
`Idefics3ForConditionalGeneration`
| Idefics3 | T + I |
`HuggingFaceM4/Idefics3-8B-Llama3`
etc. | ✅︎ | | ✅︎ |
|
`Idefics3ForConditionalGeneration`
| Idefics3 | T + I |
`HuggingFaceM4/Idefics3-8B-Llama3`
etc. | ✅︎ | | ✅︎ |
|
`InternVLChatModel`
| InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I
<sup>
E+
</sup>
+ (V
<sup>
E+
</sup>
) |
`OpenGVLab/InternVL3-9B`
,
`OpenGVLab/InternVideo2_5_Chat_8B`
,
`OpenGVLab/InternVL2_5-4B`
,
`OpenGVLab/Mono-InternVL-2B`
,
`OpenGVLab/InternVL2-4B`
, etc. | ✅︎ | ✅︎ | ✅︎ |
|
`InternVLChatModel`
| InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I
<sup>
E+
</sup>
+ (V
<sup>
E+
</sup>
) |
`OpenGVLab/InternVL3-9B`
,
`OpenGVLab/InternVideo2_5_Chat_8B`
,
`OpenGVLab/InternVL2_5-4B`
,
`OpenGVLab/Mono-InternVL-2B`
,
`OpenGVLab/InternVL2-4B`
, etc. | ✅︎ | ✅︎ | ✅︎ |
|
`KeyeForConditionalGeneration`
| Keye-VL-8B-Preview | T + I
<sup>
E+
</sup>
+ V
<sup>
E+
</sup>
|
`Kwai-Keye/Keye-VL-8B-Preview`
| | | ✅︎ |
|
`KimiVLForConditionalGeneration`
| Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I
<sup>
+
</sup>
|
`moonshotai/Kimi-VL-A3B-Instruct`
,
`moonshotai/Kimi-VL-A3B-Thinking`
| | | ✅︎ |
|
`KimiVLForConditionalGeneration`
| Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I
<sup>
+
</sup>
|
`moonshotai/Kimi-VL-A3B-Instruct`
,
`moonshotai/Kimi-VL-A3B-Thinking`
| | | ✅︎ |
|
`Llama4ForConditionalGeneration`
| Llama 4 | T + I
<sup>
+
</sup>
|
`meta-llama/Llama-4-Scout-17B-16E-Instruct`
,
`meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`
,
`meta-llama/Llama-4-Maverick-17B-128E-Instruct`
, etc. | | ✅︎ | ✅︎ |
|
`Llama4ForConditionalGeneration`
| Llama 4 | T + I
<sup>
+
</sup>
|
`meta-llama/Llama-4-Scout-17B-16E-Instruct`
,
`meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`
,
`meta-llama/Llama-4-Maverick-17B-128E-Instruct`
, etc. | | ✅︎ | ✅︎ |
|
`LlavaForConditionalGeneration`
| LLaVA-1.5 | T + I
<sup>
E+
</sup>
|
`llava-hf/llava-1.5-7b-hf`
,
`TIGER-Lab/Mantis-8B-siglip-llama3`
(see note), etc. | | ✅︎ | ✅︎ |
|
`LlavaForConditionalGeneration`
| LLaVA-1.5 | T + I
<sup>
E+
</sup>
|
`llava-hf/llava-1.5-7b-hf`
,
`TIGER-Lab/Mantis-8B-siglip-llama3`
(see note), etc. | | ✅︎ | ✅︎ |
...
...
examples/offline_inference/vision_language.py
View file @
8452946c
...
@@ -429,6 +429,37 @@ def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
...
@@ -429,6 +429,37 @@ def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
)
)
# Keye-VL
def
run_keye_vl
(
questions
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
model_name
=
"Kwai-Keye/Keye-VL-8B-Preview"
engine_args
=
EngineArgs
(
model
=
model_name
,
max_model_len
=
8192
,
trust_remote_code
=
True
,
limit_mm_per_prompt
=
{
modality
:
1
},
)
if
modality
==
"image"
:
placeholder
=
"<|image_pad|>"
elif
modality
==
"video"
:
placeholder
=
"<|video_pad|>"
prompts
=
[
(
f
"<|im_start|>user
\n
<|vision_start|>
{
placeholder
}
<|vision_end|>"
f
"
{
question
}
<|im_end|>
\n
"
"<|im_start|>assistant
\n
"
)
for
question
in
questions
]
return
ModelRequestData
(
engine_args
=
engine_args
,
prompts
=
prompts
,
)
# Kimi-VL
# Kimi-VL
def
run_kimi_vl
(
questions
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
def
run_kimi_vl
(
questions
:
list
[
str
],
modality
:
str
)
->
ModelRequestData
:
assert
modality
==
"image"
assert
modality
==
"image"
...
@@ -1154,6 +1185,7 @@ model_example_map = {
...
@@ -1154,6 +1185,7 @@ model_example_map = {
"h2ovl_chat"
:
run_h2ovl
,
"h2ovl_chat"
:
run_h2ovl
,
"idefics3"
:
run_idefics3
,
"idefics3"
:
run_idefics3
,
"internvl_chat"
:
run_internvl
,
"internvl_chat"
:
run_internvl
,
"keye_vl"
:
run_keye_vl
,
"kimi_vl"
:
run_kimi_vl
,
"kimi_vl"
:
run_kimi_vl
,
"llava"
:
run_llava
,
"llava"
:
run_llava
,
"llava-next"
:
run_llava_next
,
"llava-next"
:
run_llava_next
,
...
...
examples/offline_inference/vision_language_multi_image.py
View file @
8452946c
...
@@ -423,6 +423,43 @@ def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData:
...
@@ -423,6 +423,43 @@ def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData:
)
)
def
load_keye_vl
(
question
:
str
,
image_urls
:
list
[
str
])
->
ModelRequestData
:
model_name
=
"Kwai-Keye/Keye-VL-8B-Preview"
engine_args
=
EngineArgs
(
model
=
model_name
,
trust_remote_code
=
True
,
max_model_len
=
8192
,
max_num_seqs
=
5
,
limit_mm_per_prompt
=
{
"image"
:
len
(
image_urls
)},
)
placeholders
=
[{
"type"
:
"image"
,
"image"
:
url
}
for
url
in
image_urls
]
messages
=
[
{
"role"
:
"user"
,
"content"
:
[
*
placeholders
,
{
"type"
:
"text"
,
"text"
:
question
},
],
},
]
processor
=
AutoProcessor
.
from_pretrained
(
model_name
,
trust_remote_code
=
True
)
prompt
=
processor
.
apply_chat_template
(
messages
,
tokenize
=
False
,
add_generation_prompt
=
True
)
image_data
=
[
fetch_image
(
url
)
for
url
in
image_urls
]
return
ModelRequestData
(
engine_args
=
engine_args
,
prompt
=
prompt
,
image_data
=
image_data
,
)
def
load_kimi_vl
(
question
:
str
,
image_urls
:
list
[
str
])
->
ModelRequestData
:
def
load_kimi_vl
(
question
:
str
,
image_urls
:
list
[
str
])
->
ModelRequestData
:
model_name
=
"moonshotai/Kimi-VL-A3B-Instruct"
model_name
=
"moonshotai/Kimi-VL-A3B-Instruct"
...
@@ -862,6 +899,7 @@ model_example_map = {
...
@@ -862,6 +899,7 @@ model_example_map = {
"h2ovl_chat"
:
load_h2ovl
,
"h2ovl_chat"
:
load_h2ovl
,
"idefics3"
:
load_idefics3
,
"idefics3"
:
load_idefics3
,
"internvl_chat"
:
load_internvl
,
"internvl_chat"
:
load_internvl
,
"keye_vl"
:
load_keye_vl
,
"kimi_vl"
:
load_kimi_vl
,
"kimi_vl"
:
load_kimi_vl
,
"llava"
:
load_llava
,
"llava"
:
load_llava
,
"llava-next"
:
load_llava_next
,
"llava-next"
:
load_llava_next
,
...
...
tests/models/registry.py
View file @
8452946c
...
@@ -351,6 +351,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
...
@@ -351,6 +351,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
trust_remote_code
=
True
),
trust_remote_code
=
True
),
"Idefics3ForConditionalGeneration"
:
_HfExamplesInfo
(
"HuggingFaceM4/Idefics3-8B-Llama3"
,
# noqa: E501
"Idefics3ForConditionalGeneration"
:
_HfExamplesInfo
(
"HuggingFaceM4/Idefics3-8B-Llama3"
,
# noqa: E501
{
"tiny"
:
"HuggingFaceTB/SmolVLM-256M-Instruct"
}),
# noqa: E501
{
"tiny"
:
"HuggingFaceTB/SmolVLM-256M-Instruct"
}),
# noqa: E501
"KeyeForConditionalGeneration"
:
_HfExamplesInfo
(
"Kwai-Keye/Keye-VL-8B-Preview"
,
# noqa: E501
trust_remote_code
=
True
),
"KimiVLForConditionalGeneration"
:
_HfExamplesInfo
(
"moonshotai/Kimi-VL-A3B-Instruct"
,
# noqa: E501
"KimiVLForConditionalGeneration"
:
_HfExamplesInfo
(
"moonshotai/Kimi-VL-A3B-Instruct"
,
# noqa: E501
extras
=
{
"thinking"
:
"moonshotai/Kimi-VL-A3B-Thinking"
},
# noqa: E501
extras
=
{
"thinking"
:
"moonshotai/Kimi-VL-A3B-Thinking"
},
# noqa: E501
trust_remote_code
=
True
,
trust_remote_code
=
True
,
...
...
vllm/entrypoints/chat_utils.py
View file @
8452946c
...
@@ -540,7 +540,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
...
@@ -540,7 +540,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
return
"<image>"
return
"<image>"
if
model_type
in
(
"mllama"
,
"llama4"
):
if
model_type
in
(
"mllama"
,
"llama4"
):
return
"<|image|>"
return
"<|image|>"
if
model_type
in
(
"qwen2_vl"
,
"qwen2_5_vl"
):
if
model_type
in
(
"qwen2_vl"
,
"qwen2_5_vl"
,
"keye"
,
"Keye"
):
return
"<|vision_start|><|image_pad|><|vision_end|>"
return
"<|vision_start|><|image_pad|><|vision_end|>"
if
model_type
==
"qwen2_5_omni"
:
if
model_type
==
"qwen2_5_omni"
:
return
"<|vision_start|><|IMAGE|><|vision_end|>"
return
"<|vision_start|><|IMAGE|><|vision_end|>"
...
@@ -570,7 +570,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
...
@@ -570,7 +570,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
return
"<video>"
return
"<video>"
if
model_type
==
"glm4v"
:
if
model_type
==
"glm4v"
:
return
"<|begin_of_video|><|video|><|end_of_video|>"
return
"<|begin_of_video|><|video|><|end_of_video|>"
if
model_type
in
(
"qwen2_vl"
,
"qwen2_5_vl"
):
if
model_type
in
(
"qwen2_vl"
,
"qwen2_5_vl"
,
"keye"
,
"Keye"
):
return
"<|vision_start|><|video_pad|><|vision_end|>"
return
"<|vision_start|><|video_pad|><|vision_end|>"
if
model_type
==
"qwen2_5_omni"
:
if
model_type
==
"qwen2_5_omni"
:
return
"<|vision_start|><|VIDEO|><|vision_end|>"
return
"<|vision_start|><|VIDEO|><|vision_end|>"
...
...
vllm/model_executor/models/keye.py
0 → 100644
View file @
8452946c
This diff is collapsed.
Click to expand it.
vllm/model_executor/models/registry.py
View file @
8452946c
...
@@ -197,6 +197,7 @@ _MULTIMODAL_MODELS = {
...
@@ -197,6 +197,7 @@ _MULTIMODAL_MODELS = {
"InternVLChatModel"
:
(
"internvl"
,
"InternVLChatModel"
),
"InternVLChatModel"
:
(
"internvl"
,
"InternVLChatModel"
),
"Idefics3ForConditionalGeneration"
:(
"idefics3"
,
"Idefics3ForConditionalGeneration"
),
"Idefics3ForConditionalGeneration"
:(
"idefics3"
,
"Idefics3ForConditionalGeneration"
),
"SmolVLMForConditionalGeneration"
:
(
"smolvlm"
,
"SmolVLMForConditionalGeneration"
),
# noqa: E501
"SmolVLMForConditionalGeneration"
:
(
"smolvlm"
,
"SmolVLMForConditionalGeneration"
),
# noqa: E501
"KeyeForConditionalGeneration"
:
(
"keye"
,
"KeyeForConditionalGeneration"
),
"KimiVLForConditionalGeneration"
:
(
"kimi_vl"
,
"KimiVLForConditionalGeneration"
),
# noqa: E501
"KimiVLForConditionalGeneration"
:
(
"kimi_vl"
,
"KimiVLForConditionalGeneration"
),
# noqa: E501
"LlavaForConditionalGeneration"
:
(
"llava"
,
"LlavaForConditionalGeneration"
),
"LlavaForConditionalGeneration"
:
(
"llava"
,
"LlavaForConditionalGeneration"
),
"LlavaNextForConditionalGeneration"
:
(
"llava_next"
,
"LlavaNextForConditionalGeneration"
),
# noqa: E501
"LlavaNextForConditionalGeneration"
:
(
"llava_next"
,
"LlavaNextForConditionalGeneration"
),
# noqa: E501
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment