Unverified Commit 29c6fbe5 authored by bigshanedogg's avatar bigshanedogg Committed by GitHub
Browse files

[MODEL] New model support for naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B (#20931)


Signed-off-by: default avatarbigshanedogg <bigshane319@gmail.com>
parent c72f049c
......@@ -365,6 +365,7 @@ th {
| `Grok1ModelForCausalLM` | Grok1 | `hpcai-tech/grok-1`. | ✅︎ | ✅︎ | ✅︎ |
| `HunYuanDenseV1ForCausalLM` | Hunyuan-7B-Instruct-0124 | `tencent/Hunyuan-7B-Instruct-0124` | ✅︎ | | ✅︎ |
| `HunYuanMoEV1ForCausalLM` | Hunyuan-80B-A13B | `tencent/Hunyuan-A13B-Instruct`, `tencent/Hunyuan-A13B-Pretrain`, `tencent/Hunyuan-A13B-Instruct-FP8`, etc. | ✅︎ | | ✅︎ |
| `HCXVisionForCausalLM` | HyperCLOVAX-SEED-Vision-Instruct-3B | `naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B` | | | ✅︎ |
| `InternLMForCausalLM` | InternLM | `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `InternLM2ForCausalLM` | InternLM2 | `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc. | ✅︎ | ✅︎ | ✅︎ |
| `InternLM3ForCausalLM` | InternLM3 | `internlm/internlm3-8b-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
......
......@@ -316,6 +316,85 @@ def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
)
# naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B
def run_hyperclovax_seed_vision(
questions: list[str], modality: str
) -> ModelRequestData:
model_name = "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
engine_args = EngineArgs(
model=model_name,
trust_remote_code=True,
max_model_len=8192 if modality == "image" else 16384,
limit_mm_per_prompt={modality: 1},
)
messages = list()
for question in questions:
if modality == "image":
"""
ocr: List the words in the image in raster order.
Even if the word order feels unnatural for reading,
the model will handle it as long as it follows raster order.
e.g. "Naver, CLOVA, bigshane"
lens_keywords: List the entity names in the image.
e.g. "iPhone"
lens_local_keywords: List the entity names with quads in the image.
e.g. "[0.07, 0.21, 0.92, 0.90] iPhone"
"""
messages.append(
[
{
"role": "user",
"content": [
{
"type": "image",
"ocr": "",
"lens_keywords": "",
"lens_local_keywords": "",
},
{
"type": "text",
"text": question,
},
],
}
]
)
elif modality == "video":
messages.append(
[
{
"role": "user",
"content": [
{
"type": "video",
},
{
"type": "text",
"text": question,
},
],
}
]
)
else:
raise ValueError(f"Unsupported modality: {modality}")
prompts = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True,
)
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
stop_token_ids=None,
)
# Idefics3-8B-Llama3
def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
......@@ -1222,6 +1301,7 @@ model_example_map = {
"glm4v": run_glm4v,
"glm4_1v": run_glm4_1v,
"h2ovl_chat": run_h2ovl,
"hyperclovax_seed_vision": run_hyperclovax_seed_vision,
"idefics3": run_idefics3,
"internvl_chat": run_internvl,
"nemotron_vl": run_nemotron_vl,
......
......@@ -289,6 +289,53 @@ def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
)
def load_hyperclovax_seed_vision(
question: str, image_urls: list[str]
) -> ModelRequestData:
model_name = "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
engine_args = EngineArgs(
model=model_name,
trust_remote_code=True,
max_model_len=16384,
limit_mm_per_prompt={"image": len(image_urls)},
)
message = {"role": "user", "content": list()}
for _image_url in image_urls:
message["content"].append(
{
"type": "image",
"image": _image_url,
"ocr": "",
"lens_keywords": "",
"lens_local_keywords": "",
}
)
message["content"].append(
{
"type": "text",
"text": question,
}
)
prompt = tokenizer.apply_chat_template(
[
message,
],
tokenize=False,
add_generation_prompt=True,
)
return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
stop_token_ids=None,
image_data=[fetch_image(url) for url in image_urls],
)
def load_llava(question: str, image_urls: list[str]) -> ModelRequestData:
# NOTE: CAUTION! Original Llava models wasn't really trained on multi-image inputs,
# it will generate poor response for multi-image inputs!
......@@ -900,6 +947,7 @@ model_example_map = {
"h2ovl_chat": load_h2ovl,
"idefics3": load_idefics3,
"internvl_chat": load_internvl,
"hyperclovax_seed_vision": load_hyperclovax_seed_vision,
"keye_vl": load_keye_vl,
"kimi_vl": load_kimi_vl,
"llava": load_llava,
......
......@@ -278,6 +278,7 @@ def _test_processing_correctness_one(
"HuggingFaceTB/SmolVLM2-2.2B-Instruct",
"moonshotai/Kimi-VL-A3B-Instruct",
"meta-llama/Llama-4-Scout-17B-16E-Instruct",
"naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B",
"llava-hf/llava-1.5-7b-hf",
"llava-hf/llava-v1.6-mistral-7b-hf",
"llava-hf/LLaVA-NeXT-Video-7B-hf",
......
......@@ -201,6 +201,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
trust_remote_code=True),
"HunYuanDenseV1ForCausalLM":_HfExamplesInfo("tencent/Hunyuan-7B-Instruct-0124",
trust_remote_code=True),
"HCXVisionForCausalLM": _HfExamplesInfo(
"naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B",
trust_remote_code=True),
"InternLMForCausalLM": _HfExamplesInfo("internlm/internlm-chat-7b",
trust_remote_code=True),
"InternLM2ForCausalLM": _HfExamplesInfo("internlm/internlm2-chat-7b",
......
This diff is collapsed.
......@@ -81,6 +81,7 @@ _TEXT_GENERATION_MODELS = {
"Grok1ModelForCausalLM": ("grok1", "Grok1ForCausalLM"),
"HunYuanMoEV1ForCausalLM": ("hunyuan_v1", "HunYuanMoEV1ForCausalLM"),
"HunYuanDenseV1ForCausalLM": ("hunyuan_v1", "HunYuanDenseV1ForCausalLM"),
"HCXVisionForCausalLM": ("hyperclovax_vision", "HCXVisionForCausalLM"),
"InternLMForCausalLM": ("llama", "LlamaForCausalLM"),
"InternLM2ForCausalLM": ("internlm2", "InternLM2ForCausalLM"),
"InternLM2VEForCausalLM": ("internlm2_ve", "InternLM2VEForCausalLM"),
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment