Unverified Commit 4efd43e9 authored by qizixi's avatar qizixi Committed by GitHub
Browse files

Fix GLM-4.5V-FP8 numerical issue (#22949)


Signed-off-by: default avatarqizixi <qizixi@meta.com>
Co-authored-by: default avatarCyrus Leung <tlleungac@connect.ust.hk>
parent 3c8a7872
......@@ -333,6 +333,80 @@ def run_glm4_1v(questions: list[str], modality: str) -> ModelRequestData:
)
# GLM-4.5V
def run_glm4_5v(questions: list[str], modality: str) -> ModelRequestData:
model_name = "zai-org/GLM-4.5V"
engine_args = EngineArgs(
model=model_name,
max_model_len=4096,
max_num_seqs=2,
mm_processor_kwargs={
"size": {"shortest_edge": 12544, "longest_edge": 47040000},
"fps": 1,
},
limit_mm_per_prompt={modality: 1},
enforce_eager=True,
tensor_parallel_size=4,
)
if modality == "image":
placeholder = "<|begin_of_image|><|image|><|end_of_image|>"
elif modality == "video":
placeholder = "<|begin_of_video|><|video|><|end_of_video|>"
prompts = [
(
"[gMASK]<sop><|system|>\nYou are a helpful assistant.<|user|>\n"
f"{placeholder}"
f"{question}<|assistant|>assistant\n"
)
for question in questions
]
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
# GLM-4.5V-FP8
def run_glm4_5v_fp8(questions: list[str], modality: str) -> ModelRequestData:
model_name = "zai-org/GLM-4.5V-FP8"
engine_args = EngineArgs(
model=model_name,
max_model_len=4096,
max_num_seqs=2,
mm_processor_kwargs={
"size": {"shortest_edge": 12544, "longest_edge": 47040000},
"fps": 1,
},
limit_mm_per_prompt={modality: 1},
enforce_eager=True,
tensor_parallel_size=4,
)
if modality == "image":
placeholder = "<|begin_of_image|><|image|><|end_of_image|>"
elif modality == "video":
placeholder = "<|begin_of_video|><|video|><|end_of_video|>"
prompts = [
(
"[gMASK]<sop><|system|>\nYou are a helpful assistant.<|user|>\n"
f"{placeholder}"
f"{question}<|assistant|>assistant\n"
)
for question in questions
]
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
# H2OVL-Mississippi
def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
......@@ -383,8 +457,8 @@ def run_hyperclovax_seed_vision(
for question in questions:
if modality == "image":
"""
ocr: List the words in the image in raster order.
Even if the word order feels unnatural for reading,
ocr: List the words in the image in raster order.
Even if the word order feels unnatural for reading,
the model will handle it as long as it follows raster order.
e.g. "Naver, CLOVA, bigshane"
lens_keywords: List the entity names in the image.
......@@ -1448,6 +1522,8 @@ model_example_map = {
"gemma3n": run_gemma3n,
"glm4v": run_glm4v,
"glm4_1v": run_glm4_1v,
"glm4_5v": run_glm4_5v,
"glm4_5v_fp8": run_glm4_5v_fp8,
"h2ovl_chat": run_h2ovl,
"hyperclovax_seed_vision": run_hyperclovax_seed_vision,
"idefics3": run_idefics3,
......
......@@ -1064,6 +1064,76 @@ def load_tarsier2(question: str, image_urls: list[str]) -> ModelRequestData:
)
# GLM-4.5V
def load_glm4_5v(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "zai-org/GLM-4.5V"
engine_args = EngineArgs(
model=model_name,
max_model_len=32768,
max_num_seqs=2,
limit_mm_per_prompt={"image": len(image_urls)},
enforce_eager=True,
tensor_parallel_size=4,
)
placeholders = [{"type": "image", "image": url} for url in image_urls]
messages = [
{
"role": "user",
"content": [
*placeholders,
{"type": "text", "text": question},
],
}
]
processor = AutoProcessor.from_pretrained(model_name)
prompt = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_data = [fetch_image(url) for url in image_urls]
return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
image_data=image_data,
)
# GLM-4.5V-FP8
def load_glm4_5v_fp8(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "zai-org/GLM-4.5V-FP8"
engine_args = EngineArgs(
model=model_name,
max_model_len=32768,
max_num_seqs=2,
limit_mm_per_prompt={"image": len(image_urls)},
enforce_eager=True,
tensor_parallel_size=4,
)
placeholders = [{"type": "image", "image": url} for url in image_urls]
messages = [
{
"role": "user",
"content": [
*placeholders,
{"type": "text", "text": question},
],
}
]
processor = AutoProcessor.from_pretrained(model_name)
prompt = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_data = [fetch_image(url) for url in image_urls]
return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
image_data=image_data,
)
model_example_map = {
"aria": load_aria,
"aya_vision": load_aya_vision,
......@@ -1096,6 +1166,8 @@ model_example_map = {
"step3": load_step3,
"tarsier": load_tarsier,
"tarsier2": load_tarsier2,
"glm4_5v": load_glm4_5v,
"glm4_5v_fp8": load_glm4_5v_fp8,
}
......
......@@ -126,7 +126,7 @@ class Glm4vVideoPixelInputs(TensorSchema):
- ctpp: Number of channels * temporal_patch_size *
patch_size * patch_size
- f: Number of frames
- g: Grid dimensions (3 for grid_t which is usually 1 for processed
- g: Grid dimensions (3 for grid_t which is usually 1 for processed
video, grid_h, grid_w)
"""
type: Literal["pixel_values_videos"] = "pixel_values_videos"
......@@ -141,7 +141,7 @@ class Glm4vVideoEmbeddingInputs(TensorSchema):
- p: Number of video patches across all frames
- h: Hidden size (must match language model backbone)
- f: Number of frames
- g: Grid dimensions (3 for grid_t which is usually 1 for processed
- g: Grid dimensions (3 for grid_t which is usually 1 for processed
video, grid_h, grid_w)
"""
type: Literal["video_embeds"] = "video_embeds"
......@@ -234,7 +234,8 @@ class Glm4vVisionAttention(nn.Module):
total_num_kv_heads=num_heads,
bias=False,
quant_config=quant_config,
prefix=f"{prefix}.qkv",
# Change qkv prefix to align with GLM-4.5V-FP8 quantization config
prefix=f"{prefix}.qkv_proj" if quant_config else f"{prefix}.qkv",
)
self.proj = RowParallelLinear(
input_size=projection_size,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment