Unverified Commit 4efd43e9 authored by qizixi's avatar qizixi Committed by GitHub
Browse files

Fix GLM-4.5V-FP8 numerical issue (#22949)


Signed-off-by: default avatarqizixi <qizixi@meta.com>
Co-authored-by: default avatarCyrus Leung <tlleungac@connect.ust.hk>
parent 3c8a7872
...@@ -333,6 +333,80 @@ def run_glm4_1v(questions: list[str], modality: str) -> ModelRequestData: ...@@ -333,6 +333,80 @@ def run_glm4_1v(questions: list[str], modality: str) -> ModelRequestData:
) )
# GLM-4.5V
def run_glm4_5v(questions: list[str], modality: str) -> ModelRequestData:
model_name = "zai-org/GLM-4.5V"
engine_args = EngineArgs(
model=model_name,
max_model_len=4096,
max_num_seqs=2,
mm_processor_kwargs={
"size": {"shortest_edge": 12544, "longest_edge": 47040000},
"fps": 1,
},
limit_mm_per_prompt={modality: 1},
enforce_eager=True,
tensor_parallel_size=4,
)
if modality == "image":
placeholder = "<|begin_of_image|><|image|><|end_of_image|>"
elif modality == "video":
placeholder = "<|begin_of_video|><|video|><|end_of_video|>"
prompts = [
(
"[gMASK]<sop><|system|>\nYou are a helpful assistant.<|user|>\n"
f"{placeholder}"
f"{question}<|assistant|>assistant\n"
)
for question in questions
]
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
# GLM-4.5V-FP8
def run_glm4_5v_fp8(questions: list[str], modality: str) -> ModelRequestData:
model_name = "zai-org/GLM-4.5V-FP8"
engine_args = EngineArgs(
model=model_name,
max_model_len=4096,
max_num_seqs=2,
mm_processor_kwargs={
"size": {"shortest_edge": 12544, "longest_edge": 47040000},
"fps": 1,
},
limit_mm_per_prompt={modality: 1},
enforce_eager=True,
tensor_parallel_size=4,
)
if modality == "image":
placeholder = "<|begin_of_image|><|image|><|end_of_image|>"
elif modality == "video":
placeholder = "<|begin_of_video|><|video|><|end_of_video|>"
prompts = [
(
"[gMASK]<sop><|system|>\nYou are a helpful assistant.<|user|>\n"
f"{placeholder}"
f"{question}<|assistant|>assistant\n"
)
for question in questions
]
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
# H2OVL-Mississippi # H2OVL-Mississippi
def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData: def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image" assert modality == "image"
...@@ -383,8 +457,8 @@ def run_hyperclovax_seed_vision( ...@@ -383,8 +457,8 @@ def run_hyperclovax_seed_vision(
for question in questions: for question in questions:
if modality == "image": if modality == "image":
""" """
ocr: List the words in the image in raster order. ocr: List the words in the image in raster order.
Even if the word order feels unnatural for reading, Even if the word order feels unnatural for reading,
the model will handle it as long as it follows raster order. the model will handle it as long as it follows raster order.
e.g. "Naver, CLOVA, bigshane" e.g. "Naver, CLOVA, bigshane"
lens_keywords: List the entity names in the image. lens_keywords: List the entity names in the image.
...@@ -1448,6 +1522,8 @@ model_example_map = { ...@@ -1448,6 +1522,8 @@ model_example_map = {
"gemma3n": run_gemma3n, "gemma3n": run_gemma3n,
"glm4v": run_glm4v, "glm4v": run_glm4v,
"glm4_1v": run_glm4_1v, "glm4_1v": run_glm4_1v,
"glm4_5v": run_glm4_5v,
"glm4_5v_fp8": run_glm4_5v_fp8,
"h2ovl_chat": run_h2ovl, "h2ovl_chat": run_h2ovl,
"hyperclovax_seed_vision": run_hyperclovax_seed_vision, "hyperclovax_seed_vision": run_hyperclovax_seed_vision,
"idefics3": run_idefics3, "idefics3": run_idefics3,
......
...@@ -1064,6 +1064,76 @@ def load_tarsier2(question: str, image_urls: list[str]) -> ModelRequestData: ...@@ -1064,6 +1064,76 @@ def load_tarsier2(question: str, image_urls: list[str]) -> ModelRequestData:
) )
# GLM-4.5V
def load_glm4_5v(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "zai-org/GLM-4.5V"
engine_args = EngineArgs(
model=model_name,
max_model_len=32768,
max_num_seqs=2,
limit_mm_per_prompt={"image": len(image_urls)},
enforce_eager=True,
tensor_parallel_size=4,
)
placeholders = [{"type": "image", "image": url} for url in image_urls]
messages = [
{
"role": "user",
"content": [
*placeholders,
{"type": "text", "text": question},
],
}
]
processor = AutoProcessor.from_pretrained(model_name)
prompt = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_data = [fetch_image(url) for url in image_urls]
return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
image_data=image_data,
)
# GLM-4.5V-FP8
def load_glm4_5v_fp8(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "zai-org/GLM-4.5V-FP8"
engine_args = EngineArgs(
model=model_name,
max_model_len=32768,
max_num_seqs=2,
limit_mm_per_prompt={"image": len(image_urls)},
enforce_eager=True,
tensor_parallel_size=4,
)
placeholders = [{"type": "image", "image": url} for url in image_urls]
messages = [
{
"role": "user",
"content": [
*placeholders,
{"type": "text", "text": question},
],
}
]
processor = AutoProcessor.from_pretrained(model_name)
prompt = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_data = [fetch_image(url) for url in image_urls]
return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
image_data=image_data,
)
model_example_map = { model_example_map = {
"aria": load_aria, "aria": load_aria,
"aya_vision": load_aya_vision, "aya_vision": load_aya_vision,
...@@ -1096,6 +1166,8 @@ model_example_map = { ...@@ -1096,6 +1166,8 @@ model_example_map = {
"step3": load_step3, "step3": load_step3,
"tarsier": load_tarsier, "tarsier": load_tarsier,
"tarsier2": load_tarsier2, "tarsier2": load_tarsier2,
"glm4_5v": load_glm4_5v,
"glm4_5v_fp8": load_glm4_5v_fp8,
} }
......
...@@ -126,7 +126,7 @@ class Glm4vVideoPixelInputs(TensorSchema): ...@@ -126,7 +126,7 @@ class Glm4vVideoPixelInputs(TensorSchema):
- ctpp: Number of channels * temporal_patch_size * - ctpp: Number of channels * temporal_patch_size *
patch_size * patch_size patch_size * patch_size
- f: Number of frames - f: Number of frames
- g: Grid dimensions (3 for grid_t which is usually 1 for processed - g: Grid dimensions (3 for grid_t which is usually 1 for processed
video, grid_h, grid_w) video, grid_h, grid_w)
""" """
type: Literal["pixel_values_videos"] = "pixel_values_videos" type: Literal["pixel_values_videos"] = "pixel_values_videos"
...@@ -141,7 +141,7 @@ class Glm4vVideoEmbeddingInputs(TensorSchema): ...@@ -141,7 +141,7 @@ class Glm4vVideoEmbeddingInputs(TensorSchema):
- p: Number of video patches across all frames - p: Number of video patches across all frames
- h: Hidden size (must match language model backbone) - h: Hidden size (must match language model backbone)
- f: Number of frames - f: Number of frames
- g: Grid dimensions (3 for grid_t which is usually 1 for processed - g: Grid dimensions (3 for grid_t which is usually 1 for processed
video, grid_h, grid_w) video, grid_h, grid_w)
""" """
type: Literal["video_embeds"] = "video_embeds" type: Literal["video_embeds"] = "video_embeds"
...@@ -234,7 +234,8 @@ class Glm4vVisionAttention(nn.Module): ...@@ -234,7 +234,8 @@ class Glm4vVisionAttention(nn.Module):
total_num_kv_heads=num_heads, total_num_kv_heads=num_heads,
bias=False, bias=False,
quant_config=quant_config, quant_config=quant_config,
prefix=f"{prefix}.qkv", # Change qkv prefix to align with GLM-4.5V-FP8 quantization config
prefix=f"{prefix}.qkv_proj" if quant_config else f"{prefix}.qkv",
) )
self.proj = RowParallelLinear( self.proj = RowParallelLinear(
input_size=projection_size, input_size=projection_size,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment