Unverified Commit 3964b352 authored by Mick's avatar Mick Committed by GitHub
Browse files

chore: tune mem fraction static for vlm (#6881)

parent 9c7a4618
...@@ -411,7 +411,7 @@ class ModelRunner: ...@@ -411,7 +411,7 @@ class ModelRunner:
else: else:
server_args.attention_backend = "triton" server_args.attention_backend = "triton"
logger.info( logger.info(
f"Attention backend not set. Use {server_args.attention_backend} backend by default." f"Attention backend not explicitly specified. Use {server_args.attention_backend} backend by default."
) )
elif self.use_mla_backend: elif self.use_mla_backend:
if server_args.device != "cpu": if server_args.device != "cpu":
...@@ -463,7 +463,7 @@ class ModelRunner: ...@@ -463,7 +463,7 @@ class ModelRunner:
if not self.is_multimodal_chunked_prefill_supported: if not self.is_multimodal_chunked_prefill_supported:
server_args.chunked_prefill_size = -1 server_args.chunked_prefill_size = -1
logger.info( logger.info(
f"Automatically turn of --chunked-prefill-size as it is not supported for " f"Automatically turn off --chunked-prefill-size as it is not supported for "
f"{self.model_config.hf_config.model_type}" f"{self.model_config.hf_config.model_type}"
) )
......
...@@ -337,8 +337,52 @@ class ServerArgs: ...@@ -337,8 +337,52 @@ class ServerArgs:
# Multimodal models need more memory for the image processor # Multimodal models need more memory for the image processor
model_config = ModelConfig.from_server_args(self) model_config = ModelConfig.from_server_args(self)
if model_config.is_multimodal:
self.mem_fraction_static *= 0.90 vision_config = getattr(model_config.hf_config, "vision_config", None)
if model_config.is_multimodal and vision_config:
# roughly reduce the mem_fraction_static base on params of Vit
original_server_arg_mem_fraction = self.mem_fraction_static
# a base mem_fraction_static factor for regular Vit
base_mem_fraction_reduction_ratio = 0.95
vit_num_layers = getattr(vision_config, "num_hidden_layers", 24)
vit_hidden_size = getattr(vision_config, "hidden_size", 1024)
# baseline ViT params (ViT-L/14)
baseline_vit_layers = 24
baseline_vit_hidden_size = 1024
# weight params count
current_complexity_score = vit_num_layers * (vit_hidden_size**2)
baseline_complexity_score = baseline_vit_layers * (
baseline_vit_hidden_size**2
)
complexity_ratio = (
current_complexity_score / baseline_complexity_score
if baseline_complexity_score > 0
else 1.0
)
# every time the complexity grows 100%, adjust final factor for 10%
sensitivity_scale = 0.1
dynamic_adjustment_factor = 1.0 - sensitivity_scale * (
complexity_ratio - 1.0
)
dynamic_adjustment_factor = max(
0.8, min(1.05, dynamic_adjustment_factor)
)
final_overall_factor = (
base_mem_fraction_reduction_ratio * dynamic_adjustment_factor
)
self.mem_fraction_static = (
original_server_arg_mem_fraction * final_overall_factor
)
logger.warning(
f"Multimodal model: Dynamically adjusted --mem-fraction-static "
f"from: {original_server_arg_mem_fraction:.3f} to: {self.mem_fraction_static:.3f}."
)
# Set chunked prefill size, which depends on the gpu memory capacity # Set chunked prefill size, which depends on the gpu memory capacity
if self.chunked_prefill_size is None: if self.chunked_prefill_size is None:
......
...@@ -30,7 +30,7 @@ class TestQwen2VLServer(TestOpenAIVisionServer): ...@@ -30,7 +30,7 @@ class TestQwen2VLServer(TestOpenAIVisionServer):
api_key=cls.api_key, api_key=cls.api_key,
other_args=[ other_args=[
"--mem-fraction-static", "--mem-fraction-static",
"0.4", "0.35",
], ],
) )
cls.base_url += "/v1" cls.base_url += "/v1"
...@@ -52,7 +52,7 @@ class TestQwen2_5_VLServer(TestOpenAIVisionServer): ...@@ -52,7 +52,7 @@ class TestQwen2_5_VLServer(TestOpenAIVisionServer):
api_key=cls.api_key, api_key=cls.api_key,
other_args=[ other_args=[
"--mem-fraction-static", "--mem-fraction-static",
"0.4", "0.35",
], ],
) )
cls.base_url += "/v1" cls.base_url += "/v1"
...@@ -75,7 +75,7 @@ class TestVLMContextLengthIssue(CustomTestCase): ...@@ -75,7 +75,7 @@ class TestVLMContextLengthIssue(CustomTestCase):
other_args=[ other_args=[
"--context-length", "--context-length",
"300", "300",
"--mem-fraction-static=0.80", "--mem-fraction-static=0.75",
], ],
) )
cls.base_url += "/v1" cls.base_url += "/v1"
...@@ -147,7 +147,7 @@ class TestMinicpmvServer(TestOpenAIVisionServer): ...@@ -147,7 +147,7 @@ class TestMinicpmvServer(TestOpenAIVisionServer):
other_args=[ other_args=[
"--trust-remote-code", "--trust-remote-code",
"--mem-fraction-static", "--mem-fraction-static",
"0.4", "0.35",
], ],
) )
cls.base_url += "/v1" cls.base_url += "/v1"
...@@ -181,7 +181,7 @@ class TestMinicpmoServer(TestOpenAIVisionServer): ...@@ -181,7 +181,7 @@ class TestMinicpmoServer(TestOpenAIVisionServer):
other_args=[ other_args=[
"--trust-remote-code", "--trust-remote-code",
"--mem-fraction-static", "--mem-fraction-static",
"0.7", "0.65",
], ],
) )
cls.base_url += "/v1" cls.base_url += "/v1"
......
...@@ -22,7 +22,7 @@ class TestPixtralServer(TestOpenAIVisionServer): ...@@ -22,7 +22,7 @@ class TestPixtralServer(TestOpenAIVisionServer):
other_args=[ other_args=[
"--trust-remote-code", "--trust-remote-code",
"--mem-fraction-static", "--mem-fraction-static",
"0.73", "0.70",
], ],
) )
cls.base_url += "/v1" cls.base_url += "/v1"
...@@ -44,7 +44,7 @@ class TestMistral3_1Server(TestOpenAIVisionServer): ...@@ -44,7 +44,7 @@ class TestMistral3_1Server(TestOpenAIVisionServer):
other_args=[ other_args=[
"--trust-remote-code", "--trust-remote-code",
"--mem-fraction-static", "--mem-fraction-static",
"0.8", "0.75",
], ],
) )
cls.base_url += "/v1" cls.base_url += "/v1"
...@@ -88,7 +88,7 @@ class TestJanusProServer(TestOpenAIVisionServer): ...@@ -88,7 +88,7 @@ class TestJanusProServer(TestOpenAIVisionServer):
other_args=[ other_args=[
"--trust-remote-code", "--trust-remote-code",
"--mem-fraction-static", "--mem-fraction-static",
"0.4", "0.35",
], ],
) )
cls.base_url += "/v1" cls.base_url += "/v1"
...@@ -197,7 +197,7 @@ class TestPhi4MMServer(TestOpenAIVisionServer): ...@@ -197,7 +197,7 @@ class TestPhi4MMServer(TestOpenAIVisionServer):
other_args=[ other_args=[
"--trust-remote-code", "--trust-remote-code",
"--mem-fraction-static", "--mem-fraction-static",
"0.75", "0.70",
"--disable-radix-cache", "--disable-radix-cache",
"--max-loras-per-batch", "--max-loras-per-batch",
"1", "1",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment