Unverified Commit 7750ed1a authored by Hongkuan Zhou's avatar Hongkuan Zhou Committed by GitHub
Browse files

feat: add parallelization filters (#4144)


Signed-off-by: default avatarhongkuanz <hongkuanz@nvidia.com>
Signed-off-by: default avatarHongkuan Zhou <tedzhouhk@gmail.com>
Co-authored-by: default avatarcoderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
parent e1b0edb9
...@@ -19,6 +19,7 @@ project_root = Path(__file__).parent.parent.parent ...@@ -19,6 +19,7 @@ project_root = Path(__file__).parent.parent.parent
sys.path.insert(0, str(project_root)) sys.path.insert(0, str(project_root))
from benchmarks.profiler.profile_sla import run_profile # noqa: E402 from benchmarks.profiler.profile_sla import run_profile # noqa: E402
from benchmarks.profiler.utils.model_info import ModelInfo # noqa: E402
from benchmarks.profiler.utils.search_space_autogen import ( # noqa: E402 from benchmarks.profiler.utils.search_space_autogen import ( # noqa: E402
auto_generate_search_space, auto_generate_search_space,
) )
...@@ -63,7 +64,6 @@ class TestProfileSLADryRun: ...@@ -63,7 +64,6 @@ class TestProfileSLADryRun:
self.prefill_interpolation_granularity = 16 self.prefill_interpolation_granularity = 16
self.decode_interpolation_granularity = 6 self.decode_interpolation_granularity = 6
self.service_name = "" self.service_name = ""
self.is_moe_model = False
self.dry_run = True self.dry_run = True
self.use_ai_configurator = False self.use_ai_configurator = False
self.aic_system = None self.aic_system = None
...@@ -72,6 +72,13 @@ class TestProfileSLADryRun: ...@@ -72,6 +72,13 @@ class TestProfileSLADryRun:
self.aic_backend_version = None self.aic_backend_version = None
self.num_gpus_per_node = 8 self.num_gpus_per_node = 8
self.deploy_after_profile = False self.deploy_after_profile = False
# Provide minimal model_info to avoid HF queries
self.model_info = ModelInfo(
model_size=16384.0,
architecture="TestArchitecture",
is_moe=False,
max_context_length=self.max_context_length,
)
return Args() return Args()
...@@ -99,7 +106,6 @@ class TestProfileSLADryRun: ...@@ -99,7 +106,6 @@ class TestProfileSLADryRun:
self.prefill_interpolation_granularity = 16 self.prefill_interpolation_granularity = 16
self.decode_interpolation_granularity = 6 self.decode_interpolation_granularity = 6
self.service_name = "" self.service_name = ""
self.is_moe_model = False
self.dry_run = True self.dry_run = True
self.use_ai_configurator = False self.use_ai_configurator = False
self.aic_system = None self.aic_system = None
...@@ -108,6 +114,12 @@ class TestProfileSLADryRun: ...@@ -108,6 +114,12 @@ class TestProfileSLADryRun:
self.aic_backend_version = None self.aic_backend_version = None
self.num_gpus_per_node = 8 self.num_gpus_per_node = 8
self.deploy_after_profile = False self.deploy_after_profile = False
self.model_info = ModelInfo(
model_size=16384.0,
architecture="TestArchitecture",
is_moe=False,
max_context_length=self.max_context_length,
)
return Args() return Args()
...@@ -149,7 +161,6 @@ class TestProfileSLADryRun: ...@@ -149,7 +161,6 @@ class TestProfileSLADryRun:
self.prefill_interpolation_granularity = 16 self.prefill_interpolation_granularity = 16
self.decode_interpolation_granularity = 6 self.decode_interpolation_granularity = 6
self.service_name = "" self.service_name = ""
self.is_moe_model = False
self.dry_run = True self.dry_run = True
self.use_ai_configurator = False self.use_ai_configurator = False
self.aic_system = None self.aic_system = None
...@@ -158,6 +169,12 @@ class TestProfileSLADryRun: ...@@ -158,6 +169,12 @@ class TestProfileSLADryRun:
self.aic_backend_version = None self.aic_backend_version = None
self.num_gpus_per_node = 8 self.num_gpus_per_node = 8
self.deploy_after_profile = False self.deploy_after_profile = False
self.model_info = ModelInfo(
model_size=16384.0,
architecture="TestArchitecture",
is_moe=False,
max_context_length=self.max_context_length,
)
return Args() return Args()
...@@ -192,7 +209,6 @@ class TestProfileSLADryRun: ...@@ -192,7 +209,6 @@ class TestProfileSLADryRun:
self.prefill_interpolation_granularity = 16 self.prefill_interpolation_granularity = 16
self.decode_interpolation_granularity = 6 self.decode_interpolation_granularity = 6
self.service_name = "" self.service_name = ""
self.is_moe_model = True
self.dry_run = True self.dry_run = True
self.use_ai_configurator = False self.use_ai_configurator = False
self.aic_system = None self.aic_system = None
...@@ -201,6 +217,13 @@ class TestProfileSLADryRun: ...@@ -201,6 +217,13 @@ class TestProfileSLADryRun:
self.aic_backend_version = None self.aic_backend_version = None
self.num_gpus_per_node = 8 self.num_gpus_per_node = 8
self.deploy_after_profile = False self.deploy_after_profile = False
self.model_info = ModelInfo(
model_size=65536.0,
architecture="TestMoEArchitecture",
is_moe=True,
max_context_length=self.max_context_length,
num_experts=16,
)
return Args() return Args()
...@@ -224,11 +247,12 @@ class TestProfileSLADryRun: ...@@ -224,11 +247,12 @@ class TestProfileSLADryRun:
@pytest.fixture @pytest.fixture
def mock_model_info(self): def mock_model_info(self):
"""Mock model info for DeepSeek-R1-Distill-Llama-8B.""" """Mock model info for DeepSeek-R1-Distill-Llama-8B."""
return { return ModelInfo(
"model_size": 16384, # 16GB model in MiB model_size=16384.0, # 16GB model in MiB
"is_moe": False, architecture="LlamaForCausalLM",
"max_context_length": 16384, # 16K tokens is_moe=False,
} max_context_length=16384,
)
@pytest.fixture @pytest.fixture
def vllm_args_with_model_autogen(self): def vllm_args_with_model_autogen(self):
...@@ -242,12 +266,9 @@ class TestProfileSLADryRun: ...@@ -242,12 +266,9 @@ class TestProfileSLADryRun:
self.namespace = "test-namespace" self.namespace = "test-namespace"
self.model = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B" # Specify model for autogen self.model = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B" # Specify model for autogen
self.dgd_image = "" self.dgd_image = ""
self.min_num_gpus_per_engine = ( # Set to 0 to trigger auto-generation path
1 # Will be overridden by auto-generation self.min_num_gpus_per_engine = 0
) self.max_num_gpus_per_engine = 0
self.max_num_gpus_per_engine = (
8 # Will be overridden by auto-generation
)
self.skip_existing_results = False self.skip_existing_results = False
self.force_rerun = False self.force_rerun = False
self.isl = 3000 self.isl = 3000
...@@ -258,15 +279,16 @@ class TestProfileSLADryRun: ...@@ -258,15 +279,16 @@ class TestProfileSLADryRun:
self.prefill_interpolation_granularity = 16 self.prefill_interpolation_granularity = 16
self.decode_interpolation_granularity = 6 self.decode_interpolation_granularity = 6
self.service_name = "" self.service_name = ""
self.is_moe_model = False
self.dry_run = True self.dry_run = True
self.use_ai_configurator = False self.use_ai_configurator = False
self.aic_system = None self.aic_system = None
self.aic_model_name = None self.aic_model_name = None
self.aic_backend = "" self.aic_backend = ""
self.aic_backend_version = None self.aic_backend_version = None
self.num_gpus_per_node = 8 # Will be overridden by auto-generation # Set to 0 to trigger auto-generation path
self.num_gpus_per_node = 0
self.deploy_after_profile = False self.deploy_after_profile = False
self.enable_gpu_discovery = True
return Args() return Args()
...@@ -308,12 +330,8 @@ class TestProfileSLADryRun: ...@@ -308,12 +330,8 @@ class TestProfileSLADryRun:
self.namespace = "test-namespace" self.namespace = "test-namespace"
self.model = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B" # Specify model for autogen self.model = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B" # Specify model for autogen
self.dgd_image = "" self.dgd_image = ""
self.min_num_gpus_per_engine = ( self.min_num_gpus_per_engine = 0
1 # Will be overridden by auto-generation self.max_num_gpus_per_engine = 0
)
self.max_num_gpus_per_engine = (
8 # Will be overridden by auto-generation
)
self.skip_existing_results = False self.skip_existing_results = False
self.force_rerun = False self.force_rerun = False
self.isl = 3000 self.isl = 3000
...@@ -324,15 +342,15 @@ class TestProfileSLADryRun: ...@@ -324,15 +342,15 @@ class TestProfileSLADryRun:
self.prefill_interpolation_granularity = 16 self.prefill_interpolation_granularity = 16
self.decode_interpolation_granularity = 6 self.decode_interpolation_granularity = 6
self.service_name = "" self.service_name = ""
self.is_moe_model = False
self.dry_run = True self.dry_run = True
self.use_ai_configurator = False self.use_ai_configurator = False
self.aic_system = None self.aic_system = None
self.aic_model_name = None self.aic_model_name = None
self.aic_backend = "" self.aic_backend = ""
self.aic_backend_version = None self.aic_backend_version = None
self.num_gpus_per_node = 8 # Will be overridden by auto-generation self.num_gpus_per_node = 0
self.deploy_after_profile = False self.deploy_after_profile = False
self.enable_gpu_discovery = True
return Args() return Args()
...@@ -374,12 +392,8 @@ class TestProfileSLADryRun: ...@@ -374,12 +392,8 @@ class TestProfileSLADryRun:
self.namespace = "test-namespace" self.namespace = "test-namespace"
self.model = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B" # Specify model for autogen self.model = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B" # Specify model for autogen
self.dgd_image = "" self.dgd_image = ""
self.min_num_gpus_per_engine = ( self.min_num_gpus_per_engine = 0
1 # Will be overridden by auto-generation self.max_num_gpus_per_engine = 0
)
self.max_num_gpus_per_engine = (
8 # Will be overridden by auto-generation
)
self.skip_existing_results = False self.skip_existing_results = False
self.force_rerun = False self.force_rerun = False
self.isl = 3000 self.isl = 3000
...@@ -390,15 +404,15 @@ class TestProfileSLADryRun: ...@@ -390,15 +404,15 @@ class TestProfileSLADryRun:
self.prefill_interpolation_granularity = 16 self.prefill_interpolation_granularity = 16
self.decode_interpolation_granularity = 6 self.decode_interpolation_granularity = 6
self.service_name = "" self.service_name = ""
self.is_moe_model = False
self.dry_run = True self.dry_run = True
self.use_ai_configurator = False self.use_ai_configurator = False
self.aic_system = None self.aic_system = None
self.aic_model_name = None self.aic_model_name = None
self.aic_backend = "" self.aic_backend = ""
self.aic_backend_version = None self.aic_backend_version = None
self.num_gpus_per_node = 8 # Will be overridden by auto-generation self.num_gpus_per_node = 0
self.deploy_after_profile = False self.deploy_after_profile = False
self.enable_gpu_discovery = True
return Args() return Args()
......
...@@ -70,7 +70,7 @@ sglang_configs = { ...@@ -70,7 +70,7 @@ sglang_configs = {
name="disaggregated_same_gpu", name="disaggregated_same_gpu",
directory=sglang_dir, directory=sglang_dir,
script_name="disagg_same_gpu.sh", script_name="disagg_same_gpu.sh",
marks=[pytest.mark.gpu_1], marks=[pytest.mark.gpu_1, pytest.mark.skip(reason="unstable")],
model="Qwen/Qwen3-0.6B", model="Qwen/Qwen3-0.6B",
env={}, env={},
models_port=8000, models_port=8000,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment