Unverified Commit 43986372 authored by Jason Zhou's avatar Jason Zhou Committed by GitHub
Browse files

feat: DynamoPlanner profiler to use hf_id for AIConfigurator 0.4.0 (#4167)


Signed-off-by: default avatarJason Zhou <jasonzho@jasonzho-mlt.client.nvidia.com>
Signed-off-by: default avatarJason Zhou <jasonzho@nvidia.com>
Co-authored-by: default avatarJason Zhou <jasonzho@jasonzho-mlt.client.nvidia.com>
parent 7afb5431
...@@ -441,7 +441,7 @@ License: `Apache` ...@@ -441,7 +441,7 @@ License: `Apache`
- `Homepage`: https://github.com/huggingface/accelerate - `Homepage`: https://github.com/huggingface/accelerate
   
   
## aiconfigurator (0.2.0) ## aiconfigurator (0.4.0)
   
### Licenses ### Licenses
License: `Apache-2.0` License: `Apache-2.0`
...@@ -19,7 +19,7 @@ spec: ...@@ -19,7 +19,7 @@ spec:
# AI Configurator mode (fast simulation-based profiling) # AI Configurator mode (fast simulation-based profiling)
use_ai_configurator: true use_ai_configurator: true
aic_system: h200_sxm aic_system: h200_sxm
aic_model_name: QWEN3_32B aic_hf_id: Qwen/Qwen3-32B
aic_backend_version: "0.20.0" aic_backend_version: "0.20.0"
# SLA targets for profiling # SLA targets for profiling
......
...@@ -143,9 +143,6 @@ async def run_profile(args): ...@@ -143,9 +143,6 @@ async def run_profile(args):
assert args.backend in [ assert args.backend in [
"sglang" "sglang"
], "MoE model support is only available for SGLang" ], "MoE model support is only available for SGLang"
assert (
not args.use_ai_configurator
), "MoE model is not supported in ai-configurator"
else: else:
logger.info( logger.info(
"Dense model profiling, sweeping TP size for prefill and decode" "Dense model profiling, sweeping TP size for prefill and decode"
...@@ -204,26 +201,30 @@ async def run_profile(args): ...@@ -204,26 +201,30 @@ async def run_profile(args):
raise ValueError( raise ValueError(
"Must provide --aic-system when using --use-ai-configurator." "Must provide --aic-system when using --use-ai-configurator."
) )
if not args.aic_model_name:
raise ValueError( # Fallback to args.model if aic_hf_id is not provided
"Must provide --aic-model-name when using --use-ai-configurator." if not args.aic_hf_id:
if args.model:
logger.info(
f"--aic-hf-id not provided, using --model ({args.model}) as HuggingFace ID for AI configurator"
) )
if not args.aic_backend_version: args.aic_hf_id = args.model
else:
raise ValueError( raise ValueError(
"Must provide --aic-backend-version when using --use-ai-configurator." "Must provide --aic-hf-id or --model when using --use-ai-configurator."
) )
logger.info("Using aiconfigurator to estimate performance...") logger.info("Using aiconfigurator to estimate performance...")
ai_configurator_perf_estimator = AIConfiguratorPerfEstimator( ai_configurator_perf_estimator = AIConfiguratorPerfEstimator(
args.aic_model_name, args.aic_hf_id,
args.aic_system.lower(), args.aic_system.lower(),
args.aic_backend, args.aic_backend,
args.aic_backend_version, args.aic_backend_version,
) )
else: else:
if args.aic_system or args.aic_model_name or args.aic_backend_version: if args.aic_system or args.aic_hf_id or args.aic_backend_version:
logger.warning( logger.warning(
"Ignoring --aic-system, --aic-model-name, and/or --backend-version " "Ignoring --aic-system, --aic-hf-id, and/or --backend-version "
"when not using --use-ai-configurator." "when not using --use-ai-configurator."
) )
......
...@@ -36,7 +36,7 @@ class AIConfiguratorPerfEstimator: ...@@ -36,7 +36,7 @@ class AIConfiguratorPerfEstimator:
def __init__( def __init__(
self, self,
model_name: str, # e.g. "QWEN3_32B" hf_id: str, # e.g. "Qwen/Qwen3-32B"
system: str, # e.g. "h200_sxm" system: str, # e.g. "h200_sxm"
backend: str, # e.g. "trtllm" backend: str, # e.g. "trtllm"
version: str, # e.g. "0.20.0" version: str, # e.g. "0.20.0"
...@@ -44,6 +44,11 @@ class AIConfiguratorPerfEstimator: ...@@ -44,6 +44,11 @@ class AIConfiguratorPerfEstimator:
aiconfigurator = _try_import_aiconfigurator() aiconfigurator = _try_import_aiconfigurator()
logger.info("Loading aiconfigurator database. This might take a few seconds...") logger.info("Loading aiconfigurator database. This might take a few seconds...")
if not version:
version = aiconfigurator.sdk.perf_database.get_latest_database_version(
system,
backend,
)
self.database = aiconfigurator.sdk.perf_database.get_database( self.database = aiconfigurator.sdk.perf_database.get_database(
system=system, system=system,
backend=backend, backend=backend,
...@@ -56,10 +61,7 @@ class AIConfiguratorPerfEstimator: ...@@ -56,10 +61,7 @@ class AIConfiguratorPerfEstimator:
logger.info("aiconfigurator database loaded.") logger.info("aiconfigurator database loaded.")
self.backend = aiconfigurator.sdk.backends.factory.get_backend(backend) self.backend = aiconfigurator.sdk.backends.factory.get_backend(backend)
self.hf_id = hf_id
# This is the aiconfigurator model name (such as QWEN3_32B or DEEPSEEK_V3)
# rather than the HF model name.
self.model_name = model_name
def _get_model(self, **model_config_kwargs): def _get_model(self, **model_config_kwargs):
aiconfigurator = _try_import_aiconfigurator() aiconfigurator = _try_import_aiconfigurator()
...@@ -67,7 +69,7 @@ class AIConfiguratorPerfEstimator: ...@@ -67,7 +69,7 @@ class AIConfiguratorPerfEstimator:
# NOTE: MOE models error out unless moe_tp_size and moe_ep_size are provided. # NOTE: MOE models error out unless moe_tp_size and moe_ep_size are provided.
model_config = aiconfigurator.sdk.config.ModelConfig(**model_config_kwargs) model_config = aiconfigurator.sdk.config.ModelConfig(**model_config_kwargs)
model = aiconfigurator.sdk.models.get_model( model = aiconfigurator.sdk.models.get_model(
self.model_name, model_config, self.backend self.hf_id, model_config, self.backend
) )
return model return model
......
...@@ -80,7 +80,7 @@ def create_profiler_parser() -> argparse.Namespace: ...@@ -80,7 +80,7 @@ def create_profiler_parser() -> argparse.Namespace:
decode_interpolation_granularity: Int (how many samples to benchmark to interpolate ITL under different active kv cache size and decode context length, default: 6) decode_interpolation_granularity: Int (how many samples to benchmark to interpolate ITL under different active kv cache size and decode context length, default: 6)
use_ai_configurator: Boolean (use ai-configurator to estimate benchmarking results instead of running actual deployment, default: False) use_ai_configurator: Boolean (use ai-configurator to estimate benchmarking results instead of running actual deployment, default: False)
aic_system: String (target system for use with aiconfigurator, default: None) aic_system: String (target system for use with aiconfigurator, default: None)
aic_model_name: String (aiconfigurator name of the target model, default: None) aic_hf_id: String (aiconfigurator huggingface id of the target model, default: None)
aic_backend: String (aiconfigurator backend of the target model, if not provided, will use args.backend, default: "") aic_backend: String (aiconfigurator backend of the target model, if not provided, will use args.backend, default: "")
aic_backend_version: String (specify backend version when using aiconfigurator to estimate perf, default: None) aic_backend_version: String (specify backend version when using aiconfigurator to estimate perf, default: None)
dry_run: Boolean (dry run the profile job, default: False) dry_run: Boolean (dry run the profile job, default: False)
...@@ -260,10 +260,10 @@ def create_profiler_parser() -> argparse.Namespace: ...@@ -260,10 +260,10 @@ def create_profiler_parser() -> argparse.Namespace:
help="Target system for use with aiconfigurator (e.g. h100_sxm, h200_sxm)", help="Target system for use with aiconfigurator (e.g. h100_sxm, h200_sxm)",
) )
parser.add_argument( parser.add_argument(
"--aic-model-name", "--aic-hf-id",
type=str, type=str,
default=config.get("sweep", {}).get("aic_model_name"), default=config.get("sweep", {}).get("aic_hf_id"),
help="aiconfigurator name of the target model (e.g. QWEN3_32B, DEEPSEEK_V3)", help="aiconfigurator name of the target model (e.g. Qwen/Qwen3-32B, meta-llama/Llama-3.1-405B)",
) )
parser.add_argument( parser.add_argument(
"--aic-backend", "--aic-backend",
......
...@@ -40,7 +40,7 @@ classifiers = [ ...@@ -40,7 +40,7 @@ classifiers = [
] ]
dependencies = [ dependencies = [
"aiconfigurator @ git+https://github.com/ai-dynamo/aiconfigurator.git@11b6d821f1fbb34300bb0ed4945f647e89fb411a", "aiconfigurator @ git+https://github.com/ai-dynamo/aiconfigurator.git@5554d2eb8206738c66048bf2d72183e9bcd85759",
"networkx", "networkx",
"pandas", "pandas",
"pydantic>=2", "pydantic>=2",
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
accelerate==1.6.0 accelerate==1.6.0
aiconfigurator @ git+https://github.com/ai-dynamo/aiconfigurator.git@11b6d821f1fbb34300bb0ed4945f647e89fb411a aiconfigurator @ git+https://github.com/ai-dynamo/aiconfigurator.git@5554d2eb8206738c66048bf2d72183e9bcd85759
aiofiles aiofiles
aiperf @ git+https://github.com/ai-dynamo/aiperf.git@16dad7c02fcd959ba96823d7bfe7e681e5d5b41d aiperf @ git+https://github.com/ai-dynamo/aiperf.git@16dad7c02fcd959ba96823d7bfe7e681e5d5b41d
av==15.0.0 av==15.0.0
......
...@@ -53,7 +53,7 @@ spec: ...@@ -53,7 +53,7 @@ spec:
# AI Configurator mode (fast simulation-based profiling, 20-30 seconds) # AI Configurator mode (fast simulation-based profiling, 20-30 seconds)
use_ai_configurator: false # Set to false for online profiling (2-4 hours) use_ai_configurator: false # Set to false for online profiling (2-4 hours)
aic_system: h200_sxm # Target GPU system for AI Configurator aic_system: h200_sxm # Target GPU system for AI Configurator
aic_model_name: QWEN3_0.6B # Model name for AI Configurator aic_hf_id: Qwen/Qwen3-0.6B # HuggingFace model ID for AI Configurator
aic_backend_version: "0.20.0" # Backend version for AI Configurator aic_backend_version: "0.20.0" # Backend version for AI Configurator
# SLA targets for profiling # SLA targets for profiling
......
...@@ -348,7 +348,7 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() { ...@@ -348,7 +348,7 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() {
"sweep": map[string]interface{}{ "sweep": map[string]interface{}{
"use_ai_configurator": true, "use_ai_configurator": true,
"aic_system": "h200_sxm", "aic_system": "h200_sxm",
"aic_model_name": "QWEN3_32B", "aic_hf_id": "Qwen/Qwen3-32B",
"aic_backend_version": "0.20.0", "aic_backend_version": "0.20.0",
}, },
}), }),
...@@ -1058,7 +1058,7 @@ var _ = Describe("DGDR Profiler Arguments", func() { ...@@ -1058,7 +1058,7 @@ var _ = Describe("DGDR Profiler Arguments", func() {
"sweep": map[string]interface{}{ "sweep": map[string]interface{}{
"use_ai_configurator": true, "use_ai_configurator": true,
"aic_system": "h200_sxm", "aic_system": "h200_sxm",
"aic_model_name": "QWEN3_32B", "aic_hf_id": "Qwen/Qwen3-32B",
"aic_backend_version": "0.20.0", "aic_backend_version": "0.20.0",
}, },
}), }),
......
...@@ -303,17 +303,12 @@ profilingConfig: ...@@ -303,17 +303,12 @@ profilingConfig:
sweep: sweep:
use_ai_configurator: true use_ai_configurator: true
aic_system: h200_sxm # GPU system: h100_sxm, h200_sxm, b200_sxm, gb200_sxm, a100_sxm aic_system: h200_sxm # GPU system: h100_sxm, h200_sxm, b200_sxm, gb200_sxm, a100_sxm
aic_model_name: QWEN3_32B # AIC model identifier (see supported list) aic_hf_id: Qwen/Qwen3-32B # Huggingface model id
aic_backend_version: "0.20.0" # TensorRT-LLM version: 0.20.0, 1.0.0rc3, 1.0.0rc6 aic_backend_version: "0.20.0" # TensorRT-LLM version: 0.20.0, 1.0.0rc3
``` ```
**Supported configurations:** See [AI Configurator documentation](https://github.com/ai-dynamo/aiconfigurator#supported-features) **Supported configurations:** See [AI Configurator documentation](https://github.com/ai-dynamo/aiconfigurator#supported-features)
**Model name mapping examples:**
- `Qwen/Qwen3-32B``QWEN3_32B`
- `meta-llama/Llama-3.1-70B``LLAMA3.1_70B`
- `deepseek-ai/DeepSeek-V3``DEEPSEEK_V3`
### Planner Configuration (Optional) ### Planner Configuration (Optional)
Pass arguments to the SLA planner: Pass arguments to the SLA planner:
......
...@@ -229,7 +229,7 @@ sweep: ...@@ -229,7 +229,7 @@ sweep:
sweep: sweep:
use_ai_configurator: true use_ai_configurator: true
aic_system: h200_sxm aic_system: h200_sxm
aic_model_name: QWEN3_32B aic_hf_id: Qwen/Qwen3-32B
aic_backend_version: "0.20.0" aic_backend_version: "0.20.0"
``` ```
......
...@@ -60,9 +60,9 @@ class TestProfileSlaAiconfigurator: ...@@ -60,9 +60,9 @@ class TestProfileSlaAiconfigurator:
self.dry_run = False self.dry_run = False
self.use_ai_configurator = True self.use_ai_configurator = True
self.aic_system = "h200_sxm" self.aic_system = "h200_sxm"
self.aic_model_name = "QWEN3_32B" self.aic_hf_id = "Qwen/Qwen3-32B"
self.aic_backend = "" self.aic_backend = ""
self.aic_backend_version = "0.20.0" self.aic_backend_version = None
self.num_gpus_per_node = 8 self.num_gpus_per_node = 8
self.deploy_after_profile = False self.deploy_after_profile = False
# Provide minimal model_info to avoid HF queries # Provide minimal model_info to avoid HF queries
...@@ -77,11 +77,10 @@ class TestProfileSlaAiconfigurator: ...@@ -77,11 +77,10 @@ class TestProfileSlaAiconfigurator:
@pytest.mark.pre_merge @pytest.mark.pre_merge
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize( @pytest.mark.parametrize("missing_arg", ["aic_system", "aic_hf_id"])
"missing_arg", ["aic_system", "aic_model_name", "aic_backend_version"]
)
async def test_aiconfigurator_missing_args(self, trtllm_args, missing_arg): async def test_aiconfigurator_missing_args(self, trtllm_args, missing_arg):
# Check that validation error happens when a required arg is missing. # Check that validation error happens when a required arg is missing.
# Note: aic_backend_version is optional - when None, auto-detects latest version
setattr(trtllm_args, missing_arg, None) setattr(trtllm_args, missing_arg, None)
with pytest.raises(ValueError): with pytest.raises(ValueError):
await run_profile(trtllm_args) await run_profile(trtllm_args)
...@@ -113,16 +112,23 @@ class TestProfileSlaAiconfigurator: ...@@ -113,16 +112,23 @@ class TestProfileSlaAiconfigurator:
@pytest.mark.parametrize( @pytest.mark.parametrize(
"backend, aic_backend_version", "backend, aic_backend_version",
[ [
("trtllm", None),
("trtllm", "0.20.0"), ("trtllm", "0.20.0"),
("trtllm", "1.0.0rc3"), ("trtllm", "1.0.0rc3"),
], ],
) )
@pytest.mark.parametrize("model_name", ["QWEN3_32B", "LLAMA3.1_405B"]) @pytest.mark.parametrize(
"hf_model_id",
[
"Qwen/Qwen3-32B",
"meta-llama/Llama-3.1-405B",
],
)
async def test_trtllm_aiconfigurator_many( async def test_trtllm_aiconfigurator_many(
self, trtllm_args, model_name, backend, aic_backend_version self, trtllm_args, hf_model_id, backend, aic_backend_version
): ):
# Test that profile_sla works with a variety of backend versions and model names. # Test that profile_sla works with a variety of backend versions and model names.
trtllm_args.aic_model_name = model_name trtllm_args.aic_hf_id = hf_model_id
trtllm_args.backend = backend trtllm_args.backend = backend
trtllm_args.aic_backend_version = aic_backend_version trtllm_args.aic_backend_version = aic_backend_version
await run_profile(trtllm_args) await run_profile(trtllm_args)
...@@ -67,7 +67,7 @@ class TestProfileSLADryRun: ...@@ -67,7 +67,7 @@ class TestProfileSLADryRun:
self.dry_run = True self.dry_run = True
self.use_ai_configurator = False self.use_ai_configurator = False
self.aic_system = None self.aic_system = None
self.aic_model_name = None self.aic_hf_id = None
self.aic_backend = "" self.aic_backend = ""
self.aic_backend_version = None self.aic_backend_version = None
self.num_gpus_per_node = 8 self.num_gpus_per_node = 8
...@@ -109,7 +109,7 @@ class TestProfileSLADryRun: ...@@ -109,7 +109,7 @@ class TestProfileSLADryRun:
self.dry_run = True self.dry_run = True
self.use_ai_configurator = False self.use_ai_configurator = False
self.aic_system = None self.aic_system = None
self.aic_model_name = None self.aic_hf_id = None
self.aic_backend = "" self.aic_backend = ""
self.aic_backend_version = None self.aic_backend_version = None
self.num_gpus_per_node = 8 self.num_gpus_per_node = 8
...@@ -164,7 +164,7 @@ class TestProfileSLADryRun: ...@@ -164,7 +164,7 @@ class TestProfileSLADryRun:
self.dry_run = True self.dry_run = True
self.use_ai_configurator = False self.use_ai_configurator = False
self.aic_system = None self.aic_system = None
self.aic_model_name = None self.aic_hf_id = None
self.aic_backend = "" self.aic_backend = ""
self.aic_backend_version = None self.aic_backend_version = None
self.num_gpus_per_node = 8 self.num_gpus_per_node = 8
...@@ -212,7 +212,7 @@ class TestProfileSLADryRun: ...@@ -212,7 +212,7 @@ class TestProfileSLADryRun:
self.dry_run = True self.dry_run = True
self.use_ai_configurator = False self.use_ai_configurator = False
self.aic_system = None self.aic_system = None
self.aic_model_name = None self.aic_hf_id = None
self.aic_backend = "" self.aic_backend = ""
self.aic_backend_version = None self.aic_backend_version = None
self.num_gpus_per_node = 8 self.num_gpus_per_node = 8
...@@ -282,7 +282,7 @@ class TestProfileSLADryRun: ...@@ -282,7 +282,7 @@ class TestProfileSLADryRun:
self.dry_run = True self.dry_run = True
self.use_ai_configurator = False self.use_ai_configurator = False
self.aic_system = None self.aic_system = None
self.aic_model_name = None self.aic_hf_id = None
self.aic_backend = "" self.aic_backend = ""
self.aic_backend_version = None self.aic_backend_version = None
# Set to 0 to trigger auto-generation path # Set to 0 to trigger auto-generation path
...@@ -345,7 +345,7 @@ class TestProfileSLADryRun: ...@@ -345,7 +345,7 @@ class TestProfileSLADryRun:
self.dry_run = True self.dry_run = True
self.use_ai_configurator = False self.use_ai_configurator = False
self.aic_system = None self.aic_system = None
self.aic_model_name = None self.aic_hf_id = None
self.aic_backend = "" self.aic_backend = ""
self.aic_backend_version = None self.aic_backend_version = None
self.num_gpus_per_node = 0 self.num_gpus_per_node = 0
...@@ -407,7 +407,7 @@ class TestProfileSLADryRun: ...@@ -407,7 +407,7 @@ class TestProfileSLADryRun:
self.dry_run = True self.dry_run = True
self.use_ai_configurator = False self.use_ai_configurator = False
self.aic_system = None self.aic_system = None
self.aic_model_name = None self.aic_hf_id = None
self.aic_backend = "" self.aic_backend = ""
self.aic_backend_version = None self.aic_backend_version = None
self.num_gpus_per_node = 0 self.num_gpus_per_node = 0
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment