feat: grab num_experts info from model info if possible (#4060)

Signed-off-by: Hannah Zhang <hannahz@nvidia.com> Signed-off-by: hongkuanz <hongkuanz@nvidia.com> Co-authored-by: hongkuanz <hongkuanz@nvidia.com> Co-authored-by: Hongkuan Zhou <tedzhouhk@gmail.com>

feat: grab num_experts info from model info if possible (#4060)
Signed-off-by: Hannah Zhang <hannahz@nvidia.com> Signed-off-by: hongkuanz <hongkuanz@nvidia.com> Co-authored-by: hongkuanz <hongkuanz@nvidia.com> Co-authored-by: Hongkuan Zhou <tedzhouhk@gmail.com>
f31a1dad · hhzhang16 · GitHub · 92909c8a · f31a1dad · f31a1dad
Unverified Commit f31a1dad authored Nov 03, 2025 by hhzhang16 Committed by GitHub Nov 03, 2025
3 changed files
--- a/benchmarks/profiler/profile_sla.py
+++ b/benchmarks/profiler/profile_sla.py
@@ -97,25 +97,35 @@ async def run_profile(args):
            config = config_modifier.update_image(config, args.dgd_image)
            logger.info(f"Using DGD image: {args.dgd_image}")
+        profile_num_gpus = [
+            2**i
+            for i in range(int(math.log2(args.max_num_gpus_per_engine)) + 1)
+            if args.min_num_gpus_per_engine <= 2**i <= args.max_num_gpus_per_engine
+        ]
        if args.is_moe_model:
-            # For MoE models, use range with stride of num_gpus_per_node
+            # Filter GPU counts to only include divisors of num_experts
-            profile_num_gpus = list(
+            if hasattr(args, "num_experts") and args.num_experts is not None:
-                range(
+                original_counts = profile_num_gpus.copy()
-                    args.min_num_gpus_per_engine,
+                profile_num_gpus = [
-                    args.max_num_gpus_per_engine + 1,
+                    gpu_count
-                    args.num_gpus_per_node,
+                    for gpu_count in profile_num_gpus
-                )
+                    if args.num_experts % gpu_count == 0
-            )
+                ]
+                if not profile_num_gpus:
+                    error_msg = (
+                        f"No valid GPU counts found that divide evenly into num_experts={args.num_experts}. "
+                        f"Original candidates were {original_counts}. "
+                        f"Valid divisors in range would be: {[d for d in range(args.min_num_gpus_per_engine, args.max_num_gpus_per_engine + 1) if args.num_experts % d == 0]}"
+                    )
+                    logger.error(error_msg)
+                    raise ValueError(error_msg)
+                if len(profile_num_gpus) < len(original_counts):
+                    logger.info(
+                        f"Filtered GPU counts from {original_counts} to {profile_num_gpus} "
+                        f"(only divisors of num_experts={args.num_experts})"
+                    )
            logger.info(f"Profiling MoE GPU counts (TEP/DEP): {profile_num_gpus}")
        else:
-            # For dense models, use powers of 2
-            profile_num_gpus = [
-                2**i
-                for i in range(int(math.log2(args.max_num_gpus_per_engine)) + 1)
-                if args.min_num_gpus_per_engine
-                <= 2**i
-                <= args.max_num_gpus_per_engine
-            ]
            logger.info(f"Profiling dense model GPU counts (TP): {profile_num_gpus}")
        os.makedirs(args.output_dir, exist_ok=True)

--- a/benchmarks/profiler/utils/model_info.py
+++ b/benchmarks/profiler/utils/model_info.py
@@ -129,10 +129,27 @@ def get_model_info(
                max_context_length = value
                break
+    # Detect number of experts for MoE models
+    # Different models use different attribute names
+    num_experts = None
+    if config.is_moe:
+        expert_attrs = [
+            "n_routed_experts",  # DeepSeek V3/R1
+            "num_local_experts",  # Mixtral, Qwen
+            "num_experts",  # Generic
+        ]
+        for attr in expert_attrs:
+            if hasattr(config, attr):
+                value = getattr(config, attr)
+                if value is not None:
+                    num_experts = value
+                    break
    return {
        "model_size": model_size,
        "is_moe": config.is_moe,
        "max_context_length": max_context_length,
+        "num_experts": num_experts,
    }

--- a/benchmarks/profiler/utils/search_space_autogen.py
+++ b/benchmarks/profiler/utils/search_space_autogen.py
@@ -60,8 +60,13 @@ def auto_generate_search_space(args: argparse.Namespace) -> None:
        model_info = get_model_info(args.model)
        gpu_info = get_gpu_summary()
+        num_experts_str = (
+            f", num_experts={model_info['num_experts']}"
+            if model_info.get("num_experts")
+            else ""
+        )
        logger.info(
-            f"Model {args.model} has size {model_info['model_size']}, is_moe={model_info['is_moe']}, and max_context_length={model_info['max_context_length']}"
+            f"Model {args.model} has size {model_info['model_size']}, is_moe={model_info['is_moe']}, and max_context_length={model_info['max_context_length']}{num_experts_str}"
        )
        logger.info(
            f"Cluster has {gpu_info['gpus_per_node']}x{gpu_info['model']} GPUs per node with {gpu_info['vram']} VRAM"
@@ -88,5 +93,6 @@ def auto_generate_search_space(args: argparse.Namespace) -> None:
        args.is_moe_model = model_info["is_moe"]  # type: ignore[assignment]
        args.max_context_length = model_info["max_context_length"]  # type: ignore[assignment]
        args.num_gpus_per_node = gpu_info["gpus_per_node"]  # type: ignore[assignment]
+        args.num_experts = model_info.get("num_experts")  # type: ignore[assignment]
    return