Unverified Commit e1cd7a5f authored by Michael Goin's avatar Michael Goin Committed by GitHub
Browse files

[Bugfix] Add init_workspace_manager to moe kernel benchmarks (#31042)


Signed-off-by: default avatarmgoin <mgoin64@gmail.com>
parent a68e703c
...@@ -15,6 +15,7 @@ from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8 ...@@ -15,6 +15,7 @@ from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.v1.worker.workspace import init_workspace_manager
# Weight shapes for different models: [num_experts, topk, hidden_size, # Weight shapes for different models: [num_experts, topk, hidden_size,
# intermediate_size] # intermediate_size]
...@@ -297,6 +298,10 @@ def bench_run( ...@@ -297,6 +298,10 @@ def bench_run(
def main(args): def main(args):
# Initialize workspace manager (required for CUTLASS MoE kernels)
device = torch.device("cuda:0")
init_workspace_manager(device)
print("Benchmarking models:") print("Benchmarking models:")
for i, model in enumerate(args.models): for i, model in enumerate(args.models):
print(f"[{i}] {model}") print(f"[{i}] {model}")
......
...@@ -21,6 +21,7 @@ from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4 ...@@ -21,6 +21,7 @@ from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
from vllm.scalar_type import scalar_types from vllm.scalar_type import scalar_types
from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.v1.worker.workspace import init_workspace_manager
WEIGHT_SHAPES_MOE = { WEIGHT_SHAPES_MOE = {
"nvidia/DeepSeek-R1-FP4": [ "nvidia/DeepSeek-R1-FP4": [
...@@ -441,6 +442,10 @@ def bench_run( ...@@ -441,6 +442,10 @@ def bench_run(
def main(args): def main(args):
# Initialize workspace manager (required for CUTLASS MoE kernels)
device = torch.device("cuda:0")
init_workspace_manager(device)
print("Benchmarking models:") print("Benchmarking models:")
for i, model in enumerate(args.models): for i, model in enumerate(args.models):
print(f"[{i}] {model}") print(f"[{i}] {model}")
......
...@@ -14,6 +14,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import ( ...@@ -14,6 +14,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import (
fused_topk, fused_topk,
) )
from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.v1.worker.workspace import init_workspace_manager
DEFAULT_MODELS = [ DEFAULT_MODELS = [
"mistralai/Mixtral-8x7B-Instruct-v0.1", "mistralai/Mixtral-8x7B-Instruct-v0.1",
...@@ -364,6 +365,10 @@ def bench_run( ...@@ -364,6 +365,10 @@ def bench_run(
def main(args): def main(args):
# Initialize workspace manager (required for CUTLASS MoE kernels)
device = torch.device("cuda:0")
init_workspace_manager(device)
print("Benchmarking models:") print("Benchmarking models:")
for i, model in enumerate(args.models): for i, model in enumerate(args.models):
print(f"[{i}] {model}") print(f"[{i}] {model}")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment