Unverified Commit 42f34437 authored by pranavm-nvidia's avatar pranavm-nvidia Committed by GitHub
Browse files

Adds initialize_moe_config to bench_one_batch so MOE backend is respected (#9670)

parent 5c34b4f1
......@@ -61,6 +61,7 @@ from sglang.srt.configs.model_config import ModelConfig
from sglang.srt.distributed.parallel_state import destroy_distributed_environment
from sglang.srt.entrypoints.engine import _set_envs_and_config
from sglang.srt.hf_transformers_utils import get_tokenizer
from sglang.srt.layers.moe import initialize_moe_config
from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
from sglang.srt.managers.scheduler import Scheduler
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
......@@ -509,6 +510,8 @@ def latency_test(
bench_args,
tp_rank,
):
initialize_moe_config(server_args)
# Set CPU affinity
if get_bool_env_var("SGLANG_SET_CPU_AFFINITY"):
set_gpu_proc_affinity(server_args.tp_size, server_args.nnodes, tp_rank)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment