"git@developer.sourcefind.cn:renzhc/diffusers_dcu.git" did not exist on "65f9439b569b1e7e2854cfa0274ee3f7f50b43a0"
Unverified Commit 621e96bf authored by Lianmin Zheng's avatar Lianmin Zheng Committed by GitHub
Browse files

[CI] Fix ci tests (#5769)

parent 35ca04d2
...@@ -57,6 +57,7 @@ import torch ...@@ -57,6 +57,7 @@ import torch
import torch.distributed as dist import torch.distributed as dist
from sglang.srt.configs.model_config import ModelConfig from sglang.srt.configs.model_config import ModelConfig
from sglang.srt.distributed.parallel_state import destroy_distributed_environment
from sglang.srt.entrypoints.engine import _set_envs_and_config from sglang.srt.entrypoints.engine import _set_envs_and_config
from sglang.srt.hf_transformers_utils import get_tokenizer from sglang.srt.hf_transformers_utils import get_tokenizer
from sglang.srt.managers.schedule_batch import Req, ScheduleBatch from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
...@@ -502,8 +503,13 @@ def latency_test( ...@@ -502,8 +503,13 @@ def latency_test(
for result in result_list: for result in result_list:
fout.write(json.dumps(result) + "\n") fout.write(json.dumps(result) + "\n")
if server_args.tp_size > 1:
destroy_distributed_environment()
def main(server_args, bench_args): def main(server_args, bench_args):
server_args.cuda_graph_max_bs = max(bench_args.batch_size)
_set_envs_and_config(server_args) _set_envs_and_config(server_args)
if server_args.model_path: if server_args.model_path:
......
...@@ -8,6 +8,7 @@ from typing import Callable, Optional ...@@ -8,6 +8,7 @@ from typing import Callable, Optional
import torch import torch
from torch.nn import functional as F from torch.nn import functional as F
from sglang.srt.layers.activation import GeluAndMul, SiluAndMul
from sglang.srt.layers.moe.topk import select_experts from sglang.srt.layers.moe.topk import select_experts
...@@ -30,7 +31,7 @@ def fused_moe_forward_native( ...@@ -30,7 +31,7 @@ def fused_moe_forward_native(
) -> torch.Tensor: ) -> torch.Tensor:
if apply_router_weight_on_input: if apply_router_weight_on_input:
raise NotImplementedError raise NotImplementedError()
topk_weights, topk_ids = select_experts( topk_weights, topk_ids = select_experts(
hidden_states=x, hidden_states=x,
...@@ -75,9 +76,6 @@ def moe_forward_native( ...@@ -75,9 +76,6 @@ def moe_forward_native(
activation: str = "silu", activation: str = "silu",
routed_scaling_factor: Optional[float] = None, routed_scaling_factor: Optional[float] = None,
) -> torch.Tensor: ) -> torch.Tensor:
from sglang.srt.layers.activation import GeluAndMul, SiluAndMul
topk_weights, topk_ids = select_experts( topk_weights, topk_ids = select_experts(
hidden_states=x, hidden_states=x,
router_logits=router_logits, router_logits=router_logits,
......
...@@ -13,7 +13,16 @@ import triton ...@@ -13,7 +13,16 @@ import triton
import triton.language as tl import triton.language as tl
from sglang.srt.layers.moe.topk import select_experts from sglang.srt.layers.moe.topk import select_experts
from sglang.srt.layers.quantization.fp8_kernel import scaled_fp8_quant from sglang.srt.layers.quantization.fp8_kernel import (
per_token_group_quant_fp8,
scaled_fp8_quant,
sglang_per_token_group_quant_fp8,
)
from sglang.srt.layers.quantization.int8_kernel import (
per_token_group_quant_int8,
per_token_quant_int8,
sglang_per_token_group_quant_int8,
)
from sglang.srt.utils import ( from sglang.srt.utils import (
direct_register_custom_op, direct_register_custom_op,
get_bool_env_var, get_bool_env_var,
...@@ -746,21 +755,6 @@ def invoke_fused_moe_kernel( ...@@ -746,21 +755,6 @@ def invoke_fused_moe_kernel(
block_shape: Optional[List[int]] = None, block_shape: Optional[List[int]] = None,
no_combine: bool = False, no_combine: bool = False,
) -> None: ) -> None:
from sglang.srt.layers.quantization.int8_kernel import (
per_token_group_quant_int8,
per_token_quant_int8,
)
if _is_cuda:
from sglang.srt.layers.quantization.fp8_kernel import (
sglang_per_token_group_quant_fp8,
)
from sglang.srt.layers.quantization.int8_kernel import (
sglang_per_token_group_quant_int8,
)
else:
from sglang.srt.layers.quantization.fp8_kernel import per_token_group_quant_fp8
assert topk_weights.stride(1) == 1 assert topk_weights.stride(1) == 1
assert sorted_token_ids.stride(0) == 1 assert sorted_token_ids.stride(0) == 1
......
...@@ -91,11 +91,14 @@ from sglang.srt.utils import ( ...@@ -91,11 +91,14 @@ from sglang.srt.utils import (
set_cuda_arch, set_cuda_arch,
) )
logger = logging.getLogger(__name__) # Use a small KV cache pool size for tests in CI
SGLANG_CI_SMALL_KV_SIZE = os.getenv("SGLANG_CI_SMALL_KV_SIZE", None) SGLANG_CI_SMALL_KV_SIZE = os.getenv("SGLANG_CI_SMALL_KV_SIZE", None)
# Detect stragger ranks in model loading
UNBALANCED_MODEL_LOADING_TIMEOUT_S = 300 UNBALANCED_MODEL_LOADING_TIMEOUT_S = 300
logger = logging.getLogger(__name__)
class ModelRunner: class ModelRunner:
"""ModelRunner runs the forward passes of the models.""" """ModelRunner runs the forward passes of the models."""
...@@ -177,7 +180,7 @@ class ModelRunner: ...@@ -177,7 +180,7 @@ class ModelRunner:
if _ENABLE_JIT_DEEPGEMM: if _ENABLE_JIT_DEEPGEMM:
update_deep_gemm_config(gpu_id, server_args) update_deep_gemm_config(gpu_id, server_args)
# If it is a draft model tp_group can be different. # If it is a draft model, tp_group can be different
self.initialize(min_per_gpu_memory) self.initialize(min_per_gpu_memory)
def initialize(self, min_per_gpu_memory: float): def initialize(self, min_per_gpu_memory: float):
...@@ -230,7 +233,8 @@ class ModelRunner: ...@@ -230,7 +233,8 @@ class ModelRunner:
if server_args.attention_backend is None: if server_args.attention_backend is None:
""" """
We auto select the fastest attention backend according to the current offering Auto select the fastest attention backend.
1. Models with MHA Architecture (e.g: Llama, QWen) 1. Models with MHA Architecture (e.g: Llama, QWen)
1.1 We will turn on FA3 on hopper unless user use spec decode with topk > 1 or page_size > 1. 1.1 We will turn on FA3 on hopper unless user use spec decode with topk > 1 or page_size > 1.
1.2 In other cases, we will use flashinfer if available, otherwise use triton. 1.2 In other cases, we will use flashinfer if available, otherwise use triton.
...@@ -240,6 +244,7 @@ class ModelRunner: ...@@ -240,6 +244,7 @@ class ModelRunner:
""" """
if not self.use_mla_backend: if not self.use_mla_backend:
# MHA architecture
if ( if (
is_hopper_with_cuda_12_3() is_hopper_with_cuda_12_3()
and is_no_spec_infer_or_topk_one(server_args) and is_no_spec_infer_or_topk_one(server_args)
...@@ -251,6 +256,7 @@ class ModelRunner: ...@@ -251,6 +256,7 @@ class ModelRunner:
"flashinfer" if is_flashinfer_available() else "triton" "flashinfer" if is_flashinfer_available() else "triton"
) )
else: else:
# MLA architecture
if is_hopper_with_cuda_12_3(): if is_hopper_with_cuda_12_3():
server_args.attention_backend = "fa3" server_args.attention_backend = "fa3"
else: else:
...@@ -259,7 +265,6 @@ class ModelRunner: ...@@ -259,7 +265,6 @@ class ModelRunner:
f"Attention backend not set. Use {server_args.attention_backend} backend by default." f"Attention backend not set. Use {server_args.attention_backend} backend by default."
) )
elif self.use_mla_backend: elif self.use_mla_backend:
# TODO: add MLA optimization on CPU
if server_args.device != "cpu": if server_args.device != "cpu":
if server_args.attention_backend in [ if server_args.attention_backend in [
"flashinfer", "flashinfer",
...@@ -275,7 +280,7 @@ class ModelRunner: ...@@ -275,7 +280,7 @@ class ModelRunner:
f"Invalid attention backend for MLA: {server_args.attention_backend}" f"Invalid attention backend for MLA: {server_args.attention_backend}"
) )
else: else:
raise ValueError(f"MLA optimization not supported on CPU.") raise ValueError("MLA optimization not supported on CPU.")
if ( if (
server_args.attention_backend == "fa3" server_args.attention_backend == "fa3"
...@@ -310,9 +315,6 @@ class ModelRunner: ...@@ -310,9 +315,6 @@ class ModelRunner:
) )
server_args.chunked_prefill_size = -1 server_args.chunked_prefill_size = -1
if server_args.enable_deepep_moe:
logger.info(f"DeepEP is turned on. DeepEP mode: {server_args.deepep_mode}")
if not self.use_mla_backend: if not self.use_mla_backend:
server_args.disable_chunked_prefix_cache = True server_args.disable_chunked_prefix_cache = True
elif self.page_size > 1: elif self.page_size > 1:
......
...@@ -260,7 +260,6 @@ class Llama4Attention(nn.Module): ...@@ -260,7 +260,6 @@ class Llama4Attention(nn.Module):
if self.rotary_emb is not None: if self.rotary_emb is not None:
q_view, k_view = qk.split([self.q_size, self.kv_size], dim=-1) q_view, k_view = qk.split([self.q_size, self.kv_size], dim=-1)
q_out_unused, k_out_unused = self.rotary_emb(positions, q_view, k_view) q_out_unused, k_out_unused = self.rotary_emb(positions, q_view, k_view)
assert (q_out_unused is q_view) and (k_out_unused is k_view)
del q_view, k_view, q_out_unused, k_out_unused del q_view, k_view, q_out_unused, k_out_unused
if self.qk_norm is not None: if self.qk_norm is not None:
......
...@@ -201,7 +201,7 @@ class ServerArgs: ...@@ -201,7 +201,7 @@ class ServerArgs:
# Expert parallelism # Expert parallelism
if self.enable_ep_moe: if self.enable_ep_moe:
self.ep_size = self.tp_size self.ep_size = self.tp_size
logger.info( logger.warning(
f"EP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]." f"EP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
) )
...@@ -243,19 +243,19 @@ class ServerArgs: ...@@ -243,19 +243,19 @@ class ServerArgs:
self.chunked_prefill_size = 2048 self.chunked_prefill_size = 2048
else: else:
self.chunked_prefill_size = 8192 self.chunked_prefill_size = 8192
assert self.chunked_prefill_size % self.page_size == 0 assert self.chunked_prefill_size % self.page_size == 0
assert self.moe_dense_tp_size in { assert self.moe_dense_tp_size in {
1, 1,
None, None,
}, f"moe_dense_tp_size only support 1 and None currently" }, "moe_dense_tp_size only support 1 and None currently"
if self.attention_backend == "flashmla": if self.attention_backend == "flashmla":
logger.warning( logger.warning(
"FlashMLA only supports a page_size of 64, change page_size to 64." "FlashMLA only supports a page_size of 64, change page_size to 64."
) )
self.page_size = 64 self.page_size = 64
# Set cuda graph max batch size # Set cuda graph max batch size
if self.cuda_graph_max_bs is None: if self.cuda_graph_max_bs is None:
# Based on detailed statistics, when serving TP1/TP2 models on lower-end GPUs with HBM<25G, you can either disable cuda graph or set `cuda_graph_max_bs` to a very small value to reduce the memory overhead of creating cuda graphs, with almost no impact on performance. However, when serving models with TP4 or TP8, we need to enable cuda graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating cuda graphs. Looking at the logs from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating cuda graphs on lower-end GPUs compared to the original 160, avoiding OOM issues. # Based on detailed statistics, when serving TP1/TP2 models on lower-end GPUs with HBM<25G, you can either disable cuda graph or set `cuda_graph_max_bs` to a very small value to reduce the memory overhead of creating cuda graphs, with almost no impact on performance. However, when serving models with TP4 or TP8, we need to enable cuda graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating cuda graphs. Looking at the logs from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating cuda graphs on lower-end GPUs compared to the original 160, avoiding OOM issues.
...@@ -270,6 +270,7 @@ class ServerArgs: ...@@ -270,6 +270,7 @@ class ServerArgs:
self.attention_backend = "torch_native" self.attention_backend = "torch_native"
self.sampling_backend = "pytorch" self.sampling_backend = "pytorch"
# Set kernel backends
if self.sampling_backend is None: if self.sampling_backend is None:
self.sampling_backend = ( self.sampling_backend = (
"flashinfer" if is_flashinfer_available() else "pytorch" "flashinfer" if is_flashinfer_available() else "pytorch"
...@@ -297,8 +298,8 @@ class ServerArgs: ...@@ -297,8 +298,8 @@ class ServerArgs:
f"DP attention is enabled. The chunked prefill size is adjusted to {self.chunked_prefill_size} to avoid MoE kernel issues. " f"DP attention is enabled. The chunked prefill size is adjusted to {self.chunked_prefill_size} to avoid MoE kernel issues. "
) )
self.enable_sp_layernorm = False
# DeepEP MoE # DeepEP MoE
self.enable_sp_layernorm = False
if self.enable_deepep_moe: if self.enable_deepep_moe:
if self.deepep_mode == "auto": if self.deepep_mode == "auto":
assert ( assert (
...@@ -308,7 +309,7 @@ class ServerArgs: ...@@ -308,7 +309,7 @@ class ServerArgs:
self.enable_sp_layernorm = ( self.enable_sp_layernorm = (
self.dp_size < self.tp_size if self.enable_dp_attention else True self.dp_size < self.tp_size if self.enable_dp_attention else True
) )
logger.info( logger.warning(
f"DeepEP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]." f"DeepEP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
) )
...@@ -317,14 +318,11 @@ class ServerArgs: ...@@ -317,14 +318,11 @@ class ServerArgs:
# NEXTN shares the same implementation of EAGLE # NEXTN shares the same implementation of EAGLE
self.speculative_algorithm = "EAGLE" self.speculative_algorithm = "EAGLE"
if ( if self.speculative_algorithm in ("EAGLE", "EAGLE3"):
self.speculative_algorithm == "EAGLE"
or self.speculative_algorithm == "EAGLE3"
):
if self.max_running_requests is None: if self.max_running_requests is None:
self.max_running_requests = 48 self.max_running_requests = 48
self.disable_overlap_schedule = True self.disable_overlap_schedule = True
logger.info( logger.warning(
"Overlap scheduler is disabled because of using " "Overlap scheduler is disabled because of using "
"eagle speculative decoding." "eagle speculative decoding."
) )
...@@ -343,7 +341,7 @@ class ServerArgs: ...@@ -343,7 +341,7 @@ class ServerArgs:
if self.page_size > 1 and self.speculative_eagle_topk > 1: if self.page_size > 1 and self.speculative_eagle_topk > 1:
self.speculative_eagle_topk = 1 self.speculative_eagle_topk = 1
logger.info( logger.warning(
"speculative_eagle_topk is adjusted to 1 when page_size > 1" "speculative_eagle_topk is adjusted to 1 when page_size > 1"
) )
...@@ -351,7 +349,7 @@ class ServerArgs: ...@@ -351,7 +349,7 @@ class ServerArgs:
self.speculative_eagle_topk == 1 self.speculative_eagle_topk == 1
and self.speculative_num_draft_tokens != self.speculative_num_steps + 1 and self.speculative_num_draft_tokens != self.speculative_num_steps + 1
): ):
logger.info( logger.warning(
"speculative_num_draft_tokens is adjusted to speculative_num_steps + 1 when speculative_eagle_topk == 1" "speculative_num_draft_tokens is adjusted to speculative_num_steps + 1 when speculative_eagle_topk == 1"
) )
self.speculative_num_draft_tokens = self.speculative_num_steps + 1 self.speculative_num_draft_tokens = self.speculative_num_steps + 1
...@@ -381,18 +379,6 @@ class ServerArgs: ...@@ -381,18 +379,6 @@ class ServerArgs:
self.disable_radix_cache = True self.disable_radix_cache = True
logger.warning("KV cache is forced as chunk cache for decode server") logger.warning("KV cache is forced as chunk cache for decode server")
if self.enable_memory_saver:
try:
import torch_memory_saver
except ImportError:
logger.warning(
"enable_memory_saver is enabled, but "
"torch-memory-saver is not installed. Please install it "
"via `pip3 uninstall torch-memory-saver`. "
"For normal operation, it will be disabled."
)
raise
os.environ["SGLANG_ENABLE_TORCH_COMPILE"] = ( os.environ["SGLANG_ENABLE_TORCH_COMPILE"] = (
"1" if self.enable_torch_compile else "0" "1" if self.enable_torch_compile else "0"
) )
......
...@@ -6,7 +6,9 @@ try: ...@@ -6,7 +6,9 @@ try:
import torch_memory_saver import torch_memory_saver
_primary_memory_saver = torch_memory_saver.TorchMemorySaver() _primary_memory_saver = torch_memory_saver.TorchMemorySaver()
except ImportError: import_error = None
except ImportError as e:
import_error = e
pass pass
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -15,6 +17,13 @@ logger = logging.getLogger(__name__) ...@@ -15,6 +17,13 @@ logger = logging.getLogger(__name__)
class TorchMemorySaverAdapter(ABC): class TorchMemorySaverAdapter(ABC):
@staticmethod @staticmethod
def create(enable: bool): def create(enable: bool):
if enable and import_error is not None:
logger.warning(
"enable_memory_saver is enabled, but "
"torch-memory-saver is not installed. Please install it "
"via `pip3 install torch-memory-saver`. "
)
raise import_error
return ( return (
_TorchMemorySaverAdapterReal() if enable else _TorchMemorySaverAdapterNoop() _TorchMemorySaverAdapterReal() if enable else _TorchMemorySaverAdapterNoop()
) )
......
...@@ -1944,7 +1944,7 @@ def get_local_ip_by_remote() -> str: ...@@ -1944,7 +1944,7 @@ def get_local_ip_by_remote() -> str:
s.connect(("2001:4860:4860::8888", 80)) # Doesn't need to be reachable s.connect(("2001:4860:4860::8888", 80)) # Doesn't need to be reachable
return s.getsockname()[0] return s.getsockname()[0]
except Exception: except Exception:
raise ValueError(f"Can not get local ip") raise ValueError("Can not get local ip")
def is_page_size_one(server_args): def is_page_size_one(server_args):
......
...@@ -33,33 +33,44 @@ from sglang.srt.utils import ( ...@@ -33,33 +33,44 @@ from sglang.srt.utils import (
from sglang.test.run_eval import run_eval from sglang.test.run_eval import run_eval
from sglang.utils import get_exception_traceback from sglang.utils import get_exception_traceback
DEFAULT_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8" # General test models
DEFAULT_FP8_MODEL_NAME_FOR_ACCURACY_TEST = "neuralmagic/Meta-Llama-3-8B-Instruct-FP8" DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.1-8B-Instruct"
DEFAULT_FP8_MODEL_NAME_FOR_DYNAMIC_QUANT_ACCURACY_TEST = ( DEFAULT_SMALL_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.2-1B-Instruct"
DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST = "Qwen/Qwen1.5-MoE-A2.7B"
# MLA test models
DEFAULT_MLA_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
DEFAULT_MLA_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
DEFAULT_MODEL_NAME_FOR_TEST_MLA = "lmsys/sglang-ci-dsv3-test"
DEFAULT_MODEL_NAME_FOR_TEST_MLA_NEXTN = "lmsys/sglang-ci-dsv3-test-NextN"
# FP8 models
DEFAULT_MODEL_NAME_FOR_TEST_FP8 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"
DEFAULT_MODEL_NAME_FOR_ACCURACY_TEST_FP8 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"
DEFAULT_MODEL_NAME_FOR_DYNAMIC_QUANT_ACCURACY_TEST_FP8 = (
"neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8-dynamic" "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8-dynamic"
) )
DEFAULT_FP8_MODEL_NAME_FOR_MODELOPT_QUANT_ACCURACY_TEST = ( DEFAULT_MODEL_NAME_FOR_MODELOPT_QUANT_ACCURACY_TEST_FP8 = (
"nvidia/Llama-3.1-8B-Instruct-FP8" "nvidia/Llama-3.1-8B-Instruct-FP8"
) )
DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.1-8B-Instruct" # EAGLE
DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST = "meta-llama/Llama-2-7b-chat-hf"
DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST = "lmsys/sglang-EAGLE-llama2-chat-7B"
DEFAULT_MODEL_NAME_FOR_TEST_EAGLE3 = "jamesliu1/sglang-EAGLE3-Llama-3.1-Instruct-8B" DEFAULT_MODEL_NAME_FOR_TEST_EAGLE3 = "jamesliu1/sglang-EAGLE3-Llama-3.1-Instruct-8B"
DEFAULT_MODEL_NAME_FOR_TEST_MLA = "lmsys/sglang-ci-dsv3-test"
DEFAULT_MODEL_NAME_FOR_TEST_MLA_NEXTN = "lmsys/sglang-ci-dsv3-test-NextN" # Other use cases
DEFAULT_SMALL_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.2-1B-Instruct"
DEFAULT_MODEL_NAME_FOR_TEST_LOCAL_ATTENTION = ( DEFAULT_MODEL_NAME_FOR_TEST_LOCAL_ATTENTION = (
"meta-llama/Llama-4-Scout-17B-16E-Instruct" "meta-llama/Llama-4-Scout-17B-16E-Instruct"
) )
DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST = "Qwen/Qwen1.5-MoE-A2.7B"
DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST = "Alibaba-NLP/gte-Qwen2-1.5B-instruct" DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST = "Alibaba-NLP/gte-Qwen2-1.5B-instruct"
DEFAULT_MLA_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
DEFAULT_MLA_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
DEFAULT_REASONING_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B" DEFAULT_REASONING_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
DEFAULT_AWQ_MOE_MODEL_NAME_FOR_TEST = ( DEFAULT_AWQ_MOE_MODEL_NAME_FOR_TEST = (
"hugging-quants/Mixtral-8x7B-Instruct-v0.1-AWQ-INT4" "hugging-quants/Mixtral-8x7B-Instruct-v0.1-AWQ-INT4"
) )
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 1000
# Nightly tests
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it" DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it"
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct" DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct"
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8,neuralmagic/Mistral-7B-Instruct-v0.3-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8,neuralmagic/gemma-2-2b-it-FP8" DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8,neuralmagic/Mistral-7B-Instruct-v0.3-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8,neuralmagic/gemma-2-2b-it-FP8"
...@@ -68,12 +79,11 @@ DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8 ...@@ -68,12 +79,11 @@ DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8
DEFAULT_SMALL_MODEL_NAME_FOR_TEST_QWEN = "Qwen/Qwen2.5-1.5B-Instruct" DEFAULT_SMALL_MODEL_NAME_FOR_TEST_QWEN = "Qwen/Qwen2.5-1.5B-Instruct"
DEFAULT_SMALL_VLM_MODEL_NAME = "Qwen/Qwen2-VL-2B" DEFAULT_SMALL_VLM_MODEL_NAME = "Qwen/Qwen2-VL-2B"
DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST = "meta-llama/Llama-2-7b-chat-hf"
DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST = "lmsys/sglang-EAGLE-llama2-chat-7B"
DEFAULT_IMAGE_URL = "https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true" DEFAULT_IMAGE_URL = "https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true"
DEFAULT_VIDEO_URL = "https://raw.githubusercontent.com/EvolvingLMMs-Lab/sglang/dev/onevision_local/assets/jobs.mp4" DEFAULT_VIDEO_URL = "https://raw.githubusercontent.com/EvolvingLMMs-Lab/sglang/dev/onevision_local/assets/jobs.mp4"
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 1000
def is_in_ci(): def is_in_ci():
"""Return whether it is in CI runner.""" """Return whether it is in CI runner."""
...@@ -499,7 +509,7 @@ def run_unittest_files(files: List[TestFile], timeout_per_file: float): ...@@ -499,7 +509,7 @@ def run_unittest_files(files: List[TestFile], timeout_per_file: float):
tic = time.time() tic = time.time()
success = True success = True
for file in files: for i, file in enumerate(files):
filename, estimated_time = file.name, file.estimated_time filename, estimated_time = file.name, file.estimated_time
process = None process = None
...@@ -507,7 +517,10 @@ def run_unittest_files(files: List[TestFile], timeout_per_file: float): ...@@ -507,7 +517,10 @@ def run_unittest_files(files: List[TestFile], timeout_per_file: float):
nonlocal process nonlocal process
filename = os.path.join(os.getcwd(), filename) filename = os.path.join(os.getcwd(), filename)
print(f".\n.\nBegin:\npython3 {filename}\n.\n.\n", flush=True) print(
f".\n.\nBegin ({i}/{len(files)}):\npython3 {filename}\n.\n.\n",
flush=True,
)
tic = time.time() tic = time.time()
process = subprocess.Popen( process = subprocess.Popen(
...@@ -517,7 +530,7 @@ def run_unittest_files(files: List[TestFile], timeout_per_file: float): ...@@ -517,7 +530,7 @@ def run_unittest_files(files: List[TestFile], timeout_per_file: float):
elapsed = time.time() - tic elapsed = time.time() - tic
print( print(
f".\n.\nEnd:\n{filename=}, {elapsed=:.0f}, {estimated_time=}\n.\n.\n", f".\n.\nEnd ({i}/{len(files)}):\n{filename=}, {elapsed=:.0f}, {estimated_time=}\n.\n.\n",
flush=True, flush=True,
) )
return process.returncode return process.returncode
......
...@@ -16,23 +16,29 @@ suites = { ...@@ -16,23 +16,29 @@ suites = {
TestFile("models/lora/test_lora.py", 76), TestFile("models/lora/test_lora.py", 76),
TestFile("models/lora/test_lora_backend.py", 99), TestFile("models/lora/test_lora_backend.py", 99),
TestFile("models/lora/test_multi_lora_backend.py", 60), TestFile("models/lora/test_multi_lora_backend.py", 60),
TestFile("models/test_embedding_models.py", 35), TestFile("models/test_embedding_models.py", 184),
TestFile("models/test_clip_models.py", 52),
TestFile("models/test_compressed_tensors_models.py", 42),
TestFile("models/test_generation_models.py", 103), TestFile("models/test_generation_models.py", 103),
TestFile("models/test_gme_qwen_models.py", 45),
# TestFile("models/test_grok_models.py", 60), # Disabled due to illegal memory access # TestFile("models/test_grok_models.py", 60), # Disabled due to illegal memory access
TestFile("models/test_qwen_models.py", 82), TestFile("models/test_qwen_models.py", 82),
TestFile("models/test_compressed_tensors_models.py", 100), TestFile("models/test_reward_models.py", 132),
TestFile("models/test_reward_models.py", 83), TestFile("models/test_vlm_models.py", 317),
TestFile("models/test_gme_qwen_models.py", 45),
TestFile("models/test_clip_models.py", 52),
TestFile("models/test_vlm_models.py", 581),
TestFile("test_abort.py", 51), TestFile("test_abort.py", 51),
TestFile("test_block_int8.py", 22), TestFile("test_block_int8.py", 22),
TestFile("test_create_kvindices.py", 2),
TestFile("test_chunked_prefill.py", 285), TestFile("test_chunked_prefill.py", 285),
TestFile("test_eagle_infer.py", 584), TestFile("test_eagle_infer.py", 584),
TestFile("test_ebnf_constrained.py"), TestFile("test_ebnf_constrained.py", 108),
TestFile("test_embedding_openai_server.py", 141),
TestFile("test_eval_fp8_accuracy.py", 303),
TestFile("test_fa3.py", 376), TestFile("test_fa3.py", 376),
TestFile("test_fim_completion.py", 40),
TestFile("test_fp8_kernel.py", 8), TestFile("test_fp8_kernel.py", 8),
TestFile("test_embedding_openai_server.py", 141), TestFile("test_fused_moe.py", 30),
TestFile("test_hicache.py", 116),
TestFile("test_hicache_mla.py", 254),
TestFile("test_hidden_states.py", 55), TestFile("test_hidden_states.py", 55),
TestFile("test_int8_kernel.py", 8), TestFile("test_int8_kernel.py", 8),
TestFile("test_input_embeddings.py", 38), TestFile("test_input_embeddings.py", 38),
...@@ -41,11 +47,11 @@ suites = { ...@@ -41,11 +47,11 @@ suites = {
TestFile("test_metrics.py", 32), TestFile("test_metrics.py", 32),
TestFile("test_mla.py", 242), TestFile("test_mla.py", 242),
TestFile("test_mla_deepseek_v3.py", 221), TestFile("test_mla_deepseek_v3.py", 221),
TestFile("test_mla_int8_deepseek_v3.py", 674), TestFile("test_mla_int8_deepseek_v3.py", 389),
TestFile("test_mla_flashinfer.py", 395), TestFile("test_mla_flashinfer.py", 395),
TestFile("test_mla_fp8.py", 153), TestFile("test_mla_fp8.py", 153),
TestFile("test_no_chunked_prefill.py", 126), TestFile("test_no_chunked_prefill.py", 108),
TestFile("test_no_overlap_scheduler.py", 262), TestFile("test_no_overlap_scheduler.py", 216),
TestFile("test_openai_server.py", 149), TestFile("test_openai_server.py", 149),
TestFile("test_penalty.py", 41), TestFile("test_penalty.py", 41),
TestFile("test_page_size.py", 60), TestFile("test_page_size.py", 60),
...@@ -59,27 +65,21 @@ suites = { ...@@ -59,27 +65,21 @@ suites = {
TestFile("test_server_args.py", 1), TestFile("test_server_args.py", 1),
TestFile("test_skip_tokenizer_init.py", 117), TestFile("test_skip_tokenizer_init.py", 117),
TestFile("test_srt_engine.py", 237), TestFile("test_srt_engine.py", 237),
TestFile("test_srt_endpoint.py", 94), TestFile("test_srt_endpoint.py", 130),
TestFile("test_torch_compile.py", 76), TestFile("test_torch_compile.py", 76),
TestFile("test_torch_compile_moe.py", 235), TestFile("test_torch_compile_moe.py", 172),
TestFile("test_torch_native_attention_backend.py", 123), TestFile("test_torch_native_attention_backend.py", 123),
TestFile("test_torchao.py", 70), TestFile("test_torchao.py", 70),
TestFile("test_triton_attention_kernels.py", 4), TestFile("test_triton_attention_kernels.py", 4),
TestFile("test_triton_attention_backend.py", 134), TestFile("test_triton_attention_backend.py", 134),
TestFile("test_triton_moe_channel_fp8_kernel.py", 25),
TestFile("test_update_weights_from_disk.py", 114), TestFile("test_update_weights_from_disk.py", 114),
TestFile("test_update_weights_from_tensor.py", 48), TestFile("test_update_weights_from_tensor.py", 48),
TestFile("test_vertex_endpoint.py", 31), TestFile("test_vertex_endpoint.py", 31),
TestFile("test_vision_chunked_prefill.py", 119), TestFile("test_vision_chunked_prefill.py", 175),
TestFile("test_vlm_accuracy.py", 60), TestFile("test_vlm_accuracy.py", 60),
TestFile("test_vision_openai_server.py", 637), TestFile("test_vision_openai_server.py", 637),
TestFile("test_fim_completion.py", 40),
TestFile("test_w8a8_quantization.py", 46), TestFile("test_w8a8_quantization.py", 46),
TestFile("test_eval_fp8_accuracy.py", 303),
TestFile("test_create_kvindices.py", 2),
TestFile("test_hicache.py", 116),
TestFile("test_hicache_mla.py", 254),
TestFile("test_fused_moe.py", 30),
TestFile("test_triton_moe_channel_fp8_kernel.py", 25),
], ],
"per-commit-2-gpu": [ "per-commit-2-gpu": [
TestFile("models/lora/test_lora_tp.py", 116), TestFile("models/lora/test_lora_tp.py", 116),
......
...@@ -29,13 +29,9 @@ class TestBenchOneBatch(CustomTestCase): ...@@ -29,13 +29,9 @@ class TestBenchOneBatch(CustomTestCase):
DEFAULT_MOE_MODEL_NAME_FOR_TEST, ["--tp", "2", "--cuda-graph-max-bs", "2"] DEFAULT_MOE_MODEL_NAME_FOR_TEST, ["--tp", "2", "--cuda-graph-max-bs", "2"]
) )
use_vllm_custom_allreduce = get_bool_env_var(
"USE_VLLM_CUSTOM_ALLREDUCE", default="false"
)
if is_in_ci(): if is_in_ci():
write_github_step_summary( write_github_step_summary(
f"### test_moe_tp2_bs1 ({use_vllm_custom_allreduce=})\n" f"### test_moe_tp2_bs1\n"
f"output_throughput : {output_throughput:.2f} token/s\n" f"output_throughput : {output_throughput:.2f} token/s\n"
) )
self.assertGreater(output_throughput, 124) self.assertGreater(output_throughput, 124)
......
...@@ -3,8 +3,8 @@ import unittest ...@@ -3,8 +3,8 @@ import unittest
from sglang.test.test_utils import ( from sglang.test.test_utils import (
DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST, DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST,
DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST, DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST,
DEFAULT_FP8_MODEL_NAME_FOR_TEST,
DEFAULT_MODEL_NAME_FOR_TEST, DEFAULT_MODEL_NAME_FOR_TEST,
DEFAULT_MODEL_NAME_FOR_TEST_FP8,
DEFAULT_MOE_MODEL_NAME_FOR_TEST, DEFAULT_MOE_MODEL_NAME_FOR_TEST,
CustomTestCase, CustomTestCase,
is_in_ci, is_in_ci,
...@@ -28,7 +28,7 @@ class TestBenchServing(CustomTestCase): ...@@ -28,7 +28,7 @@ class TestBenchServing(CustomTestCase):
f"### test_offline_throughput_default\n" f"### test_offline_throughput_default\n"
f'Output throughput: {res["output_throughput"]:.2f} token/s\n' f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
) )
self.assertGreater(res["output_throughput"], 3350) self.assertGreater(res["output_throughput"], 3800)
def test_offline_throughput_non_stream_small_batch_size(self): def test_offline_throughput_non_stream_small_batch_size(self):
res = run_bench_serving( res = run_bench_serving(
...@@ -48,9 +48,7 @@ class TestBenchServing(CustomTestCase): ...@@ -48,9 +48,7 @@ class TestBenchServing(CustomTestCase):
f"### test_offline_throughput_non_stream_small_batch_size\n" f"### test_offline_throughput_non_stream_small_batch_size\n"
f'Output throughput: {res["output_throughput"]:.2f} token/s\n' f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
) )
# There is a regression with torch 2.5 self.assertGreater(res["output_throughput"], 1050)
# This number was 950 for torch 2.4
self.assertGreater(res["output_throughput"], 1000)
def test_offline_throughput_without_radix_cache(self): def test_offline_throughput_without_radix_cache(self):
res = run_bench_serving( res = run_bench_serving(
...@@ -65,7 +63,7 @@ class TestBenchServing(CustomTestCase): ...@@ -65,7 +63,7 @@ class TestBenchServing(CustomTestCase):
f"### test_offline_throughput_without_radix_cache\n" f"### test_offline_throughput_without_radix_cache\n"
f'Output throughput: {res["output_throughput"]:.2f} token/s\n' f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
) )
self.assertGreater(res["output_throughput"], 3350) self.assertGreater(res["output_throughput"], 3800)
def test_offline_throughput_without_chunked_prefill(self): def test_offline_throughput_without_chunked_prefill(self):
res = run_bench_serving( res = run_bench_serving(
...@@ -100,11 +98,11 @@ class TestBenchServing(CustomTestCase): ...@@ -100,11 +98,11 @@ class TestBenchServing(CustomTestCase):
f"### test_offline_throughput_with_triton_attention_backend\n" f"### test_offline_throughput_with_triton_attention_backend\n"
f'Output throughput: {res["output_throughput"]:.2f} token/s\n' f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
) )
self.assertGreater(res["output_throughput"], 3450) self.assertGreater(res["output_throughput"], 3600)
def test_offline_throughput_default_fp8(self): def test_offline_throughput_default_fp8(self):
res = run_bench_serving( res = run_bench_serving(
model=DEFAULT_FP8_MODEL_NAME_FOR_TEST, model=DEFAULT_MODEL_NAME_FOR_TEST_FP8,
num_prompts=500, num_prompts=500,
request_rate=float("inf"), request_rate=float("inf"),
other_server_args=[], other_server_args=[],
...@@ -115,7 +113,7 @@ class TestBenchServing(CustomTestCase): ...@@ -115,7 +113,7 @@ class TestBenchServing(CustomTestCase):
f"### test_offline_throughput_default_fp8\n" f"### test_offline_throughput_default_fp8\n"
f'Output throughput: {res["output_throughput"]:.2f} token/s\n' f'Output throughput: {res["output_throughput"]:.2f} token/s\n'
) )
self.assertGreater(res["output_throughput"], 3900) self.assertGreater(res["output_throughput"], 4200)
def test_online_latency_default(self): def test_online_latency_default(self):
res = run_bench_serving( res = run_bench_serving(
...@@ -166,8 +164,8 @@ class TestBenchServing(CustomTestCase): ...@@ -166,8 +164,8 @@ class TestBenchServing(CustomTestCase):
f'median_e2e_latency_ms : {res["median_e2e_latency_ms"]:.2f} ms\n' f'median_e2e_latency_ms : {res["median_e2e_latency_ms"]:.2f} ms\n'
f'accept_length : {res["accept_length"]:.2f} \n' f'accept_length : {res["accept_length"]:.2f} \n'
) )
self.assertLess(res["median_e2e_latency_ms"], 900) self.assertLess(res["median_e2e_latency_ms"], 800)
self.assertGreater(res["accept_length"], 2.99) self.assertGreater(res["accept_length"], 3.0)
def test_moe_offline_throughput_default(self): def test_moe_offline_throughput_default(self):
res = run_bench_serving( res = run_bench_serving(
......
...@@ -4,8 +4,8 @@ from types import SimpleNamespace ...@@ -4,8 +4,8 @@ from types import SimpleNamespace
from sglang.srt.utils import is_hip, kill_process_tree from sglang.srt.utils import is_hip, kill_process_tree
from sglang.test.run_eval import run_eval from sglang.test.run_eval import run_eval
from sglang.test.test_utils import ( from sglang.test.test_utils import (
DEFAULT_FP8_MODEL_NAME_FOR_ACCURACY_TEST, DEFAULT_MODEL_NAME_FOR_ACCURACY_TEST_FP8,
DEFAULT_FP8_MODEL_NAME_FOR_DYNAMIC_QUANT_ACCURACY_TEST, DEFAULT_MODEL_NAME_FOR_DYNAMIC_QUANT_ACCURACY_TEST_FP8,
DEFAULT_MODEL_NAME_FOR_TEST, DEFAULT_MODEL_NAME_FOR_TEST,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST, DEFAULT_URL_FOR_TEST,
...@@ -17,7 +17,7 @@ from sglang.test.test_utils import ( ...@@ -17,7 +17,7 @@ from sglang.test.test_utils import (
class TestEvalFP8Accuracy(CustomTestCase): class TestEvalFP8Accuracy(CustomTestCase):
@classmethod @classmethod
def setUpClass(cls): def setUpClass(cls):
cls.model = DEFAULT_FP8_MODEL_NAME_FOR_ACCURACY_TEST cls.model = DEFAULT_MODEL_NAME_FOR_ACCURACY_TEST_FP8
cls.base_url = DEFAULT_URL_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST
cls.process = popen_launch_server( cls.process = popen_launch_server(
cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
...@@ -76,7 +76,7 @@ class TestEvalFP8DynamicQuantAccuracy(CustomTestCase): ...@@ -76,7 +76,7 @@ class TestEvalFP8DynamicQuantAccuracy(CustomTestCase):
def test_mmlu_offline_only(self): def test_mmlu_offline_only(self):
"""Test with offline quantization only.""" """Test with offline quantization only."""
self._run_test( self._run_test(
model=DEFAULT_FP8_MODEL_NAME_FOR_DYNAMIC_QUANT_ACCURACY_TEST, model=DEFAULT_MODEL_NAME_FOR_DYNAMIC_QUANT_ACCURACY_TEST_FP8,
other_args=[], other_args=[],
expected_score=0.64, expected_score=0.64,
) )
...@@ -84,7 +84,7 @@ class TestEvalFP8DynamicQuantAccuracy(CustomTestCase): ...@@ -84,7 +84,7 @@ class TestEvalFP8DynamicQuantAccuracy(CustomTestCase):
def test_mmlu_offline_and_online_override(self): def test_mmlu_offline_and_online_override(self):
"""Test with both offline and online quantization.""" """Test with both offline and online quantization."""
self._run_test( self._run_test(
model=DEFAULT_FP8_MODEL_NAME_FOR_DYNAMIC_QUANT_ACCURACY_TEST, model=DEFAULT_MODEL_NAME_FOR_DYNAMIC_QUANT_ACCURACY_TEST_FP8,
other_args=["--quantization", "w8a8_fp8"], other_args=["--quantization", "w8a8_fp8"],
# inference will use sgl kernel w/ online quant override # inference will use sgl kernel w/ online quant override
# we observed that the accuracy is higher then offline only # we observed that the accuracy is higher then offline only
......
...@@ -48,7 +48,7 @@ if OFFLINE_MODE: ...@@ -48,7 +48,7 @@ if OFFLINE_MODE:
DEFAULT_SERVER_ARGS = [ DEFAULT_SERVER_ARGS = [
"--trust-remote-code", "--trust-remote-code",
"--cuda-graph-max-bs", "--cuda-graph-max-bs",
"4", "8",
"--attention-backend", "--attention-backend",
"fa3", "fa3",
] ]
......
...@@ -6,8 +6,8 @@ import torch ...@@ -6,8 +6,8 @@ import torch
from sglang.srt.utils import kill_process_tree from sglang.srt.utils import kill_process_tree
from sglang.test.run_eval import run_eval from sglang.test.run_eval import run_eval
from sglang.test.test_utils import ( from sglang.test.test_utils import (
DEFAULT_FP8_MODEL_NAME_FOR_MODELOPT_QUANT_ACCURACY_TEST, DEFAULT_MODEL_NAME_FOR_MODELOPT_QUANT_ACCURACY_TEST_FP8,
DEFAULT_FP8_MODEL_NAME_FOR_MODELOPT_QUANT_ACCURACY_TEST_REVISION, DEFAULT_MODEL_NAME_FOR_MODELOPT_QUANT_ACCURACY_TEST_FP8_REVISION,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST, DEFAULT_URL_FOR_TEST,
CustomTestCase, CustomTestCase,
...@@ -49,10 +49,10 @@ class TestEvalFP8ModelOptQuantAccuracy(CustomTestCase): ...@@ -49,10 +49,10 @@ class TestEvalFP8ModelOptQuantAccuracy(CustomTestCase):
def test_mmlu_offline_only(self): def test_mmlu_offline_only(self):
"""Test with offline quantization only.""" """Test with offline quantization only."""
self._run_test( self._run_test(
model=DEFAULT_FP8_MODEL_NAME_FOR_MODELOPT_QUANT_ACCURACY_TEST, model=DEFAULT_MODEL_NAME_FOR_MODELOPT_QUANT_ACCURACY_TEST_FP8,
other_args=[ other_args=[
"--revision", "--revision",
DEFAULT_FP8_MODEL_NAME_FOR_MODELOPT_QUANT_ACCURACY_TEST_REVISION, DEFAULT_MODEL_NAME_FOR_MODELOPT_QUANT_ACCURACY_TEST_FP8_REVISION,
], ],
expected_score=0.64, expected_score=0.64,
) )
...@@ -14,7 +14,6 @@ from sglang.test.test_utils import ( ...@@ -14,7 +14,6 @@ from sglang.test.test_utils import (
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2, DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST, DEFAULT_URL_FOR_TEST,
CustomTestCase,
is_in_ci, is_in_ci,
popen_launch_server, popen_launch_server,
write_github_step_summary, write_github_step_summary,
...@@ -45,25 +44,10 @@ def parse_models(model_string): ...@@ -45,25 +44,10 @@ def parse_models(model_string):
return [model.strip() for model in model_string.split(",") if model.strip()] return [model.strip() for model in model_string.split(",") if model.strip()]
def popen_launch_server_wrapper(base_url, model, is_fp8, is_tp2): def popen_launch_server_wrapper(base_url, model, is_tp2):
other_args = ["--log-level-http", "warning", "--trust-remote-code"] other_args = ["--log-level-http", "warning", "--trust-remote-code"]
if is_fp8:
if "Llama-3" in model or "gemma-2" in model:
other_args.extend(["--kv-cache-dtype", "fp8_e5m2"])
elif "Qwen2-72B-Instruct-FP8" in model:
other_args.extend(["--quantization", "fp8"])
elif "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8" in model:
other_args.extend([])
else:
other_args.extend(["--quantization", "fp8", "--kv-cache-dtype", "fp8_e5m2"])
if is_tp2: if is_tp2:
other_args.extend(["--tp", "2"]) other_args.extend(["--tp", "2"])
if "DeepSeek" in model:
other_args.extend(["--mem-frac", "0.85"])
if "AWQ" in model:
other_args.extend(["--quantization", "awq"])
elif "GPTQ" in model:
other_args.extend(["--quantization", "gptq"])
process = popen_launch_server( process = popen_launch_server(
model, model,
...@@ -150,9 +134,7 @@ class TestNightlyGsm8KEval(unittest.TestCase): ...@@ -150,9 +134,7 @@ class TestNightlyGsm8KEval(unittest.TestCase):
for model_group, is_fp8, is_tp2 in self.model_groups: for model_group, is_fp8, is_tp2 in self.model_groups:
for model in model_group: for model in model_group:
with self.subTest(model=model): with self.subTest(model=model):
process = popen_launch_server_wrapper( process = popen_launch_server_wrapper(self.base_url, model, is_tp2)
self.base_url, model, is_fp8, is_tp2
)
args = SimpleNamespace( args = SimpleNamespace(
base_url=self.base_url, base_url=self.base_url,
......
import os
import shutil
import signal
import subprocess
import unittest
from test_nightly_gsm8k_eval import parse_models, popen_launch_server_wrapper
from sglang.srt.utils import kill_process_tree
from sglang.test.test_utils import (
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1,
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2,
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1,
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2,
DEFAULT_MODEL_NAME_FOR_TEST,
DEFAULT_URL_FOR_TEST,
CustomTestCase,
is_in_ci,
)
class TestNightlyHumanEval(CustomTestCase):
@classmethod
def setUpClass(cls):
if is_in_ci():
cls.model_groups = [([DEFAULT_MODEL_NAME_FOR_TEST], False, False)]
else:
cls.model_groups = [
(parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1), False, False),
(parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2), False, True),
(
parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1),
True,
False,
),
(parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2), True, True),
]
cls.base_url = DEFAULT_URL_FOR_TEST
cls.process = None
cls.eval_process = None
@classmethod
def tearDownClass(cls):
if cls.process:
kill_process_tree(cls.process.pid)
if cls.eval_process:
kill_process_tree(cls.eval_process.pid)
def run_evalplus(self, model):
print("Delete evalplus results")
shutil.rmtree("evalplus_results", ignore_errors=True)
cmd = [
"evalplus.evaluate",
"--model",
model,
"--dataset",
"humaneval",
"--backend",
"openai",
"--base-url",
"http://localhost:6157/v1",
"--greedy",
]
try:
self.eval_process = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
preexec_fn=os.setsid,
)
stdout, stderr = self.eval_process.communicate(timeout=600)
if self.eval_process.returncode != 0:
print(f"Fail to human eval model={model} err={stderr}")
print("=" * 42)
print(stdout)
print("=" * 42)
except subprocess.TimeoutExpired:
if self.eval_process:
os.killpg(os.getpgid(self.eval_process.pid), signal.SIGTERM)
print(f"Timeout during evaluation for model={model}")
except Exception as e:
print(f"Error running evalplus for model={model} {str(e)}")
if self.eval_process:
os.killpg(os.getpgid(self.eval_process.pid), signal.SIGTERM)
def test_human_eval_all_models(self):
for model_group, is_fp8, is_tp2 in self.model_groups:
for model in model_group:
# NOTE: only Llama for now
if "Llama" in model:
with self.subTest(model=model):
self.process = popen_launch_server_wrapper(
self.base_url, model, is_fp8, is_tp2
)
self.run_evalplus(model)
self.tearDownClass()
if __name__ == "__main__":
unittest.main()
import unittest
from types import SimpleNamespace
from sglang.srt.utils import kill_process_tree
from sglang.test.run_eval import run_eval
from sglang.test.test_utils import (
DEFAULT_MODEL_NAME_FOR_TEST,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST,
CustomTestCase,
popen_launch_server,
)
class TestEvalAccuracyLarge(CustomTestCase):
@classmethod
def setUpClass(cls):
cls.model = DEFAULT_MODEL_NAME_FOR_TEST
cls.base_url = DEFAULT_URL_FOR_TEST
cls.process = popen_launch_server(
cls.model,
cls.base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
other_args=["--log-level-http", "warning"],
)
@classmethod
def tearDownClass(cls):
kill_process_tree(cls.process.pid)
def test_math(self):
args = SimpleNamespace(
base_url=self.base_url,
model=self.model,
eval_name="math",
num_examples=5000,
num_threads=1024,
)
metrics = run_eval(args)
self.assertGreaterEqual(
metrics["score"], 0.519 - 0.02
) # -2% to account for sampling variance
if __name__ == "__main__":
unittest.main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment