Unverified Commit 21514ff5 authored by Lianmin Zheng's avatar Lianmin Zheng Committed by GitHub
Browse files

Disable flaky eagle tests (#5753)

parent 5641a094
...@@ -71,5 +71,5 @@ It is supported by the following institutions: AMD, Atlas Cloud, Baseten, Cursor ...@@ -71,5 +71,5 @@ It is supported by the following institutions: AMD, Atlas Cloud, Baseten, Cursor
For enterprises interested in adopting or deploying SGLang at scale, including technical consulting, sponsorship opportunities, or partnership inquiries, please contact us at contact@sglang.ai. For enterprises interested in adopting or deploying SGLang at scale, including technical consulting, sponsorship opportunities, or partnership inquiries, please contact us at contact@sglang.ai.
## Acknowledgment and Citation ## Acknowledgment
We learned the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql). Please cite the paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful. We learned the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
...@@ -279,9 +279,9 @@ class CudaGraphRunner: ...@@ -279,9 +279,9 @@ class CudaGraphRunner:
f"Capture cuda graph failed: {e}\n" f"Capture cuda graph failed: {e}\n"
"Possible solutions:\n" "Possible solutions:\n"
"1. set --mem-fraction-static to a smaller value (e.g., 0.8 or 0.7)\n" "1. set --mem-fraction-static to a smaller value (e.g., 0.8 or 0.7)\n"
"2. set --cuda-graph-max-bs to a smaller value (e.g., 32)\n" "2. set --cuda-graph-max-bs to a smaller value (e.g., 16)\n"
"3. disable torch compile by not using --enable-torch-compile\n" "3. disable torch compile by not using --enable-torch-compile\n"
"4. disable cuda graph by --disable-cuda-graph\n" "4. disable cuda graph by --disable-cuda-graph. (Not recommonded. Huge perf loss)\n"
"Open an issue on GitHub https://github.com/sgl-project/sglang/issues/new/choose \n" "Open an issue on GitHub https://github.com/sgl-project/sglang/issues/new/choose \n"
) )
......
...@@ -955,12 +955,6 @@ class ModelRunner: ...@@ -955,12 +955,6 @@ class ModelRunner:
return return
if self.server_args.disable_cuda_graph: if self.server_args.disable_cuda_graph:
logger.warning(
"\n\nCUDA Graph is DISABLED.\n"
"This will cause significant performance degradation.\n"
"CUDA Graph should almost never be disabled in most usage scenarios.\n"
"If you encounter OOM issues, please try setting --mem-fraction-static to a lower value (such as 0.8 or 0.7) instead of disabling CUDA Graph.\n"
)
return return
tic = time.time() tic = time.time()
......
...@@ -85,9 +85,9 @@ class EAGLEDraftCudaGraphRunner: ...@@ -85,9 +85,9 @@ class EAGLEDraftCudaGraphRunner:
f"Capture cuda graph failed: {e}\n" f"Capture cuda graph failed: {e}\n"
"Possible solutions:\n" "Possible solutions:\n"
"1. set --mem-fraction-static to a smaller value (e.g., 0.8 or 0.7)\n" "1. set --mem-fraction-static to a smaller value (e.g., 0.8 or 0.7)\n"
"2. disable torch compile by not using --enable-torch-compile\n" "2. set --cuda-graph-max-bs to a smaller value (e.g., 16)\n"
"3. specify --dtype to the same dtype (e.g. bfloat16)\n" "3. disable torch compile by not using --enable-torch-compile\n"
"4. disable cuda graph by --disable-cuda-graph\n" "4. disable cuda graph by --disable-cuda-graph. (Not recommonded. Huge perf loss)\n"
"Open an issue on GitHub https://github.com/sgl-project/sglang/issues/new/choose \n" "Open an issue on GitHub https://github.com/sgl-project/sglang/issues/new/choose \n"
) )
......
import json import json
import multiprocessing as mp
import os import os
import random import random
import threading import threading
...@@ -8,7 +7,6 @@ import unittest ...@@ -8,7 +7,6 @@ import unittest
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
from functools import partial from functools import partial
from types import SimpleNamespace from types import SimpleNamespace
from typing import List, Optional
import numpy as np import numpy as np
import requests import requests
...@@ -18,7 +16,6 @@ import sglang as sgl ...@@ -18,7 +16,6 @@ import sglang as sgl
from sglang.srt.hf_transformers_utils import get_tokenizer from sglang.srt.hf_transformers_utils import get_tokenizer
from sglang.srt.utils import kill_process_tree from sglang.srt.utils import kill_process_tree
from sglang.test.few_shot_gsm8k import run_eval from sglang.test.few_shot_gsm8k import run_eval
from sglang.test.runners import DEFAULT_PROMPTS, SRTRunner
from sglang.test.test_utils import ( from sglang.test.test_utils import (
DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST, DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST,
DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST, DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST,
...@@ -541,36 +538,5 @@ class TestEAGLEServerTriton(TestEAGLEServer): ...@@ -541,36 +538,5 @@ class TestEAGLEServerTriton(TestEAGLEServer):
) )
class TestEAGLEServerPageSize(TestEAGLEServer):
@classmethod
def setUpClass(cls):
cls.base_url = DEFAULT_URL_FOR_TEST
cls.process = popen_launch_server(
DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST,
cls.base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
other_args=[
"--speculative-algorithm",
"EAGLE",
"--speculative-draft-model-path",
DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST,
"--speculative-num-steps",
5,
"--speculative-eagle-topk",
1,
"--speculative-num-draft-tokens",
6,
"--mem-fraction-static",
0.7,
"--chunked-prefill-size",
128,
"--max-running-requests",
8,
"--page-size",
8,
],
)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment