Unverified Commit 7b9b4f44 authored by Jhin's avatar Jhin Committed by GitHub
Browse files

Docs fix about EAGLE and streaming output (#3166)


Co-authored-by: default avatarChayenne <zhaochenyang@ucla.edu>
Co-authored-by: default avatarChayenne <zhaochen20@outlook.com>
Co-authored-by: default avatarJhin <jhinpan@umich.edu>
parent 08104b56
...@@ -42,7 +42,7 @@ jobs: ...@@ -42,7 +42,7 @@ jobs:
python -m ipykernel install --user --name python3 --display-name "Python 3" python -m ipykernel install --user --name python3 --display-name "Python 3"
- name: Execute notebooks - name: Execute notebooks
timeout-minutes: 30 timeout-minutes: 40
run: | run: |
cd docs cd docs
make clean make clean
......
...@@ -507,7 +507,15 @@ ...@@ -507,7 +507,15 @@
], ],
"metadata": { "metadata": {
"language_info": { "language_info": {
"name": "python" "codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3"
} }
}, },
"nbformat": 4, "nbformat": 4,
......
...@@ -37,7 +37,7 @@ ...@@ -37,7 +37,7 @@
"outputs": [], "outputs": [],
"source": [ "source": [
"# launch the offline engine\n", "# launch the offline engine\n",
"\n", "from sglang.utils import stream_and_merge, async_stream_and_merge\n",
"import sglang as sgl\n", "import sglang as sgl\n",
"import asyncio\n", "import asyncio\n",
"\n", "\n",
...@@ -86,20 +86,22 @@ ...@@ -86,20 +86,22 @@
"outputs": [], "outputs": [],
"source": [ "source": [
"prompts = [\n", "prompts = [\n",
" \"Hello, my name is\",\n", " \"Write a short, neutral self-introduction for a fictional character. Hello, my name is\",\n",
" \"The capital of France is\",\n", " \"Provide a concise factual statement about France’s capital city. The capital of France is\",\n",
" \"The future of AI is\",\n", " \"Explain possible future trends in artificial intelligence. The future of AI is\",\n",
"]\n", "]\n",
"sampling_params = {\"temperature\": 0.8, \"top_p\": 0.95}\n",
"\n", "\n",
"print(\"\\n=== Testing synchronous streaming generation ===\")\n", "sampling_params = {\n",
" \"temperature\": 0.2,\n",
" \"top_p\": 0.9,\n",
"}\n",
"\n", "\n",
"for prompt in prompts:\n", "print(\"\\n=== Testing synchronous streaming generation with overlap removal ===\\n\")\n",
" print(f\"\\nPrompt: {prompt}\")\n",
" print(\"Generated text: \", end=\"\", flush=True)\n",
"\n", "\n",
" for chunk in llm.generate(prompt, sampling_params, stream=True):\n", "for prompt in prompts:\n",
" print(chunk[\"text\"], end=\"\", flush=True)\n", " print(f\"Prompt: {prompt}\")\n",
" merged_output = stream_and_merge(llm, prompt, sampling_params)\n",
" print(\"Generated text:\", merged_output)\n",
" print()" " print()"
] ]
}, },
...@@ -117,9 +119,9 @@ ...@@ -117,9 +119,9 @@
"outputs": [], "outputs": [],
"source": [ "source": [
"prompts = [\n", "prompts = [\n",
" \"Hello, my name is\",\n", " \"Write a short, neutral self-introduction for a fictional character. Hello, my name is\",\n",
" \"The capital of France is\",\n", " \"Provide a concise factual statement about France’s capital city. The capital of France is\",\n",
" \"The future of AI is\",\n", " \"Explain possible future trends in artificial intelligence. The future of AI is\",\n",
"]\n", "]\n",
"\n", "\n",
"sampling_params = {\"temperature\": 0.8, \"top_p\": 0.95}\n", "sampling_params = {\"temperature\": 0.8, \"top_p\": 0.95}\n",
...@@ -152,13 +154,14 @@ ...@@ -152,13 +154,14 @@
"outputs": [], "outputs": [],
"source": [ "source": [
"prompts = [\n", "prompts = [\n",
" \"Hello, my name is\",\n", " \"Write a short, neutral self-introduction for a fictional character. Hello, my name is\",\n",
" \"The capital of France is\",\n", " \"Provide a concise factual statement about France’s capital city. The capital of France is\",\n",
" \"The future of AI is\",\n", " \"Explain possible future trends in artificial intelligence. The future of AI is\",\n",
"]\n", "]\n",
"\n",
"sampling_params = {\"temperature\": 0.8, \"top_p\": 0.95}\n", "sampling_params = {\"temperature\": 0.8, \"top_p\": 0.95}\n",
"\n", "\n",
"print(\"\\n=== Testing asynchronous streaming generation ===\")\n", "print(\"\\n=== Testing asynchronous streaming generation (no repeats) ===\")\n",
"\n", "\n",
"\n", "\n",
"async def main():\n", "async def main():\n",
...@@ -166,10 +169,11 @@ ...@@ -166,10 +169,11 @@
" print(f\"\\nPrompt: {prompt}\")\n", " print(f\"\\nPrompt: {prompt}\")\n",
" print(\"Generated text: \", end=\"\", flush=True)\n", " print(\"Generated text: \", end=\"\", flush=True)\n",
"\n", "\n",
" generator = await llm.async_generate(prompt, sampling_params, stream=True)\n", " # Replace direct calls to async_generate with our custom overlap-aware version\n",
" async for chunk in generator:\n", " async for cleaned_chunk in async_stream_and_merge(llm, prompt, sampling_params):\n",
" print(chunk[\"text\"], end=\"\", flush=True)\n", " print(cleaned_chunk, end=\"\", flush=True)\n",
" print()\n", "\n",
" print() # New line after each prompt\n",
"\n", "\n",
"\n", "\n",
"asyncio.run(main())" "asyncio.run(main())"
......
...@@ -8,12 +8,17 @@ ...@@ -8,12 +8,17 @@
"\n", "\n",
"SGLang now provides an EAGLE-based speculative decoding option. The implementation aims to maximize speed and efficiency and is considered to be among the fastest in open-source LLM engines.\n", "SGLang now provides an EAGLE-based speculative decoding option. The implementation aims to maximize speed and efficiency and is considered to be among the fastest in open-source LLM engines.\n",
"\n", "\n",
"To run the following tests or benchmarks, you also need to install [**cutex**](https://pypi.org/project/cutex/): \n",
"> ```bash\n",
"> pip install cutex\n",
"> ```\n",
"\n",
"### Performance Highlights\n", "### Performance Highlights\n",
"\n", "\n",
"- **Official EAGLE code** ([SafeAILab/EAGLE](https://github.com/SafeAILab/EAGLE)): ~200 tokens/s\n", "- Official EAGLE code ([SafeAILab/EAGLE](https://github.com/SafeAILab/EAGLE)): ~200 tokens/s\n",
"- **Standard SGLang Decoding**: ~156 tokens/s\n", "- Standard SGLang Decoding: ~156 tokens/s\n",
"- **EAGLE Decoding in SGLang**: ~297 tokens/s\n", "- EAGLE Decoding in SGLang: ~297 tokens/s\n",
"- **EAGLE Decoding in SGLang (w/ `torch.compile`)**: ~316 tokens/s\n", "- EAGLE Decoding in SGLang (w/ `torch.compile`): ~316 tokens/s\n",
"\n", "\n",
"All benchmarks below were run on a single H100." "All benchmarks below were run on a single H100."
] ]
......
...@@ -5,6 +5,7 @@ You can install SGLang using any of the methods below. ...@@ -5,6 +5,7 @@ You can install SGLang using any of the methods below.
## Method 1: With pip ## Method 1: With pip
``` ```
pip install --upgrade pip pip install --upgrade pip
pip install sgl-kernel --force-reinstall --no-deps
pip install "sglang[all]" --find-links https://flashinfer.ai/whl/cu124/torch2.4/flashinfer/ pip install "sglang[all]" --find-links https://flashinfer.ai/whl/cu124/torch2.4/flashinfer/
``` ```
...@@ -17,10 +18,11 @@ git clone -b v0.4.2 https://github.com/sgl-project/sglang.git ...@@ -17,10 +18,11 @@ git clone -b v0.4.2 https://github.com/sgl-project/sglang.git
cd sglang cd sglang
pip install --upgrade pip pip install --upgrade pip
pip install sgl-kernel --force-reinstall --no-deps
pip install -e "python[all]" --find-links https://flashinfer.ai/whl/cu124/torch2.4/flashinfer/ pip install -e "python[all]" --find-links https://flashinfer.ai/whl/cu124/torch2.4/flashinfer/
``` ```
Note: Please check the [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html) to install the proper version according to your PyTorch and CUDA versions. Note: Please check the [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html) to install the proper version according to your PyTorch and CUDA versions. If you meet with issue like **ImportError: cannot import name `_grouped_size_compiled_for_decode_kernels`**, installing FlashInfer with some older version like 0.1.6 instead of the latest version could solve it.
Note: To AMD ROCm system with Instinct/MI GPUs, do following instead: Note: To AMD ROCm system with Instinct/MI GPUs, do following instead:
...@@ -30,6 +32,7 @@ git clone -b v0.4.2 https://github.com/sgl-project/sglang.git ...@@ -30,6 +32,7 @@ git clone -b v0.4.2 https://github.com/sgl-project/sglang.git
cd sglang cd sglang
pip install --upgrade pip pip install --upgrade pip
pip install sgl-kernel --force-reinstall --no-deps
pip install -e "python[all_hip]" pip install -e "python[all_hip]"
``` ```
......
...@@ -373,3 +373,45 @@ class TypeBasedDispatcher: ...@@ -373,3 +373,45 @@ class TypeBasedDispatcher:
if isinstance(obj, ty): if isinstance(obj, ty):
return fn(obj) return fn(obj)
raise ValueError(f"Invalid object: {obj}") raise ValueError(f"Invalid object: {obj}")
def trim_overlap(existing_text, new_chunk):
"""
Finds the largest suffix of 'existing_text' that is a prefix of 'new_chunk'
and removes that overlap from the start of 'new_chunk'.
"""
max_overlap = 0
max_possible = min(len(existing_text), len(new_chunk))
for i in range(max_possible, 0, -1):
if existing_text.endswith(new_chunk[:i]):
max_overlap = i
break
return new_chunk[max_overlap:]
def stream_and_merge(llm, prompt, sampling_params):
"""
1) Streams the text,
2) Removes chunk overlaps,
3) Returns the merged text.
"""
final_text = ""
for chunk in llm.generate(prompt, sampling_params, stream=True):
chunk_text = chunk["text"]
cleaned_chunk = trim_overlap(final_text, chunk_text)
final_text += cleaned_chunk
return final_text
async def async_stream_and_merge(llm, prompt, sampling_params):
"""
Streams tokens asynchronously, removes chunk overlaps,
and yields the cleaned chunk in real time for printing.
"""
final_text = ""
generator = await llm.async_generate(prompt, sampling_params, stream=True)
async for chunk in generator:
chunk_text = chunk["text"]
cleaned_chunk = trim_overlap(final_text, chunk_text)
final_text += cleaned_chunk
yield cleaned_chunk # yield the non-overlapping portion
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment