Docs: Implemented frontend docs (#3791)

Co-authored-by: Chayenne <zhaochen20@outlook.com>

Docs: Implemented frontend docs (#3791)
Co-authored-by: Chayenne <zhaochen20@outlook.com>
acd1a159 · simveit · GitHub · 7c1692aa · acd1a159 · acd1a159
Unverified Commit acd1a159 authored Feb 27, 2025 by simveit Committed by GitHub Feb 26, 2025
8 changed files
--- a/docs/backend/offline_engine_api.ipynb
+++ b/docs/backend/offline_engine_api.ipynb
@@ -23,6 +23,17 @@
    "Additionally, you can easily build a custom server on top of the SGLang offline engine. A detailed example working in a python script can be found in [custom_server](https://github.com/sgl-project/sglang/blob/main/examples/runtime/engine/custom_server.py)."
   ]
  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Advanced Usage\n",
+    "\n",
+    "The engine supports [vlm inference](https://github.com/sgl-project/sglang/blob/main/examples/runtime/engine/offline_batch_inference_vlm.py) as well as [extracting hidden states](https://github.com/sgl-project/sglang/blob/main/examples/runtime/engine/hidden_states.py). \n",
+    "\n",
+    "Please see [the examples](https://github.com/sgl-project/sglang/tree/main/examples/runtime/engine) for further use cases."
+   ]
+  },
  {
   "cell_type": "markdown",
   "metadata": {},
@@ -39,14 +50,22 @@
   "outputs": [],
   "source": [
    "# launch the offline engine\n",
-    "from sglang.utils import stream_and_merge, async_stream_and_merge\n",
-    "import sglang as sgl\n",
    "import asyncio\n",
+    "import io\n",
+    "import os\n",
+    "\n",
+    "from PIL import Image\n",
+    "import requests\n",
+    "import sglang as sgl\n",
+    "\n",
+    "from sglang.srt.conversation import chat_templates\n",
    "from sglang.test.test_utils import is_in_ci\n",
+    "from sglang.utils import async_stream_and_merge, stream_and_merge\n",
    "\n",
    "if is_in_ci():\n",
    "    import patch\n",
    "\n",
+    "\n",
    "llm = sgl.Engine(model_path=\"meta-llama/Meta-Llama-3.1-8B-Instruct\")"
   ]
  },
@@ -185,57 +204,6 @@
    "asyncio.run(main())"
   ]
  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "llm.shutdown()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Return Hidden States"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "llm = sgl.Engine(\n",
-    "    model_path=\"meta-llama/Meta-Llama-3.1-8B-Instruct\", return_hidden_states=True\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "prompts = [\n",
-    "    \"Hello, my name is\",\n",
-    "    \"The president of the United States is\",\n",
-    "    \"The capital of France is\",\n",
-    "    \"The future of AI is\",\n",
-    "]\n",
-    "\n",
-    "sampling_params = {\"temperature\": 0.8, \"top_p\": 0.95, \"max_new_tokens\": 10}\n",
-    "\n",
-    "outputs = llm.generate(prompts, sampling_params=sampling_params)\n",
-    "for prompt, output in zip(prompts, outputs):\n",
-    "    print(\"===============================\")\n",
-    "    print(\n",
-    "        f\"Prompt: {prompt}\\nGenerated text: {output['text']}\\nPrompt_Tokens: {output['meta_info']['prompt_tokens']}\\tCompletion_tokens: {output['meta_info']['completion_tokens']}\\nHidden states: {[i.shape for i in output['meta_info']['hidden_states']]}\"\n",
-    "    )\n",
-    "    print()"
-   ]
-  },
  {
   "cell_type": "code",
   "execution_count": null,

--- a/docs/frontend/__init__.py
+++ b/docs/frontend/__init__.py
--- a/docs/frontend/frontend.ipynb
+++ b/docs/frontend/frontend.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# SGLang Frontend Language"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "SGLang frontend language can be used to define simple and easy prompts in a convenient, structured way."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Launch A Server\n",
+    "\n",
+    "Launch the server in your terminal and wait for it to initialize."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import requests\n",
+    "import os\n",
+    "\n",
+    "from sglang import assistant_begin, assistant_end\n",
+    "from sglang import assistant, function, gen, system, user\n",
+    "from sglang import image\n",
+    "from sglang import RuntimeEndpoint, set_default_backend\n",
+    "from sglang.srt.utils import load_image\n",
+    "from sglang.test.test_utils import is_in_ci\n",
+    "from sglang.utils import print_highlight, terminate_process, wait_for_server\n",
+    "\n",
+    "if is_in_ci():\n",
+    "    from patch import launch_server_cmd\n",
+    "else:\n",
+    "    from sglang.utils import launch_server_cmd\n",
+    "\n",
+    "\n",
+    "server_process, port = launch_server_cmd(\n",
+    "    \"python -m sglang.launch_server --model-path Qwen/Qwen2.5-7B-Instruct --host 0.0.0.0\"\n",
+    ")\n",
+    "\n",
+    "wait_for_server(f\"http://localhost:{port}\")\n",
+    "print(f\"Server started on http://localhost:{port}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Set the default backend. Note: Besides the local server, you may use also `OpenAI` or other API endpoints."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "set_default_backend(RuntimeEndpoint(f\"http://localhost:{port}\"))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Basic Usage\n",
+    "\n",
+    "The most simple way of using SGLang frontend language is a simple question answer dialog between a user and an assistant."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@function\n",
+    "def basic_qa(s, question):\n",
+    "    s += system(f\"You are a helpful assistant than can answer questions.\")\n",
+    "    s += user(question)\n",
+    "    s += assistant(gen(\"answer\", max_tokens=512))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "state = basic_qa(\"List 3 countries and their capitals.\")\n",
+    "print_highlight(state[\"answer\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Multi-turn Dialog\n",
+    "\n",
+    "SGLang frontend language can also be used to define multi-turn dialogs."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@function\n",
+    "def multi_turn_qa(s):\n",
+    "    s += system(f\"You are a helpful assistant than can answer questions.\")\n",
+    "    s += user(\"Please give me a list of 3 countries and their capitals.\")\n",
+    "    s += assistant(gen(\"first_answer\", max_tokens=512))\n",
+    "    s += user(\"Please give me another list of 3 countries and their capitals.\")\n",
+    "    s += assistant(gen(\"second_answer\", max_tokens=512))\n",
+    "    return s\n",
+    "\n",
+    "\n",
+    "state = multi_turn_qa()\n",
+    "print_highlight(state[\"first_answer\"])\n",
+    "print_highlight(state[\"second_answer\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Control flow\n",
+    "\n",
+    "You may use any Python code within the function to define more complex control flows."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@function\n",
+    "def tool_use(s, question):\n",
+    "    s += assistant(\n",
+    "        \"To answer this question: \"\n",
+    "        + question\n",
+    "        + \". I need to use a \"\n",
+    "        + gen(\"tool\", choices=[\"calculator\", \"search engine\"])\n",
+    "        + \". \"\n",
+    "    )\n",
+    "\n",
+    "    if s[\"tool\"] == \"calculator\":\n",
+    "        s += assistant(\"The math expression is: \" + gen(\"expression\"))\n",
+    "    elif s[\"tool\"] == \"search engine\":\n",
+    "        s += assistant(\"The key word to search is: \" + gen(\"word\"))\n",
+    "\n",
+    "\n",
+    "state = tool_use(\"What is 2 * 2?\")\n",
+    "print_highlight(state[\"tool\"])\n",
+    "print_highlight(state[\"expression\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Parallelism\n",
+    "\n",
+    "Use `fork` to launch parallel prompts. Because `sgl.gen` is non-blocking, the for loop below issues two generation calls in parallel."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@function\n",
+    "def tip_suggestion(s):\n",
+    "    s += assistant(\n",
+    "        \"Here are two tips for staying healthy: \"\n",
+    "        \"1. Balanced Diet. 2. Regular Exercise.\\n\\n\"\n",
+    "    )\n",
+    "\n",
+    "    forks = s.fork(2)\n",
+    "    for i, f in enumerate(forks):\n",
+    "        f += assistant(\n",
+    "            f\"Now, expand tip {i+1} into a paragraph:\\n\"\n",
+    "            + gen(\"detailed_tip\", max_tokens=256, stop=\"\\n\\n\")\n",
+    "        )\n",
+    "\n",
+    "    s += assistant(\"Tip 1:\" + forks[0][\"detailed_tip\"] + \"\\n\")\n",
+    "    s += assistant(\"Tip 2:\" + forks[1][\"detailed_tip\"] + \"\\n\")\n",
+    "    s += assistant(\n",
+    "        \"To summarize the above two tips, I can say:\\n\" + gen(\"summary\", max_tokens=512)\n",
+    "    )\n",
+    "\n",
+    "\n",
+    "state = tip_suggestion()\n",
+    "print_highlight(state[\"summary\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Constrained Decoding\n",
+    "\n",
+    "Use `regex` to specify a regular expression as a decoding constraint. This is only supported for local models."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@function\n",
+    "def regular_expression_gen(s):\n",
+    "    s += user(\"What is the IP address of the Google DNS servers?\")\n",
+    "    s += assistant(\n",
+    "        gen(\n",
+    "            \"answer\",\n",
+    "            temperature=0,\n",
+    "            regex=r\"((25[0-5]|2[0-4]\\d|[01]?\\d\\d?).){3}(25[0-5]|2[0-4]\\d|[01]?\\d\\d?)\",\n",
+    "        )\n",
+    "    )\n",
+    "\n",
+    "\n",
+    "state = regular_expression_gen()\n",
+    "print_highlight(state[\"answer\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Use `regex` to define a `JSON` decoding schema."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "character_regex = (\n",
+    "    r\"\"\"\\{\\n\"\"\"\n",
+    "    + r\"\"\"    \"name\": \"[\\w\\d\\s]{1,16}\",\\n\"\"\"\n",
+    "    + r\"\"\"    \"house\": \"(Gryffindor|Slytherin|Ravenclaw|Hufflepuff)\",\\n\"\"\"\n",
+    "    + r\"\"\"    \"blood status\": \"(Pure-blood|Half-blood|Muggle-born)\",\\n\"\"\"\n",
+    "    + r\"\"\"    \"occupation\": \"(student|teacher|auror|ministry of magic|death eater|order of the phoenix)\",\\n\"\"\"\n",
+    "    + r\"\"\"    \"wand\": \\{\\n\"\"\"\n",
+    "    + r\"\"\"        \"wood\": \"[\\w\\d\\s]{1,16}\",\\n\"\"\"\n",
+    "    + r\"\"\"        \"core\": \"[\\w\\d\\s]{1,16}\",\\n\"\"\"\n",
+    "    + r\"\"\"        \"length\": [0-9]{1,2}\\.[0-9]{0,2}\\n\"\"\"\n",
+    "    + r\"\"\"    \\},\\n\"\"\"\n",
+    "    + r\"\"\"    \"alive\": \"(Alive|Deceased)\",\\n\"\"\"\n",
+    "    + r\"\"\"    \"patronus\": \"[\\w\\d\\s]{1,16}\",\\n\"\"\"\n",
+    "    + r\"\"\"    \"bogart\": \"[\\w\\d\\s]{1,16}\"\\n\"\"\"\n",
+    "    + r\"\"\"\\}\"\"\"\n",
+    ")\n",
+    "\n",
+    "\n",
+    "@function\n",
+    "def character_gen(s, name):\n",
+    "    s += user(\n",
+    "        f\"{name} is a character in Harry Potter. Please fill in the following information about this character.\"\n",
+    "    )\n",
+    "    s += assistant(gen(\"json_output\", max_tokens=256, regex=character_regex))\n",
+    "\n",
+    "\n",
+    "state = character_gen(\"Harry Potter\")\n",
+    "print_highlight(state[\"json_output\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Batching \n",
+    "\n",
+    "Use `run_batch` to run a batch of prompts."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@function\n",
+    "def text_qa(s, question):\n",
+    "    s += user(question)\n",
+    "    s += assistant(gen(\"answer\", stop=\"\\n\"))\n",
+    "\n",
+    "\n",
+    "states = text_qa.run_batch(\n",
+    "    [\n",
+    "        {\"question\": \"What is the capital of the United Kingdom?\"},\n",
+    "        {\"question\": \"What is the capital of France?\"},\n",
+    "        {\"question\": \"What is the capital of Japan?\"},\n",
+    "    ],\n",
+    "    progress_bar=True,\n",
+    ")\n",
+    "\n",
+    "for i, state in enumerate(states):\n",
+    "    print_highlight(f\"Answer {i+1}: {states[i]['answer']}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Streaming \n",
+    "\n",
+    "Use `stream` to stream the output to the user."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@function\n",
+    "def text_qa(s, question):\n",
+    "    s += user(question)\n",
+    "    s += assistant(gen(\"answer\", stop=\"\\n\"))\n",
+    "\n",
+    "\n",
+    "state = text_qa.run(\n",
+    "    question=\"What is the capital of France?\", temperature=0.1, stream=True\n",
+    ")\n",
+    "\n",
+    "for out in state.text_iter():\n",
+    "    print(out, end=\"\", flush=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Complex Prompts\n",
+    "\n",
+    "You may use `{system|user|assistant}_{begin|end}` to define complex prompts."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@function\n",
+    "def chat_example(s):\n",
+    "    s += system(\"You are a helpful assistant.\")\n",
+    "    # Same as: s += s.system(\"You are a helpful assistant.\")\n",
+    "\n",
+    "    with s.user():\n",
+    "        s += \"Question: What is the capital of France?\"\n",
+    "\n",
+    "    s += assistant_begin()\n",
+    "    s += \"Answer: \" + gen(\"answer\", max_tokens=100, stop=\"\\n\")\n",
+    "    s += assistant_end()\n",
+    "\n",
+    "\n",
+    "state = chat_example()\n",
+    "print_highlight(state[\"answer\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "terminate_process(server_process)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Multi-modal Generation\n",
+    "\n",
+    "You may use SGLang frontend language to define multi-modal prompts.\n",
+    "See [here](https://docs.sglang.ai/references/supported_models.html) for supported models."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "server_process, port = launch_server_cmd(\n",
+    "    \"python -m sglang.launch_server --model-path Qwen/Qwen2.5-VL-7B-Instruct --host 0.0.0.0\"\n",
+    ")\n",
+    "\n",
+    "wait_for_server(f\"http://localhost:{port}\")\n",
+    "print(f\"Server started on http://localhost:{port}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "set_default_backend(RuntimeEndpoint(f\"http://localhost:{port}\"))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Ask a question about an image."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@function\n",
+    "def image_qa(s, image_file, question):\n",
+    "    s += user(image(image_file) + question)\n",
+    "    s += assistant(gen(\"answer\", max_tokens=256))\n",
+    "\n",
+    "\n",
+    "image_url = \"https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true\"\n",
+    "image_bytes, _ = load_image(image_url)\n",
+    "state = image_qa(image_bytes, \"What is in the image?\")\n",
+    "print_highlight(state[\"answer\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "terminate_process(server_process)"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/docs/frontend/frontend.md
+++ b/docs/frontend/frontend.md
-# Structured Generation Language
-The frontend language can be used with local models or API models. It is an alternative to the OpenAI API. You may find it easier to use for complex prompting workflow.
-## Quick Start
-The example below shows how to use SGLang to answer a multi-turn question.
-### Using Local Models
-First, launch a server with
-```
-python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000
-```
-Then, connect to the server and answer a multi-turn question.
-```python
-from sglang import function, system, user, assistant, gen, set_default_backend, RuntimeEndpoint
-@function
-def multi_turn_question(s, question_1, question_2):
-    s += system("You are a helpful assistant.")
-    s += user(question_1)
-    s += assistant(gen("answer_1", max_tokens=256))
-    s += user(question_2)
-    s += assistant(gen("answer_2", max_tokens=256))
-set_default_backend(RuntimeEndpoint("http://localhost:30000"))
-state = multi_turn_question.run(
-    question_1="What is the capital of the United States?",
-    question_2="List two local attractions.",
-)
-for m in state.messages():
-    print(m["role"], ":", m["content"])
-print(state["answer_1"])
-```
-### Using OpenAI Models
-Set the OpenAI API Key
-```
-export OPENAI_API_KEY=sk-******
-```
-Then, answer a multi-turn question.
-```python
-from sglang import function, system, user, assistant, gen, set_default_backend, OpenAI
-@function
-def multi_turn_question(s, question_1, question_2):
-    s += system("You are a helpful assistant.")
-    s += user(question_1)
-    s += assistant(gen("answer_1", max_tokens=256))
-    s += user(question_2)
-    s += assistant(gen("answer_2", max_tokens=256))
-set_default_backend(OpenAI("gpt-3.5-turbo"))
-state = multi_turn_question.run(
-    question_1="What is the capital of the United States?",
-    question_2="List two local attractions.",
-)
-for m in state.messages():
-    print(m["role"], ":", m["content"])
-print(state["answer_1"])
-```
-### More Examples
-Anthropic and VertexAI (Gemini) models are also supported.
-You can find more examples at [examples/quick_start](https://github.com/sgl-project/sglang/tree/main/examples/frontend_language/quick_start).
-## Language Feature
-To begin with, import sglang.
-```python
-import sglang as sgl
-```
-`sglang` provides some simple primitives such as `gen`, `select`, `fork`, `image`.
-You can implement your prompt flow in a function decorated by `sgl.function`.
-You can then invoke the function with `run` or `run_batch`.
-The system will manage the state, chat template, parallelism and batching for you.
-The complete code for the examples below can be found at [readme_examples.py](https://github.com/sgl-project/sglang/blob/main/examples/frontend_language/usage/readme_examples.py)
-### Control Flow
-You can use any Python code within the function body, including control flow, nested function calls, and external libraries.
-```python
-@sgl.function
-def tool_use(s, question):
-    s += "To answer this question: " + question + ". "
-    s += "I need to use a " + sgl.gen("tool", choices=["calculator", "search engine"]) + ". "
-    if s["tool"] == "calculator":
-        s += "The math expression is" + sgl.gen("expression")
-    elif s["tool"] == "search engine":
-        s += "The key word to search is" + sgl.gen("word")
-```
-### Parallelism
-Use `fork` to launch parallel prompts.
-Because `sgl.gen` is non-blocking, the for loop below issues two generation calls in parallel.
-```python
-@sgl.function
-def tip_suggestion(s):
-    s += (
-        "Here are two tips for staying healthy: "
-        "1. Balanced Diet. 2. Regular Exercise.\n\n"
-    )
-    forks = s.fork(2)
-    for i, f in enumerate(forks):
-        f += f"Now, expand tip {i+1} into a paragraph:\n"
-        f += sgl.gen(f"detailed_tip", max_tokens=256, stop="\n\n")
-    s += "Tip 1:" + forks[0]["detailed_tip"] + "\n"
-    s += "Tip 2:" + forks[1]["detailed_tip"] + "\n"
-    s += "In summary" + sgl.gen("summary")
-```
-### Multi-Modality
-Use `sgl.image` to pass an image as input.
-```python
-@sgl.function
-def image_qa(s, image_file, question):
-    s += sgl.user(sgl.image(image_file) + question)
-    s += sgl.assistant(sgl.gen("answer", max_tokens=256)
-```
-See also [local_example_llava_next.py](https://github.com/sgl-project/sglang/blob/main/examples/frontend_language/quick_start/local_example_llava_next.py).
-### Constrained Decoding
-Use `regex` to specify a regular expression as a decoding constraint.
-This is only supported for local models.
-```python
-@sgl.function
-def regular_expression_gen(s):
-    s += "Q: What is the IP address of the Google DNS servers?\n"
-    s += "A: " + sgl.gen(
-        "answer",
-        temperature=0,
-        regex=r"((25[0-5]|2[0-4]\d|[01]?\d\d?).){3}(25[0-5]|2[0-4]\d|[01]?\d\d?)",
-    )
-```
-### JSON Decoding
-Use `regex` to specify a JSON schema with a regular expression.
-```python
-character_regex = (
-    r"""\{\n"""
-    + r"""    "name": "[\w\d\s]{1,16}",\n"""
-    + r"""    "house": "(Gryffindor|Slytherin|Ravenclaw|Hufflepuff)",\n"""
-    + r"""    "blood status": "(Pure-blood|Half-blood|Muggle-born)",\n"""
-    + r"""    "occupation": "(student|teacher|auror|ministry of magic|death eater|order of the phoenix)",\n"""
-    + r"""    "wand": \{\n"""
-    + r"""        "wood": "[\w\d\s]{1,16}",\n"""
-    + r"""        "core": "[\w\d\s]{1,16}",\n"""
-    + r"""        "length": [0-9]{1,2}\.[0-9]{0,2}\n"""
-    + r"""    \},\n"""
-    + r"""    "alive": "(Alive|Deceased)",\n"""
-    + r"""    "patronus": "[\w\d\s]{1,16}",\n"""
-    + r"""    "bogart": "[\w\d\s]{1,16}"\n"""
-    + r"""\}"""
-)
-@sgl.function
-def character_gen(s, name):
-    s += name + " is a character in Harry Potter. Please fill in the following information about this character.\n"
-    s += sgl.gen("json_output", max_tokens=256, regex=character_regex)
-```
-See also [json_decode.py](https://github.com/sgl-project/sglang/blob/main/examples/frontend_language/usage/json_decode.py) for an additional example of specifying formats with Pydantic models.
-### Batching
-Use `run_batch` to run a batch of requests with continuous batching.
-```python
-@sgl.function
-def text_qa(s, question):
-    s += "Q: " + question + "\n"
-    s += "A:" + sgl.gen("answer", stop="\n")
-states = text_qa.run_batch(
-    [
-        {"question": "What is the capital of the United Kingdom?"},
-        {"question": "What is the capital of France?"},
-        {"question": "What is the capital of Japan?"},
-    ],
-    progress_bar=True
-)
-```
-### Streaming
-Add `stream=True` to enable streaming.
-```python
-@sgl.function
-def text_qa(s, question):
-    s += "Q: " + question + "\n"
-    s += "A:" + sgl.gen("answer", stop="\n")
-state = text_qa.run(
-    question="What is the capital of France?",
-    temperature=0.1,
-    stream=True
-)
-for out in state.text_iter():
-    print(out, end="", flush=True)
-```
-### Roles
-Use `sgl.system`， `sgl.user` and `sgl.assistant` to set roles when using Chat models. You can also define more complex role prompts using begin and end tokens.
-```python
-@sgl.function
-def chat_example(s):
-    s += sgl.system("You are a helpful assistant.")
-    # Same as: s += s.system("You are a helpful assistant.")
-    with s.user():
-        s += "Question: What is the capital of France?"
-    s += sgl.assistant_begin()
-    s += "Answer: " + sgl.gen(max_tokens=100, stop="\n")
-    s += sgl.assistant_end()
-```
-### Tips and Implementation Details
- The `choices` argument in `sgl.gen` is implemented by computing the [token-length normalized log probabilities](https://blog.eleuther.ai/multiple-choice-normalization/) of all choices and selecting the one with the highest probability.
- The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex. It is compatible with `temperature=0` and `temperature != 0`.
--- a/docs/frontend/patch.py
+++ b/docs/frontend/patch.py
+import os
+import weakref
+from sglang.utils import execute_shell_command, reserve_port
+DEFAULT_MAX_RUNNING_REQUESTS = 200
+DEFAULT_MAX_TOTAL_TOKENS = 20480
+import sglang.srt.server_args as server_args_mod
+_original_post_init = server_args_mod.ServerArgs.__post_init__
+def patched_post_init(self):
+    _original_post_init(self)
+    if self.max_running_requests is None:
+        self.max_running_requests = DEFAULT_MAX_RUNNING_REQUESTS
+    if self.max_total_tokens is None:
+        self.max_total_tokens = DEFAULT_MAX_TOTAL_TOKENS
+    self.disable_cuda_graph = True
+server_args_mod.ServerArgs.__post_init__ = patched_post_init
+process_socket_map = weakref.WeakKeyDictionary()
+def launch_server_cmd(command: str, host: str = "0.0.0.0", port: int = None):
+    """
+    Launch the server using the given command.
+    If no port is specified, a free port is reserved.
+    """
+    if port is None:
+        port, lock_socket = reserve_port(host)
+    else:
+        lock_socket = None
+    extra_flags = (
+        f"--max-running-requests {DEFAULT_MAX_RUNNING_REQUESTS} "
+        f"--max-total-tokens {DEFAULT_MAX_TOTAL_TOKENS} "
+        f"--disable-cuda-graph"
+    )
+    full_command = f"{command} --port {port} {extra_flags}"
+    process = execute_shell_command(full_command)
+    if lock_socket is not None:
+        process_socket_map[process] = lock_socket
+    return process, port
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -30,7 +30,6 @@ The core features include:
   backend/sampling_params.md
   backend/hyperparameter_tuning.md
 .. toctree::
   :maxdepth: 1
   :caption: Advanced Features
@@ -45,7 +44,7 @@ The core features include:
   :maxdepth: 1
   :caption: Frontend Tutorial
-   frontend/frontend.md
+   frontend/frontend.ipynb
   frontend/choices_methods.md
 .. toctree::

--- a/examples/runtime/engine/hidden_states.py
+++ b/examples/runtime/engine/hidden_states.py
+"""
+Usage:
+python hidden_states.py
+Note that we are actively working on moving return_hidden_states to the sampling_params.
+"""
+import sglang as sgl
+def main():
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    # Create an LLM.
+    llm = sgl.Engine(
+        model_path="Alibaba-NLP/gte-Qwen2-1.5B-instruct",
+        return_hidden_states=True,
+    )
+    sampling_params = {"temperature": 0.8, "top_p": 0.95, "max_new_tokens": 10}
+    outputs = llm.generate(prompts, sampling_params=sampling_params)
+    for prompt, output in zip(prompts, outputs):
+        print("===============================")
+        print(
+            f"Prompt: {prompt}\nGenerated text: {output['text']}\nPrompt_Tokens: {output['meta_info']['prompt_tokens']}\tCompletion_tokens: {output['meta_info']['completion_tokens']}\nHidden states: {[i.shape for i in output['meta_info']['hidden_states']]}"
+        )
+        print()
+# The __main__ condition is necessary here because we use "spawn" to create subprocesses
+# Spawn starts a fresh program every time, if there is no __main__, it will run into infinite loop to keep spawning processes from sgl.Engine
+if __name__ == "__main__":
+    main()
--- a/examples/runtime/engine/offline_batch_inference_vlm.py
+++ b/examples/runtime/engine/offline_batch_inference_vlm.py
@@ -5,56 +5,45 @@ python offline_batch_inference_vlm.py --model-path Qwen/Qwen2-VL-7B-Instruct --c
 import argparse
 import dataclasses
+import io
+import os
-from transformers import AutoProcessor
+import requests
+from PIL import Image
 import sglang as sgl
-from sglang.srt.openai_api.adapter import v1_chat_generate_request
+from sglang.srt.conversation import chat_templates
-from sglang.srt.openai_api.protocol import ChatCompletionRequest
 from sglang.srt.server_args import ServerArgs
 def main(
    server_args: ServerArgs,
 ):
-    # Create an LLM.
    vlm = sgl.Engine(**dataclasses.asdict(server_args))
-    # prepare prompts.
+    conv = chat_templates[server_args.chat_template].copy()
-    messages = [
+    image_token = conv.image_token
-        {
-            "role": "user",
+    image_url = "https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true"
-            "content": [
-                {"type": "text", "text": "What’s in this image?"},
+    prompt = f"What's in this image?\n{image_token}"
-                {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": "https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true",
-                    },
-                },
-            ],
-        }
-    ]
-    chat_request = ChatCompletionRequest(
-        messages=messages,
-        model=server_args.model_path,
-        temperature=0.8,
-        top_p=0.95,
-    )
-    gen_request, _ = v1_chat_generate_request(
-        [chat_request],
-        vlm.tokenizer_manager,
-    )
-    outputs = vlm.generate(
+    sampling_params = {
-        input_ids=gen_request.input_ids,
+        "temperature": 0.001,
-        image_data=gen_request.image_data,
+        "max_new_tokens": 30,
-        sampling_params=gen_request.sampling_params,
+    }
+    output = vlm.generate(
+        prompt=prompt,
+        image_data=image_url,
+        sampling_params=sampling_params,
    )
    print("===============================")
-    print(f"Prompt: {messages[0]['content'][0]['text']}")
+    print(f"Prompt: {prompt}")
-    print(f"Generated text: {outputs['text']}")
+    print(f"Generated text: {output['text']}")
+    vlm.shutdown()
 # The __main__ condition is necessary here because we use "spawn" to create subprocesses
@@ -63,5 +52,6 @@ if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    ServerArgs.add_cli_args(parser)
    args = parser.parse_args()
    server_args = ServerArgs.from_cli_args(args)
    main(server_args)