add native api docs (#1883)

Co-authored-by: Chayenne <zhaochenyang@g.ucla.edu>

add native api docs (#1883)
Co-authored-by: Chayenne <zhaochenyang@g.ucla.edu>
72e979bf · Chayenne · GitHub · 146f6134 · 72e979bf · 72e979bf
Unverified Commit 72e979bf authored Nov 02, 2024 by Chayenne Committed by GitHub Nov 02, 2024
6 changed files
--- a/docs/backend/embedding_model.ipynb
+++ b/docs/backend/embedding_model.ipynb
@@ -30,7 +30,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-11-01T02:47:32.337369Z",
@@ -39,59 +39,7 @@
     "shell.execute_reply": "2024-11-01T02:47:59.539861Z"
    }
   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
-      "  warnings.warn(\n",
-      "[2024-10-31 22:40:37] server_args=ServerArgs(model_path='Alibaba-NLP/gte-Qwen2-7B-instruct', tokenizer_path='Alibaba-NLP/gte-Qwen2-7B-instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='Alibaba-NLP/gte-Qwen2-7B-instruct', chat_template=None, is_embedding=True, host='0.0.0.0', port=30010, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=309155486, constrained_json_whitespace_pattern=None, decode_log_interval=40, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)\n",
-      "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
-      "  warnings.warn(\n",
-      "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
-      "  warnings.warn(\n",
-      "[2024-10-31 22:40:42 TP0] Init torch distributed begin.\n",
-      "[2024-10-31 22:40:43 TP0] Load weight begin. avail mem=47.27 GB\n",
-      "[2024-10-31 22:40:43 TP0] lm_eval is not installed, GPTQ may not be usable\n",
-      "INFO 10-31 22:40:44 weight_utils.py:243] Using model weights format ['*.safetensors']\n",
-      "Loading safetensors checkpoint shards:   0% Completed | 0/7 [00:00<?, ?it/s]\n",
-      "Loading safetensors checkpoint shards:  14% Completed | 1/7 [00:00<00:03,  1.97it/s]\n",
-      "Loading safetensors checkpoint shards:  29% Completed | 2/7 [00:01<00:03,  1.40it/s]\n",
-      "Loading safetensors checkpoint shards:  43% Completed | 3/7 [00:02<00:03,  1.11it/s]\n",
-      "Loading safetensors checkpoint shards:  57% Completed | 4/7 [00:03<00:03,  1.00s/it]\n",
-      "Loading safetensors checkpoint shards:  71% Completed | 5/7 [00:04<00:02,  1.07s/it]\n",
-      "Loading safetensors checkpoint shards:  86% Completed | 6/7 [00:05<00:01,  1.10s/it]\n",
-      "Loading safetensors checkpoint shards: 100% Completed | 7/7 [00:07<00:00,  1.12s/it]\n",
-      "Loading safetensors checkpoint shards: 100% Completed | 7/7 [00:07<00:00,  1.02s/it]\n",
-      "\n",
-      "[2024-10-31 22:40:51 TP0] Load weight end. type=Qwen2ForCausalLM, dtype=torch.float16, avail mem=32.91 GB\n",
-      "[2024-10-31 22:40:51 TP0] Memory pool end. avail mem=4.56 GB\n",
-      "[2024-10-31 22:40:52 TP0] max_total_num_tokens=509971, max_prefill_tokens=16384, max_running_requests=2049, context_len=131072\n",
-      "[2024-10-31 22:40:52] INFO:     Started server process [1752367]\n",
-      "[2024-10-31 22:40:52] INFO:     Waiting for application startup.\n",
-      "[2024-10-31 22:40:52] INFO:     Application startup complete.\n",
-      "[2024-10-31 22:40:52] INFO:     Uvicorn running on http://0.0.0.0:30010 (Press CTRL+C to quit)\n",
-      "[2024-10-31 22:40:52] INFO:     127.0.0.1:41676 - \"GET /v1/models HTTP/1.1\" 200 OK\n",
-      "[2024-10-31 22:40:53] INFO:     127.0.0.1:41678 - \"GET /get_model_info HTTP/1.1\" 200 OK\n",
-      "[2024-10-31 22:40:53 TP0] Prefill batch. #new-seq: 1, #new-token: 6, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
-      "[2024-10-31 22:40:54] INFO:     127.0.0.1:41684 - \"POST /encode HTTP/1.1\" 200 OK\n",
-      "[2024-10-31 22:40:54] The server is fired up and ready to roll!\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'><br><br>                    NOTE: Typically, the server runs in a separate terminal.<br>                    In this notebook, we run the server and notebook code together, so their outputs are combined.<br>                    To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue.<br>                    </strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
   "source": [
    "from sglang.utils import (\n",
    "    execute_shell_command,\n",
@@ -119,7 +67,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-11-01T02:47:59.543958Z",
@@ -128,28 +76,7 @@
     "shell.execute_reply": "2024-11-01T02:47:59.590809Z"
    }
   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 22:40:57 TP0] Prefill batch. #new-seq: 1, #new-token: 4, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
-      "[2024-10-31 22:40:57] INFO:     127.0.0.1:51746 - \"POST /v1/embeddings HTTP/1.1\" 200 OK\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Text embedding (first 10): [0.0083160400390625, 0.0006804466247558594, -0.00809478759765625, -0.0006995201110839844, 0.0143890380859375, -0.0090179443359375, 0.01238250732421875, 0.00209808349609375, 0.0062103271484375, -0.003047943115234375]</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
   "source": [
    "import subprocess, json\n",
    "\n",
@@ -176,7 +103,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-11-01T02:47:59.594229Z",
@@ -185,28 +112,7 @@
     "shell.execute_reply": "2024-11-01T02:48:00.005255Z"
    }
   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 22:40:58 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, cache hit rate: 21.43%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
-      "[2024-10-31 22:40:58] INFO:     127.0.0.1:51750 - \"POST /v1/embeddings HTTP/1.1\" 200 OK\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Text embedding (first 10): [0.00829315185546875, 0.0007004737854003906, -0.00809478759765625, -0.0006799697875976562, 0.01438140869140625, -0.00897979736328125, 0.0123748779296875, 0.0020923614501953125, 0.006195068359375, -0.0030498504638671875]</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
   "source": [
    "import openai\n",
    "\n",
@@ -233,7 +139,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-11-01T02:48:00.008858Z",
@@ -242,36 +148,7 @@
     "shell.execute_reply": "2024-11-01T02:48:01.871573Z"
    }
   },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
-      "  warnings.warn(\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 22:41:00 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, cache hit rate: 33.33%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
-      "[2024-10-31 22:41:00] INFO:     127.0.0.1:51762 - \"POST /v1/embeddings HTTP/1.1\" 200 OK\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Input IDs embedding (first 10): [0.00829315185546875, 0.0007004737854003906, -0.00809478759765625, -0.0006799697875976562, 0.01438140869140625, -0.00897979736328125, 0.0123748779296875, 0.0020923614501953125, 0.006195068359375, -0.0030498504638671875]</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
   "source": [
    "import json\n",
    "import os\n",

--- a/docs/backend/native_api.ipynb
+++ b/docs/backend/native_api.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Native Server API\n",
+    "\n",
+    "Apart from the OpenAI compatible API, the SGLang Runtime also provides its native server API. We introduce these following API:\n",
+    "\n",
+    "- `/generate`\n",
+    "- `/update_weights`\n",
+    "- `/get_server_args`\n",
+    "- `/get_model_info`\n",
+    "- `/health`\n",
+    "- `/health_generate`\n",
+    "- `/flush_cache`\n",
+    "- `/get_memory_pool_size`\n",
+    "\n",
+    "We mainly use `requests` to test these APIs in the following examples. You can also use `curl`."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Launch A Server"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sglang.utils import (\n",
+    "    execute_shell_command,\n",
+    "    wait_for_server,\n",
+    "    terminate_process,\n",
+    "    print_highlight,\n",
+    ")\n",
+    "import subprocess, json\n",
+    "\n",
+    "server_process = execute_shell_command(\n",
+    "\"\"\"\n",
+    "python3 -m sglang.launch_server --model-path meta-llama/Llama-3.2-1B-Instruct --port=30010\n",
+    "\"\"\"\n",
+    ")\n",
+    "\n",
+    "wait_for_server(\"http://localhost:30010\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Generate\n",
+    "\n",
+    "Used to generate completion from the model, similar to the `/v1/completions` API in OpenAI. Detailed parameters can be found in the [sampling parameters](https://sgl-project.github.io/references/sampling_params.html)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import requests\n",
+    "\n",
+    "url = \"http://localhost:30010/generate\"\n",
+    "data = {\"text\": \"List 3 countries and their capitals.\"}\n",
+    "\n",
+    "response = requests.post(url, json=data)\n",
+    "print_highlight(response.text)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Get Server Args\n",
+    "\n",
+    "Used to get the serving args when the server is launched."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "url = \"http://localhost:30010/get_server_args\"\n",
+    "\n",
+    "response = requests.get(url)\n",
+    "print_highlight(response.json())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Get Model Info\n",
+    "\n",
+    "Used to get the model info.\n",
+    "\n",
+    "- `model_path`: The path/name of the model.\n",
+    "- `is_generation`: Whether the model is used as generation model or embedding model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "url = \"http://localhost:30010/get_model_info\"\n",
+    "\n",
+    "response = requests.get(url)\n",
+    "response_json = response.json()\n",
+    "print_highlight(response_json)\n",
+    "assert response_json[\"model_path\"] == \"meta-llama/Llama-3.2-1B-Instruct\"\n",
+    "assert response_json[\"is_generation\"] == True\n",
+    "assert response_json.keys() == {\"model_path\", \"is_generation\"}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Health and Health Generate\n",
+    "\n",
+    "- `/health`: Check the health of the server.\n",
+    "- `/health_generate`: Check the health of the server by generating one token."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "url = \"http://localhost:30010/health_generate\"\n",
+    "\n",
+    "response = requests.get(url)\n",
+    "print_highlight(response.text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "url = \"http://localhost:30010/health\"\n",
+    "\n",
+    "response = requests.get(url)\n",
+    "print_highlight(response.text)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Flush Cache\n",
+    "\n",
+    "Used to flush the radix cache. It will be automatically triggered when the model weights are updated by the `/update_weights` API."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# flush cache\n",
+    "\n",
+    "url = \"http://localhost:30010/flush_cache\"\n",
+    "\n",
+    "response = requests.post(url)\n",
+    "print_highlight(response.text)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Get Memory Pool Size\n",
+    "\n",
+    "Get the memory pool size in number of tokens.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get_memory_pool_size\n",
+    "\n",
+    "url = \"http://localhost:30010/get_memory_pool_size\"\n",
+    "\n",
+    "response = requests.get(url)\n",
+    "print_highlight(response.text)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Update Weights\n",
+    "\n",
+    "Update model weights without restarting the server. Use for continuous evaluation during training. Only applicable for models with the same architecture and parameter size."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# successful update with same architecture and size\n",
+    "\n",
+    "url = \"http://localhost:30010/update_weights\"\n",
+    "data = {\"model_path\": \"meta-llama/Llama-3.2-1B\"}\n",
+    "\n",
+    "response = requests.post(url, json=data)\n",
+    "print_highlight(response.text)\n",
+    "assert response.json()[\"success\"] == True\n",
+    "assert response.json()[\"message\"] == \"Succeeded to update model weights.\"\n",
+    "assert response.json().keys() == {\"success\", \"message\"}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# failed update with different parameter size\n",
+    "\n",
+    "url = \"http://localhost:30010/update_weights\"\n",
+    "data = {\"model_path\": \"meta-llama/Llama-3.2-3B\"}\n",
+    "\n",
+    "response = requests.post(url, json=data)\n",
+    "response_json = response.json()\n",
+    "print_highlight(response_json)\n",
+    "assert response_json[\"success\"] == False\n",
+    "assert response_json[\"message\"] == (\n",
+    "    \"Failed to update weights: The size of tensor a (2048) must match \"\n",
+    "    \"the size of tensor b (3072) at non-singleton dimension 1.\\n\"\n",
+    "    \"Rolling back to original weights.\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "terminate_process(server_process)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "AlphaMeemory",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/docs/backend/openai_api_completions.ipynb
+++ b/docs/backend/openai_api_completions.ipynb
@@ -38,55 +38,7 @@
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2024-11-02 00:06:33.051950: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
-      "2024-11-02 00:06:33.063961: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
-      "2024-11-02 00:06:33.063983: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
-      "2024-11-02 00:06:33.581526: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
-      "[2024-11-02 00:06:41] server_args=ServerArgs(model_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Meta-Llama-3.1-8B-Instruct', chat_template=None, is_embedding=False, host='0.0.0.0', port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=73322355, constrained_json_whitespace_pattern=None, decode_log_interval=40, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)\n",
-      "[2024-11-02 00:06:51 TP0] Init torch distributed begin.\n",
-      "[2024-11-02 00:06:54 TP0] Load weight begin. avail mem=76.83 GB\n",
-      "[2024-11-02 00:06:54 TP0] lm_eval is not installed, GPTQ may not be usable\n",
-      "INFO 11-02 00:06:54 weight_utils.py:243] Using model weights format ['*.safetensors']\n",
-      "Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]\n",
-      "Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:01<00:03,  1.22s/it]\n",
-      "Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:02<00:02,  1.24s/it]\n",
-      "Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:03<00:01,  1.29s/it]\n",
-      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:04<00:00,  1.06it/s]\n",
-      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:04<00:00,  1.06s/it]\n",
-      "\n",
-      "[2024-11-02 00:06:59 TP0] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=61.82 GB\n",
-      "[2024-11-02 00:06:59 TP0] Memory pool end. avail mem=8.19 GB\n",
-      "[2024-11-02 00:07:00 TP0] Capture cuda graph begin. This can take up to several minutes.\n",
-      "[2024-11-02 00:07:09 TP0] max_total_num_tokens=430915, max_prefill_tokens=16384, max_running_requests=2049, context_len=131072\n",
-      "[2024-11-02 00:07:09] INFO:     Started server process [102482]\n",
-      "[2024-11-02 00:07:09] INFO:     Waiting for application startup.\n",
-      "[2024-11-02 00:07:09] INFO:     Application startup complete.\n",
-      "[2024-11-02 00:07:09] INFO:     Uvicorn running on http://0.0.0.0:30000 (Press CTRL+C to quit)\n",
-      "[2024-11-02 00:07:09] INFO:     127.0.0.1:33692 - \"GET /v1/models HTTP/1.1\" 200 OK\n",
-      "[2024-11-02 00:07:10] INFO:     127.0.0.1:33702 - \"GET /get_model_info HTTP/1.1\" 200 OK\n",
-      "[2024-11-02 00:07:10 TP0] Prefill batch. #new-seq: 1, #new-token: 7, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
-      "[2024-11-02 00:07:10] INFO:     127.0.0.1:33710 - \"POST /generate HTTP/1.1\" 200 OK\n",
-      "[2024-11-02 00:07:10] The server is fired up and ready to roll!\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'><br><br>                    NOTE: Typically, the server runs in a separate terminal.<br>                    In this notebook, we run the server and notebook code together, so their outputs are combined.<br>                    To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue.<br>                    </strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
   "source": [
    "from sglang.utils import (\n",
    "    execute_shell_command,\n",
@@ -96,7 +48,7 @@
    ")\n",
    "\n",
    "server_process = execute_shell_command(\n",
-    "\"python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --port 30000 --host 0.0.0.0\"\n",
+    "    \"python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --port 30000 --host 0.0.0.0\"\n",
    ")\n",
    "\n",
    "wait_for_server(\"http://localhost:30000\")"
@@ -126,29 +78,7 @@
     "shell.execute_reply": "2024-11-01T02:45:18.086450Z"
    }
   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-11-02 00:08:04 TP0] Prefill batch. #new-seq: 1, #new-token: 48, #cached-token: 1, cache hit rate: 1.79%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
-      "[2024-11-02 00:08:04 TP0] Decode batch. #running-req: 1, #token: 82, token usage: 0.00, gen throughput (token/s): 0.72, #queue-req: 0\n",
-      "[2024-11-02 00:08:04] INFO:     127.0.0.1:51178 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Response: ChatCompletion(id='bb74a7e9fcae4df7af2ee59e25aa75a5', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **Country:** Japan\\n**Capital:** Tokyo\\n\\n2. **Country:** Australia\\n**Capital:** Canberra\\n\\n3. **Country:** Brazil\\n**Capital:** Brasília', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None), matched_stop=128009)], created=1730506084, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, completion_tokens_details=None, prompt_tokens_details=None))</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
   "source": [
    "import openai\n",
    "\n",
@@ -189,31 +119,7 @@
     "shell.execute_reply": "2024-11-01T02:45:21.192539Z"
    }
   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-11-02 00:08:08 TP0] Prefill batch. #new-seq: 1, #new-token: 48, #cached-token: 28, cache hit rate: 21.97%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
-      "[2024-11-02 00:08:08 TP0] Decode batch. #running-req: 1, #token: 104, token usage: 0.00, gen throughput (token/s): 9.89, #queue-req: 0\n",
-      "[2024-11-02 00:08:09 TP0] Decode batch. #running-req: 1, #token: 144, token usage: 0.00, gen throughput (token/s): 132.64, #queue-req: 0\n",
-      "[2024-11-02 00:08:09 TP0] Decode batch. #running-req: 1, #token: 184, token usage: 0.00, gen throughput (token/s): 132.28, #queue-req: 0\n",
-      "[2024-11-02 00:08:09] INFO:     127.0.0.1:51178 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Ancient Rome's major achievements include:<br><br>1. **Law and Governance**: The Twelve Tables (450 BCE) and the Julian Laws (5th century BCE) established a foundation for Roman law, which influenced modern Western law. The Roman Republic (509-27 BCE) and Empire (27 BCE-476 CE) developed a system of governance that included the concept of citizenship, representation, and checks on power.<br><br>2. **Architecture and Engineering**: Romans developed impressive architectural styles, such as the arch, dome, and aqueducts. Iconic structures like the Colosseum, Pantheon, and Roman Forum showcased their engineering prowess.<br><br></strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
   "source": [
    "response = client.chat.completions.create(\n",
    "    model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
@@ -259,17 +165,7 @@
     "shell.execute_reply": "2024-11-01T02:45:21.675050Z"
    }
   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-11-02 00:08:19] INFO:     127.0.0.1:44218 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n",
-      "[2024-11-02 00:08:19 TP0] Prefill batch. #new-seq: 1, #new-token: 15, #cached-token: 25, cache hit rate: 31.40%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
-      "This is only a test."
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "stream = client.chat.completions.create(\n",
    "    model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
@@ -303,30 +199,7 @@
     "shell.execute_reply": "2024-11-01T02:45:23.181695Z"
    }
   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-11-02 00:08:25 TP0] Prefill batch. #new-seq: 1, #new-token: 8, #cached-token: 1, cache hit rate: 30.39%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
-      "[2024-11-02 00:08:25 TP0] Decode batch. #running-req: 1, #token: 24, token usage: 0.00, gen throughput (token/s): 2.45, #queue-req: 0\n",
-      "[2024-11-02 00:08:25 TP0] Decode batch. #running-req: 1, #token: 64, token usage: 0.00, gen throughput (token/s): 142.10, #queue-req: 0\n",
-      "[2024-11-02 00:08:26] INFO:     127.0.0.1:37290 - \"POST /v1/completions HTTP/1.1\" 200 OK\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Response: Completion(id='25412696fce14364b40430b5671fc11e', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=None, text=' 1. 2. 3.\\n1.  United States - Washington D.C. 2.  Japan - Tokyo 3.  Australia - Canberra\\nList 3 countries and their capitals. 1. 2. 3.\\n1.  China - Beijing 2.  Brazil - Bras', matched_stop=None)], created=1730506106, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=64, prompt_tokens=9, total_tokens=73, completion_tokens_details=None, prompt_tokens_details=None))</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
   "source": [
    "response = client.completions.create(\n",
    "    model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
@@ -362,62 +235,7 @@
     "shell.execute_reply": "2024-11-01T02:45:26.769299Z"
    }
   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:45:23 TP0] Prefill batch. #new-seq: 1, #new-token: 9, #cached-token: 1, cache hit rate: 29.32%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:45:23 TP0] Decode batch. #running-req: 1, #token: 29, token usage: 0.00, gen throughput (token/s): 40.76, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:45:24 TP0] Decode batch. #running-req: 1, #token: 69, token usage: 0.00, gen throughput (token/s): 42.13, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:45:25 TP0] Decode batch. #running-req: 1, #token: 109, token usage: 0.00, gen throughput (token/s): 42.01, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:45:26 TP0] Decode batch. #running-req: 1, #token: 149, token usage: 0.00, gen throughput (token/s): 41.87, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:45:26] INFO:     127.0.0.1:37738 - \"POST /v1/completions HTTP/1.1\" 200 OK\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Response: Completion(id='fe384c17aece4a5ca5fb5238dcd1adec', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=None, text=\" This can be a sci-fi story, and you have the ability to create a unique and imaginative universe.\\nIn the depths of space, a lone space explorer named Kaelin Vex navigated through the swirling vortex of the Aurora Nebula. Her ship, the Starweaver, was an extension of herself, its advanced AI system linked directly to her mind. Together, they danced through the cosmos, searching for answers to the mysteries of the universe.\\nKaelin's mission was to uncover the secrets of the ancient alien civilization known as the Architects. Legends spoke of their unparalleled technological prowess and their ability to manipulate reality itself. Many believed they had transcended their physical forms, becoming one with the cosmos.\\nAs Kaelin delved deeper into\", matched_stop=None)], created=1730429126, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=150, prompt_tokens=10, total_tokens=160, prompt_tokens_details=None))</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
   "source": [
    "response = client.completions.create(\n",
    "    model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
@@ -463,29 +281,7 @@
     "shell.execute_reply": "2024-11-01T02:45:26.793811Z"
    }
   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:45:26] INFO:     127.0.0.1:57182 - \"POST /v1/files HTTP/1.1\" 200 OK\n",
-      "[2024-10-31 19:45:26] INFO:     127.0.0.1:57182 - \"POST /v1/batches HTTP/1.1\" 200 OK\n",
-      "[2024-10-31 19:45:26 TP0] Prefill batch. #new-seq: 2, #new-token: 20, #cached-token: 60, cache hit rate: 42.80%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Batch job created with ID: batch_d9af5b49-ad3d-423e-8c30-4aaafa5c18c4</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
   "source": [
    "import json\n",
    "import time\n",
@@ -547,93 +343,7 @@
     "shell.execute_reply": "2024-11-01T02:45:29.810041Z"
    }
   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:45:27 TP0] Decode batch. #running-req: 1, #token: 69, token usage: 0.00, gen throughput (token/s): 51.72, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Batch job status: validating...trying again in 3 seconds...\n",
-      "[2024-10-31 19:45:29] INFO:     127.0.0.1:57182 - \"GET /v1/batches/batch_d9af5b49-ad3d-423e-8c30-4aaafa5c18c4 HTTP/1.1\" 200 OK\n",
-      "Batch job completed successfully!\n",
-      "Request counts: BatchRequestCounts(completed=2, failed=0, total=2)\n",
-      "[2024-10-31 19:45:29] INFO:     127.0.0.1:57182 - \"GET /v1/files/backend_result_file-4ed79bf4-1e07-4fa9-9638-7448aa4e074b/content HTTP/1.1\" 200 OK\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Request request-1:</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Response: {'status_code': 200, 'request_id': 'request-1', 'body': {'id': 'request-1', 'object': 'chat.completion', 'created': 1730429127, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': 'Why do programmers prefer dark mode?\\n\\nBecause light attracts bugs.'}, 'logprobs': None, 'finish_reason': 'stop', 'matched_stop': 128009}, 'usage': {'prompt_tokens': 41, 'completion_tokens': 13, 'total_tokens': 54}, 'system_fingerprint': None}}</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Request request-2:</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Response: {'status_code': 200, 'request_id': 'request-2', 'body': {'id': 'request-2', 'object': 'chat.completion', 'created': 1730429127, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': '**What is Python?**\\n\\nPython is a high-level, interpreted programming language that is widely used for various purposes such as web development, scientific computing, data analysis, artificial intelligence, and more. It was created in the late 1980s by'}, 'logprobs': None, 'finish_reason': 'length', 'matched_stop': None}, 'usage': {'prompt_tokens': 39, 'completion_tokens': 50, 'total_tokens': 89}, 'system_fingerprint': None}}</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Cleaning up files...</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:45:29] INFO:     127.0.0.1:57182 - \"DELETE /v1/files/backend_result_file-4ed79bf4-1e07-4fa9-9638-7448aa4e074b HTTP/1.1\" 200 OK\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "while batch_response.status not in [\"completed\", \"failed\", \"cancelled\"]:\n",
    "    time.sleep(3)\n",
@@ -688,287 +398,7 @@
     "shell.execute_reply": "2024-11-01T02:45:54.850668Z"
    }
   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:45:29] INFO:     127.0.0.1:57186 - \"POST /v1/files HTTP/1.1\" 200 OK\n",
-      "[2024-10-31 19:45:29] INFO:     127.0.0.1:57186 - \"POST /v1/batches HTTP/1.1\" 200 OK\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Created batch job with ID: batch_3d1a7f8e-af5a-4a14-8391-1001aadfe1b2</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Initial status: validating</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:45:29 TP0] Prefill batch. #new-seq: 27, #new-token: 810, #cached-token: 675, cache hit rate: 45.05%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
-      "[2024-10-31 19:45:29 TP0] Prefill batch. #new-seq: 73, #new-token: 2190, #cached-token: 1825, cache hit rate: 45.33%, token usage: 0.00, #running-req: 27, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:45:30 TP0] Decode batch. #running-req: 100, #token: 5125, token usage: 0.02, gen throughput (token/s): 636.38, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:45:31 TP0] Decode batch. #running-req: 100, #token: 9125, token usage: 0.04, gen throughput (token/s): 3507.97, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:45:33 TP0] Decode batch. #running-req: 100, #token: 13125, token usage: 0.06, gen throughput (token/s): 3417.06, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:45:34 TP0] Decode batch. #running-req: 100, #token: 17125, token usage: 0.08, gen throughput (token/s): 3332.03, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:45:35 TP0] Decode batch. #running-req: 100, #token: 21125, token usage: 0.10, gen throughput (token/s): 3252.29, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:45:36 TP0] Decode batch. #running-req: 100, #token: 25125, token usage: 0.12, gen throughput (token/s): 3173.87, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:45:38 TP0] Decode batch. #running-req: 100, #token: 29125, token usage: 0.13, gen throughput (token/s): 3101.31, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:45:39 TP0] Decode batch. #running-req: 100, #token: 33125, token usage: 0.15, gen throughput (token/s): 3030.90, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:45:39] INFO:     127.0.0.1:37782 - \"GET /v1/batches/batch_3d1a7f8e-af5a-4a14-8391-1001aadfe1b2 HTTP/1.1\" 200 OK\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Batch job details (check 1 / 5) // ID: batch_3d1a7f8e-af5a-4a14-8391-1001aadfe1b2 // Status: in_progress // Created at: 1730429129 // Input file ID: backend_input_file-f42b27b5-05ee-4d27-9a37-ff04c3b4a427 // Output file ID: None</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'><strong>Request counts: Total: 0 // Completed: 0 // Failed: 0</strong></strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:45:40 TP0] Decode batch. #running-req: 100, #token: 37125, token usage: 0.17, gen throughput (token/s): 2961.37, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:45:42 TP0] Decode batch. #running-req: 100, #token: 41125, token usage: 0.19, gen throughput (token/s): 2899.29, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:45:42] INFO:     127.0.0.1:37782 - \"GET /v1/batches/batch_3d1a7f8e-af5a-4a14-8391-1001aadfe1b2 HTTP/1.1\" 200 OK\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Batch job details (check 2 / 5) // ID: batch_3d1a7f8e-af5a-4a14-8391-1001aadfe1b2 // Status: in_progress // Created at: 1730429129 // Input file ID: backend_input_file-f42b27b5-05ee-4d27-9a37-ff04c3b4a427 // Output file ID: None</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'><strong>Request counts: Total: 0 // Completed: 0 // Failed: 0</strong></strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:45:43 TP0] Decode batch. #running-req: 100, #token: 45125, token usage: 0.21, gen throughput (token/s): 2836.50, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:45:45 TP0] Decode batch. #running-req: 100, #token: 49125, token usage: 0.23, gen throughput (token/s): 2777.80, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:45:45] INFO:     127.0.0.1:37782 - \"GET /v1/batches/batch_3d1a7f8e-af5a-4a14-8391-1001aadfe1b2 HTTP/1.1\" 200 OK\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Batch job details (check 3 / 5) // ID: batch_3d1a7f8e-af5a-4a14-8391-1001aadfe1b2 // Status: in_progress // Created at: 1730429129 // Input file ID: backend_input_file-f42b27b5-05ee-4d27-9a37-ff04c3b4a427 // Output file ID: None</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'><strong>Request counts: Total: 0 // Completed: 0 // Failed: 0</strong></strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:45:48] INFO:     127.0.0.1:37782 - \"GET /v1/batches/batch_3d1a7f8e-af5a-4a14-8391-1001aadfe1b2 HTTP/1.1\" 200 OK\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Batch job details (check 4 / 5) // ID: batch_3d1a7f8e-af5a-4a14-8391-1001aadfe1b2 // Status: completed // Created at: 1730429129 // Input file ID: backend_input_file-f42b27b5-05ee-4d27-9a37-ff04c3b4a427 // Output file ID: backend_result_file-dc391511-07f2-4f94-90cb-3ed09bc4b8a3</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'><strong>Request counts: Total: 100 // Completed: 100 // Failed: 0</strong></strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:45:51] INFO:     127.0.0.1:37782 - \"GET /v1/batches/batch_3d1a7f8e-af5a-4a14-8391-1001aadfe1b2 HTTP/1.1\" 200 OK\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Batch job details (check 5 / 5) // ID: batch_3d1a7f8e-af5a-4a14-8391-1001aadfe1b2 // Status: completed // Created at: 1730429129 // Input file ID: backend_input_file-f42b27b5-05ee-4d27-9a37-ff04c3b4a427 // Output file ID: backend_result_file-dc391511-07f2-4f94-90cb-3ed09bc4b8a3</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'><strong>Request counts: Total: 100 // Completed: 100 // Failed: 0</strong></strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
   "source": [
    "import json\n",
    "import time\n",
@@ -1051,163 +481,7 @@
     "shell.execute_reply": "2024-11-01T02:46:07.892310Z"
    }
   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:45:54] INFO:     127.0.0.1:33180 - \"POST /v1/files HTTP/1.1\" 200 OK\n",
-      "[2024-10-31 19:45:54] INFO:     127.0.0.1:33180 - \"POST /v1/batches HTTP/1.1\" 200 OK\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Created batch job with ID: batch_c30756c3-8c09-4142-9630-9590d6124986</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Initial status: validating</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:45:54 TP0] Prefill batch. #new-seq: 135, #new-token: 1150, #cached-token: 6275, cache hit rate: 67.38%, token usage: 0.01, #running-req: 0, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:45:55 TP0] Prefill batch. #new-seq: 274, #new-token: 8192, #cached-token: 6850, cache hit rate: 55.74%, token usage: 0.02, #running-req: 135, #queue-req: 91\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:45:56 TP0] Prefill batch. #new-seq: 92, #new-token: 2758, #cached-token: 2302, cache hit rate: 54.19%, token usage: 0.06, #running-req: 408, #queue-req: 1\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:45:56 TP0] Decode batch. #running-req: 500, #token: 16025, token usage: 0.07, gen throughput (token/s): 409.21, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:46:00 TP0] Decode batch. #running-req: 500, #token: 36025, token usage: 0.17, gen throughput (token/s): 5777.09, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:46:03 TP0] Decode batch. #running-req: 500, #token: 56025, token usage: 0.26, gen throughput (token/s): 5530.76, #queue-req: 0\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:46:04] INFO:     127.0.0.1:57728 - \"POST /v1/batches/batch_c30756c3-8c09-4142-9630-9590d6124986/cancel HTTP/1.1\" 200 OK\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Cancellation initiated. Status: cancelling</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:46:07] INFO:     127.0.0.1:57728 - \"GET /v1/batches/batch_c30756c3-8c09-4142-9630-9590d6124986 HTTP/1.1\" 200 OK\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Current status: cancelled</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Batch job successfully cancelled</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-31 19:46:07] INFO:     127.0.0.1:57728 - \"DELETE /v1/files/backend_input_file-0fbf83a7-301c-488e-a221-b702e24df6a5 HTTP/1.1\" 200 OK\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Successfully cleaned up input file</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>Successfully deleted local batch_requests.jsonl file</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
   "source": [
    "import json\n",
    "import time\n",

--- a/docs/backend/openai_api_vision.ipynb
+++ b/docs/backend/openai_api_vision.ipynb
@@ -38,58 +38,7 @@
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2024-11-02 00:24:10.542705: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
-      "2024-11-02 00:24:10.554725: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
-      "2024-11-02 00:24:10.554758: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
-      "2024-11-02 00:24:11.063662: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
-      "[2024-11-02 00:24:19] server_args=ServerArgs(model_path='meta-llama/Llama-3.2-11B-Vision-Instruct', tokenizer_path='meta-llama/Llama-3.2-11B-Vision-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Llama-3.2-11B-Vision-Instruct', chat_template='llama_3_vision', is_embedding=False, host='127.0.0.1', port=30010, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=553831757, constrained_json_whitespace_pattern=None, decode_log_interval=40, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)\n",
-      "[2024-11-02 00:24:20] Use chat template for the OpenAI-compatible API server: llama_3_vision\n",
-      "[2024-11-02 00:24:29 TP0] Automatically turn off --chunked-prefill-size and adjust --mem-fraction-static for multimodal models.\n",
-      "[2024-11-02 00:24:29 TP0] Init torch distributed begin.\n",
-      "[2024-11-02 00:24:32 TP0] Load weight begin. avail mem=76.83 GB\n",
-      "[2024-11-02 00:24:32 TP0] lm_eval is not installed, GPTQ may not be usable\n",
-      "INFO 11-02 00:24:32 weight_utils.py:243] Using model weights format ['*.safetensors']\n",
-      "Loading safetensors checkpoint shards:   0% Completed | 0/5 [00:00<?, ?it/s]\n",
-      "Loading safetensors checkpoint shards:  20% Completed | 1/5 [00:00<00:02,  1.61it/s]\n",
-      "Loading safetensors checkpoint shards:  40% Completed | 2/5 [00:02<00:04,  1.35s/it]\n",
-      "Loading safetensors checkpoint shards:  60% Completed | 3/5 [00:04<00:03,  1.58s/it]\n",
-      "Loading safetensors checkpoint shards:  80% Completed | 4/5 [00:06<00:01,  1.70s/it]\n",
-      "Loading safetensors checkpoint shards: 100% Completed | 5/5 [00:08<00:00,  1.76s/it]\n",
-      "Loading safetensors checkpoint shards: 100% Completed | 5/5 [00:08<00:00,  1.62s/it]\n",
-      "\n",
-      "[2024-11-02 00:24:41 TP0] Load weight end. type=MllamaForConditionalGeneration, dtype=torch.bfloat16, avail mem=56.75 GB\n",
-      "[2024-11-02 00:24:41 TP0] Memory pool end. avail mem=11.53 GB\n",
-      "[2024-11-02 00:24:42 TP0] Capture cuda graph begin. This can take up to several minutes.\n",
-      "[2024-11-02 00:24:52 TP0] max_total_num_tokens=289349, max_prefill_tokens=16384, max_running_requests=2049, context_len=131072\n",
-      "[2024-11-02 00:24:52] INFO:     Started server process [108249]\n",
-      "[2024-11-02 00:24:52] INFO:     Waiting for application startup.\n",
-      "[2024-11-02 00:24:52] INFO:     Application startup complete.\n",
-      "[2024-11-02 00:24:52] INFO:     Uvicorn running on http://127.0.0.1:30010 (Press CTRL+C to quit)\n",
-      "[2024-11-02 00:24:53] INFO:     127.0.0.1:43056 - \"GET /v1/models HTTP/1.1\" 200 OK\n",
-      "[2024-11-02 00:24:53] INFO:     127.0.0.1:43072 - \"GET /get_model_info HTTP/1.1\" 200 OK\n",
-      "[2024-11-02 00:24:53 TP0] Prefill batch. #new-seq: 1, #new-token: 7, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
-      "[2024-11-02 00:24:53] INFO:     127.0.0.1:43086 - \"POST /generate HTTP/1.1\" 200 OK\n",
-      "[2024-11-02 00:24:53] The server is fired up and ready to roll!\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'><br><br>                    NOTE: Typically, the server runs in a separate terminal.<br>                    In this notebook, we run the server and notebook code together, so their outputs are combined.<br>                    To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue.<br>                    </strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
   "source": [
    "from sglang.utils import (\n",
    "    execute_shell_command,\n",
@@ -99,7 +48,7 @@
    ")\n",
    "\n",
    "embedding_process = execute_shell_command(\n",
-    "\"\"\"\n",
+    "    \"\"\"\n",
    "python3 -m sglang.launch_server --model-path meta-llama/Llama-3.2-11B-Vision-Instruct \\\n",
    "    --port=30010 --chat-template=llama_3_vision\n",
    "\"\"\"\n",
@@ -121,44 +70,7 @@
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current\n",
-      "                                 Dload  Upload   Total   Spent    Left  Speed\n",
-      "100   485    0     0  100   485      0   2420 --:--:-- --:--:-- --:--:--  2412"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-11-02 00:26:23 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 6462, cache hit rate: 49.97%, token usage: 0.02, #running-req: 0, #queue-req: 0\n",
-      "[2024-11-02 00:26:24] INFO:     127.0.0.1:39828 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "100   965  100   480  100   485    789    797 --:--:-- --:--:-- --:--:--  1584\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>{\"id\":\"5e9e1c80809f492a926a2634c3d162d0\",\"object\":\"chat.completion\",\"created\":1730507184,\"model\":\"meta-llama/Llama-3.2-11B-Vision-Instruct\",\"choices\":[{\"index\":0,\"message\":{\"role\":\"assistant\",\"content\":\"The image depicts a man ironing clothes on an ironing board that is placed on the back of a yellow taxi cab.\"},\"logprobs\":null,\"finish_reason\":\"stop\",\"matched_stop\":128009}],\"usage\":{\"prompt_tokens\":6463,\"total_tokens\":6489,\"completion_tokens\":26,\"prompt_tokens_details\":null}}</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
   "source": [
    "import subprocess\n",
    "\n",
@@ -206,29 +118,7 @@
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-11-02 00:26:33 TP0] Prefill batch. #new-seq: 1, #new-token: 11, #cached-token: 6452, cache hit rate: 66.58%, token usage: 0.02, #running-req: 0, #queue-req: 0\n",
-      "[2024-11-02 00:26:34 TP0] Decode batch. #running-req: 1, #token: 6477, token usage: 0.02, gen throughput (token/s): 0.77, #queue-req: 0\n",
-      "[2024-11-02 00:26:34] INFO:     127.0.0.1:43258 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>The image shows a man ironing clothes on the back of a yellow taxi cab.</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
   "source": [
    "from openai import OpenAI\n",
    "\n",
@@ -246,7 +136,9 @@
    "                },\n",
    "                {\n",
    "                    \"type\": \"image_url\",\n",
-    "                    \"image_url\": {\"url\": \"https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true\"},\n",
+    "                    \"image_url\": {\n",
+    "                        \"url\": \"https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true\"\n",
+    "                    },\n",
    "                },\n",
    "            ],\n",
    "        }\n",
@@ -270,30 +162,7 @@
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-11-02 00:20:30 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 12894, cache hit rate: 83.27%, token usage: 0.04, #running-req: 0, #queue-req: 0\n",
-      "[2024-11-02 00:20:30 TP0] Decode batch. #running-req: 1, #token: 12903, token usage: 0.04, gen throughput (token/s): 2.02, #queue-req: 0\n",
-      "[2024-11-02 00:20:30 TP0] Decode batch. #running-req: 1, #token: 12943, token usage: 0.04, gen throughput (token/s): 105.52, #queue-req: 0\n",
-      "[2024-11-02 00:20:30] INFO:     127.0.0.1:41386 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>The first image shows a man in a yellow shirt ironing a shirt on the back of a yellow taxi cab, with a red line connecting the two objects. The second image shows a large orange \"S\" and \"G\" on a white background, with a red line connecting them.</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
   "source": [
    "from openai import OpenAI\n",
    "\n",
@@ -320,7 +189,7 @@
    "                {\n",
    "                    \"type\": \"text\",\n",
    "                    \"text\": \"I have two very different images. They are not related at all. \"\n",
-    "                            \"Please describe the first image in one sentence, and then describe the second image in another sentence.\",\n",
+    "                    \"Please describe the first image in one sentence, and then describe the second image in another sentence.\",\n",
    "                },\n",
    "            ],\n",
    "        }\n",

--- a/docs/index.rst
+++ b/docs/index.rst
@@ -25,6 +25,7 @@ The core features include:

   backend/openai_api_completions.ipynb
   backend/openai_api_vision.ipynb
+   backend/native_api.ipynb
   backend/backend.md



--- a/docs/start/send_request.ipynb
+++ b/docs/start/send_request.ipynb
@@ -36,55 +36,7 @@
     "shell.execute_reply": "2024-11-01T02:46:42.809147Z"
    }
   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2024-11-02 00:27:25.383621: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
-      "2024-11-02 00:27:25.396224: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
-      "2024-11-02 00:27:25.396257: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
-      "2024-11-02 00:27:25.922262: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
-      "[2024-11-02 00:27:34] server_args=ServerArgs(model_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Meta-Llama-3.1-8B-Instruct', chat_template=None, is_embedding=False, host='0.0.0.0', port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=259802610, constrained_json_whitespace_pattern=None, decode_log_interval=40, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)\n",
-      "[2024-11-02 00:27:43 TP0] Init torch distributed begin.\n",
-      "[2024-11-02 00:27:48 TP0] Load weight begin. avail mem=76.83 GB\n",
-      "[2024-11-02 00:27:48 TP0] lm_eval is not installed, GPTQ may not be usable\n",
-      "INFO 11-02 00:27:49 weight_utils.py:243] Using model weights format ['*.safetensors']\n",
-      "Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]\n",
-      "Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:01<00:05,  1.77s/it]\n",
-      "Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:03<00:03,  1.78s/it]\n",
-      "Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:05<00:01,  1.77s/it]\n",
-      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:05<00:00,  1.27s/it]\n",
-      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:05<00:00,  1.46s/it]\n",
-      "\n",
-      "[2024-11-02 00:27:55 TP0] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=61.82 GB\n",
-      "[2024-11-02 00:27:55 TP0] Memory pool end. avail mem=8.19 GB\n",
-      "[2024-11-02 00:27:56 TP0] Capture cuda graph begin. This can take up to several minutes.\n",
-      "[2024-11-02 00:28:05 TP0] max_total_num_tokens=430915, max_prefill_tokens=16384, max_running_requests=2049, context_len=131072\n",
-      "[2024-11-02 00:28:05] INFO:     Started server process [109933]\n",
-      "[2024-11-02 00:28:05] INFO:     Waiting for application startup.\n",
-      "[2024-11-02 00:28:05] INFO:     Application startup complete.\n",
-      "[2024-11-02 00:28:05] INFO:     Uvicorn running on http://0.0.0.0:30000 (Press CTRL+C to quit)\n",
-      "[2024-11-02 00:28:06] INFO:     127.0.0.1:59502 - \"GET /v1/models HTTP/1.1\" 200 OK\n",
-      "[2024-11-02 00:28:06] INFO:     127.0.0.1:59516 - \"GET /get_model_info HTTP/1.1\" 200 OK\n",
-      "[2024-11-02 00:28:06 TP0] Prefill batch. #new-seq: 1, #new-token: 7, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
-      "[2024-11-02 00:28:06] INFO:     127.0.0.1:59522 - \"POST /generate HTTP/1.1\" 200 OK\n",
-      "[2024-11-02 00:28:06] The server is fired up and ready to roll!\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'><br><br>                    NOTE: Typically, the server runs in a separate terminal.<br>                    In this notebook, we run the server and notebook code together, so their outputs are combined.<br>                    To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue.<br>                    </strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
   "source": [
    "from sglang.utils import (\n",
    "    execute_shell_command,\n",
@@ -123,45 +75,7 @@
     "shell.execute_reply": "2024-11-01T02:46:51.435965Z"
    }
   },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current\n",
-      "                                 Dload  Upload   Total   Spent    Left  Speed\n",
-      "100   278    0     0  100   278      0   1387 --:--:-- --:--:-- --:--:--  1383"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-11-02 00:28:48 TP0] Prefill batch. #new-seq: 1, #new-token: 11, #cached-token: 42, cache hit rate: 40.19%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
-      "[2024-11-02 00:28:48 TP0] Decode batch. #running-req: 1, #token: 75, token usage: 0.00, gen throughput (token/s): 1.46, #queue-req: 0\n",
-      "[2024-11-02 00:28:49] INFO:     127.0.0.1:53714 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "100   871  100   593  100   278   1788    838 --:--:-- --:--:-- --:--:--  2623\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>{\"id\":\"a0714277fab546c5b6d91724aa3e27a3\",\"object\":\"chat.completion\",\"created\":1730507329,\"model\":\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\"choices\":[{\"index\":0,\"message\":{\"role\":\"assistant\",\"content\":\"An LLM, or Large Language Model, is a type of artificial intelligence (AI) designed to process and generate human-like language, often used in applications such as chatbots, virtual assistants, and language translation software.\"},\"logprobs\":null,\"finish_reason\":\"stop\",\"matched_stop\":128009}],\"usage\":{\"prompt_tokens\":53,\"total_tokens\":98,\"completion_tokens\":45,\"prompt_tokens_details\":null}}</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
   "source": [
    "import subprocess\n",
    "\n",
@@ -209,29 +123,7 @@
     "shell.execute_reply": "2024-11-01T02:46:52.895318Z"
    }
   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-11-02 00:03:52 TP0] Prefill batch. #new-seq: 1, #new-token: 20, #cached-token: 29, cache hit rate: 29.13%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
-      "[2024-11-02 00:03:52 TP0] Decode batch. #running-req: 1, #token: 65, token usage: 0.00, gen throughput (token/s): 11.33, #queue-req: 0\n",
-      "[2024-11-02 00:03:53] INFO:     127.0.0.1:57008 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>ChatCompletion(id='a6590143c40f4732a5c57d4c91b43f05', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **Country:** Japan\\n**Capital:** Tokyo\\n\\n2. **Country:** Australia\\n**Capital:** Canberra\\n\\n3. **Country:** Brazil\\n**Capital:** Brasília', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None), matched_stop=128009)], created=1730505833, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, completion_tokens_details=None, prompt_tokens_details=None))</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
   "source": [
    "import openai\n",
    "\n",
@@ -264,29 +156,7 @@
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-11-02 00:05:04 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 5, cache hit rate: 33.04%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
-      "[2024-11-02 00:05:04 TP0] Decode batch. #running-req: 1, #token: 26, token usage: 0.00, gen throughput (token/s): 3.10, #queue-req: 0\n",
-      "[2024-11-02 00:05:04] INFO:     127.0.0.1:60536 - \"POST /generate HTTP/1.1\" 200 OK\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<strong style='color: #00008B;'>{'text': ' a city of romance, art, fashion, and history. Paris is a must-visit destination for anyone who loves culture, architecture, and cuisine. From the', 'meta_info': {'prompt_tokens': 6, 'completion_tokens': 32, 'completion_tokens_wo_jump_forward': 32, 'cached_tokens': 5, 'finish_reason': {'type': 'length', 'length': 32}, 'id': 'd882513c180d4c5981488257ccab4b9f'}, 'index': 0}</strong>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
   "source": [
    "import requests\n",
    "\n",