Imporve openai api documents (#1827)

Co-authored-by: Chayenne <zhaochenyang@g.ucla.edu>

Imporve openai api documents (#1827)
Co-authored-by: Chayenne <zhaochenyang@g.ucla.edu>
539df95d · Chayenne · GitHub · 5e00ddeb · 539df95d · 539df95d
Unverified Commit 539df95d authored Oct 30, 2024 by Chayenne Committed by GitHub Oct 30, 2024
8 changed files
--- a/docs/README.md
+++ b/docs/README.md
@@ -20,6 +20,7 @@ make clean
 ### Serve (preview)
 Run an HTTP server and visit http://localhost:8000 in your browser.
 ```
 python3 -m http.server --d _build/html
 ```

--- a/docs/_static/css/custom_log.css
+++ b/docs/_static/css/custom_log.css
+.output_area {
+    color: #615656;
+}
+table.autosummary td {
+    width: 50%
+  }
+  img.align-center {
+    display: block;
+    margin-left: auto;
+    margin-right: auto;
+}
+.output_area.stderr {
+    color: #d3d3d3 !important;
+}
+.output_area.stdout {
+    color: #d3d3d3 !important;
+}
+div.output_area.stderr {
+    color: #d3d3d3 !important;
+}
+div.output_area.stdout {
+    color: #d3d3d3 !important;
+}
\ No newline at end of file
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -70,7 +70,10 @@ html_theme_options = {
 }
 html_static_path = ["_static"]
-html_css_files = ["css/readthedocs.css"]
+html_css_files = ["css/custom_log.css"]
+def setup(app):
+    app.add_css_file('css/custom_log.css')
 myst_enable_extensions = [
    "dollarmath",
@@ -127,3 +130,14 @@ intersphinx_mapping = {
 }
 html_theme = "sphinx_book_theme"
+nbsphinx_prolog = """
+.. raw:: html
+    <style>
+        .output_area.stderr, .output_area.stdout {
+            color: #d3d3d3 !important; /* light gray */
+        }
+    </style>
+"""
\ No newline at end of file
--- a/docs/embedding_model.ipynb
+++ b/docs/embedding_model.ipynb
@@ -21,7 +21,7 @@
    "The following code is equivalent to running this in the shell:\n",
    "```bash\n",
    "python -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-7B-instruct \\\n",
-    "    --port 30010 --host 0.0.0.0 --is-embedding --log-level error\n",
+    "    --port 30010 --host 0.0.0.0 --is-embedding\n",
    "```\n",
    "\n",
    "Remember to add `--is-embedding` to the command."
@@ -29,30 +29,83 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Embedding server is ready. Proceeding with the next steps.\n"
+      "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
+      "  warnings.warn(\n",
+      "[2024-10-29 21:07:15] server_args=ServerArgs(model_path='Alibaba-NLP/gte-Qwen2-7B-instruct', tokenizer_path='Alibaba-NLP/gte-Qwen2-7B-instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='Alibaba-NLP/gte-Qwen2-7B-instruct', chat_template=None, is_embedding=True, host='0.0.0.0', port=30010, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=568040040, constrained_json_whitespace_pattern=None, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)\n",
+      "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
+      "  warnings.warn(\n",
+      "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
+      "  warnings.warn(\n",
+      "[2024-10-29 21:07:20 TP0] Init torch distributed begin.\n",
+      "[2024-10-29 21:07:20 TP0] Load weight begin. avail mem=47.27 GB\n",
+      "[2024-10-29 21:07:21 TP0] lm_eval is not installed, GPTQ may not be usable\n",
+      "INFO 10-29 21:07:22 weight_utils.py:243] Using model weights format ['*.safetensors']\n",
+      "Loading safetensors checkpoint shards:   0% Completed | 0/7 [00:00<?, ?it/s]\n",
+      "Loading safetensors checkpoint shards:  14% Completed | 1/7 [00:00<00:03,  1.65it/s]\n",
+      "Loading safetensors checkpoint shards:  29% Completed | 2/7 [00:01<00:04,  1.02it/s]\n",
+      "Loading safetensors checkpoint shards:  43% Completed | 3/7 [00:03<00:04,  1.24s/it]\n",
+      "Loading safetensors checkpoint shards:  57% Completed | 4/7 [00:05<00:04,  1.47s/it]\n",
+      "Loading safetensors checkpoint shards:  71% Completed | 5/7 [00:07<00:03,  1.62s/it]\n",
+      "Loading safetensors checkpoint shards:  86% Completed | 6/7 [00:08<00:01,  1.64s/it]\n",
+      "Loading safetensors checkpoint shards: 100% Completed | 7/7 [00:10<00:00,  1.63s/it]\n",
+      "Loading safetensors checkpoint shards: 100% Completed | 7/7 [00:10<00:00,  1.49s/it]\n",
+      "\n",
+      "[2024-10-29 21:07:32 TP0] Load weight end. type=Qwen2ForCausalLM, dtype=torch.float16, avail mem=32.91 GB\n",
+      "[2024-10-29 21:07:33 TP0] Memory pool end. avail mem=4.56 GB\n",
+      "[2024-10-29 21:07:33 TP0] max_total_num_tokens=509971, max_prefill_tokens=16384, max_running_requests=2049, context_len=131072\n",
+      "[2024-10-29 21:07:33] INFO:     Started server process [2650986]\n",
+      "[2024-10-29 21:07:33] INFO:     Waiting for application startup.\n",
+      "[2024-10-29 21:07:33] INFO:     Application startup complete.\n",
+      "[2024-10-29 21:07:33] INFO:     Uvicorn running on http://0.0.0.0:30010 (Press CTRL+C to quit)\n",
+      "[2024-10-29 21:07:34] INFO:     127.0.0.1:47812 - \"GET /v1/models HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'><br>                            This cell combines server and notebook output. <br>                            <br>                            Typically, the server runs in a separate terminal, <br>                            but we combine the output of server and notebook to demonstrate the usage better.<br>                            <br>                            In our documentation, server output is in gray, notebook output is highlighted.<br>                            </strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-29 21:07:34] INFO:     127.0.0.1:41780 - \"GET /get_model_info HTTP/1.1\" 200 OK\n",
+      "[2024-10-29 21:07:34 TP0] Prefill batch. #new-seq: 1, #new-token: 6, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "[2024-10-29 21:07:35] INFO:     127.0.0.1:41792 - \"POST /encode HTTP/1.1\" 200 OK\n",
+      "[2024-10-29 21:07:35] The server is fired up and ready to roll!\n"
     ]
    }
   ],
   "source": [
-    "from sglang.utils import execute_shell_command, wait_for_server, terminate_process\n",
+    "from sglang.utils import (\n",
+    "    execute_shell_command,\n",
+    "    wait_for_server,\n",
+    "    terminate_process,\n",
+    "    print_highlight,\n",
+    ")\n",
    "\n",
    "embedding_process = execute_shell_command(\n",
    "    \"\"\"\n",
    "python -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-7B-instruct \\\n",
-    "    --port 30010 --host 0.0.0.0 --is-embedding --log-level error\n",
+    "    --port 30010 --host 0.0.0.0 --is-embedding\n",
    "\"\"\"\n",
    ")\n",
    "\n",
-    "wait_for_server(\"http://localhost:30010\")\n",
+    "wait_for_server(\"http://localhost:30010\")"
-    "\n",
-    "print(\"Embedding server is ready. Proceeding with the next steps.\")"
   ]
  },
  {
@@ -64,15 +117,34 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Text embedding (first 10): [0.0083160400390625, 0.0006804466247558594, -0.00809478759765625, -0.0006995201110839844, 0.0143890380859375, -0.0090179443359375, 0.01238250732421875, 0.00209808349609375, 0.0062103271484375, -0.003047943115234375]\n"
+      "[2024-10-28 02:10:30 TP0] Prefill batch. #new-seq: 1, #new-token: 4, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 02:10:31] INFO:     127.0.0.1:48094 - \"POST /v1/embeddings HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Text embedding (first 10): [0.0083160400390625, 0.0006804466247558594, -0.00809478759765625, -0.0006995201110839844, 0.0143890380859375, -0.0090179443359375, 0.01238250732421875, 0.00209808349609375, 0.0062103271484375, -0.003047943115234375]</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
    }
   ],
   "source": [
@@ -89,7 +161,7 @@
    "    \"embedding\"\n",
    "]\n",
    "\n",
-    "print(f\"Text embedding (first 10): {text_embedding[:10]}\")"
+    "print_highlight(f\"Text embedding (first 10): {text_embedding[:10]}\")"
   ]
  },
  {
@@ -101,15 +173,32 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Text embedding (first 10): [0.00829315185546875, 0.0007004737854003906, -0.00809478759765625, -0.0006799697875976562, 0.01438140869140625, -0.00897979736328125, 0.0123748779296875, 0.0020923614501953125, 0.006195068359375, -0.0030498504638671875]\n"
+      "[2024-10-28 02:10:31] INFO:     127.0.0.1:48110 - \"GET /get_model_info HTTP/1.1\" 200 OK\n",
+      "[2024-10-28 02:10:31 TP0] Prefill batch. #new-seq: 1, #new-token: 6, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "[2024-10-28 02:10:31] INFO:     127.0.0.1:48114 - \"POST /encode HTTP/1.1\" 200 OK\n",
+      "[2024-10-28 02:10:31] The server is fired up and ready to roll!\n",
+      "[2024-10-28 02:10:31 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, cache hit rate: 21.43%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "[2024-10-28 02:10:31] INFO:     127.0.0.1:48118 - \"POST /v1/embeddings HTTP/1.1\" 200 OK\n"
     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Text embedding (first 10): [0.00829315185546875, 0.0007004737854003906, -0.00809478759765625, -0.0006799697875976562, 0.01438140869140625, -0.00897979736328125, 0.0123748779296875, 0.0020923614501953125, 0.006195068359375, -0.0030498504638671875]</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
    }
   ],
   "source": [
@@ -124,7 +213,7 @@
    ")\n",
    "\n",
    "embedding = response.data[0].embedding[:10]\n",
-    "print(f\"Text embedding (first 10): {embedding}\")"
+    "print_highlight(f\"Text embedding (first 10): {embedding}\")"
   ]
  },
  {
@@ -138,15 +227,36 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 4,
   "metadata": {},
   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:127: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
+      "  warnings.warn(\n"
+     ]
+    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Input IDs embedding (first 10): [0.00829315185546875, 0.0007004737854003906, -0.00809478759765625, -0.0006799697875976562, 0.01438140869140625, -0.00897979736328125, 0.0123748779296875, 0.0020923614501953125, 0.006195068359375, -0.0030498504638671875]\n"
+      "[2024-10-28 02:10:32 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, cache hit rate: 33.33%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "[2024-10-28 02:10:32] INFO:     127.0.0.1:48124 - \"POST /v1/embeddings HTTP/1.1\" 200 OK\n"
     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Input IDs embedding (first 10): [0.00829315185546875, 0.0007004737854003906, -0.00809478759765625, -0.0006799697875976562, 0.01438140869140625, -0.00897979736328125, 0.0123748779296875, 0.0020923614501953125, 0.006195068359375, -0.0030498504638671875]</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
    }
   ],
   "source": [
@@ -168,14 +278,26 @@
    "    0\n",
    "][\"embedding\"]\n",
    "\n",
-    "print(f\"Input IDs embedding (first 10): {input_ids_embedding[:10]}\")"
+    "print_highlight(f\"Input IDs embedding (first 10): {input_ids_embedding[:10]}\")"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 5,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 02:10:32] INFO:     Shutting down\n",
+      "[2024-10-28 02:10:32] INFO:     Waiting for application shutdown.\n",
+      "[2024-10-28 02:10:32] INFO:     Application shutdown complete.\n",
+      "[2024-10-28 02:10:32] INFO:     Finished server process [1188896]\n",
+      "W1028 02:10:32.490000 140389363193408 torch/_inductor/compile_worker/subproc_pool.py:126] SubprocPool unclean exit\n"
+     ]
+    }
+   ],
   "source": [
    "terminate_process(embedding_process)"
   ]

--- a/docs/openai_api.ipynb
+++ b/docs/openai_api.ipynb
@@ -6,7 +6,9 @@
   "source": [
    "# OpenAI Compatible API\n",
    "\n",
-    "SGLang provides an OpenAI compatible API for smooth transition from OpenAI services.\n",
+    "SGLang provides an OpenAI compatible API for smooth transition from OpenAI services. Full reference of the API is available at [OpenAI API Reference](https://platform.openai.com/docs/api-reference).\n",
+    "\n",
+    "This tutorial aims at these popular APIs:\n",
    "\n",
    "- `chat/completions`\n",
    "- `completions`\n",
@@ -27,42 +29,99 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 38,
+   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Server is ready. Proceeding with the next steps.\n"
+      "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:127: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
+      "  warnings.warn(\n",
+      "[2024-10-28 02:02:31] server_args=ServerArgs(model_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Meta-Llama-3.1-8B-Instruct', chat_template=None, is_embedding=False, host='0.0.0.0', port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=800169736, constrained_json_whitespace_pattern=None, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)\n",
+      "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:127: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
+      "  warnings.warn(\n",
+      "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:127: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
+      "  warnings.warn(\n",
+      "[2024-10-28 02:02:36 TP0] Init torch distributed begin.\n",
+      "[2024-10-28 02:02:37 TP0] Load weight begin. avail mem=47.27 GB\n",
+      "[2024-10-28 02:02:37 TP0] Ignore import error when loading sglang.srt.models.mllama. No module named 'transformers.models.mllama'\n",
+      "INFO 10-28 02:02:38 weight_utils.py:236] Using model weights format ['*.safetensors']\n",
+      "Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]\n",
+      "Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:00<00:01,  2.57it/s]\n",
+      "Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:00<00:00,  2.45it/s]\n",
+      "Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:00<00:00,  3.53it/s]\n",
+      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:01<00:00,  2.98it/s]\n",
+      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:01<00:00,  2.94it/s]\n",
+      "\n",
+      "[2024-10-28 02:02:40 TP0] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=32.22 GB\n",
+      "[2024-10-28 02:02:40 TP0] Memory pool end. avail mem=4.60 GB\n",
+      "[2024-10-28 02:02:40 TP0] Capture cuda graph begin. This can take up to several minutes.\n",
+      "[2024-10-28 02:02:48 TP0] max_total_num_tokens=217512, max_prefill_tokens=16384, max_running_requests=2049, context_len=131072\n",
+      "[2024-10-28 02:02:48] INFO:     Started server process [1185529]\n",
+      "[2024-10-28 02:02:48] INFO:     Waiting for application startup.\n",
+      "[2024-10-28 02:02:48] INFO:     Application startup complete.\n",
+      "[2024-10-28 02:02:48] INFO:     Uvicorn running on http://0.0.0.0:30000 (Press CTRL+C to quit)\n",
+      "[2024-10-28 02:02:48] INFO:     127.0.0.1:47904 - \"GET /v1/models HTTP/1.1\" 200 OK\n"
     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Server is ready. Proceeding with the next steps.</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
    }
   ],
   "source": [
-    "from sglang.utils import execute_shell_command, wait_for_server, terminate_process\n",
+    "from sglang.utils import (\n",
+    "    execute_shell_command,\n",
+    "    wait_for_server,\n",
+    "    terminate_process,\n",
+    "    print_highlight,\n",
+    ")\n",
    "\n",
    "server_process = execute_shell_command(\n",
-    "    \"\"\"\n",
+    "    command=\"python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --port 30000 --host 0.0.0.0\"\n",
-    "python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n",
-    "--port 30000 --host 0.0.0.0 --log-level warning\n",
-    "\"\"\"\n",
    ")\n",
    "\n",
-    "wait_for_server(\"http://localhost:30000\")\n",
+    "wait_for_server(\"http://localhost:30000\")"
-    "print(\"Server is ready. Proceeding with the next steps.\")"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 39,
+   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "ChatCompletion(id='e854540ec7914b2d8c712f16fd9ed2ca', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **Country:** Japan\\n**Capital:** Tokyo\\n\\n2. **Country:** Australia\\n**Capital:** Canberra\\n\\n3. **Country:** Brazil\\n**Capital:** Brasília', refusal=None, role='assistant', function_call=None, tool_calls=None), matched_stop=128009)], created=1730012326, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, prompt_tokens_details=None))\n"
+      "[2024-10-28 02:02:49 TP0] Prefill batch. #new-seq: 1, #new-token: 49, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "[2024-10-28 02:02:49] INFO:     127.0.0.1:47912 - \"GET /get_model_info HTTP/1.1\" 200 OK\n",
+      "[2024-10-28 02:02:49 TP0] Prefill batch. #new-seq: 1, #new-token: 6, #cached-token: 1, cache hit rate: 1.79%, token usage: 0.00, #running-req: 1, #queue-req: 0\n",
+      "[2024-10-28 02:02:49] INFO:     127.0.0.1:47926 - \"POST /generate HTTP/1.1\" 200 OK\n",
+      "[2024-10-28 02:02:49] The server is fired up and ready to roll!\n",
+      "[2024-10-28 02:02:50 TP0] Decode batch. #running-req: 1, #token: 89, token usage: 0.00, gen throughput (token/s): 24.12, #queue-req: 0\n",
+      "[2024-10-28 02:02:50] INFO:     127.0.0.1:47910 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Response: ChatCompletion(id='692899ebd3ea464dbb456008a7d60bf3', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **Country:** Japan\\n**Capital:** Tokyo\\n\\n2. **Country:** Australia\\n**Capital:** Canberra\\n\\n3. **Country:** Brazil\\n**Capital:** Brasília', refusal=None, role='assistant', function_call=None, tool_calls=None), matched_stop=128009)], created=1730106170, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, prompt_tokens_details=None))</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
    }
   ],
   "source": [
@@ -84,7 +143,8 @@
    "    temperature=0,\n",
    "    max_tokens=64,\n",
    ")\n",
-    "print(response)"
+    "\n",
+    "print_highlight(f\"Response: {response}\")"
   ]
  },
  {
@@ -93,40 +153,35 @@
   "source": [
    "### Parameters\n",
    "\n",
-    "The chat completions API accepts the following parameters (refer to [OpenAI Chat Completions API](https://platform.openai.com/docs/api-reference/chat/create) for more details):\n",
+    "The chat completions API accepts OpenAI Chat Completions API's parameters. Refer to [OpenAI Chat Completions API](https://platform.openai.com/docs/api-reference/chat/create) for more details.\n",
-    "\n",
-    "- `messages`: List of messages in the conversation, each containing `role` and `content`\n",
-    "- `model`: The model identifier to use for completion\n",
-    "- `max_tokens`: Maximum number of tokens to generate in the response\n",
-    "- `temperature`: Controls randomness (0-2). Lower values make output more focused and deterministic\n",
-    "- `top_p`: Alternative to temperature. Controls diversity via nucleus sampling\n",
-    "- `n`: Number of chat completion choices to generate\n",
-    "- `stream`: If true, partial message deltas will be sent as they become available\n",
-    "- `stop`: Sequences where the API will stop generating further tokens\n",
-    "- `presence_penalty`: Penalizes new tokens based on their presence in the text so far (-2.0 to 2.0)\n",
-    "- `frequency_penalty`: Penalizes new tokens based on their frequency in the text so far (-2.0 to 2.0)\n",
-    "- `logit_bias`: Modify the likelihood of specified tokens appearing in the completion\n",
-    "- `logprobs`: Include log probabilities of tokens in the response\n",
-    "- `top_logprobs`: Number of most likely tokens to return probabilities for\n",
-    "- `seed`: Random seed for deterministic results\n",
-    "- `response_format`: Specify the format of the response (e.g., JSON)\n",
-    "- `stream_options`: Additional options for streaming responses\n",
-    "- `user`: A unique identifier representing your end-user\n",
    "\n",
    "Here is an example of a detailed chat completion request:"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 40,
+   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Ancient Rome's major achievements include:"
+      "[2024-10-28 02:02:50 TP0] Prefill batch. #new-seq: 1, #new-token: 48, #cached-token: 28, cache hit rate: 21.97%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "[2024-10-28 02:02:50] INFO:     127.0.0.1:47910 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Response: ChatCompletion(id='bffa083869484c78ab89d334514d5af3', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content=\"Ancient Rome's major achievements include:\", refusal=None, role='assistant', function_call=None, tool_calls=None), matched_stop='\\n\\n')], created=1730106170, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=8, prompt_tokens=76, total_tokens=84, prompt_tokens_details=None))</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
    }
   ],
   "source": [
@@ -152,11 +207,9 @@
    "    frequency_penalty=0.2,  # Mild penalty for more natural language\n",
    "    n=1,  # Single response is usually more stable\n",
    "    seed=42,  # Keep for reproducibility\n",
-    "    stream=True,  # Keep streaming for real-time output\n",
    ")\n",
    "\n",
-    "for chunk in response:\n",
+    "print_highlight(f\"Response: {response}\")"
-    "    print(chunk.choices[0].delta.content or \"\", end=\"\")"
   ]
  },
  {
@@ -167,20 +220,34 @@
    "\n",
    "### Usage\n",
    "\n",
-    "Completions API is similar to Chat Completions API, but without the `messages` parameter. Refer to [OpenAI Completions API](https://platform.openai.com/docs/api-reference/completions/create) for more details."
+    "Completions API is similar to Chat Completions API, but without the `messages` parameter."
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 41,
+   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Completion(id='a6e07198f4b445baa0fb08a2178ceb59', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=None, text=' 1. 2. 3.\\n1.  United States - Washington D.C. 2.  Japan - Tokyo 3.  Australia - Canberra\\nList 3 countries and their capitals. 1. 2. 3.\\n1.  China - Beijing 2.  Brazil - Bras', matched_stop=None)], created=1730012328, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=64, prompt_tokens=9, total_tokens=73, prompt_tokens_details=None))\n"
+      "[2024-10-28 02:02:50 TP0] Prefill batch. #new-seq: 1, #new-token: 8, #cached-token: 1, cache hit rate: 21.28%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "[2024-10-28 02:02:51 TP0] Decode batch. #running-req: 1, #token: 37, token usage: 0.00, gen throughput (token/s): 38.07, #queue-req: 0\n",
+      "[2024-10-28 02:02:52] INFO:     127.0.0.1:47910 - \"POST /v1/completions HTTP/1.1\" 200 OK\n"
     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Response: Completion(id='eb486d0a32fd4384baba923f3bc17e8b', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=None, text=' 1. 2. 3.\\n1.  United States - Washington D.C. 2.  Japan - Tokyo 3.  Australia - Canberra\\nList 3 countries and their capitals. 1. 2. 3.\\n1.  China - Beijing 2.  Brazil - Bras', matched_stop=None)], created=1730106172, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=64, prompt_tokens=9, total_tokens=73, prompt_tokens_details=None))</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
    }
   ],
   "source": [
@@ -192,7 +259,8 @@
    "    n=1,\n",
    "    stop=None,\n",
    ")\n",
-    "print(response)"
+    "\n",
+    "print_highlight(f\"Response: {response}\")"
   ]
  },
  {
@@ -201,44 +269,39 @@
   "source": [
    "### Parameters\n",
    "\n",
-    "The completions API accepts the following parameters:\n",
+    "The completions API accepts OpenAI Completions API's parameters.  Refer to [OpenAI Completions API](https://platform.openai.com/docs/api-reference/completions/create) for more details.\n",
-    "\n",
-    "- `model`: The model identifier to use for completion\n",
-    "- `prompt`: Input text to generate completions for. Can be a string, array of strings, or token arrays\n",
-    "- `best_of`: Number of completions to generate server-side and return the best one\n",
-    "- `echo`: If true, the prompt will be included in the response\n",
-    "- `frequency_penalty`: Penalizes new tokens based on their frequency in the text so far (-2.0 to 2.0)\n",
-    "- `logit_bias`: Modify the likelihood of specified tokens appearing in the completion\n",
-    "- `logprobs`: Include log probabilities of tokens in the response\n",
-    "- `max_tokens`: Maximum number of tokens to generate in the response (default: 16)\n",
-    "- `n`: Number of completion choices to generate\n",
-    "- `presence_penalty`: Penalizes new tokens based on their presence in the text so far (-2.0 to 2.0)\n",
-    "- `seed`: Random seed for deterministic results\n",
-    "- `stop`: Sequences where the API will stop generating further tokens\n",
-    "- `stream`: If true, partial completion deltas will be sent as they become available\n",
-    "- `stream_options`: Additional options for streaming responses\n",
-    "- `suffix`: Text to append to the completion\n",
-    "- `temperature`: Controls randomness (0-2). Lower values make output more focused and deterministic\n",
-    "- `top_p`: Alternative to temperature. Controls diversity via nucleus sampling\n",
-    "- `user`: A unique identifier representing your end-user\n",
    "\n",
    "Here is an example of a detailed completions request:"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 42,
+   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      " Space explorer, Captain Orion Blackwood, had been traveling through the galaxy for 12 years, searching for a new home for humanity. His ship, the Aurora, had been his home for so long that he barely remembered what it was like to walk on solid ground.\n",
+      "[2024-10-28 02:02:52 TP0] Prefill batch. #new-seq: 1, #new-token: 9, #cached-token: 1, cache hit rate: 20.53%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
-      "As he navigated through the dense asteroid field, the ship's computer, S.A.R.A. (Self-Aware Reasoning Algorithm), alerted him to a strange reading on one of the asteroids. Captain Blackwood's curiosity was piqued, and he decided to investigate further.\n",
+      "[2024-10-28 02:02:52 TP0] Decode batch. #running-req: 1, #token: 15, token usage: 0.00, gen throughput (token/s): 40.91, #queue-req: 0\n",
-      "\"Captain, I'm detecting unusual energy signatures emanating from the asteroid,\" S.A.R.A. said. \"It's unlike anything I've seen before.\"\n",
+      "[2024-10-28 02:02:53 TP0] Decode batch. #running-req: 1, #token: 55, token usage: 0.00, gen throughput (token/s): 42.13, #queue-req: 0\n",
-      "Captain Blackwood's eyes narrowed as"
+      "[2024-10-28 02:02:54 TP0] Decode batch. #running-req: 1, #token: 95, token usage: 0.00, gen throughput (token/s): 42.10, #queue-req: 0\n",
+      "[2024-10-28 02:02:55 TP0] Decode batch. #running-req: 1, #token: 135, token usage: 0.00, gen throughput (token/s): 41.94, #queue-req: 0\n",
+      "[2024-10-28 02:02:55] INFO:     127.0.0.1:47910 - \"POST /v1/completions HTTP/1.1\" 200 OK\n"
     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Response: Completion(id='fb23a12a15bc4137815b91d63b6fd976', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=None, text=\" Here is a short story about a space explorer named Astrid.\\nAstrid had always been fascinated by the stars. As a child, she would spend hours gazing up at the night sky, dreaming of what lay beyond our small planet. Now, as a renowned space explorer, she had the chance to explore the cosmos firsthand.\\nAstrid's ship, the Aurora, was equipped with state-of-the-art technology that allowed her to traverse vast distances in a relatively short period of time. She had been traveling for weeks, and finally, she had reached her destination: a distant planet on the edge of the galaxy.\\nAs she entered the planet's atmosphere, Astrid felt a thrill of excitement. She had never seen anything like this before.\", matched_stop=None)], created=1730106175, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=150, prompt_tokens=10, total_tokens=160, prompt_tokens_details=None))</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
    }
   ],
   "source": [
@@ -253,11 +316,9 @@
    "    frequency_penalty=0.3,  # Reduce repetitive phrases\n",
    "    n=1,  # Generate one completion\n",
    "    seed=123,  # For reproducible results\n",
-    "    stream=True,  # Stream the response\n",
    ")\n",
    "\n",
-    "for chunk in response:\n",
+    "print_highlight(f\"Response: {response}\")"
-    "    print(chunk.choices[0].text or \"\", end=\"\")"
   ]
  },
  {
@@ -279,15 +340,29 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 43,
+   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Batch job created with ID: batch_03d7f74f-dffe-4c26-b5e7-bb9fb5cb89ff\n"
+      "[2024-10-28 02:02:55] INFO:     127.0.0.1:43330 - \"POST /v1/files HTTP/1.1\" 200 OK\n",
+      "[2024-10-28 02:02:55] INFO:     127.0.0.1:43330 - \"POST /v1/batches HTTP/1.1\" 200 OK\n",
+      "[2024-10-28 02:02:55 TP0] Prefill batch. #new-seq: 2, #new-token: 30, #cached-token: 50, cache hit rate: 35.06%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Batch job created with ID: batch_56fefd2e-0187-4c14-aa2d-110917723dde</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
    }
   ],
   "source": [
@@ -337,29 +412,91 @@
    "    completion_window=\"24h\",\n",
    ")\n",
    "\n",
-    "print(f\"Batch job created with ID: {batch_response.id}\")"
+    "print_highlight(f\"Batch job created with ID: {batch_response.id}\")"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 44,
+   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
+      "[2024-10-28 02:02:56 TP0] Decode batch. #running-req: 2, #token: 82, token usage: 0.00, gen throughput (token/s): 55.10, #queue-req: 0\n",
      "Batch job status: validating...trying again in 3 seconds...\n",
+      "[2024-10-28 02:02:58] INFO:     127.0.0.1:43330 - \"GET /v1/batches/batch_56fefd2e-0187-4c14-aa2d-110917723dde HTTP/1.1\" 200 OK\n",
      "Batch job completed successfully!\n",
      "Request counts: BatchRequestCounts(completed=2, failed=0, total=2)\n",
-      "\n",
+      "[2024-10-28 02:02:58] INFO:     127.0.0.1:43330 - \"GET /v1/files/backend_result_file-520da6c8-0cce-4d4c-a943-a86101f5f5b4/content HTTP/1.1\" 200 OK\n"
-      "Request request-1:\n",
+     ]
-      "Response: {'status_code': 200, 'request_id': 'request-1', 'body': {'id': 'request-1', 'object': 'chat.completion', 'created': 1730012333, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': 'Why do programmers prefer dark mode?\\n\\nBecause light attracts bugs.'}, 'logprobs': None, 'finish_reason': 'stop', 'matched_stop': 128009}, 'usage': {'prompt_tokens': 41, 'completion_tokens': 13, 'total_tokens': 54}, 'system_fingerprint': None}}\n",
+    },
-      "\n",
+    {
-      "Request request-2:\n",
+     "data": {
-      "Response: {'status_code': 200, 'request_id': 'request-2', 'body': {'id': 'request-2', 'object': 'chat.completion', 'created': 1730012333, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': '**What is Python?**\\n\\nPython is a high-level, interpreted programming language that is widely used for various purposes, including:\\n\\n*   **Web Development**: Building web applications, web services, and web scraping.\\n*   **Data Science**: Data analysis'}, 'logprobs': None, 'finish_reason': 'length', 'matched_stop': None}, 'usage': {'prompt_tokens': 39, 'completion_tokens': 50, 'total_tokens': 89}, 'system_fingerprint': None}}\n",
+      "text/html": [
-      "\n",
+       "<strong style='color: #00008B;'>Request request-1:</strong>"
-      "Cleaning up files...\n"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Response: {'status_code': 200, 'request_id': 'request-1', 'body': {'id': 'request-1', 'object': 'chat.completion', 'created': 1730106176, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': 'A programmer walks into a library and asks the librarian, \"Do you have any books on Pavlov\\'s dogs and Schrödinger\\'s cat?\"\\n\\nThe librarian replies, \"It rings a bell, but I\\'m not sure if it\\'s here'}, 'logprobs': None, 'finish_reason': 'length', 'matched_stop': None}, 'usage': {'prompt_tokens': 41, 'completion_tokens': 50, 'total_tokens': 91}, 'system_fingerprint': None}}</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Request request-2:</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Response: {'status_code': 200, 'request_id': 'request-2', 'body': {'id': 'request-2', 'object': 'chat.completion', 'created': 1730106176, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': '**What is Python?**\\n\\nPython is a high-level, interpreted programming language that is widely used for various purposes, including:\\n\\n1. **Web Development**: Building web applications and web services using frameworks like Django and Flask.\\n2. **Data Analysis and'}, 'logprobs': None, 'finish_reason': 'length', 'matched_stop': None}, 'usage': {'prompt_tokens': 39, 'completion_tokens': 50, 'total_tokens': 89}, 'system_fingerprint': None}}</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Cleaning up files...</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 02:02:58] INFO:     127.0.0.1:43330 - \"DELETE /v1/files/backend_result_file-520da6c8-0cce-4d4c-a943-a86101f5f5b4 HTTP/1.1\" 200 OK\n"
     ]
    }
   ],
@@ -382,16 +519,16 @@
    "    ]\n",
    "\n",
    "    for result in results:\n",
-    "        print(f\"\\nRequest {result['custom_id']}:\")\n",
+    "        print_highlight(f\"Request {result['custom_id']}:\")\n",
-    "        print(f\"Response: {result['response']}\")\n",
+    "        print_highlight(f\"Response: {result['response']}\")\n",
    "\n",
-    "    print(\"\\nCleaning up files...\")\n",
+    "    print_highlight(\"Cleaning up files...\")\n",
    "    # Only delete the result file ID since file_response is just content\n",
    "    client.files.delete(result_file_id)\n",
    "else:\n",
-    "    print(f\"Batch job failed with status: {batch_response.status}\")\n",
+    "    print_highlight(f\"Batch job failed with status: {batch_response.status}\")\n",
    "    if hasattr(batch_response, \"errors\"):\n",
-    "        print(f\"Errors: {batch_response.errors}\")"
+    "        print_highlight(f\"Errors: {batch_response.errors}\")"
   ]
  },
  {
@@ -408,66 +545,210 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 45,
+   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Created batch job with ID: batch_6b9625ac-9ebc-4c4f-bfd5-f84f88b0100d\n",
+      "[2024-10-28 02:02:58] INFO:     127.0.0.1:43336 - \"POST /v1/files HTTP/1.1\" 200 OK\n",
-      "Initial status: validating\n",
+      "[2024-10-28 02:02:58] INFO:     127.0.0.1:43336 - \"POST /v1/batches HTTP/1.1\" 200 OK\n"
-      "Batch job details (check 1/5):\n",
-      "ID: batch_6b9625ac-9ebc-4c4f-bfd5-f84f88b0100d\n",
-      "Status: in_progress\n",
-      "Created at: 1730012334\n",
-      "Input file ID: backend_input_file-8203d42a-109c-4573-9663-13b5d9cb6a2b\n",
-      "Output file ID: None\n",
-      "Request counts:\n",
-      "Total: 0\n",
-      "Completed: 0\n",
-      "Failed: 0\n",
-      "Batch job details (check 2/5):\n",
-      "ID: batch_6b9625ac-9ebc-4c4f-bfd5-f84f88b0100d\n",
-      "Status: in_progress\n",
-      "Created at: 1730012334\n",
-      "Input file ID: backend_input_file-8203d42a-109c-4573-9663-13b5d9cb6a2b\n",
-      "Output file ID: None\n",
-      "Request counts:\n",
-      "Total: 0\n",
-      "Completed: 0\n",
-      "Failed: 0\n",
-      "Batch job details (check 3/5):\n",
-      "ID: batch_6b9625ac-9ebc-4c4f-bfd5-f84f88b0100d\n",
-      "Status: in_progress\n",
-      "Created at: 1730012334\n",
-      "Input file ID: backend_input_file-8203d42a-109c-4573-9663-13b5d9cb6a2b\n",
-      "Output file ID: None\n",
-      "Request counts:\n",
-      "Total: 0\n",
-      "Completed: 0\n",
-      "Failed: 0\n",
-      "Batch job details (check 4/5):\n",
-      "ID: batch_6b9625ac-9ebc-4c4f-bfd5-f84f88b0100d\n",
-      "Status: completed\n",
-      "Created at: 1730012334\n",
-      "Input file ID: backend_input_file-8203d42a-109c-4573-9663-13b5d9cb6a2b\n",
-      "Output file ID: backend_result_file-d32f441d-e737-4da3-b07a-c39349425b3a\n",
-      "Request counts:\n",
-      "Total: 100\n",
-      "Completed: 100\n",
-      "Failed: 0\n",
-      "Batch job details (check 5/5):\n",
-      "ID: batch_6b9625ac-9ebc-4c4f-bfd5-f84f88b0100d\n",
-      "Status: completed\n",
-      "Created at: 1730012334\n",
-      "Input file ID: backend_input_file-8203d42a-109c-4573-9663-13b5d9cb6a2b\n",
-      "Output file ID: backend_result_file-d32f441d-e737-4da3-b07a-c39349425b3a\n",
-      "Request counts:\n",
-      "Total: 100\n",
-      "Completed: 100\n",
-      "Failed: 0\n"
     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Created batch job with ID: batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Initial status: validating</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 02:02:58 TP0] Prefill batch. #new-seq: 17, #new-token: 510, #cached-token: 425, cache hit rate: 43.40%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "[2024-10-28 02:02:58 TP0] Prefill batch. #new-seq: 83, #new-token: 2490, #cached-token: 2075, cache hit rate: 45.04%, token usage: 0.00, #running-req: 17, #queue-req: 0\n",
+      "[2024-10-28 02:02:59 TP0] Decode batch. #running-req: 100, #token: 3725, token usage: 0.02, gen throughput (token/s): 234.43, #queue-req: 0\n",
+      "[2024-10-28 02:03:00 TP0] Decode batch. #running-req: 100, #token: 7725, token usage: 0.04, gen throughput (token/s): 3545.41, #queue-req: 0\n",
+      "[2024-10-28 02:03:01 TP0] Decode batch. #running-req: 100, #token: 11725, token usage: 0.05, gen throughput (token/s): 3448.10, #queue-req: 0\n",
+      "[2024-10-28 02:03:02 TP0] Decode batch. #running-req: 100, #token: 15725, token usage: 0.07, gen throughput (token/s): 3362.62, #queue-req: 0\n",
+      "[2024-10-28 02:03:04 TP0] Decode batch. #running-req: 100, #token: 19725, token usage: 0.09, gen throughput (token/s): 3279.58, #queue-req: 0\n",
+      "[2024-10-28 02:03:05 TP0] Decode batch. #running-req: 100, #token: 23725, token usage: 0.11, gen throughput (token/s): 3200.86, #queue-req: 0\n",
+      "[2024-10-28 02:03:06 TP0] Decode batch. #running-req: 100, #token: 27725, token usage: 0.13, gen throughput (token/s): 3126.52, #queue-req: 0\n",
+      "[2024-10-28 02:03:07 TP0] Decode batch. #running-req: 100, #token: 31725, token usage: 0.15, gen throughput (token/s): 3053.16, #queue-req: 0\n",
+      "[2024-10-28 02:03:08] INFO:     127.0.0.1:41320 - \"GET /v1/batches/batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5 HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Batch job details (check 1 / 5) // ID: batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5 // Status: in_progress // Created at: 1730106178 // Input file ID: backend_input_file-92cf2cc1-afbd-428f-8c5c-85fabd86cb63 // Output file ID: None</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'><strong>Request counts: Total: 0 // Completed: 0 // Failed: 0</strong></strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 02:03:09 TP0] Decode batch. #running-req: 100, #token: 35725, token usage: 0.16, gen throughput (token/s): 2980.26, #queue-req: 0\n",
+      "[2024-10-28 02:03:10 TP0] Decode batch. #running-req: 100, #token: 39725, token usage: 0.18, gen throughput (token/s): 2919.09, #queue-req: 0\n",
+      "[2024-10-28 02:03:11] INFO:     127.0.0.1:41320 - \"GET /v1/batches/batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5 HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Batch job details (check 2 / 5) // ID: batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5 // Status: in_progress // Created at: 1730106178 // Input file ID: backend_input_file-92cf2cc1-afbd-428f-8c5c-85fabd86cb63 // Output file ID: None</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'><strong>Request counts: Total: 0 // Completed: 0 // Failed: 0</strong></strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 02:03:11 TP0] Decode batch. #running-req: 100, #token: 43725, token usage: 0.20, gen throughput (token/s): 2854.92, #queue-req: 0\n",
+      "[2024-10-28 02:03:13 TP0] Decode batch. #running-req: 100, #token: 47725, token usage: 0.22, gen throughput (token/s): 2794.62, #queue-req: 0\n",
+      "[2024-10-28 02:03:14] INFO:     127.0.0.1:41320 - \"GET /v1/batches/batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5 HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Batch job details (check 3 / 5) // ID: batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5 // Status: in_progress // Created at: 1730106178 // Input file ID: backend_input_file-92cf2cc1-afbd-428f-8c5c-85fabd86cb63 // Output file ID: None</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'><strong>Request counts: Total: 0 // Completed: 0 // Failed: 0</strong></strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 02:03:14 TP0] Decode batch. #running-req: 100, #token: 51725, token usage: 0.24, gen throughput (token/s): 2737.84, #queue-req: 0\n",
+      "[2024-10-28 02:03:17] INFO:     127.0.0.1:41320 - \"GET /v1/batches/batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5 HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Batch job details (check 4 / 5) // ID: batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5 // Status: completed // Created at: 1730106178 // Input file ID: backend_input_file-92cf2cc1-afbd-428f-8c5c-85fabd86cb63 // Output file ID: backend_result_file-c10ee9f5-eca8-4357-a922-934543b7f433</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'><strong>Request counts: Total: 100 // Completed: 100 // Failed: 0</strong></strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 02:03:20] INFO:     127.0.0.1:41320 - \"GET /v1/batches/batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5 HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Batch job details (check 5 / 5) // ID: batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5 // Status: completed // Created at: 1730106178 // Input file ID: backend_input_file-92cf2cc1-afbd-428f-8c5c-85fabd86cb63 // Output file ID: backend_result_file-c10ee9f5-eca8-4357-a922-934543b7f433</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'><strong>Request counts: Total: 100 // Completed: 100 // Failed: 0</strong></strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
    }
   ],
   "source": [
@@ -515,25 +796,21 @@
    "    completion_window=\"24h\",\n",
    ")\n",
    "\n",
-    "print(f\"Created batch job with ID: {batch_job.id}\")\n",
+    "print_highlight(f\"Created batch job with ID: {batch_job.id}\")\n",
-    "print(f\"Initial status: {batch_job.status}\")\n",
+    "print_highlight(f\"Initial status: {batch_job.status}\")\n",
    "\n",
    "time.sleep(10)\n",
    "\n",
    "max_checks = 5\n",
    "for i in range(max_checks):\n",
    "    batch_details = client.batches.retrieve(batch_id=batch_job.id)\n",
-    "    print(f\"Batch job details (check {i+1}/{max_checks}):\")\n",
+    "\n",
-    "    print(f\"ID: {batch_details.id}\")\n",
+    "    print_highlight(\n",
-    "    print(f\"Status: {batch_details.status}\")\n",
+    "        f\"Batch job details (check {i+1} / {max_checks}) // ID: {batch_details.id} // Status: {batch_details.status} // Created at: {batch_details.created_at} // Input file ID: {batch_details.input_file_id} // Output file ID: {batch_details.output_file_id}\"\n",
-    "    print(f\"Created at: {batch_details.created_at}\")\n",
+    "    )\n",
-    "    print(f\"Input file ID: {batch_details.input_file_id}\")\n",
+    "    print_highlight(\n",
-    "    print(f\"Output file ID: {batch_details.output_file_id}\")\n",
+    "        f\"<strong>Request counts: Total: {batch_details.request_counts.total} // Completed: {batch_details.request_counts.completed} // Failed: {batch_details.request_counts.failed}</strong>\"\n",
-    "\n",
+    "    )\n",
-    "    print(\"Request counts:\")\n",
-    "    print(f\"Total: {batch_details.request_counts.total}\")\n",
-    "    print(f\"Completed: {batch_details.request_counts.completed}\")\n",
-    "    print(f\"Failed: {batch_details.request_counts.failed}\")\n",
    "\n",
    "    time.sleep(3)"
   ]
@@ -547,20 +824,114 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 46,
+   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Created batch job with ID: batch_3d2dd881-ad84-465a-85ee-6d5991794e5e\n",
+      "[2024-10-28 02:03:23] INFO:     127.0.0.1:47360 - \"POST /v1/files HTTP/1.1\" 200 OK\n",
-      "Initial status: validating\n",
+      "[2024-10-28 02:03:23] INFO:     127.0.0.1:47360 - \"POST /v1/batches HTTP/1.1\" 200 OK\n"
-      "Cancellation initiated. Status: cancelling\n",
+     ]
-      "Current status: cancelled\n",
+    },
-      "Batch job successfully cancelled\n",
+    {
-      "Successfully cleaned up input file\n"
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Created batch job with ID: batch_8a409f86-b8c7-4e29-9cc7-187d6d28df62</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Initial status: validating</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 02:03:23 TP0] Prefill batch. #new-seq: 44, #new-token: 44, #cached-token: 2376, cache hit rate: 60.81%, token usage: 0.01, #running-req: 0, #queue-req: 0\n",
+      "[2024-10-28 02:03:23 TP0] Prefill batch. #new-seq: 328, #new-token: 8192, #cached-token: 9824, cache hit rate: 56.49%, token usage: 0.01, #running-req: 44, #queue-req: 128\n",
+      "[2024-10-28 02:03:24 TP0] Prefill batch. #new-seq: 129, #new-token: 3864, #cached-token: 3231, cache hit rate: 54.15%, token usage: 0.05, #running-req: 371, #queue-req: 1\n",
+      "[2024-10-28 02:03:27 TP0] Decode batch. #running-req: 500, #token: 29025, token usage: 0.13, gen throughput (token/s): 1162.55, #queue-req: 0\n",
+      "[2024-10-28 02:03:31 TP0] Decode batch. #running-req: 500, #token: 49025, token usage: 0.23, gen throughput (token/s): 5606.35, #queue-req: 0\n",
+      "[2024-10-28 02:03:33] INFO:     127.0.0.1:40110 - \"POST /v1/batches/batch_8a409f86-b8c7-4e29-9cc7-187d6d28df62/cancel HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Cancellation initiated. Status: cancelling</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 02:03:36] INFO:     127.0.0.1:40110 - \"GET /v1/batches/batch_8a409f86-b8c7-4e29-9cc7-187d6d28df62 HTTP/1.1\" 200 OK\n"
     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Current status: cancelled</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Batch job successfully cancelled</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 02:03:36] INFO:     127.0.0.1:40110 - \"DELETE /v1/files/backend_input_file-2e9608b6-981b-48ec-8adb-e653ffc69106 HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Successfully cleaned up input file</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
    }
   ],
   "source": [
@@ -608,45 +979,57 @@
    "    completion_window=\"24h\",\n",
    ")\n",
    "\n",
-    "print(f\"Created batch job with ID: {batch_job.id}\")\n",
+    "print_highlight(f\"Created batch job with ID: {batch_job.id}\")\n",
-    "print(f\"Initial status: {batch_job.status}\")\n",
+    "print_highlight(f\"Initial status: {batch_job.status}\")\n",
    "\n",
    "time.sleep(10)\n",
    "\n",
    "try:\n",
    "    cancelled_job = client.batches.cancel(batch_id=batch_job.id)\n",
-    "    print(f\"Cancellation initiated. Status: {cancelled_job.status}\")\n",
+    "    print_highlight(f\"Cancellation initiated. Status: {cancelled_job.status}\")\n",
    "    assert cancelled_job.status == \"cancelling\"\n",
    "\n",
    "    # Monitor the cancellation process\n",
    "    while cancelled_job.status not in [\"failed\", \"cancelled\"]:\n",
    "        time.sleep(3)\n",
    "        cancelled_job = client.batches.retrieve(batch_job.id)\n",
-    "        print(f\"Current status: {cancelled_job.status}\")\n",
+    "        print_highlight(f\"Current status: {cancelled_job.status}\")\n",
    "\n",
    "    # Verify final status\n",
    "    assert cancelled_job.status == \"cancelled\"\n",
-    "    print(\"Batch job successfully cancelled\")\n",
+    "    print_highlight(\"Batch job successfully cancelled\")\n",
    "\n",
    "except Exception as e:\n",
-    "    print(f\"Error during cancellation: {e}\")\n",
+    "    print_highlight(f\"Error during cancellation: {e}\")\n",
    "    raise e\n",
    "\n",
    "finally:\n",
    "    try:\n",
    "        del_response = client.files.delete(uploaded_file.id)\n",
    "        if del_response.deleted:\n",
-    "            print(\"Successfully cleaned up input file\")\n",
+    "            print_highlight(\"Successfully cleaned up input file\")\n",
    "    except Exception as e:\n",
-    "        print(f\"Error cleaning up: {e}\")\n",
+    "        print_highlight(f\"Error cleaning up: {e}\")\n",
    "        raise e"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 47,
+   "execution_count": 10,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 02:03:36] INFO:     Shutting down\n",
+      "[2024-10-28 02:03:36] INFO:     Waiting for application shutdown.\n",
+      "[2024-10-28 02:03:36] INFO:     Application shutdown complete.\n",
+      "[2024-10-28 02:03:36] INFO:     Finished server process [1185529]\n",
+      "W1028 02:03:37.084000 140231994889792 torch/_inductor/compile_worker/subproc_pool.py:126] SubprocPool unclean exit\n"
+     ]
+    }
+   ],
   "source": [
    "terminate_process(server_process)"
   ]

--- a/docs/send_request.ipynb
+++ b/docs/send_request.ipynb
@@ -19,7 +19,7 @@
    "\n",
    "```bash\n",
    "python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n",
-    "--port 30000 --host 0.0.0.0 --log-level warning\n",
+    "--port 30000 --host 0.0.0.0\n",
    "```\n",
    "\n",
    "in your command line and wait for the server to be ready."
@@ -34,23 +34,65 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Server is ready. Proceeding with the next steps.\n"
+      "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
+      "  warnings.warn(\n",
+      "[2024-10-29 21:14:13] server_args=ServerArgs(model_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Meta-Llama-3.1-8B-Instruct', chat_template=None, is_embedding=False, host='0.0.0.0', port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=518055348, constrained_json_whitespace_pattern=None, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)\n",
+      "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
+      "  warnings.warn(\n",
+      "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
+      "  warnings.warn(\n",
+      "[2024-10-29 21:14:19 TP0] Init torch distributed begin.\n",
+      "[2024-10-29 21:14:20 TP0] Load weight begin. avail mem=47.27 GB\n",
+      "[2024-10-29 21:14:21 TP0] lm_eval is not installed, GPTQ may not be usable\n",
+      "INFO 10-29 21:14:21 weight_utils.py:243] Using model weights format ['*.safetensors']\n",
+      "Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]\n",
+      "Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:00<00:01,  2.32it/s]\n",
+      "Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:00<00:00,  2.28it/s]\n",
+      "Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:01<00:00,  3.27it/s]\n",
+      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:01<00:00,  2.87it/s]\n",
+      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:01<00:00,  2.78it/s]\n",
+      "\n",
+      "[2024-10-29 21:14:24 TP0] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=32.22 GB\n",
+      "[2024-10-29 21:14:24 TP0] Memory pool end. avail mem=4.60 GB\n",
+      "[2024-10-29 21:14:24 TP0] Capture cuda graph begin. This can take up to several minutes.\n",
+      "[2024-10-29 21:14:32 TP0] max_total_num_tokens=217512, max_prefill_tokens=16384, max_running_requests=2049, context_len=131072\n",
+      "[2024-10-29 21:14:32] INFO:     Started server process [2661188]\n",
+      "[2024-10-29 21:14:32] INFO:     Waiting for application startup.\n",
+      "[2024-10-29 21:14:32] INFO:     Application startup complete.\n",
+      "[2024-10-29 21:14:32] INFO:     Uvicorn running on http://0.0.0.0:30000 (Press CTRL+C to quit)\n",
+      "[2024-10-29 21:14:32] INFO:     127.0.0.1:49888 - \"GET /v1/models HTTP/1.1\" 200 OK\n"
     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'><br>                            Server and notebook outputs are combined for clarity.<br>                            <br>                            Typically, the server runs in a separate terminal.<br>                            <br>                            Server output is gray; notebook output is highlighted.<br>                            </strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
    }
   ],
   "source": [
-    "from sglang.utils import execute_shell_command, wait_for_server, terminate_process\n",
+    "from sglang.utils import (\n",
+    "    execute_shell_command,\n",
+    "    wait_for_server,\n",
+    "    terminate_process,\n",
+    "    print_highlight,\n",
+    ")\n",
    "\n",
    "\n",
    "server_process = execute_shell_command(\n",
    "    \"\"\"\n",
    "python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n",
-    "--port 30000 --host 0.0.0.0 --log-level warning\n",
+    "--port 30000 --host 0.0.0.0\n",
    "\"\"\"\n",
    ")\n",
    "\n",
-    "wait_for_server(\"http://localhost:30000\")\n",
+    "wait_for_server(\"http://localhost:30000\")"
-    "print(\"Server is ready. Proceeding with the next steps.\")"
   ]
  },
  {
@@ -71,7 +113,30 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "{\"id\":\"449710eb827c49c99b82ce187e912c2a\",\"object\":\"chat.completion\",\"created\":1729962606,\"model\":\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\"choices\":[{\"index\":0,\"message\":{\"role\":\"assistant\",\"content\":\"LLM stands for Large Language Model. It's a type of artificial intelligence (AI) designed to process and generate human-like language. These models are trained on vast amounts of text data, allowing them to learn patterns, relationships, and context within language.\\n\\nLarge language models use various techniques, such as deep learning and natural language processing, to analyze and understand the input text. They can then use this understanding to generate coherent and context-specific text, such as:\\n\\n1. Responses to questions or prompts\\n2. Summaries of long pieces of text\\n3. Creative writing, like stories or poetry\\n4. Translation of text from one language to another\\n\\nSome popular examples of LLMs include:\\n\\n1. Chatbots: Virtual assistants that can understand and respond to user input\\n2. Virtual assistants: Like Siri, Alexa, or Google Assistant\\n3. Language translation tools: Such as Google Translate\\n4. Writing assistants: Like Grammarly or Language Tool\\n\\nThe key characteristics of LLMs include:\\n\\n1. **Scalability**: They can process large amounts of text data\\n2. **Flexibility**: They can be fine-tuned for specific tasks or domains\\n3. **Contextual understanding**: They can recognize context and nuances in language\\n4. **Creativity**: They can generate original text or responses\\n\\nHowever, LLMs also have limitations and potential drawbacks:\\n\\n1. **Bias**: They can perpetuate existing biases in the training data\\n2. **Misinformation**: They can spread misinformation or false information\\n3. **Dependence on data quality**: The quality of the training data directly affects the model's performance\\n\\nOverall, LLMs are powerful tools that can be used in various applications, from language translation and writing assistance to chatbots and virtual assistants.\"},\"logprobs\":null,\"finish_reason\":\"stop\",\"matched_stop\":128009}],\"usage\":{\"prompt_tokens\":47,\"total_tokens\":408,\"completion_tokens\":361,\"prompt_tokens_details\":null}}"
+      "[2024-10-29 21:14:32 TP0] Prefill batch. #new-seq: 1, #new-token: 47, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-29 21:14:33] INFO:     127.0.0.1:49914 - \"GET /get_model_info HTTP/1.1\" 200 OK\n",
+      "[2024-10-29 21:14:33 TP0] Prefill batch. #new-seq: 1, #new-token: 6, #cached-token: 1, cache hit rate: 1.85%, token usage: 0.00, #running-req: 1, #queue-req: 0\n",
+      "[2024-10-29 21:14:33] INFO:     127.0.0.1:49916 - \"POST /generate HTTP/1.1\" 200 OK\n",
+      "[2024-10-29 21:14:33] The server is fired up and ready to roll!\n",
+      "[2024-10-29 21:14:33 TP0] Decode batch. #running-req: 1, #token: 87, token usage: 0.00, gen throughput (token/s): 27.00, #queue-req: 0\n",
+      "[2024-10-29 21:14:34 TP0] Decode batch. #running-req: 1, #token: 127, token usage: 0.00, gen throughput (token/s): 42.50, #queue-req: 0\n",
+      "[2024-10-29 21:14:35 TP0] Decode batch. #running-req: 1, #token: 167, token usage: 0.00, gen throughput (token/s): 42.31, #queue-req: 0\n",
+      "[2024-10-29 21:14:36 TP0] Decode batch. #running-req: 1, #token: 207, token usage: 0.00, gen throughput (token/s): 42.29, #queue-req: 0\n",
+      "[2024-10-29 21:14:37 TP0] Decode batch. #running-req: 1, #token: 247, token usage: 0.00, gen throughput (token/s): 42.34, #queue-req: 0\n",
+      "[2024-10-29 21:14:38 TP0] Decode batch. #running-req: 1, #token: 287, token usage: 0.00, gen throughput (token/s): 42.34, #queue-req: 0\n",
+      "[2024-10-29 21:14:39 TP0] Decode batch. #running-req: 1, #token: 327, token usage: 0.00, gen throughput (token/s): 42.30, #queue-req: 0\n",
+      "[2024-10-29 21:14:40 TP0] Decode batch. #running-req: 1, #token: 367, token usage: 0.00, gen throughput (token/s): 42.32, #queue-req: 0\n",
+      "[2024-10-29 21:14:41 TP0] Decode batch. #running-req: 1, #token: 407, token usage: 0.00, gen throughput (token/s): 42.23, #queue-req: 0\n",
+      "[2024-10-29 21:14:42 TP0] Decode batch. #running-req: 1, #token: 447, token usage: 0.00, gen throughput (token/s): 42.25, #queue-req: 0\n",
+      "[2024-10-29 21:14:43 TP0] Decode batch. #running-req: 1, #token: 487, token usage: 0.00, gen throughput (token/s): 42.22, #queue-req: 0\n",
+      "[2024-10-29 21:14:43] INFO:     127.0.0.1:49902 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n",
+      "{\"id\":\"0635a1c4d1d940f597b11482bed9595f\",\"object\":\"chat.completion\",\"created\":1730261683,\"model\":\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\"choices\":[{\"index\":0,\"message\":{\"role\":\"assistant\",\"content\":\"LLM stands for Large Language Model. It's a type of artificial intelligence (AI) designed to process and understand human language. LLMs are trained on vast amounts of text data, allowing them to learn patterns, relationships, and context within language.\\n\\nLarge language models like myself use natural language processing (NLP) and machine learning algorithms to analyze and generate human-like text. This enables us to:\\n\\n1. **Answer questions**: Provide information on a wide range of topics, from general knowledge to specialized domains.\\n2. **Generate text**: Create coherent and contextually relevant text, such as articles, essays, or even entire stories.\\n3. **Translate languages**: Translate text from one language to another, helping to break language barriers.\\n4. **Summarize content**: Condense long pieces of text into shorter, more digestible summaries.\\n5. **Chat and converse**: Engage in natural-sounding conversations, using context and understanding to respond to questions and statements.\\n\\nLarge language models are typically trained on massive datasets, often consisting of billions of parameters and petabytes of text data. This training enables us to learn complex language patterns, nuances, and context, allowing us to provide helpful and informative responses.\\n\\nSome popular examples of large language models include:\\n\\n1. **BERT (Bidirectional Encoder Representations from Transformers)**: Developed by Google, BERT is a foundational model for many language understanding tasks.\\n2. **RoBERTa (Robustly Optimized BERT Pretraining Approach)**: A variant of BERT, developed by Facebook AI, which improved upon the original model's performance.\\n3. **Transformers**: A family of models developed by Google, which includes BERT and other related architectures.\\n\\nThese models have revolutionized the field of natural language processing and have many exciting applications in areas like:\\n\\n1. **Virtual assistants**: Like Siri, Alexa, or myself, which can understand and respond to voice commands.\\n2. **Language translation**: Enabling real-time translation of languages.\\n3. **Content generation**: Creating original text, such as articles, stories, or even entire books.\\n4. **Customer service**: Providing 24/7 support and answering common customer queries.\\n\\nI hope this helps you understand what a Large Language Model is and its capabilities!\"},\"logprobs\":null,\"finish_reason\":\"stop\",\"matched_stop\":128009}],\"usage\":{\"prompt_tokens\":47,\"total_tokens\":504,\"completion_tokens\":457,\"prompt_tokens_details\":null}}"
     ]
    }
   ],
@@ -100,8 +165,22 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "ChatCompletion(id='6bbf20fed17940739eb5cd5d685fa29a', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **Country:** Japan\\n**Capital:** Tokyo\\n\\n2. **Country:** Australia\\n**Capital:** Canberra\\n\\n3. **Country:** Brazil\\n**Capital:** Brasília', refusal=None, role='assistant', function_call=None, tool_calls=None), matched_stop=128009)], created=1729962608, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, prompt_tokens_details=None))\n"
+      "[2024-10-29 21:14:44 TP0] Prefill batch. #new-seq: 1, #new-token: 20, #cached-token: 29, cache hit rate: 29.13%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "[2024-10-29 21:14:44 TP0] Decode batch. #running-req: 1, #token: 73, token usage: 0.00, gen throughput (token/s): 26.00, #queue-req: 0\n",
+      "[2024-10-29 21:14:45] INFO:     127.0.0.1:52764 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>ChatCompletion(id='994dd35133d34f57951a102c7470464f', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **Country:** Japan\\n**Capital:** Tokyo\\n\\n2. **Country:** Australia\\n**Capital:** Canberra\\n\\n3. **Country:** Brazil\\n**Capital:** Brasília', refusal=None, role='assistant', function_call=None, tool_calls=None), matched_stop=128009)], created=1730261685, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, prompt_tokens_details=None))</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
    }
   ],
   "source": [
@@ -123,14 +202,26 @@
    "    temperature=0,\n",
    "    max_tokens=64,\n",
    ")\n",
-    "print(response)"
+    "print_highlight(response)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-29 21:14:45] INFO:     Shutting down\n",
+      "[2024-10-29 21:14:45] INFO:     Waiting for application shutdown.\n",
+      "[2024-10-29 21:14:45] INFO:     Application shutdown complete.\n",
+      "[2024-10-29 21:14:45] INFO:     Finished server process [2661188]\n",
+      "W1029 21:14:45.740000 139643311699520 torch/_inductor/compile_worker/subproc_pool.py:126] SubprocPool unclean exit\n"
+     ]
+    }
+   ],
   "source": [
    "terminate_process(server_process)"
   ]

--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -13,7 +13,7 @@ classifiers = [
    "Programming Language :: Python :: 3",
    "License :: OSI Approved :: Apache Software License",
 ]
-dependencies = ["requests", "tqdm", "numpy"]
+dependencies = ["requests", "tqdm", "numpy", "IPython"]
 [project.optional-dependencies]
 runtime_common = ["aiohttp", "decord", "fastapi", "hf_transfer", "huggingface_hub", "interegular",

--- a/python/sglang/utils.py
+++ b/python/sglang/utils.py
@@ -20,6 +20,7 @@ from typing import Optional, Union
 import numpy as np
 import requests
 import torch
+from IPython.display import HTML, display
 from tqdm import tqdm
 logger = logging.getLogger(__name__)
@@ -313,12 +314,7 @@ def execute_shell_command(command: str) -> subprocess.Popen:
    command = command.replace("\\\n", " ").replace("\\", " ")
    parts = command.split()
-    return subprocess.Popen(
+    return subprocess.Popen(parts, text=True, stderr=subprocess.STDOUT)
-        parts,
-        text=True,
-        stdout=subprocess.DEVNULL,
-        stderr=subprocess.DEVNULL,
-    )
 def wait_for_server(base_url: str, timeout: int = None) -> None:
@@ -336,6 +332,15 @@ def wait_for_server(base_url: str, timeout: int = None) -> None:
                headers={"Authorization": "Bearer None"},
            )
            if response.status_code == 200:
+                print_highlight(
+                    """
+                            Server and notebook outputs are combined for clarity.
+                            Typically, the server runs in a separate terminal.
+                            Server output is gray; notebook output is highlighted.
+                            """
+                )
                break
            if timeout and time.time() - start_time > timeout:
@@ -375,3 +380,8 @@ def terminate_process(process):
            torch.cuda.empty_cache()
            torch.cuda.ipc_collect()
        time.sleep(2)
+def print_highlight(html_content: str):
+    html_content = str(html_content).replace("\n", "<br>")
+    display(HTML(f"<strong style='color: #00008B;'>{html_content}</strong>"))
\ No newline at end of file