Imporve openai api documents (#1827)

Co-authored-by: Chayenne <zhaochenyang@g.ucla.edu>

Imporve openai api documents (#1827)
Co-authored-by: Chayenne <zhaochenyang@g.ucla.edu>
539df95d · Chayenne · GitHub · 5e00ddeb · 539df95d · 539df95d
Unverified Commit 539df95d authored Oct 30, 2024 by Chayenne Committed by GitHub Oct 30, 2024
8 changed files
--- a/docs/README.md
+++ b/docs/README.md
@@ -20,6 +20,7 @@ make clean

 ### Serve (preview)
 Run an HTTP server and visit http://localhost:8000 in your browser.
+
 ```
 python3 -m http.server --d _build/html
 ```

--- a/docs/_static/css/custom_log.css
+++ b/docs/_static/css/custom_log.css
+.output_area {
+    color: #615656;
+}
+
+table.autosummary td {
+    width: 50%
+  }
+  
+  img.align-center {
+    display: block;
+    margin-left: auto;
+    margin-right: auto;
+}
+ 
+.output_area.stderr {
+    color: #d3d3d3 !important;
+}
+
+.output_area.stdout {
+    color: #d3d3d3 !important;
+}
+
+div.output_area.stderr {
+    color: #d3d3d3 !important;
+}
+
+div.output_area.stdout {
+    color: #d3d3d3 !important;
+}
\ No newline at end of file
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -70,7 +70,10 @@ html_theme_options = {
 }

 html_static_path = ["_static"]
-html_css_files = ["css/readthedocs.css"]
+html_css_files = ["css/custom_log.css"]
+
+def setup(app):
+    app.add_css_file('css/custom_log.css')

 myst_enable_extensions = [
    "dollarmath",
@@ -127,3 +130,14 @@ intersphinx_mapping = {
 }

 html_theme = "sphinx_book_theme"
+
+
+nbsphinx_prolog = """
+.. raw:: html
+
+    <style>
+        .output_area.stderr, .output_area.stdout {
+            color: #d3d3d3 !important; /* light gray */
+        }
+    </style>
+"""
\ No newline at end of file
--- a/docs/embedding_model.ipynb
+++ b/docs/embedding_model.ipynb
@@ -21,7 +21,7 @@
    "The following code is equivalent to running this in the shell:\n",
    "```bash\n",
    "python -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-7B-instruct \\\n",
-    "    --port 30010 --host 0.0.0.0 --is-embedding --log-level error\n",
+    "    --port 30010 --host 0.0.0.0 --is-embedding\n",
    "```\n",
    "\n",
    "Remember to add `--is-embedding` to the command."
@@ -29,30 +29,83 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Embedding server is ready. Proceeding with the next steps.\n"
+      "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
+      "  warnings.warn(\n",
+      "[2024-10-29 21:07:15] server_args=ServerArgs(model_path='Alibaba-NLP/gte-Qwen2-7B-instruct', tokenizer_path='Alibaba-NLP/gte-Qwen2-7B-instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='Alibaba-NLP/gte-Qwen2-7B-instruct', chat_template=None, is_embedding=True, host='0.0.0.0', port=30010, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=568040040, constrained_json_whitespace_pattern=None, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)\n",
+      "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
+      "  warnings.warn(\n",
+      "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
+      "  warnings.warn(\n",
+      "[2024-10-29 21:07:20 TP0] Init torch distributed begin.\n",
+      "[2024-10-29 21:07:20 TP0] Load weight begin. avail mem=47.27 GB\n",
+      "[2024-10-29 21:07:21 TP0] lm_eval is not installed, GPTQ may not be usable\n",
+      "INFO 10-29 21:07:22 weight_utils.py:243] Using model weights format ['*.safetensors']\n",
+      "Loading safetensors checkpoint shards:   0% Completed | 0/7 [00:00<?, ?it/s]\n",
+      "Loading safetensors checkpoint shards:  14% Completed | 1/7 [00:00<00:03,  1.65it/s]\n",
+      "Loading safetensors checkpoint shards:  29% Completed | 2/7 [00:01<00:04,  1.02it/s]\n",
+      "Loading safetensors checkpoint shards:  43% Completed | 3/7 [00:03<00:04,  1.24s/it]\n",
+      "Loading safetensors checkpoint shards:  57% Completed | 4/7 [00:05<00:04,  1.47s/it]\n",
+      "Loading safetensors checkpoint shards:  71% Completed | 5/7 [00:07<00:03,  1.62s/it]\n",
+      "Loading safetensors checkpoint shards:  86% Completed | 6/7 [00:08<00:01,  1.64s/it]\n",
+      "Loading safetensors checkpoint shards: 100% Completed | 7/7 [00:10<00:00,  1.63s/it]\n",
+      "Loading safetensors checkpoint shards: 100% Completed | 7/7 [00:10<00:00,  1.49s/it]\n",
+      "\n",
+      "[2024-10-29 21:07:32 TP0] Load weight end. type=Qwen2ForCausalLM, dtype=torch.float16, avail mem=32.91 GB\n",
+      "[2024-10-29 21:07:33 TP0] Memory pool end. avail mem=4.56 GB\n",
+      "[2024-10-29 21:07:33 TP0] max_total_num_tokens=509971, max_prefill_tokens=16384, max_running_requests=2049, context_len=131072\n",
+      "[2024-10-29 21:07:33] INFO:     Started server process [2650986]\n",
+      "[2024-10-29 21:07:33] INFO:     Waiting for application startup.\n",
+      "[2024-10-29 21:07:33] INFO:     Application startup complete.\n",
+      "[2024-10-29 21:07:33] INFO:     Uvicorn running on http://0.0.0.0:30010 (Press CTRL+C to quit)\n",
+      "[2024-10-29 21:07:34] INFO:     127.0.0.1:47812 - \"GET /v1/models HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'><br>                            This cell combines server and notebook output. <br>                            <br>                            Typically, the server runs in a separate terminal, <br>                            but we combine the output of server and notebook to demonstrate the usage better.<br>                            <br>                            In our documentation, server output is in gray, notebook output is highlighted.<br>                            </strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-29 21:07:34] INFO:     127.0.0.1:41780 - \"GET /get_model_info HTTP/1.1\" 200 OK\n",
+      "[2024-10-29 21:07:34 TP0] Prefill batch. #new-seq: 1, #new-token: 6, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "[2024-10-29 21:07:35] INFO:     127.0.0.1:41792 - \"POST /encode HTTP/1.1\" 200 OK\n",
+      "[2024-10-29 21:07:35] The server is fired up and ready to roll!\n"
     ]
    }
   ],
   "source": [
-    "from sglang.utils import execute_shell_command, wait_for_server, terminate_process\n",
+    "from sglang.utils import (\n",
+    "    execute_shell_command,\n",
+    "    wait_for_server,\n",
+    "    terminate_process,\n",
+    "    print_highlight,\n",
+    ")\n",
    "\n",
    "embedding_process = execute_shell_command(\n",
    "    \"\"\"\n",
    "python -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-7B-instruct \\\n",
-    "    --port 30010 --host 0.0.0.0 --is-embedding --log-level error\n",
+    "    --port 30010 --host 0.0.0.0 --is-embedding\n",
    "\"\"\"\n",
    ")\n",
    "\n",
-    "wait_for_server(\"http://localhost:30010\")\n",
-    "\n",
-    "print(\"Embedding server is ready. Proceeding with the next steps.\")"
+    "wait_for_server(\"http://localhost:30010\")"
   ]
  },
  {
@@ -64,15 +117,34 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Text embedding (first 10): [0.0083160400390625, 0.0006804466247558594, -0.00809478759765625, -0.0006995201110839844, 0.0143890380859375, -0.0090179443359375, 0.01238250732421875, 0.00209808349609375, 0.0062103271484375, -0.003047943115234375]\n"
+      "[2024-10-28 02:10:30 TP0] Prefill batch. #new-seq: 1, #new-token: 4, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 02:10:31] INFO:     127.0.0.1:48094 - \"POST /v1/embeddings HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Text embedding (first 10): [0.0083160400390625, 0.0006804466247558594, -0.00809478759765625, -0.0006995201110839844, 0.0143890380859375, -0.0090179443359375, 0.01238250732421875, 0.00209808349609375, 0.0062103271484375, -0.003047943115234375]</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
    }
   ],
   "source": [
@@ -89,7 +161,7 @@
    "    \"embedding\"\n",
    "]\n",
    "\n",
-    "print(f\"Text embedding (first 10): {text_embedding[:10]}\")"
+    "print_highlight(f\"Text embedding (first 10): {text_embedding[:10]}\")"
   ]
  },
  {
@@ -101,15 +173,32 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Text embedding (first 10): [0.00829315185546875, 0.0007004737854003906, -0.00809478759765625, -0.0006799697875976562, 0.01438140869140625, -0.00897979736328125, 0.0123748779296875, 0.0020923614501953125, 0.006195068359375, -0.0030498504638671875]\n"
+      "[2024-10-28 02:10:31] INFO:     127.0.0.1:48110 - \"GET /get_model_info HTTP/1.1\" 200 OK\n",
+      "[2024-10-28 02:10:31 TP0] Prefill batch. #new-seq: 1, #new-token: 6, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "[2024-10-28 02:10:31] INFO:     127.0.0.1:48114 - \"POST /encode HTTP/1.1\" 200 OK\n",
+      "[2024-10-28 02:10:31] The server is fired up and ready to roll!\n",
+      "[2024-10-28 02:10:31 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, cache hit rate: 21.43%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "[2024-10-28 02:10:31] INFO:     127.0.0.1:48118 - \"POST /v1/embeddings HTTP/1.1\" 200 OK\n"
     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Text embedding (first 10): [0.00829315185546875, 0.0007004737854003906, -0.00809478759765625, -0.0006799697875976562, 0.01438140869140625, -0.00897979736328125, 0.0123748779296875, 0.0020923614501953125, 0.006195068359375, -0.0030498504638671875]</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
    }
   ],
   "source": [
@@ -124,7 +213,7 @@
    ")\n",
    "\n",
    "embedding = response.data[0].embedding[:10]\n",
-    "print(f\"Text embedding (first 10): {embedding}\")"
+    "print_highlight(f\"Text embedding (first 10): {embedding}\")"
   ]
  },
  {
@@ -138,15 +227,36 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 4,
   "metadata": {},
   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:127: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
+      "  warnings.warn(\n"
+     ]
+    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Input IDs embedding (first 10): [0.00829315185546875, 0.0007004737854003906, -0.00809478759765625, -0.0006799697875976562, 0.01438140869140625, -0.00897979736328125, 0.0123748779296875, 0.0020923614501953125, 0.006195068359375, -0.0030498504638671875]\n"
+      "[2024-10-28 02:10:32 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, cache hit rate: 33.33%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "[2024-10-28 02:10:32] INFO:     127.0.0.1:48124 - \"POST /v1/embeddings HTTP/1.1\" 200 OK\n"
     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Input IDs embedding (first 10): [0.00829315185546875, 0.0007004737854003906, -0.00809478759765625, -0.0006799697875976562, 0.01438140869140625, -0.00897979736328125, 0.0123748779296875, 0.0020923614501953125, 0.006195068359375, -0.0030498504638671875]</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
    }
   ],
   "source": [
@@ -168,14 +278,26 @@
    "    0\n",
    "][\"embedding\"]\n",
    "\n",
-    "print(f\"Input IDs embedding (first 10): {input_ids_embedding[:10]}\")"
+    "print_highlight(f\"Input IDs embedding (first 10): {input_ids_embedding[:10]}\")"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 5,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-28 02:10:32] INFO:     Shutting down\n",
+      "[2024-10-28 02:10:32] INFO:     Waiting for application shutdown.\n",
+      "[2024-10-28 02:10:32] INFO:     Application shutdown complete.\n",
+      "[2024-10-28 02:10:32] INFO:     Finished server process [1188896]\n",
+      "W1028 02:10:32.490000 140389363193408 torch/_inductor/compile_worker/subproc_pool.py:126] SubprocPool unclean exit\n"
+     ]
+    }
+   ],
   "source": [
    "terminate_process(embedding_process)"
   ]

--- a/docs/openai_api.ipynb
+++ b/docs/openai_api.ipynb
--- a/docs/send_request.ipynb
+++ b/docs/send_request.ipynb
@@ -19,7 +19,7 @@
    "\n",
    "```bash\n",
    "python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n",
-    "--port 30000 --host 0.0.0.0 --log-level warning\n",
+    "--port 30000 --host 0.0.0.0\n",
    "```\n",
    "\n",
    "in your command line and wait for the server to be ready."
@@ -34,23 +34,65 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Server is ready. Proceeding with the next steps.\n"
+      "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
+      "  warnings.warn(\n",
+      "[2024-10-29 21:14:13] server_args=ServerArgs(model_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Meta-Llama-3.1-8B-Instruct', chat_template=None, is_embedding=False, host='0.0.0.0', port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=518055348, constrained_json_whitespace_pattern=None, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)\n",
+      "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
+      "  warnings.warn(\n",
+      "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
+      "  warnings.warn(\n",
+      "[2024-10-29 21:14:19 TP0] Init torch distributed begin.\n",
+      "[2024-10-29 21:14:20 TP0] Load weight begin. avail mem=47.27 GB\n",
+      "[2024-10-29 21:14:21 TP0] lm_eval is not installed, GPTQ may not be usable\n",
+      "INFO 10-29 21:14:21 weight_utils.py:243] Using model weights format ['*.safetensors']\n",
+      "Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]\n",
+      "Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:00<00:01,  2.32it/s]\n",
+      "Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:00<00:00,  2.28it/s]\n",
+      "Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:01<00:00,  3.27it/s]\n",
+      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:01<00:00,  2.87it/s]\n",
+      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:01<00:00,  2.78it/s]\n",
+      "\n",
+      "[2024-10-29 21:14:24 TP0] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=32.22 GB\n",
+      "[2024-10-29 21:14:24 TP0] Memory pool end. avail mem=4.60 GB\n",
+      "[2024-10-29 21:14:24 TP0] Capture cuda graph begin. This can take up to several minutes.\n",
+      "[2024-10-29 21:14:32 TP0] max_total_num_tokens=217512, max_prefill_tokens=16384, max_running_requests=2049, context_len=131072\n",
+      "[2024-10-29 21:14:32] INFO:     Started server process [2661188]\n",
+      "[2024-10-29 21:14:32] INFO:     Waiting for application startup.\n",
+      "[2024-10-29 21:14:32] INFO:     Application startup complete.\n",
+      "[2024-10-29 21:14:32] INFO:     Uvicorn running on http://0.0.0.0:30000 (Press CTRL+C to quit)\n",
+      "[2024-10-29 21:14:32] INFO:     127.0.0.1:49888 - \"GET /v1/models HTTP/1.1\" 200 OK\n"
     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'><br>                            Server and notebook outputs are combined for clarity.<br>                            <br>                            Typically, the server runs in a separate terminal.<br>                            <br>                            Server output is gray; notebook output is highlighted.<br>                            </strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
    }
   ],
   "source": [
-    "from sglang.utils import execute_shell_command, wait_for_server, terminate_process\n",
+    "from sglang.utils import (\n",
+    "    execute_shell_command,\n",
+    "    wait_for_server,\n",
+    "    terminate_process,\n",
+    "    print_highlight,\n",
+    ")\n",
    "\n",
    "\n",
    "server_process = execute_shell_command(\n",
    "    \"\"\"\n",
    "python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n",
-    "--port 30000 --host 0.0.0.0 --log-level warning\n",
+    "--port 30000 --host 0.0.0.0\n",
    "\"\"\"\n",
    ")\n",
    "\n",
-    "wait_for_server(\"http://localhost:30000\")\n",
-    "print(\"Server is ready. Proceeding with the next steps.\")"
+    "wait_for_server(\"http://localhost:30000\")"
   ]
  },
  {
@@ -71,7 +113,30 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "{\"id\":\"449710eb827c49c99b82ce187e912c2a\",\"object\":\"chat.completion\",\"created\":1729962606,\"model\":\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\"choices\":[{\"index\":0,\"message\":{\"role\":\"assistant\",\"content\":\"LLM stands for Large Language Model. It's a type of artificial intelligence (AI) designed to process and generate human-like language. These models are trained on vast amounts of text data, allowing them to learn patterns, relationships, and context within language.\\n\\nLarge language models use various techniques, such as deep learning and natural language processing, to analyze and understand the input text. They can then use this understanding to generate coherent and context-specific text, such as:\\n\\n1. Responses to questions or prompts\\n2. Summaries of long pieces of text\\n3. Creative writing, like stories or poetry\\n4. Translation of text from one language to another\\n\\nSome popular examples of LLMs include:\\n\\n1. Chatbots: Virtual assistants that can understand and respond to user input\\n2. Virtual assistants: Like Siri, Alexa, or Google Assistant\\n3. Language translation tools: Such as Google Translate\\n4. Writing assistants: Like Grammarly or Language Tool\\n\\nThe key characteristics of LLMs include:\\n\\n1. **Scalability**: They can process large amounts of text data\\n2. **Flexibility**: They can be fine-tuned for specific tasks or domains\\n3. **Contextual understanding**: They can recognize context and nuances in language\\n4. **Creativity**: They can generate original text or responses\\n\\nHowever, LLMs also have limitations and potential drawbacks:\\n\\n1. **Bias**: They can perpetuate existing biases in the training data\\n2. **Misinformation**: They can spread misinformation or false information\\n3. **Dependence on data quality**: The quality of the training data directly affects the model's performance\\n\\nOverall, LLMs are powerful tools that can be used in various applications, from language translation and writing assistance to chatbots and virtual assistants.\"},\"logprobs\":null,\"finish_reason\":\"stop\",\"matched_stop\":128009}],\"usage\":{\"prompt_tokens\":47,\"total_tokens\":408,\"completion_tokens\":361,\"prompt_tokens_details\":null}}"
+      "[2024-10-29 21:14:32 TP0] Prefill batch. #new-seq: 1, #new-token: 47, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-29 21:14:33] INFO:     127.0.0.1:49914 - \"GET /get_model_info HTTP/1.1\" 200 OK\n",
+      "[2024-10-29 21:14:33 TP0] Prefill batch. #new-seq: 1, #new-token: 6, #cached-token: 1, cache hit rate: 1.85%, token usage: 0.00, #running-req: 1, #queue-req: 0\n",
+      "[2024-10-29 21:14:33] INFO:     127.0.0.1:49916 - \"POST /generate HTTP/1.1\" 200 OK\n",
+      "[2024-10-29 21:14:33] The server is fired up and ready to roll!\n",
+      "[2024-10-29 21:14:33 TP0] Decode batch. #running-req: 1, #token: 87, token usage: 0.00, gen throughput (token/s): 27.00, #queue-req: 0\n",
+      "[2024-10-29 21:14:34 TP0] Decode batch. #running-req: 1, #token: 127, token usage: 0.00, gen throughput (token/s): 42.50, #queue-req: 0\n",
+      "[2024-10-29 21:14:35 TP0] Decode batch. #running-req: 1, #token: 167, token usage: 0.00, gen throughput (token/s): 42.31, #queue-req: 0\n",
+      "[2024-10-29 21:14:36 TP0] Decode batch. #running-req: 1, #token: 207, token usage: 0.00, gen throughput (token/s): 42.29, #queue-req: 0\n",
+      "[2024-10-29 21:14:37 TP0] Decode batch. #running-req: 1, #token: 247, token usage: 0.00, gen throughput (token/s): 42.34, #queue-req: 0\n",
+      "[2024-10-29 21:14:38 TP0] Decode batch. #running-req: 1, #token: 287, token usage: 0.00, gen throughput (token/s): 42.34, #queue-req: 0\n",
+      "[2024-10-29 21:14:39 TP0] Decode batch. #running-req: 1, #token: 327, token usage: 0.00, gen throughput (token/s): 42.30, #queue-req: 0\n",
+      "[2024-10-29 21:14:40 TP0] Decode batch. #running-req: 1, #token: 367, token usage: 0.00, gen throughput (token/s): 42.32, #queue-req: 0\n",
+      "[2024-10-29 21:14:41 TP0] Decode batch. #running-req: 1, #token: 407, token usage: 0.00, gen throughput (token/s): 42.23, #queue-req: 0\n",
+      "[2024-10-29 21:14:42 TP0] Decode batch. #running-req: 1, #token: 447, token usage: 0.00, gen throughput (token/s): 42.25, #queue-req: 0\n",
+      "[2024-10-29 21:14:43 TP0] Decode batch. #running-req: 1, #token: 487, token usage: 0.00, gen throughput (token/s): 42.22, #queue-req: 0\n",
+      "[2024-10-29 21:14:43] INFO:     127.0.0.1:49902 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n",
+      "{\"id\":\"0635a1c4d1d940f597b11482bed9595f\",\"object\":\"chat.completion\",\"created\":1730261683,\"model\":\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\"choices\":[{\"index\":0,\"message\":{\"role\":\"assistant\",\"content\":\"LLM stands for Large Language Model. It's a type of artificial intelligence (AI) designed to process and understand human language. LLMs are trained on vast amounts of text data, allowing them to learn patterns, relationships, and context within language.\\n\\nLarge language models like myself use natural language processing (NLP) and machine learning algorithms to analyze and generate human-like text. This enables us to:\\n\\n1. **Answer questions**: Provide information on a wide range of topics, from general knowledge to specialized domains.\\n2. **Generate text**: Create coherent and contextually relevant text, such as articles, essays, or even entire stories.\\n3. **Translate languages**: Translate text from one language to another, helping to break language barriers.\\n4. **Summarize content**: Condense long pieces of text into shorter, more digestible summaries.\\n5. **Chat and converse**: Engage in natural-sounding conversations, using context and understanding to respond to questions and statements.\\n\\nLarge language models are typically trained on massive datasets, often consisting of billions of parameters and petabytes of text data. This training enables us to learn complex language patterns, nuances, and context, allowing us to provide helpful and informative responses.\\n\\nSome popular examples of large language models include:\\n\\n1. **BERT (Bidirectional Encoder Representations from Transformers)**: Developed by Google, BERT is a foundational model for many language understanding tasks.\\n2. **RoBERTa (Robustly Optimized BERT Pretraining Approach)**: A variant of BERT, developed by Facebook AI, which improved upon the original model's performance.\\n3. **Transformers**: A family of models developed by Google, which includes BERT and other related architectures.\\n\\nThese models have revolutionized the field of natural language processing and have many exciting applications in areas like:\\n\\n1. **Virtual assistants**: Like Siri, Alexa, or myself, which can understand and respond to voice commands.\\n2. **Language translation**: Enabling real-time translation of languages.\\n3. **Content generation**: Creating original text, such as articles, stories, or even entire books.\\n4. **Customer service**: Providing 24/7 support and answering common customer queries.\\n\\nI hope this helps you understand what a Large Language Model is and its capabilities!\"},\"logprobs\":null,\"finish_reason\":\"stop\",\"matched_stop\":128009}],\"usage\":{\"prompt_tokens\":47,\"total_tokens\":504,\"completion_tokens\":457,\"prompt_tokens_details\":null}}"
     ]
    }
   ],
@@ -100,8 +165,22 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "ChatCompletion(id='6bbf20fed17940739eb5cd5d685fa29a', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **Country:** Japan\\n**Capital:** Tokyo\\n\\n2. **Country:** Australia\\n**Capital:** Canberra\\n\\n3. **Country:** Brazil\\n**Capital:** Brasília', refusal=None, role='assistant', function_call=None, tool_calls=None), matched_stop=128009)], created=1729962608, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, prompt_tokens_details=None))\n"
+      "[2024-10-29 21:14:44 TP0] Prefill batch. #new-seq: 1, #new-token: 20, #cached-token: 29, cache hit rate: 29.13%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "[2024-10-29 21:14:44 TP0] Decode batch. #running-req: 1, #token: 73, token usage: 0.00, gen throughput (token/s): 26.00, #queue-req: 0\n",
+      "[2024-10-29 21:14:45] INFO:     127.0.0.1:52764 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>ChatCompletion(id='994dd35133d34f57951a102c7470464f', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **Country:** Japan\\n**Capital:** Tokyo\\n\\n2. **Country:** Australia\\n**Capital:** Canberra\\n\\n3. **Country:** Brazil\\n**Capital:** Brasília', refusal=None, role='assistant', function_call=None, tool_calls=None), matched_stop=128009)], created=1730261685, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, prompt_tokens_details=None))</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
    }
   ],
   "source": [
@@ -123,14 +202,26 @@
    "    temperature=0,\n",
    "    max_tokens=64,\n",
    ")\n",
-    "print(response)"
+    "print_highlight(response)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-29 21:14:45] INFO:     Shutting down\n",
+      "[2024-10-29 21:14:45] INFO:     Waiting for application shutdown.\n",
+      "[2024-10-29 21:14:45] INFO:     Application shutdown complete.\n",
+      "[2024-10-29 21:14:45] INFO:     Finished server process [2661188]\n",
+      "W1029 21:14:45.740000 139643311699520 torch/_inductor/compile_worker/subproc_pool.py:126] SubprocPool unclean exit\n"
+     ]
+    }
+   ],
   "source": [
    "terminate_process(server_process)"
   ]

--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -13,7 +13,7 @@ classifiers = [
    "Programming Language :: Python :: 3",
    "License :: OSI Approved :: Apache Software License",
 ]
-dependencies = ["requests", "tqdm", "numpy"]
+dependencies = ["requests", "tqdm", "numpy", "IPython"]

 [project.optional-dependencies]
 runtime_common = ["aiohttp", "decord", "fastapi", "hf_transfer", "huggingface_hub", "interegular",

--- a/python/sglang/utils.py
+++ b/python/sglang/utils.py
@@ -20,6 +20,7 @@ from typing import Optional, Union
 import numpy as np
 import requests
 import torch
+from IPython.display import HTML, display
 from tqdm import tqdm

 logger = logging.getLogger(__name__)
@@ -313,12 +314,7 @@ def execute_shell_command(command: str) -> subprocess.Popen:
    command = command.replace("\\\n", " ").replace("\\", " ")
    parts = command.split()

-    return subprocess.Popen(
-        parts,
-        text=True,
-        stdout=subprocess.DEVNULL,
-        stderr=subprocess.DEVNULL,
-    )
+    return subprocess.Popen(parts, text=True, stderr=subprocess.STDOUT)


 def wait_for_server(base_url: str, timeout: int = None) -> None:
@@ -336,6 +332,15 @@ def wait_for_server(base_url: str, timeout: int = None) -> None:
                headers={"Authorization": "Bearer None"},
            )
            if response.status_code == 200:
+                print_highlight(
+                    """
+                            Server and notebook outputs are combined for clarity.
+                            
+                            Typically, the server runs in a separate terminal.
+                            
+                            Server output is gray; notebook output is highlighted.
+                            """
+                )
                break

            if timeout and time.time() - start_time > timeout:
@@ -375,3 +380,8 @@ def terminate_process(process):
            torch.cuda.empty_cache()
            torch.cuda.ipc_collect()
        time.sleep(2)
+
+
+def print_highlight(html_content: str):
+    html_content = str(html_content).replace("\n", "<br>")
+    display(HTML(f"<strong style='color: #00008B;'>{html_content}</strong>"))
\ No newline at end of file