Update docs (#1839)

b548801d · Lianmin Zheng · GitHub · 539df95d · b548801d · 539df95d
Unverified Commit b548801d authored Oct 30, 2024 by Lianmin Zheng Committed by GitHub Oct 30, 2024
11 changed files
--- a/docs/deploy.py
+++ b/docs/deploy.py
-#!/usr/bin/python3
+# Deploy the documents
 import os
 from datetime import datetime

--- a/docs/deploy_docs.sh
+++ b/docs/deploy_docs.sh
--- a/docs/openai_api.ipynb
+++ b/docs/openai_api.ipynb
@@ -8,7 +8,7 @@
    "\n",
    "SGLang provides an OpenAI compatible API for smooth transition from OpenAI services. Full reference of the API is available at [OpenAI API Reference](https://platform.openai.com/docs/api-reference).\n",
    "\n",
-    "This tutorial aims at these popular APIs:\n",
+    "This tutorial covers these popular APIs:\n",
    "\n",
    "- `chat/completions`\n",
    "- `completions`\n",
@@ -36,39 +36,41 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:127: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
+      "2024-10-30 09:44:20.477109: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
-      "  warnings.warn(\n",
+      "2024-10-30 09:44:20.489679: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
-      "[2024-10-28 02:02:31] server_args=ServerArgs(model_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Meta-Llama-3.1-8B-Instruct', chat_template=None, is_embedding=False, host='0.0.0.0', port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=800169736, constrained_json_whitespace_pattern=None, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)\n",
+      "2024-10-30 09:44:20.489712: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
-      "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:127: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
+      "2024-10-30 09:44:21.010067: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
-      "  warnings.warn(\n",
+      "[2024-10-30 09:44:29] server_args=ServerArgs(model_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Meta-Llama-3.1-8B-Instruct', chat_template=None, is_embedding=False, host='0.0.0.0', port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=134920821, constrained_json_whitespace_pattern=None, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)\n",
-      "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:127: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
+      "[2024-10-30 09:44:39 TP0] Init torch distributed begin.\n",
-      "  warnings.warn(\n",
+      "[2024-10-30 09:44:41 TP0] Load weight begin. avail mem=76.83 GB\n",
-      "[2024-10-28 02:02:36 TP0] Init torch distributed begin.\n",
+      "[2024-10-30 09:44:42 TP0] lm_eval is not installed, GPTQ may not be usable\n",
-      "[2024-10-28 02:02:37 TP0] Load weight begin. avail mem=47.27 GB\n",
+      "INFO 10-30 09:44:42 weight_utils.py:243] Using model weights format ['*.safetensors']\n",
-      "[2024-10-28 02:02:37 TP0] Ignore import error when loading sglang.srt.models.mllama. No module named 'transformers.models.mllama'\n",
-      "INFO 10-28 02:02:38 weight_utils.py:236] Using model weights format ['*.safetensors']\n",
      "Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]\n",
-      "Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:00<00:01,  2.57it/s]\n",
+      "Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:01<00:05,  1.77s/it]\n",
-      "Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:00<00:00,  2.45it/s]\n",
+      "Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:03<00:03,  1.77s/it]\n",
-      "Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:00<00:00,  3.53it/s]\n",
+      "Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:05<00:01,  1.77s/it]\n",
-      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:01<00:00,  2.98it/s]\n",
+      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:05<00:00,  1.27s/it]\n",
-      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:01<00:00,  2.94it/s]\n",
+      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:05<00:00,  1.45s/it]\n",
      "\n",
-      "[2024-10-28 02:02:40 TP0] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=32.22 GB\n",
+      "[2024-10-30 09:44:48 TP0] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=61.82 GB\n",
-      "[2024-10-28 02:02:40 TP0] Memory pool end. avail mem=4.60 GB\n",
+      "[2024-10-30 09:44:48 TP0] Memory pool end. avail mem=8.19 GB\n",
-      "[2024-10-28 02:02:40 TP0] Capture cuda graph begin. This can take up to several minutes.\n",
+      "[2024-10-30 09:44:49 TP0] Capture cuda graph begin. This can take up to several minutes.\n",
-      "[2024-10-28 02:02:48 TP0] max_total_num_tokens=217512, max_prefill_tokens=16384, max_running_requests=2049, context_len=131072\n",
+      "[2024-10-30 09:44:58 TP0] max_total_num_tokens=430915, max_prefill_tokens=16384, max_running_requests=2049, context_len=131072\n",
-      "[2024-10-28 02:02:48] INFO:     Started server process [1185529]\n",
+      "[2024-10-30 09:44:58] INFO:     Started server process [231459]\n",
-      "[2024-10-28 02:02:48] INFO:     Waiting for application startup.\n",
+      "[2024-10-30 09:44:58] INFO:     Waiting for application startup.\n",
-      "[2024-10-28 02:02:48] INFO:     Application startup complete.\n",
+      "[2024-10-30 09:44:58] INFO:     Application startup complete.\n",
-      "[2024-10-28 02:02:48] INFO:     Uvicorn running on http://0.0.0.0:30000 (Press CTRL+C to quit)\n",
+      "[2024-10-30 09:44:58] INFO:     Uvicorn running on http://0.0.0.0:30000 (Press CTRL+C to quit)\n",
-      "[2024-10-28 02:02:48] INFO:     127.0.0.1:47904 - \"GET /v1/models HTTP/1.1\" 200 OK\n"
+      "[2024-10-30 09:44:59] INFO:     127.0.0.1:54650 - \"GET /v1/models HTTP/1.1\" 200 OK\n",
+      "[2024-10-30 09:44:59] INFO:     127.0.0.1:54666 - \"GET /get_model_info HTTP/1.1\" 200 OK\n",
+      "[2024-10-30 09:44:59 TP0] Prefill batch. #new-seq: 1, #new-token: 7, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "[2024-10-30 09:44:59] INFO:     127.0.0.1:54672 - \"POST /generate HTTP/1.1\" 200 OK\n",
+      "[2024-10-30 09:44:59] The server is fired up and ready to roll!\n"
     ]
    },
    {
     "data": {
      "text/html": [
-       "<strong style='color: #00008B;'>Server is ready. Proceeding with the next steps.</strong>"
+       "<strong style='color: #00008B;'><br><br>                    NOTE: Typically, the server runs in a separate terminal.<br>                    In this notebook, we run the server and notebook code together, so their outputs are combined.<br>                    To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue.<br>                    </strong>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
@@ -102,19 +104,15 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "[2024-10-28 02:02:49 TP0] Prefill batch. #new-seq: 1, #new-token: 49, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "[2024-10-30 09:45:52 TP0] Prefill batch. #new-seq: 1, #new-token: 48, #cached-token: 1, cache hit rate: 1.79%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
-      "[2024-10-28 02:02:49] INFO:     127.0.0.1:47912 - \"GET /get_model_info HTTP/1.1\" 200 OK\n",
+      "[2024-10-30 09:45:53 TP0] Decode batch. #running-req: 1, #token: 82, token usage: 0.00, gen throughput (token/s): 0.73, #queue-req: 0\n",
-      "[2024-10-28 02:02:49 TP0] Prefill batch. #new-seq: 1, #new-token: 6, #cached-token: 1, cache hit rate: 1.79%, token usage: 0.00, #running-req: 1, #queue-req: 0\n",
+      "[2024-10-30 09:45:53] INFO:     127.0.0.1:55594 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
-      "[2024-10-28 02:02:49] INFO:     127.0.0.1:47926 - \"POST /generate HTTP/1.1\" 200 OK\n",
-      "[2024-10-28 02:02:49] The server is fired up and ready to roll!\n",
-      "[2024-10-28 02:02:50 TP0] Decode batch. #running-req: 1, #token: 89, token usage: 0.00, gen throughput (token/s): 24.12, #queue-req: 0\n",
-      "[2024-10-28 02:02:50] INFO:     127.0.0.1:47910 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
     ]
    },
    {
     "data": {
      "text/html": [
-       "<strong style='color: #00008B;'>Response: ChatCompletion(id='692899ebd3ea464dbb456008a7d60bf3', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **Country:** Japan\\n**Capital:** Tokyo\\n\\n2. **Country:** Australia\\n**Capital:** Canberra\\n\\n3. **Country:** Brazil\\n**Capital:** Brasília', refusal=None, role='assistant', function_call=None, tool_calls=None), matched_stop=128009)], created=1730106170, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, prompt_tokens_details=None))</strong>"
+       "<strong style='color: #00008B;'>Response: ChatCompletion(id='876500c402ae452ea17e4dde415c108a', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **Country:** Japan\\n**Capital:** Tokyo\\n\\n2. **Country:** Australia\\n**Capital:** Canberra\\n\\n3. **Country:** Brazil\\n**Capital:** Brasília', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None), matched_stop=128009)], created=1730281553, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, completion_tokens_details=None, prompt_tokens_details=None))</strong>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
@@ -127,13 +125,8 @@
   "source": [
    "import openai\n",
    "\n",
-    "# Always assign an api_key, even if not specified during server initialization.\n",
-    "# Setting an API key during server initialization is strongly recommended.\n",
-    "\n",
    "client = openai.Client(base_url=\"http://127.0.0.1:30000/v1\", api_key=\"None\")\n",
    "\n",
-    "# Chat completion example\n",
-    "\n",
    "response = client.chat.completions.create(\n",
    "    model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
    "    messages=[\n",
@@ -167,14 +160,17 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "[2024-10-28 02:02:50 TP0] Prefill batch. #new-seq: 1, #new-token: 48, #cached-token: 28, cache hit rate: 21.97%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "[2024-10-30 09:45:57 TP0] Prefill batch. #new-seq: 1, #new-token: 48, #cached-token: 28, cache hit rate: 21.97%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
-      "[2024-10-28 02:02:50] INFO:     127.0.0.1:47910 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
+      "[2024-10-30 09:45:57 TP0] Decode batch. #running-req: 1, #token: 104, token usage: 0.00, gen throughput (token/s): 8.70, #queue-req: 0\n",
+      "[2024-10-30 09:45:58 TP0] Decode batch. #running-req: 1, #token: 144, token usage: 0.00, gen throughput (token/s): 132.75, #queue-req: 0\n",
+      "[2024-10-30 09:45:58 TP0] Decode batch. #running-req: 1, #token: 184, token usage: 0.00, gen throughput (token/s): 132.30, #queue-req: 0\n",
+      "[2024-10-30 09:45:58] INFO:     127.0.0.1:55594 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
     ]
    },
    {
     "data": {
      "text/html": [
-       "<strong style='color: #00008B;'>Response: ChatCompletion(id='bffa083869484c78ab89d334514d5af3', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content=\"Ancient Rome's major achievements include:\", refusal=None, role='assistant', function_call=None, tool_calls=None), matched_stop='\\n\\n')], created=1730106170, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=8, prompt_tokens=76, total_tokens=84, prompt_tokens_details=None))</strong>"
+       "<strong style='color: #00008B;'>Ancient Rome's major achievements include:<br><br>1. **Engineering and Architecture**: Developed concrete, aqueducts, roads, bridges, and monumental buildings like the Colosseum and Pantheon.<br>2. **Law and Governance**: Established the Twelve Tables, a foundation for modern law, and a system of governance that included the Senate and Assemblies.<br>3. **Military Conquests**: Expanded the empire through numerous wars, creating a vast territory that stretched from Britain to Egypt.<br>4. **Language and Literature**: Developed Latin, which became the language of law, government, and literature, influencing modern languages like French, Spanish, and Italian.<br></strong>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
@@ -200,16 +196,49 @@
    "        {\"role\": \"user\", \"content\": \"What were their major achievements?\"},\n",
    "    ],\n",
    "    temperature=0.3,  # Lower temperature for more focused responses\n",
-    "    max_tokens=100,  # Reasonable length for a concise response\n",
+    "    max_tokens=128,  # Reasonable length for a concise response\n",
    "    top_p=0.95,  # Slightly higher for better fluency\n",
-    "    stop=[\"\\n\\n\"],  # Simple stop sequence\n",
    "    presence_penalty=0.2,  # Mild penalty to avoid repetition\n",
    "    frequency_penalty=0.2,  # Mild penalty for more natural language\n",
    "    n=1,  # Single response is usually more stable\n",
    "    seed=42,  # Keep for reproducibility\n",
    ")\n",
    "\n",
-    "print_highlight(f\"Response: {response}\")"
+    "print_highlight(response.choices[0].message.content)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Streaming mode is also supported"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-30 09:46:06] INFO:     127.0.0.1:45834 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n",
+      "[2024-10-30 09:46:06 TP0] Prefill batch. #new-seq: 1, #new-token: 15, #cached-token: 25, cache hit rate: 31.40%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "It looks like you're getting started with our conversation. I'm happy to chat with you and see how[2024-10-30 09:46:06 TP0] Decode batch. #running-req: 1, #token: 61, token usage: 0.00, gen throughput (token/s): 4.78, #queue-req: 0\n",
+      " things go. What would you like to talk about?"
+     ]
+    }
+   ],
+   "source": [
+    "stream = client.chat.completions.create(\n",
+    "    model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
+    "    messages=[{\"role\": \"user\", \"content\": \"Say this is a test\"}],\n",
+    "    stream=True,\n",
+    ")\n",
+    "for chunk in stream:\n",
+    "    if chunk.choices[0].delta.content is not None:\n",
+    "        print(chunk.choices[0].delta.content, end=\"\")"
   ]
  },
  {
@@ -225,22 +254,22 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "[2024-10-28 02:02:50 TP0] Prefill batch. #new-seq: 1, #new-token: 8, #cached-token: 1, cache hit rate: 21.28%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "[2024-10-30 09:46:11 TP0] Prefill batch. #new-seq: 1, #new-token: 8, #cached-token: 1, cache hit rate: 30.39%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
-      "[2024-10-28 02:02:51 TP0] Decode batch. #running-req: 1, #token: 37, token usage: 0.00, gen throughput (token/s): 38.07, #queue-req: 0\n",
+      "[2024-10-30 09:46:12 TP0] Decode batch. #running-req: 1, #token: 38, token usage: 0.00, gen throughput (token/s): 7.66, #queue-req: 0\n",
-      "[2024-10-28 02:02:52] INFO:     127.0.0.1:47910 - \"POST /v1/completions HTTP/1.1\" 200 OK\n"
+      "[2024-10-30 09:46:12] INFO:     127.0.0.1:45834 - \"POST /v1/completions HTTP/1.1\" 200 OK\n"
     ]
    },
    {
     "data": {
      "text/html": [
-       "<strong style='color: #00008B;'>Response: Completion(id='eb486d0a32fd4384baba923f3bc17e8b', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=None, text=' 1. 2. 3.\\n1.  United States - Washington D.C. 2.  Japan - Tokyo 3.  Australia - Canberra\\nList 3 countries and their capitals. 1. 2. 3.\\n1.  China - Beijing 2.  Brazil - Bras', matched_stop=None)], created=1730106172, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=64, prompt_tokens=9, total_tokens=73, prompt_tokens_details=None))</strong>"
+       "<strong style='color: #00008B;'>Response: Completion(id='1c988750627649f8872965d00cc008d9', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=None, text=' 1. 2. 3.\\n1.  United States - Washington D.C. 2.  Japan - Tokyo 3.  Australia - Canberra\\nList 3 countries and their capitals. 1. 2. 3.\\n1.  China - Beijing 2.  Brazil - Bras', matched_stop=None)], created=1730281572, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=64, prompt_tokens=9, total_tokens=73, completion_tokens_details=None, prompt_tokens_details=None))</strong>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
@@ -276,25 +305,25 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "[2024-10-28 02:02:52 TP0] Prefill batch. #new-seq: 1, #new-token: 9, #cached-token: 1, cache hit rate: 20.53%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "[2024-10-30 09:46:15 TP0] Prefill batch. #new-seq: 1, #new-token: 9, #cached-token: 1, cache hit rate: 29.32%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
-      "[2024-10-28 02:02:52 TP0] Decode batch. #running-req: 1, #token: 15, token usage: 0.00, gen throughput (token/s): 40.91, #queue-req: 0\n",
+      "[2024-10-30 09:46:15 TP0] Decode batch. #running-req: 1, #token: 16, token usage: 0.00, gen throughput (token/s): 12.28, #queue-req: 0\n",
-      "[2024-10-28 02:02:53 TP0] Decode batch. #running-req: 1, #token: 55, token usage: 0.00, gen throughput (token/s): 42.13, #queue-req: 0\n",
+      "[2024-10-30 09:46:15 TP0] Decode batch. #running-req: 1, #token: 56, token usage: 0.00, gen throughput (token/s): 135.70, #queue-req: 0\n",
-      "[2024-10-28 02:02:54 TP0] Decode batch. #running-req: 1, #token: 95, token usage: 0.00, gen throughput (token/s): 42.10, #queue-req: 0\n",
+      "[2024-10-30 09:46:15 TP0] Decode batch. #running-req: 1, #token: 96, token usage: 0.00, gen throughput (token/s): 134.45, #queue-req: 0\n",
-      "[2024-10-28 02:02:55 TP0] Decode batch. #running-req: 1, #token: 135, token usage: 0.00, gen throughput (token/s): 41.94, #queue-req: 0\n",
+      "[2024-10-30 09:46:16 TP0] Decode batch. #running-req: 1, #token: 136, token usage: 0.00, gen throughput (token/s): 133.34, #queue-req: 0\n",
-      "[2024-10-28 02:02:55] INFO:     127.0.0.1:47910 - \"POST /v1/completions HTTP/1.1\" 200 OK\n"
+      "[2024-10-30 09:46:16] INFO:     127.0.0.1:45834 - \"POST /v1/completions HTTP/1.1\" 200 OK\n"
     ]
    },
    {
     "data": {
      "text/html": [
-       "<strong style='color: #00008B;'>Response: Completion(id='fb23a12a15bc4137815b91d63b6fd976', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=None, text=\" Here is a short story about a space explorer named Astrid.\\nAstrid had always been fascinated by the stars. As a child, she would spend hours gazing up at the night sky, dreaming of what lay beyond our small planet. Now, as a renowned space explorer, she had the chance to explore the cosmos firsthand.\\nAstrid's ship, the Aurora, was equipped with state-of-the-art technology that allowed her to traverse vast distances in a relatively short period of time. She had been traveling for weeks, and finally, she had reached her destination: a distant planet on the edge of the galaxy.\\nAs she entered the planet's atmosphere, Astrid felt a thrill of excitement. She had never seen anything like this before.\", matched_stop=None)], created=1730106175, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=150, prompt_tokens=10, total_tokens=160, prompt_tokens_details=None))</strong>"
+       "<strong style='color: #00008B;'>Response: Completion(id='784041b9af634537a7960a0ba6152ba2', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=None, text=\"\\xa0\\nOnce upon a time, in a distant corner of the universe, there was a brave space explorer named Captain Orion. She had spent her entire life studying the stars and dreaming of the day she could explore them for herself. Finally, after years of training and preparation, she set off on her maiden voyage to explore the cosmos.\\nCaptain Orion's ship, the Aurora, was equipped with state-of-the-art technology and a crew of skilled astronauts who were eager to venture into the unknown. As they soared through the galaxy, they encountered breathtaking landscapes and incredible creatures that defied explanation.\\nOn their first stop, they landed on a planet called Zorvath, a world of swirling purple clouds and towering crystal spires. Captain Orion and her crew mar\", matched_stop=None)], created=1730281576, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=150, prompt_tokens=10, total_tokens=160, completion_tokens_details=None, prompt_tokens_details=None))</strong>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
@@ -1015,21 +1044,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 7,
   "metadata": {},
-   "outputs": [
+   "outputs": [],
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-28 02:03:36] INFO:     Shutting down\n",
-      "[2024-10-28 02:03:36] INFO:     Waiting for application shutdown.\n",
-      "[2024-10-28 02:03:36] INFO:     Application shutdown complete.\n",
-      "[2024-10-28 02:03:36] INFO:     Finished server process [1185529]\n",
-      "W1028 02:03:37.084000 140231994889792 torch/_inductor/compile_worker/subproc_pool.py:126] SubprocPool unclean exit\n"
-     ]
-    }
-   ],
   "source": [
    "terminate_process(server_process)"
   ]
@@ -1037,7 +1054,7 @@
 ],
 "metadata": {
  "kernelspec": {
-   "display_name": "AlphaMeemory",
+   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
@@ -1051,7 +1068,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.11.7"
+   "version": "3.10.12"
  }
 },
 "nbformat": 4,

--- a/docs/send_request.ipynb
+++ b/docs/send_request.ipynb
@@ -6,7 +6,7 @@
   "source": [
    "# Quick Start: Launch A Server and Send Requests\n",
    "\n",
-    "This section provides a quick start guide to using SGLang after installation."
+    "This notebook provides a quick-start guide for using SGLang after installation."
   ]
  },
  {
@@ -34,39 +34,37 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
+      "[2024-10-30 09:32:30] server_args=ServerArgs(model_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Meta-Llama-3.1-8B-Instruct', chat_template=None, is_embedding=False, host='0.0.0.0', port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=335520337, constrained_json_whitespace_pattern=None, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)\n",
-      "  warnings.warn(\n",
+      "[2024-10-30 09:32:39 TP0] Init torch distributed begin.\n",
-      "[2024-10-29 21:14:13] server_args=ServerArgs(model_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Meta-Llama-3.1-8B-Instruct', chat_template=None, is_embedding=False, host='0.0.0.0', port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=518055348, constrained_json_whitespace_pattern=None, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)\n",
+      "[2024-10-30 09:32:43 TP0] Load weight begin. avail mem=76.83 GB\n",
-      "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
+      "[2024-10-30 09:32:43 TP0] lm_eval is not installed, GPTQ may not be usable\n",
-      "  warnings.warn(\n",
+      "INFO 10-30 09:32:43 weight_utils.py:243] Using model weights format ['*.safetensors']\n",
-      "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
-      "  warnings.warn(\n",
-      "[2024-10-29 21:14:19 TP0] Init torch distributed begin.\n",
-      "[2024-10-29 21:14:20 TP0] Load weight begin. avail mem=47.27 GB\n",
-      "[2024-10-29 21:14:21 TP0] lm_eval is not installed, GPTQ may not be usable\n",
-      "INFO 10-29 21:14:21 weight_utils.py:243] Using model weights format ['*.safetensors']\n",
      "Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]\n",
-      "Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:00<00:01,  2.32it/s]\n",
+      "Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:01<00:05,  1.78s/it]\n",
-      "Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:00<00:00,  2.28it/s]\n",
+      "Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:03<00:03,  1.78s/it]\n",
-      "Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:01<00:00,  3.27it/s]\n",
+      "Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:05<00:01,  1.80s/it]\n",
-      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:01<00:00,  2.87it/s]\n",
+      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:05<00:00,  1.30s/it]\n",
-      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:01<00:00,  2.78it/s]\n",
+      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:05<00:00,  1.48s/it]\n",
      "\n",
-      "[2024-10-29 21:14:24 TP0] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=32.22 GB\n",
+      "[2024-10-30 09:32:49 TP0] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=61.82 GB\n",
-      "[2024-10-29 21:14:24 TP0] Memory pool end. avail mem=4.60 GB\n",
+      "[2024-10-30 09:32:49 TP0] Memory pool end. avail mem=8.19 GB\n",
-      "[2024-10-29 21:14:24 TP0] Capture cuda graph begin. This can take up to several minutes.\n",
+      "[2024-10-30 09:32:51 TP0] Capture cuda graph begin. This can take up to several minutes.\n",
-      "[2024-10-29 21:14:32 TP0] max_total_num_tokens=217512, max_prefill_tokens=16384, max_running_requests=2049, context_len=131072\n",
+      "[2024-10-30 09:32:59 TP0] max_total_num_tokens=430915, max_prefill_tokens=16384, max_running_requests=2049, context_len=131072\n",
-      "[2024-10-29 21:14:32] INFO:     Started server process [2661188]\n",
+      "[2024-10-30 09:33:00] INFO:     Started server process [227758]\n",
-      "[2024-10-29 21:14:32] INFO:     Waiting for application startup.\n",
+      "[2024-10-30 09:33:00] INFO:     Waiting for application startup.\n",
-      "[2024-10-29 21:14:32] INFO:     Application startup complete.\n",
+      "[2024-10-30 09:33:00] INFO:     Application startup complete.\n",
-      "[2024-10-29 21:14:32] INFO:     Uvicorn running on http://0.0.0.0:30000 (Press CTRL+C to quit)\n",
+      "[2024-10-30 09:33:00] INFO:     Uvicorn running on http://0.0.0.0:30000 (Press CTRL+C to quit)\n",
-      "[2024-10-29 21:14:32] INFO:     127.0.0.1:49888 - \"GET /v1/models HTTP/1.1\" 200 OK\n"
+      "[2024-10-30 09:33:01] INFO:     127.0.0.1:49220 - \"GET /v1/models HTTP/1.1\" 200 OK\n",
+      "[2024-10-30 09:33:01] INFO:     127.0.0.1:49236 - \"GET /get_model_info HTTP/1.1\" 200 OK\n",
+      "[2024-10-30 09:33:01 TP0] Prefill batch. #new-seq: 1, #new-token: 7, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "[2024-10-30 09:33:01] INFO:     127.0.0.1:49240 - \"POST /generate HTTP/1.1\" 200 OK\n",
+      "[2024-10-30 09:33:01] The server is fired up and ready to roll!\n"
     ]
    },
    {
     "data": {
      "text/html": [
-       "<strong style='color: #00008B;'><br>                            Server and notebook outputs are combined for clarity.<br>                            <br>                            Typically, the server runs in a separate terminal.<br>                            <br>                            Server output is gray; notebook output is highlighted.<br>                            </strong>"
+       "<strong style='color: #00008B;'><br><br>                    NOTE: Typically, the server runs in a separate terminal.<br>                    In this notebook, we run the server and notebook code together, so their outputs are combined.<br>                    To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue.<br>                    </strong>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
@@ -84,9 +82,8 @@
    "    print_highlight,\n",
    ")\n",
    "\n",
-    "\n",
    "server_process = execute_shell_command(\n",
-    "    \"\"\"\n",
+    "\"\"\"\n",
    "python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n",
    "--port 30000 --host 0.0.0.0\n",
    "\"\"\"\n",
@@ -101,7 +98,7 @@
   "source": [
    "## Send a Request\n",
    "\n",
-    "Once the server is running, you can send test requests using curl."
+    "Once the server is running, you can send test requests using curl. The server implements the [OpenAI-compatible API](https://platform.openai.com/docs/api-reference/chat)."
   ]
  },
  {
@@ -113,30 +110,21 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "[2024-10-29 21:14:32 TP0] Prefill batch. #new-seq: 1, #new-token: 47, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
+      "[2024-10-30 09:34:00 TP0] Prefill batch. #new-seq: 1, #new-token: 46, #cached-token: 1, cache hit rate: 1.85%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
-     ]
+      "[2024-10-30 09:34:00 TP0] Decode batch. #running-req: 1, #token: 80, token usage: 0.00, gen throughput (token/s): 0.65, #queue-req: 0\n",
-    },
+      "[2024-10-30 09:34:01 TP0] Decode batch. #running-req: 1, #token: 120, token usage: 0.00, gen throughput (token/s): 139.05, #queue-req: 0\n",
-    {
+      "[2024-10-30 09:34:01 TP0] Decode batch. #running-req: 1, #token: 160, token usage: 0.00, gen throughput (token/s): 137.75, #queue-req: 0\n",
-     "name": "stdout",
+      "[2024-10-30 09:34:01 TP0] Decode batch. #running-req: 1, #token: 200, token usage: 0.00, gen throughput (token/s): 137.59, #queue-req: 0\n",
-     "output_type": "stream",
+      "[2024-10-30 09:34:02 TP0] Decode batch. #running-req: 1, #token: 240, token usage: 0.00, gen throughput (token/s): 137.62, #queue-req: 0\n",
-     "text": [
+      "[2024-10-30 09:34:02 TP0] Decode batch. #running-req: 1, #token: 280, token usage: 0.00, gen throughput (token/s): 137.61, #queue-req: 0\n",
-      "[2024-10-29 21:14:33] INFO:     127.0.0.1:49914 - \"GET /get_model_info HTTP/1.1\" 200 OK\n",
+      "[2024-10-30 09:34:02 TP0] Decode batch. #running-req: 1, #token: 320, token usage: 0.00, gen throughput (token/s): 137.49, #queue-req: 0\n",
-      "[2024-10-29 21:14:33 TP0] Prefill batch. #new-seq: 1, #new-token: 6, #cached-token: 1, cache hit rate: 1.85%, token usage: 0.00, #running-req: 1, #queue-req: 0\n",
+      "[2024-10-30 09:34:02 TP0] Decode batch. #running-req: 1, #token: 360, token usage: 0.00, gen throughput (token/s): 137.51, #queue-req: 0\n",
-      "[2024-10-29 21:14:33] INFO:     127.0.0.1:49916 - \"POST /generate HTTP/1.1\" 200 OK\n",
+      "[2024-10-30 09:34:03 TP0] Decode batch. #running-req: 1, #token: 400, token usage: 0.00, gen throughput (token/s): 137.47, #queue-req: 0\n",
-      "[2024-10-29 21:14:33] The server is fired up and ready to roll!\n",
+      "[2024-10-30 09:34:03 TP0] Decode batch. #running-req: 1, #token: 440, token usage: 0.00, gen throughput (token/s): 137.48, #queue-req: 0\n",
-      "[2024-10-29 21:14:33 TP0] Decode batch. #running-req: 1, #token: 87, token usage: 0.00, gen throughput (token/s): 27.00, #queue-req: 0\n",
+      "[2024-10-30 09:34:03 TP0] Decode batch. #running-req: 1, #token: 480, token usage: 0.00, gen throughput (token/s): 137.47, #queue-req: 0\n",
-      "[2024-10-29 21:14:34 TP0] Decode batch. #running-req: 1, #token: 127, token usage: 0.00, gen throughput (token/s): 42.50, #queue-req: 0\n",
+      "[2024-10-30 09:34:04 TP0] Decode batch. #running-req: 1, #token: 520, token usage: 0.00, gen throughput (token/s): 137.47, #queue-req: 0\n",
-      "[2024-10-29 21:14:35 TP0] Decode batch. #running-req: 1, #token: 167, token usage: 0.00, gen throughput (token/s): 42.31, #queue-req: 0\n",
+      "[2024-10-30 09:34:04] INFO:     127.0.0.1:54110 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n",
-      "[2024-10-29 21:14:36 TP0] Decode batch. #running-req: 1, #token: 207, token usage: 0.00, gen throughput (token/s): 42.29, #queue-req: 0\n",
+      "{\"id\":\"a53e18ead1314ab0a2cec76cef484c11\",\"object\":\"chat.completion\",\"created\":1730280844,\"model\":\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\"choices\":[{\"index\":0,\"message\":{\"role\":\"assistant\",\"content\":\"LLM stands for Large Language Model. It's a type of artificial intelligence (AI) model that is designed to process and understand human language in a way that's similar to how humans do. \\n\\nLLMs are trained on vast amounts of text data, which allows them to learn patterns, relationships, and context within language. This training enables them to generate human-like responses to a wide range of questions, prompts, and topics.\\n\\nSome common characteristics of LLMs include:\\n\\n1. **Language understanding**: LLMs can comprehend the meaning and context of language, including nuances like idioms, sarcasm, and figurative language.\\n2. **Language generation**: LLMs can generate text that's coherent, contextually relevant, and often engaging.\\n3. **Knowledge retrieval**: LLMs can access and retrieve information from their vast training datasets, allowing them to answer questions and provide information on a wide range of topics.\\n4. **Conversational dialogue**: LLMs can engage in natural-sounding conversations, using context and understanding to respond to questions and statements.\\n\\nLLMs have many applications, including:\\n\\n1. **Virtual assistants**: LLMs power virtual assistants like Siri, Alexa, and Google Assistant.\\n2. **Language translation**: LLMs can translate languages in real-time, with high accuracy.\\n3. **Content generation**: LLMs can generate text, such as articles, emails, and social media posts.\\n4. **Chatbots**: LLMs can power chatbots that provide customer support, answer questions, and engage in conversations.\\n\\nSome popular examples of LLMs include:\\n\\n1. **BERT (Bidirectional Encoder Representations from Transformers)**: Developed by Google, BERT is a widely used LLM that's been trained on a massive dataset of text.\\n2. **RoBERTa (Robustly Optimized BERT Pretraining Approach)**: Developed by Facebook AI, RoBERTa is another popular LLM that's been trained on a large dataset of text.\\n3. **Language models from OpenAI**: OpenAI has developed a range of LLMs, including GPT-3 (Generative Pre-trained Transformer 3), which is one of the most advanced LLMs available today.\\n\\nOverall, LLMs have the potential to revolutionize the way we interact with language and information, making it easier to access and understand complex topics, and opening up new possibilities for language-based applications.\"},\"logprobs\":null,\"finish_reason\":\"stop\",\"matched_stop\":128009}],\"usage\":{\"prompt_tokens\":47,\"total_tokens\":539,\"completion_tokens\":492,\"prompt_tokens_details\":null}}"
-      "[2024-10-29 21:14:37 TP0] Decode batch. #running-req: 1, #token: 247, token usage: 0.00, gen throughput (token/s): 42.34, #queue-req: 0\n",
-      "[2024-10-29 21:14:38 TP0] Decode batch. #running-req: 1, #token: 287, token usage: 0.00, gen throughput (token/s): 42.34, #queue-req: 0\n",
-      "[2024-10-29 21:14:39 TP0] Decode batch. #running-req: 1, #token: 327, token usage: 0.00, gen throughput (token/s): 42.30, #queue-req: 0\n",
-      "[2024-10-29 21:14:40 TP0] Decode batch. #running-req: 1, #token: 367, token usage: 0.00, gen throughput (token/s): 42.32, #queue-req: 0\n",
-      "[2024-10-29 21:14:41 TP0] Decode batch. #running-req: 1, #token: 407, token usage: 0.00, gen throughput (token/s): 42.23, #queue-req: 0\n",
-      "[2024-10-29 21:14:42 TP0] Decode batch. #running-req: 1, #token: 447, token usage: 0.00, gen throughput (token/s): 42.25, #queue-req: 0\n",
-      "[2024-10-29 21:14:43 TP0] Decode batch. #running-req: 1, #token: 487, token usage: 0.00, gen throughput (token/s): 42.22, #queue-req: 0\n",
-      "[2024-10-29 21:14:43] INFO:     127.0.0.1:49902 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n",
-      "{\"id\":\"0635a1c4d1d940f597b11482bed9595f\",\"object\":\"chat.completion\",\"created\":1730261683,\"model\":\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\"choices\":[{\"index\":0,\"message\":{\"role\":\"assistant\",\"content\":\"LLM stands for Large Language Model. It's a type of artificial intelligence (AI) designed to process and understand human language. LLMs are trained on vast amounts of text data, allowing them to learn patterns, relationships, and context within language.\\n\\nLarge language models like myself use natural language processing (NLP) and machine learning algorithms to analyze and generate human-like text. This enables us to:\\n\\n1. **Answer questions**: Provide information on a wide range of topics, from general knowledge to specialized domains.\\n2. **Generate text**: Create coherent and contextually relevant text, such as articles, essays, or even entire stories.\\n3. **Translate languages**: Translate text from one language to another, helping to break language barriers.\\n4. **Summarize content**: Condense long pieces of text into shorter, more digestible summaries.\\n5. **Chat and converse**: Engage in natural-sounding conversations, using context and understanding to respond to questions and statements.\\n\\nLarge language models are typically trained on massive datasets, often consisting of billions of parameters and petabytes of text data. This training enables us to learn complex language patterns, nuances, and context, allowing us to provide helpful and informative responses.\\n\\nSome popular examples of large language models include:\\n\\n1. **BERT (Bidirectional Encoder Representations from Transformers)**: Developed by Google, BERT is a foundational model for many language understanding tasks.\\n2. **RoBERTa (Robustly Optimized BERT Pretraining Approach)**: A variant of BERT, developed by Facebook AI, which improved upon the original model's performance.\\n3. **Transformers**: A family of models developed by Google, which includes BERT and other related architectures.\\n\\nThese models have revolutionized the field of natural language processing and have many exciting applications in areas like:\\n\\n1. **Virtual assistants**: Like Siri, Alexa, or myself, which can understand and respond to voice commands.\\n2. **Language translation**: Enabling real-time translation of languages.\\n3. **Content generation**: Creating original text, such as articles, stories, or even entire books.\\n4. **Customer service**: Providing 24/7 support and answering common customer queries.\\n\\nI hope this helps you understand what a Large Language Model is and its capabilities!\"},\"logprobs\":null,\"finish_reason\":\"stop\",\"matched_stop\":128009}],\"usage\":{\"prompt_tokens\":47,\"total_tokens\":504,\"completion_tokens\":457,\"prompt_tokens_details\":null}}"
     ]
    }
   ],
@@ -151,9 +139,9 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "## Using OpenAI Compatible API\n",
+    "## Using OpenAI Python Client\n",
    "\n",
-    "SGLang supports OpenAI-compatible APIs. Here are Python examples:"
+    "You can also use the OpenAI Python API library to send requests."
   ]
  },
  {
@@ -165,15 +153,15 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "[2024-10-29 21:14:44 TP0] Prefill batch. #new-seq: 1, #new-token: 20, #cached-token: 29, cache hit rate: 29.13%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "[2024-10-30 09:34:06 TP0] Prefill batch. #new-seq: 1, #new-token: 20, #cached-token: 29, cache hit rate: 29.13%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
-      "[2024-10-29 21:14:44 TP0] Decode batch. #running-req: 1, #token: 73, token usage: 0.00, gen throughput (token/s): 26.00, #queue-req: 0\n",
+      "[2024-10-30 09:34:07 TP0] Decode batch. #running-req: 1, #token: 71, token usage: 0.00, gen throughput (token/s): 13.51, #queue-req: 0\n",
-      "[2024-10-29 21:14:45] INFO:     127.0.0.1:52764 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
+      "[2024-10-30 09:34:07] INFO:     127.0.0.1:42068 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
     ]
    },
    {
     "data": {
      "text/html": [
-       "<strong style='color: #00008B;'>ChatCompletion(id='994dd35133d34f57951a102c7470464f', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **Country:** Japan\\n**Capital:** Tokyo\\n\\n2. **Country:** Australia\\n**Capital:** Canberra\\n\\n3. **Country:** Brazil\\n**Capital:** Brasília', refusal=None, role='assistant', function_call=None, tool_calls=None), matched_stop=128009)], created=1730261685, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, prompt_tokens_details=None))</strong>"
+       "<strong style='color: #00008B;'>ChatCompletion(id='0708a0196e524456a1316359f6189e48', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **Country:** Japan\\n**Capital:** Tokyo\\n\\n2. **Country:** Australia\\n**Capital:** Canberra\\n\\n3. **Country:** Brazil\\n**Capital:** Brasília', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None), matched_stop=128009)], created=1730280847, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, completion_tokens_details=None, prompt_tokens_details=None))</strong>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
@@ -186,13 +174,8 @@
   "source": [
    "import openai\n",
    "\n",
-    "# Always assign an api_key, even if not specified during server initialization.\n",
-    "# Setting an API key during server initialization is strongly recommended.\n",
-    "\n",
    "client = openai.Client(base_url=\"http://127.0.0.1:30000/v1\", api_key=\"None\")\n",
    "\n",
-    "# Chat completion example\n",
-    "\n",
    "response = client.chat.completions.create(\n",
    "    model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
    "    messages=[\n",
@@ -209,19 +192,7 @@
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
-   "outputs": [
+   "outputs": [],
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-29 21:14:45] INFO:     Shutting down\n",
-      "[2024-10-29 21:14:45] INFO:     Waiting for application shutdown.\n",
-      "[2024-10-29 21:14:45] INFO:     Application shutdown complete.\n",
-      "[2024-10-29 21:14:45] INFO:     Finished server process [2661188]\n",
-      "W1029 21:14:45.740000 139643311699520 torch/_inductor/compile_worker/subproc_pool.py:126] SubprocPool unclean exit\n"
-     ]
-    }
-   ],
   "source": [
    "terminate_process(server_process)"
   ]
@@ -229,7 +200,7 @@
 ],
 "metadata": {
  "kernelspec": {
-   "display_name": "AlphaMeemory",
+   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
@@ -243,7 +214,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.11.7"
+   "version": "3.10.12"
  }
 },
 "nbformat": 4,

--- a/python/sglang/srt/mem_cache/flush_cache.py
+++ b/python/sglang/srt/mem_cache/flush_cache.py
@@ -29,5 +29,5 @@ if __name__ == "__main__":
    parser.add_argument("--url", type=str, default="http://localhost:30000")
    args = parser.parse_args()
-    response = requests.get(args.url + "/flush_cache")
+    response = requests.post(args.url + "/flush_cache")
    assert response.status_code == 200
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -124,7 +124,7 @@ class ModelRunner:
                "Automatically turn off --chunked-prefill-size and adjust --mem-fraction-static for multimodal models."
            )
            server_args.chunked_prefill_size = None
-            server_args.mem_fraction_static *= 0.95
+            self.mem_fraction_static *= 0.95
            # TODO: qwen2-vl does not support radix cache now, set disable_radix_cache=True automatically
            if self.model_config.hf_config.architectures == [
                "Qwen2VLForConditionalGeneration"

--- a/python/sglang/srt/server.py
+++ b/python/sglang/srt/server.py
@@ -139,7 +139,7 @@ async def get_server_args():
    return dataclasses.asdict(tokenizer_manager.server_args)
-@app.get("/flush_cache")
+@app.post("/flush_cache")
 async def flush_cache():
    """Flush the radix cache."""
    tokenizer_manager.flush_cache()
@@ -180,7 +180,7 @@ async def get_memory_pool_size():
        return ret
    except Exception as e:
-        return JSONResponse(
+        return ORJSONResponse(
            {"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
        )

--- a/python/sglang/utils.py
+++ b/python/sglang/utils.py
@@ -19,7 +19,6 @@ from typing import Optional, Union
 import numpy as np
 import requests
-import torch
 from IPython.display import HTML, display
 from tqdm import tqdm
@@ -332,14 +331,13 @@ def wait_for_server(base_url: str, timeout: int = None) -> None:
                headers={"Authorization": "Bearer None"},
            )
            if response.status_code == 200:
+                time.sleep(5)
                print_highlight(
+                    """\n
+                    NOTE: Typically, the server runs in a separate terminal.
+                    In this notebook, we run the server and notebook code together, so their outputs are combined.
+                    To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue.
                    """
-                            Server and notebook outputs are combined for clarity.
-                            Typically, the server runs in a separate terminal.
-                            Server output is gray; notebook output is highlighted.
-                            """
                )
                break
@@ -350,36 +348,8 @@ def wait_for_server(base_url: str, timeout: int = None) -> None:
 def terminate_process(process):
-    """Safely terminate a process and clean up GPU memory.
+    from sglang.srt.utils import kill_child_process
+    kill_child_process(process.pid, include_self=True)
-    Args:
-        process: subprocess.Popen object to terminate
-    """
-    try:
-        process.terminate()
-        try:
-            process.wait(timeout=5)
-        except subprocess.TimeoutExpired:
-            if os.name != "nt":
-                try:
-                    pgid = os.getpgid(process.pid)
-                    os.killpg(pgid, signal.SIGTERM)
-                    time.sleep(1)
-                    if process.poll() is None:
-                        os.killpg(pgid, signal.SIGKILL)
-                except ProcessLookupError:
-                    pass
-            else:
-                process.kill()
-            process.wait()
-    except Exception as e:
-        print(f"Warning: {e}")
-    finally:
-        gc.collect()
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-            torch.cuda.ipc_collect()
-        time.sleep(2)
 def print_highlight(html_content: str):

--- a/scripts/ci_install_dependency.sh
+++ b/scripts/ci_install_dependency.sh
+"""
+Install the dependency in CI.
+"""
 pip install --upgrade pip
 pip install -e "python[all]"
 pip install transformers==4.45.2

--- a/scripts/killall_sglang.sh
+++ b/scripts/killall_sglang.sh
+"""
+Kill all SGLang processes and free the GPU memory.
+"""
 kill -9 $(ps aux | grep 'multiprocessing.spawn' | grep -v 'grep' | awk '{print $2}')
 kill -9 $(ps aux | grep 'sglang.launch_server' | grep -v 'grep' | awk '{print $2}')
--- a/scripts/version_branch_to_tag.sh
+++ b/scripts/version_branch_to_tag.sh
 #!/bin/bash
-# This script tags all remote branches starting with 'v' with the same name as the branch, 
+# This script is used for release.
+# It tags all remote branches starting with 'v' with the same name as the branch, 
 # deletes the corresponding branches from the remote, and pushes the tags to the remote repository. 
 git fetch origin --prune