change file tree (#1859)

Co-authored-by: Chayenne <zhaochenyang@g.ucla.edu>

change file tree (#1859)
Co-authored-by: Chayenne <zhaochenyang@g.ucla.edu>
61cf00e1 · Chayenne · GitHub · b9fd178f · 61cf00e1 · 61cf00e1
Unverified Commit 61cf00e1 authored Oct 31, 2024 by Chayenne Committed by GitHub Oct 31, 2024
20 changed files
--- a/.github/workflows/deploy-docs.yml
+++ b/.github/workflows/deploy-docs.yml
@@ -38,14 +38,8 @@ jobs:
          GITHUB_TOKEN: ${{ secrets.PAT_TOKEN }}
        run: |
          cd docs
-          for nb in *.ipynb; do
+          make clean
-            if [ -f "$nb" ]; then
+          make compile
-              echo "Executing $nb"
-              jupyter nbconvert --to notebook --execute --inplace "$nb" \
-                --ExecutePreprocessor.timeout=600 \
-                --ExecutePreprocessor.kernel_name=python3
-            fi
-          done
          make html
          cd _build/html

--- a/.github/workflows/execute-notebook.yml
+++ b/.github/workflows/execute-notebook.yml
@@ -44,11 +44,5 @@ jobs:
      - name: Execute notebooks
        run: |
          cd docs
-          for nb in *.ipynb; do
+          make clean
-            if [ -f "$nb" ]; then
+          make compile
-              echo "Executing $nb"
\ No newline at end of file
-              jupyter nbconvert --to notebook --execute --inplace "$nb" \
-                --ExecutePreprocessor.timeout=600 \
-                --ExecutePreprocessor.kernel_name=python3
-            fi
-          done
\ No newline at end of file
--- a/README.md
+++ b/README.md
@@ -40,13 +40,13 @@ The core features include:
 - **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
 ## Install
-See [https://sgl-project.github.io/install.html](https://sgl-project.github.io/install.html)
+See [https://sgl-project.github.io/starts/install.html](https://sgl-project.github.io/starts/install.html)
 ## Backend: SGLang Runtime (SRT)
-See [https://sgl-project.github.io/backend.html](https://sgl-project.github.io/backend.html)
+See [https://sgl-project.github.io/backend/backend.html](https://sgl-project.github.io/backend/backend.html)
 ## Frontend: Structured Generation Language (SGLang)
-See [https://sgl-project.github.io/frontend.html](https://sgl-project.github.io/frontend.html)
+See [https://sgl-project.github.io/frontend/frontend.html](https://sgl-project.github.io/frontend/frontend.html)
 ## Benchmark And Performance
 Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)

--- a/docs/Makefile
+++ b/docs/Makefile
@@ -12,7 +12,18 @@ BUILDDIR      = _build
 help:
 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
-.PHONY: help Makefile
+# New target to compile Markdown and Jupyter Notebook files
+compile:
+	find $(SOURCEDIR) -name '*.ipynb' | while read nb; do \
+		if [ -f "$$nb" ]; then \
+			echo "Executing $$nb"; \
+			jupyter nbconvert --to notebook --execute --inplace "$$nb" \
+				--ExecutePreprocessor.timeout=600 \
+				--ExecutePreprocessor.kernel_name=python3; \
+		fi; \
+	done
+.PHONY: help Makefile compile
 # Catch-all target: route all unknown targets to Sphinx using the new
 # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).

--- a/docs/backend.md
+++ b/docs/backend.md
--- a/docs/embedding_model.ipynb
+++ b/docs/embedding_model.ipynb
@@ -30,47 +30,181 @@
  {
   "cell_type": "code",
   "execution_count": 1,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-01T02:47:32.337369Z",
+     "iopub.status.busy": "2024-11-01T02:47:32.337032Z",
+     "iopub.status.idle": "2024-11-01T02:47:59.540926Z",
+     "shell.execute_reply": "2024-11-01T02:47:59.539861Z"
+    }
+   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
-      "  warnings.warn(\n",
+      "  warnings.warn(\n"
-      "[2024-10-29 21:07:15] server_args=ServerArgs(model_path='Alibaba-NLP/gte-Qwen2-7B-instruct', tokenizer_path='Alibaba-NLP/gte-Qwen2-7B-instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='Alibaba-NLP/gte-Qwen2-7B-instruct', chat_template=None, is_embedding=True, host='0.0.0.0', port=30010, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=568040040, constrained_json_whitespace_pattern=None, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)\n",
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-31 19:47:37] server_args=ServerArgs(model_path='Alibaba-NLP/gte-Qwen2-7B-instruct', tokenizer_path='Alibaba-NLP/gte-Qwen2-7B-instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='Alibaba-NLP/gte-Qwen2-7B-instruct', chat_template=None, is_embedding=True, host='0.0.0.0', port=30010, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=314021918, constrained_json_whitespace_pattern=None, decode_log_interval=40, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
      "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
      "  warnings.warn(\n",
      "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
-      "  warnings.warn(\n",
+      "  warnings.warn(\n"
-      "[2024-10-29 21:07:20 TP0] Init torch distributed begin.\n",
+     ]
-      "[2024-10-29 21:07:20 TP0] Load weight begin. avail mem=47.27 GB\n",
+    },
-      "[2024-10-29 21:07:21 TP0] lm_eval is not installed, GPTQ may not be usable\n",
+    {
-      "INFO 10-29 21:07:22 weight_utils.py:243] Using model weights format ['*.safetensors']\n",
+     "name": "stdout",
-      "Loading safetensors checkpoint shards:   0% Completed | 0/7 [00:00<?, ?it/s]\n",
+     "output_type": "stream",
-      "Loading safetensors checkpoint shards:  14% Completed | 1/7 [00:00<00:03,  1.65it/s]\n",
+     "text": [
-      "Loading safetensors checkpoint shards:  29% Completed | 2/7 [00:01<00:04,  1.02it/s]\n",
+      "[2024-10-31 19:47:43 TP0] Init torch distributed begin.\n"
-      "Loading safetensors checkpoint shards:  43% Completed | 3/7 [00:03<00:04,  1.24s/it]\n",
+     ]
-      "Loading safetensors checkpoint shards:  57% Completed | 4/7 [00:05<00:04,  1.47s/it]\n",
+    },
-      "Loading safetensors checkpoint shards:  71% Completed | 5/7 [00:07<00:03,  1.62s/it]\n",
+    {
-      "Loading safetensors checkpoint shards:  86% Completed | 6/7 [00:08<00:01,  1.64s/it]\n",
+     "name": "stdout",
-      "Loading safetensors checkpoint shards: 100% Completed | 7/7 [00:10<00:00,  1.63s/it]\n",
+     "output_type": "stream",
-      "Loading safetensors checkpoint shards: 100% Completed | 7/7 [00:10<00:00,  1.49s/it]\n",
+     "text": [
+      "[2024-10-31 19:47:44 TP0] Load weight begin. avail mem=47.27 GB\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-31 19:47:44 TP0] lm_eval is not installed, GPTQ may not be usable\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO 10-31 19:47:45 weight_utils.py:243] Using model weights format ['*.safetensors']\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\r",
+      "Loading safetensors checkpoint shards:   0% Completed | 0/7 [00:00<?, ?it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\r",
+      "Loading safetensors checkpoint shards:  14% Completed | 1/7 [00:00<00:03,  1.96it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\r",
+      "Loading safetensors checkpoint shards:  29% Completed | 2/7 [00:01<00:03,  1.39it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\r",
+      "Loading safetensors checkpoint shards:  43% Completed | 3/7 [00:02<00:03,  1.13it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\r",
+      "Loading safetensors checkpoint shards:  57% Completed | 4/7 [00:03<00:02,  1.00it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\r",
+      "Loading safetensors checkpoint shards:  71% Completed | 5/7 [00:04<00:02,  1.05s/it]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\r",
+      "Loading safetensors checkpoint shards:  86% Completed | 6/7 [00:05<00:01,  1.09s/it]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\r",
+      "Loading safetensors checkpoint shards: 100% Completed | 7/7 [00:07<00:00,  1.11s/it]\n",
+      "\r",
+      "Loading safetensors checkpoint shards: 100% Completed | 7/7 [00:07<00:00,  1.01s/it]\n",
      "\n",
-      "[2024-10-29 21:07:32 TP0] Load weight end. type=Qwen2ForCausalLM, dtype=torch.float16, avail mem=32.91 GB\n",
+      "[2024-10-31 19:47:53 TP0] Load weight end. type=Qwen2ForCausalLM, dtype=torch.float16, avail mem=32.91 GB\n",
-      "[2024-10-29 21:07:33 TP0] Memory pool end. avail mem=4.56 GB\n",
+      "[2024-10-31 19:47:53 TP0] Memory pool end. avail mem=4.56 GB\n"
-      "[2024-10-29 21:07:33 TP0] max_total_num_tokens=509971, max_prefill_tokens=16384, max_running_requests=2049, context_len=131072\n",
+     ]
-      "[2024-10-29 21:07:33] INFO:     Started server process [2650986]\n",
+    },
-      "[2024-10-29 21:07:33] INFO:     Waiting for application startup.\n",
+    {
-      "[2024-10-29 21:07:33] INFO:     Application startup complete.\n",
+     "name": "stdout",
-      "[2024-10-29 21:07:33] INFO:     Uvicorn running on http://0.0.0.0:30010 (Press CTRL+C to quit)\n",
+     "output_type": "stream",
-      "[2024-10-29 21:07:34] INFO:     127.0.0.1:47812 - \"GET /v1/models HTTP/1.1\" 200 OK\n"
+     "text": [
+      "[2024-10-31 19:47:53 TP0] max_total_num_tokens=509971, max_prefill_tokens=16384, max_running_requests=2049, context_len=131072\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-31 19:47:54] INFO:     Started server process [1552642]\n",
+      "[2024-10-31 19:47:54] INFO:     Waiting for application startup.\n",
+      "[2024-10-31 19:47:54] INFO:     Application startup complete.\n",
+      "[2024-10-31 19:47:54] INFO:     Uvicorn running on http://0.0.0.0:30010 (Press CTRL+C to quit)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-31 19:47:54] INFO:     127.0.0.1:47776 - \"GET /v1/models HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-31 19:47:55] INFO:     127.0.0.1:50344 - \"GET /get_model_info HTTP/1.1\" 200 OK\n",
+      "[2024-10-31 19:47:55 TP0] Prefill batch. #new-seq: 1, #new-token: 6, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-31 19:47:55] INFO:     127.0.0.1:50352 - \"POST /encode HTTP/1.1\" 200 OK\n",
+      "[2024-10-31 19:47:55] The server is fired up and ready to roll!\n"
     ]
    },
    {
     "data": {
      "text/html": [
-       "<strong style='color: #00008B;'><br>                            This cell combines server and notebook output. <br>                            <br>                            Typically, the server runs in a separate terminal, <br>                            but we combine the output of server and notebook to demonstrate the usage better.<br>                            <br>                            In our documentation, server output is in gray, notebook output is highlighted.<br>                            </strong>"
+       "<strong style='color: #00008B;'><br><br>                    NOTE: Typically, the server runs in a separate terminal.<br>                    In this notebook, we run the server and notebook code together, so their outputs are combined.<br>                    To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue.<br>                    </strong>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
@@ -78,16 +212,6 @@
     },
     "metadata": {},
     "output_type": "display_data"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-29 21:07:34] INFO:     127.0.0.1:41780 - \"GET /get_model_info HTTP/1.1\" 200 OK\n",
-      "[2024-10-29 21:07:34 TP0] Prefill batch. #new-seq: 1, #new-token: 6, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
-      "[2024-10-29 21:07:35] INFO:     127.0.0.1:41792 - \"POST /encode HTTP/1.1\" 200 OK\n",
-      "[2024-10-29 21:07:35] The server is fired up and ready to roll!\n"
-     ]
    }
   ],
   "source": [
@@ -118,20 +242,21 @@
  {
   "cell_type": "code",
   "execution_count": 2,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-01T02:47:59.543958Z",
+     "iopub.status.busy": "2024-11-01T02:47:59.543670Z",
+     "iopub.status.idle": "2024-11-01T02:47:59.591699Z",
+     "shell.execute_reply": "2024-11-01T02:47:59.590809Z"
+    }
+   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "[2024-10-28 02:10:30 TP0] Prefill batch. #new-seq: 1, #new-token: 4, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
+      "[2024-10-31 19:47:59 TP0] Prefill batch. #new-seq: 1, #new-token: 4, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
-     ]
+      "[2024-10-31 19:47:59] INFO:     127.0.0.1:50358 - \"POST /v1/embeddings HTTP/1.1\" 200 OK\n"
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2024-10-28 02:10:31] INFO:     127.0.0.1:48094 - \"POST /v1/embeddings HTTP/1.1\" 200 OK\n"
     ]
    },
    {
@@ -174,18 +299,21 @@
  {
   "cell_type": "code",
   "execution_count": 3,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-01T02:47:59.594229Z",
+     "iopub.status.busy": "2024-11-01T02:47:59.594049Z",
+     "iopub.status.idle": "2024-11-01T02:48:00.006233Z",
+     "shell.execute_reply": "2024-11-01T02:48:00.005255Z"
+    }
+   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "[2024-10-28 02:10:31] INFO:     127.0.0.1:48110 - \"GET /get_model_info HTTP/1.1\" 200 OK\n",
+      "[2024-10-31 19:47:59 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, cache hit rate: 21.43%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
-      "[2024-10-28 02:10:31 TP0] Prefill batch. #new-seq: 1, #new-token: 6, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "[2024-10-31 19:47:59] INFO:     127.0.0.1:50362 - \"POST /v1/embeddings HTTP/1.1\" 200 OK\n"
-      "[2024-10-28 02:10:31] INFO:     127.0.0.1:48114 - \"POST /encode HTTP/1.1\" 200 OK\n",
-      "[2024-10-28 02:10:31] The server is fired up and ready to roll!\n",
-      "[2024-10-28 02:10:31 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, cache hit rate: 21.43%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
-      "[2024-10-28 02:10:31] INFO:     127.0.0.1:48118 - \"POST /v1/embeddings HTTP/1.1\" 200 OK\n"
     ]
    },
    {
@@ -228,13 +356,20 @@
  {
   "cell_type": "code",
   "execution_count": 4,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-01T02:48:00.008858Z",
+     "iopub.status.busy": "2024-11-01T02:48:00.008689Z",
+     "iopub.status.idle": "2024-11-01T02:48:01.872542Z",
+     "shell.execute_reply": "2024-11-01T02:48:01.871573Z"
+    }
+   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:127: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
+      "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
      "  warnings.warn(\n"
     ]
    },
@@ -242,8 +377,8 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "[2024-10-28 02:10:32 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, cache hit rate: 33.33%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "[2024-10-31 19:48:01 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 3, cache hit rate: 33.33%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
-      "[2024-10-28 02:10:32] INFO:     127.0.0.1:48124 - \"POST /v1/embeddings HTTP/1.1\" 200 OK\n"
+      "[2024-10-31 19:48:01] INFO:     127.0.0.1:50366 - \"POST /v1/embeddings HTTP/1.1\" 200 OK\n"
     ]
    },
    {
@@ -284,20 +419,15 @@
  {
   "cell_type": "code",
   "execution_count": 5,
-   "metadata": {},
+   "metadata": {
-   "outputs": [
+    "execution": {
-    {
+     "iopub.execute_input": "2024-11-01T02:48:01.875204Z",
-     "name": "stdout",
+     "iopub.status.busy": "2024-11-01T02:48:01.874915Z",
-     "output_type": "stream",
+     "iopub.status.idle": "2024-11-01T02:48:02.193734Z",
-     "text": [
+     "shell.execute_reply": "2024-11-01T02:48:02.192158Z"
-      "[2024-10-28 02:10:32] INFO:     Shutting down\n",
-      "[2024-10-28 02:10:32] INFO:     Waiting for application shutdown.\n",
-      "[2024-10-28 02:10:32] INFO:     Application shutdown complete.\n",
-      "[2024-10-28 02:10:32] INFO:     Finished server process [1188896]\n",
-      "W1028 02:10:32.490000 140389363193408 torch/_inductor/compile_worker/subproc_pool.py:126] SubprocPool unclean exit\n"
-     ]
    }
-   ],
+   },
+   "outputs": [],
   "source": [
    "terminate_process(embedding_process)"
   ]

--- a/docs/openai_api.ipynb
+++ b/docs/openai_api.ipynb
@@ -30,41 +30,140 @@
  {
   "cell_type": "code",
   "execution_count": 1,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-01T02:44:46.419815Z",
+     "iopub.status.busy": "2024-11-01T02:44:46.419509Z",
+     "iopub.status.idle": "2024-11-01T02:45:16.621648Z",
+     "shell.execute_reply": "2024-11-01T02:45:16.620659Z"
+    }
+   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "2024-10-30 09:44:20.477109: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
+      "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
-      "2024-10-30 09:44:20.489679: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
+      "  warnings.warn(\n"
-      "2024-10-30 09:44:20.489712: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
+     ]
-      "2024-10-30 09:44:21.010067: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
+    },
-      "[2024-10-30 09:44:29] server_args=ServerArgs(model_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Meta-Llama-3.1-8B-Instruct', chat_template=None, is_embedding=False, host='0.0.0.0', port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=134920821, constrained_json_whitespace_pattern=None, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)\n",
+    {
-      "[2024-10-30 09:44:39 TP0] Init torch distributed begin.\n",
+     "name": "stdout",
-      "[2024-10-30 09:44:41 TP0] Load weight begin. avail mem=76.83 GB\n",
+     "output_type": "stream",
-      "[2024-10-30 09:44:42 TP0] lm_eval is not installed, GPTQ may not be usable\n",
+     "text": [
-      "INFO 10-30 09:44:42 weight_utils.py:243] Using model weights format ['*.safetensors']\n",
+      "[2024-10-31 19:44:51] server_args=ServerArgs(model_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Meta-Llama-3.1-8B-Instruct', chat_template=None, is_embedding=False, host='0.0.0.0', port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=357249111, constrained_json_whitespace_pattern=None, decode_log_interval=40, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)\n"
-      "Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]\n",
+     ]
-      "Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:01<00:05,  1.77s/it]\n",
+    },
-      "Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:03<00:03,  1.77s/it]\n",
+    {
-      "Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:05<00:01,  1.77s/it]\n",
+     "name": "stdout",
-      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:05<00:00,  1.27s/it]\n",
+     "output_type": "stream",
-      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:05<00:00,  1.45s/it]\n",
+     "text": [
+      "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
+      "  warnings.warn(\n",
+      "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-31 19:44:57 TP0] Init torch distributed begin.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-31 19:44:58 TP0] Load weight begin. avail mem=47.27 GB\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-31 19:44:59 TP0] lm_eval is not installed, GPTQ may not be usable\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO 10-31 19:44:59 weight_utils.py:243] Using model weights format ['*.safetensors']\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\r",
+      "Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\r",
+      "Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:00<00:01,  2.26it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\r",
+      "Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:00<00:00,  2.25it/s]\n",
+      "\r",
+      "Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:01<00:00,  3.24it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\r",
+      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:01<00:00,  2.70it/s]\n",
+      "\r",
+      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:01<00:00,  2.67it/s]\n",
      "\n",
-      "[2024-10-30 09:44:48 TP0] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=61.82 GB\n",
+      "[2024-10-31 19:45:01 TP0] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=32.22 GB\n",
-      "[2024-10-30 09:44:48 TP0] Memory pool end. avail mem=8.19 GB\n",
+      "[2024-10-31 19:45:02 TP0] Memory pool end. avail mem=4.60 GB\n",
-      "[2024-10-30 09:44:49 TP0] Capture cuda graph begin. This can take up to several minutes.\n",
+      "[2024-10-31 19:45:02 TP0] Capture cuda graph begin. This can take up to several minutes.\n"
-      "[2024-10-30 09:44:58 TP0] max_total_num_tokens=430915, max_prefill_tokens=16384, max_running_requests=2049, context_len=131072\n",
+     ]
-      "[2024-10-30 09:44:58] INFO:     Started server process [231459]\n",
+    },
-      "[2024-10-30 09:44:58] INFO:     Waiting for application startup.\n",
+    {
-      "[2024-10-30 09:44:58] INFO:     Application startup complete.\n",
+     "name": "stdout",
-      "[2024-10-30 09:44:58] INFO:     Uvicorn running on http://0.0.0.0:30000 (Press CTRL+C to quit)\n",
+     "output_type": "stream",
-      "[2024-10-30 09:44:59] INFO:     127.0.0.1:54650 - \"GET /v1/models HTTP/1.1\" 200 OK\n",
+     "text": [
-      "[2024-10-30 09:44:59] INFO:     127.0.0.1:54666 - \"GET /get_model_info HTTP/1.1\" 200 OK\n",
+      "[2024-10-31 19:45:10 TP0] max_total_num_tokens=217512, max_prefill_tokens=16384, max_running_requests=2049, context_len=131072\n"
-      "[2024-10-30 09:44:59 TP0] Prefill batch. #new-seq: 1, #new-token: 7, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+     ]
-      "[2024-10-30 09:44:59] INFO:     127.0.0.1:54672 - \"POST /generate HTTP/1.1\" 200 OK\n",
+    },
-      "[2024-10-30 09:44:59] The server is fired up and ready to roll!\n"
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-31 19:45:10] INFO:     Started server process [1543025]\n",
+      "[2024-10-31 19:45:10] INFO:     Waiting for application startup.\n",
+      "[2024-10-31 19:45:10] INFO:     Application startup complete.\n",
+      "[2024-10-31 19:45:10] INFO:     Uvicorn running on http://0.0.0.0:30000 (Press CTRL+C to quit)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-31 19:45:11] INFO:     127.0.0.1:35048 - \"GET /v1/models HTTP/1.1\" 200 OK\n",
+      "[2024-10-31 19:45:11] INFO:     127.0.0.1:35056 - \"GET /get_model_info HTTP/1.1\" 200 OK\n",
+      "[2024-10-31 19:45:11 TP0] Prefill batch. #new-seq: 1, #new-token: 7, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-31 19:45:11] INFO:     127.0.0.1:35066 - \"POST /generate HTTP/1.1\" 200 OK\n",
+      "[2024-10-31 19:45:11] The server is fired up and ready to roll!\n"
     ]
    },
    {
@@ -98,21 +197,40 @@
  {
   "cell_type": "code",
   "execution_count": 2,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-01T02:45:16.624550Z",
+     "iopub.status.busy": "2024-11-01T02:45:16.624258Z",
+     "iopub.status.idle": "2024-11-01T02:45:18.087455Z",
+     "shell.execute_reply": "2024-11-01T02:45:18.086450Z"
+    }
+   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "[2024-10-30 09:45:52 TP0] Prefill batch. #new-seq: 1, #new-token: 48, #cached-token: 1, cache hit rate: 1.79%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "[2024-10-31 19:45:16 TP0] Prefill batch. #new-seq: 1, #new-token: 48, #cached-token: 1, cache hit rate: 1.79%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
-      "[2024-10-30 09:45:53 TP0] Decode batch. #running-req: 1, #token: 82, token usage: 0.00, gen throughput (token/s): 0.73, #queue-req: 0\n",
+     ]
-      "[2024-10-30 09:45:53] INFO:     127.0.0.1:55594 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-31 19:45:17 TP0] Decode batch. #running-req: 1, #token: 82, token usage: 0.00, gen throughput (token/s): 5.21, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-31 19:45:18] INFO:     127.0.0.1:37738 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
     ]
    },
    {
     "data": {
      "text/html": [
-       "<strong style='color: #00008B;'>Response: ChatCompletion(id='876500c402ae452ea17e4dde415c108a', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **Country:** Japan\\n**Capital:** Tokyo\\n\\n2. **Country:** Australia\\n**Capital:** Canberra\\n\\n3. **Country:** Brazil\\n**Capital:** Brasília', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None), matched_stop=128009)], created=1730281553, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, completion_tokens_details=None, prompt_tokens_details=None))</strong>"
+       "<strong style='color: #00008B;'>Response: ChatCompletion(id='e04fce6c460d4764af68007fc82763e1', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **Country:** Japan\\n**Capital:** Tokyo\\n\\n2. **Country:** Australia\\n**Capital:** Canberra\\n\\n3. **Country:** Brazil\\n**Capital:** Brasília', refusal=None, role='assistant', function_call=None, tool_calls=None), matched_stop=128009)], created=1730429118, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, prompt_tokens_details=None))</strong>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
@@ -154,23 +272,54 @@
  {
   "cell_type": "code",
   "execution_count": 3,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-01T02:45:18.090228Z",
+     "iopub.status.busy": "2024-11-01T02:45:18.090071Z",
+     "iopub.status.idle": "2024-11-01T02:45:21.193221Z",
+     "shell.execute_reply": "2024-11-01T02:45:21.192539Z"
+    }
+   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "[2024-10-30 09:45:57 TP0] Prefill batch. #new-seq: 1, #new-token: 48, #cached-token: 28, cache hit rate: 21.97%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "[2024-10-31 19:45:18 TP0] Prefill batch. #new-seq: 1, #new-token: 48, #cached-token: 28, cache hit rate: 21.97%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
-      "[2024-10-30 09:45:57 TP0] Decode batch. #running-req: 1, #token: 104, token usage: 0.00, gen throughput (token/s): 8.70, #queue-req: 0\n",
+     ]
-      "[2024-10-30 09:45:58 TP0] Decode batch. #running-req: 1, #token: 144, token usage: 0.00, gen throughput (token/s): 132.75, #queue-req: 0\n",
+    },
-      "[2024-10-30 09:45:58 TP0] Decode batch. #running-req: 1, #token: 184, token usage: 0.00, gen throughput (token/s): 132.30, #queue-req: 0\n",
+    {
-      "[2024-10-30 09:45:58] INFO:     127.0.0.1:55594 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-31 19:45:18 TP0] Decode batch. #running-req: 1, #token: 104, token usage: 0.00, gen throughput (token/s): 39.15, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-31 19:45:19 TP0] Decode batch. #running-req: 1, #token: 144, token usage: 0.00, gen throughput (token/s): 41.80, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-31 19:45:20 TP0] Decode batch. #running-req: 1, #token: 184, token usage: 0.00, gen throughput (token/s): 41.81, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-31 19:45:21] INFO:     127.0.0.1:37738 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
     ]
    },
    {
     "data": {
      "text/html": [
-       "<strong style='color: #00008B;'>Ancient Rome's major achievements include:<br><br>1. **Engineering and Architecture**: Developed concrete, aqueducts, roads, bridges, and monumental buildings like the Colosseum and Pantheon.<br>2. **Law and Governance**: Established the Twelve Tables, a foundation for modern law, and a system of governance that included the Senate and Assemblies.<br>3. **Military Conquests**: Expanded the empire through numerous wars, creating a vast territory that stretched from Britain to Egypt.<br>4. **Language and Literature**: Developed Latin, which became the language of law, government, and literature, influencing modern languages like French, Spanish, and Italian.<br></strong>"
+       "<strong style='color: #00008B;'>Ancient Rome's major achievements include:<br><br>1. **Engineering and Architecture**: They built iconic structures like the Colosseum, Pantheon, and Roman Forum, showcasing their mastery of concrete, arches, and aqueducts.<br>2. **Law and Governance**: The Romans developed the 12 Tables (450 BCE), which formed the basis of their laws, and established the concept of citizenship, paving the way for modern democracy.<br>3. **Military Conquests**: Rome expanded its territories through a series of wars, creating a vast empire that lasted for centuries, stretching from Britain to Egypt.<br>4. **Language and Literature**: Latin became</strong>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
@@ -217,16 +366,50 @@
  {
   "cell_type": "code",
   "execution_count": 4,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-01T02:45:21.195226Z",
+     "iopub.status.busy": "2024-11-01T02:45:21.194680Z",
+     "iopub.status.idle": "2024-11-01T02:45:21.675473Z",
+     "shell.execute_reply": "2024-11-01T02:45:21.675050Z"
+    }
+   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "[2024-10-30 09:46:06] INFO:     127.0.0.1:45834 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n",
+      "[2024-10-31 19:45:21] INFO:     127.0.0.1:37738 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n",
-      "[2024-10-30 09:46:06 TP0] Prefill batch. #new-seq: 1, #new-token: 15, #cached-token: 25, cache hit rate: 31.40%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "[2024-10-31 19:45:21 TP0] Prefill batch. #new-seq: 1, #new-token: 15, #cached-token: 25, cache hit rate: 31.40%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
-      "It looks like you're getting started with our conversation. I'm happy to chat with you and see how[2024-10-30 09:46:06 TP0] Decode batch. #running-req: 1, #token: 61, token usage: 0.00, gen throughput (token/s): 4.78, #queue-req: 0\n",
+      "It looks like you're ready to"
-      " things go. What would you like to talk about?"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " begin"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      ". What kind of test would you like"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " to"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      " conduct?"
     ]
    }
   ],
@@ -255,21 +438,41 @@
  {
   "cell_type": "code",
   "execution_count": 5,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-01T02:45:21.676813Z",
+     "iopub.status.busy": "2024-11-01T02:45:21.676665Z",
+     "iopub.status.idle": "2024-11-01T02:45:23.182104Z",
+     "shell.execute_reply": "2024-11-01T02:45:23.181695Z"
+    }
+   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "[2024-10-30 09:46:11 TP0] Prefill batch. #new-seq: 1, #new-token: 8, #cached-token: 1, cache hit rate: 30.39%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "[2024-10-31 19:45:21 TP0] Prefill batch. #new-seq: 1, #new-token: 8, #cached-token: 1, cache hit rate: 30.39%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
-      "[2024-10-30 09:46:12 TP0] Decode batch. #running-req: 1, #token: 38, token usage: 0.00, gen throughput (token/s): 7.66, #queue-req: 0\n",
+      "[2024-10-31 19:45:21 TP0] Decode batch. #running-req: 1, #token: 11, token usage: 0.00, gen throughput (token/s): 39.18, #queue-req: 0\n"
-      "[2024-10-30 09:46:12] INFO:     127.0.0.1:45834 - \"POST /v1/completions HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-31 19:45:22 TP0] Decode batch. #running-req: 1, #token: 51, token usage: 0.00, gen throughput (token/s): 42.85, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-31 19:45:23] INFO:     127.0.0.1:37738 - \"POST /v1/completions HTTP/1.1\" 200 OK\n"
     ]
    },
    {
     "data": {
      "text/html": [
-       "<strong style='color: #00008B;'>Response: Completion(id='1c988750627649f8872965d00cc008d9', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=None, text=' 1. 2. 3.\\n1.  United States - Washington D.C. 2.  Japan - Tokyo 3.  Australia - Canberra\\nList 3 countries and their capitals. 1. 2. 3.\\n1.  China - Beijing 2.  Brazil - Bras', matched_stop=None)], created=1730281572, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=64, prompt_tokens=9, total_tokens=73, completion_tokens_details=None, prompt_tokens_details=None))</strong>"
+       "<strong style='color: #00008B;'>Response: Completion(id='84ca7b4df182449697c4b38a454b8834', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=None, text=' 1. 2. 3.\\n1.  United States  Washington D.C. 2.  Japan  Tokyo 3.  Australia  Canberra\\nList 3 countries and their capitals. 1. 2. 3.\\n1.  China  Beijing 2.  Brazil  Bras', matched_stop=None)], created=1730429123, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=64, prompt_tokens=9, total_tokens=73, prompt_tokens_details=None))</strong>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
@@ -306,24 +509,61 @@
  {
   "cell_type": "code",
   "execution_count": 6,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-01T02:45:23.186337Z",
+     "iopub.status.busy": "2024-11-01T02:45:23.186189Z",
+     "iopub.status.idle": "2024-11-01T02:45:26.769744Z",
+     "shell.execute_reply": "2024-11-01T02:45:26.769299Z"
+    }
+   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "[2024-10-30 09:46:15 TP0] Prefill batch. #new-seq: 1, #new-token: 9, #cached-token: 1, cache hit rate: 29.32%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "[2024-10-31 19:45:23 TP0] Prefill batch. #new-seq: 1, #new-token: 9, #cached-token: 1, cache hit rate: 29.32%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
-      "[2024-10-30 09:46:15 TP0] Decode batch. #running-req: 1, #token: 16, token usage: 0.00, gen throughput (token/s): 12.28, #queue-req: 0\n",
+     ]
-      "[2024-10-30 09:46:15 TP0] Decode batch. #running-req: 1, #token: 56, token usage: 0.00, gen throughput (token/s): 135.70, #queue-req: 0\n",
+    },
-      "[2024-10-30 09:46:15 TP0] Decode batch. #running-req: 1, #token: 96, token usage: 0.00, gen throughput (token/s): 134.45, #queue-req: 0\n",
+    {
-      "[2024-10-30 09:46:16 TP0] Decode batch. #running-req: 1, #token: 136, token usage: 0.00, gen throughput (token/s): 133.34, #queue-req: 0\n",
+     "name": "stdout",
-      "[2024-10-30 09:46:16] INFO:     127.0.0.1:45834 - \"POST /v1/completions HTTP/1.1\" 200 OK\n"
+     "output_type": "stream",
+     "text": [
+      "[2024-10-31 19:45:23 TP0] Decode batch. #running-req: 1, #token: 29, token usage: 0.00, gen throughput (token/s): 40.76, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-31 19:45:24 TP0] Decode batch. #running-req: 1, #token: 69, token usage: 0.00, gen throughput (token/s): 42.13, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-31 19:45:25 TP0] Decode batch. #running-req: 1, #token: 109, token usage: 0.00, gen throughput (token/s): 42.01, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-31 19:45:26 TP0] Decode batch. #running-req: 1, #token: 149, token usage: 0.00, gen throughput (token/s): 41.87, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-31 19:45:26] INFO:     127.0.0.1:37738 - \"POST /v1/completions HTTP/1.1\" 200 OK\n"
     ]
    },
    {
     "data": {
      "text/html": [
-       "<strong style='color: #00008B;'>Response: Completion(id='784041b9af634537a7960a0ba6152ba2', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=None, text=\"\\xa0\\nOnce upon a time, in a distant corner of the universe, there was a brave space explorer named Captain Orion. She had spent her entire life studying the stars and dreaming of the day she could explore them for herself. Finally, after years of training and preparation, she set off on her maiden voyage to explore the cosmos.\\nCaptain Orion's ship, the Aurora, was equipped with state-of-the-art technology and a crew of skilled astronauts who were eager to venture into the unknown. As they soared through the galaxy, they encountered breathtaking landscapes and incredible creatures that defied explanation.\\nOn their first stop, they landed on a planet called Zorvath, a world of swirling purple clouds and towering crystal spires. Captain Orion and her crew mar\", matched_stop=None)], created=1730281576, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=150, prompt_tokens=10, total_tokens=160, completion_tokens_details=None, prompt_tokens_details=None))</strong>"
+       "<strong style='color: #00008B;'>Response: Completion(id='fe384c17aece4a5ca5fb5238dcd1adec', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=None, text=\" This can be a sci-fi story, and you have the ability to create a unique and imaginative universe.\\nIn the depths of space, a lone space explorer named Kaelin Vex navigated through the swirling vortex of the Aurora Nebula. Her ship, the Starweaver, was an extension of herself, its advanced AI system linked directly to her mind. Together, they danced through the cosmos, searching for answers to the mysteries of the universe.\\nKaelin's mission was to uncover the secrets of the ancient alien civilization known as the Architects. Legends spoke of their unparalleled technological prowess and their ability to manipulate reality itself. Many believed they had transcended their physical forms, becoming one with the cosmos.\\nAs Kaelin delved deeper into\", matched_stop=None)], created=1730429126, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=150, prompt_tokens=10, total_tokens=160, prompt_tokens_details=None))</strong>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
@@ -369,22 +609,29 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 7,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-01T02:45:26.772016Z",
+     "iopub.status.busy": "2024-11-01T02:45:26.771868Z",
+     "iopub.status.idle": "2024-11-01T02:45:26.794225Z",
+     "shell.execute_reply": "2024-11-01T02:45:26.793811Z"
+    }
+   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "[2024-10-28 02:02:55] INFO:     127.0.0.1:43330 - \"POST /v1/files HTTP/1.1\" 200 OK\n",
+      "[2024-10-31 19:45:26] INFO:     127.0.0.1:57182 - \"POST /v1/files HTTP/1.1\" 200 OK\n",
-      "[2024-10-28 02:02:55] INFO:     127.0.0.1:43330 - \"POST /v1/batches HTTP/1.1\" 200 OK\n",
+      "[2024-10-31 19:45:26] INFO:     127.0.0.1:57182 - \"POST /v1/batches HTTP/1.1\" 200 OK\n",
-      "[2024-10-28 02:02:55 TP0] Prefill batch. #new-seq: 2, #new-token: 30, #cached-token: 50, cache hit rate: 35.06%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
+      "[2024-10-31 19:45:26 TP0] Prefill batch. #new-seq: 2, #new-token: 20, #cached-token: 60, cache hit rate: 42.80%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
     ]
    },
    {
     "data": {
      "text/html": [
-       "<strong style='color: #00008B;'>Batch job created with ID: batch_56fefd2e-0187-4c14-aa2d-110917723dde</strong>"
+       "<strong style='color: #00008B;'>Batch job created with ID: batch_d9af5b49-ad3d-423e-8c30-4aaafa5c18c4</strong>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
@@ -446,19 +693,32 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 8,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-01T02:45:26.796422Z",
+     "iopub.status.busy": "2024-11-01T02:45:26.796273Z",
+     "iopub.status.idle": "2024-11-01T02:45:29.810471Z",
+     "shell.execute_reply": "2024-11-01T02:45:29.810041Z"
+    }
+   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "[2024-10-28 02:02:56 TP0] Decode batch. #running-req: 2, #token: 82, token usage: 0.00, gen throughput (token/s): 55.10, #queue-req: 0\n",
+      "[2024-10-31 19:45:27 TP0] Decode batch. #running-req: 1, #token: 69, token usage: 0.00, gen throughput (token/s): 51.72, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
      "Batch job status: validating...trying again in 3 seconds...\n",
-      "[2024-10-28 02:02:58] INFO:     127.0.0.1:43330 - \"GET /v1/batches/batch_56fefd2e-0187-4c14-aa2d-110917723dde HTTP/1.1\" 200 OK\n",
+      "[2024-10-31 19:45:29] INFO:     127.0.0.1:57182 - \"GET /v1/batches/batch_d9af5b49-ad3d-423e-8c30-4aaafa5c18c4 HTTP/1.1\" 200 OK\n",
      "Batch job completed successfully!\n",
      "Request counts: BatchRequestCounts(completed=2, failed=0, total=2)\n",
-      "[2024-10-28 02:02:58] INFO:     127.0.0.1:43330 - \"GET /v1/files/backend_result_file-520da6c8-0cce-4d4c-a943-a86101f5f5b4/content HTTP/1.1\" 200 OK\n"
+      "[2024-10-31 19:45:29] INFO:     127.0.0.1:57182 - \"GET /v1/files/backend_result_file-4ed79bf4-1e07-4fa9-9638-7448aa4e074b/content HTTP/1.1\" 200 OK\n"
     ]
    },
    {
@@ -476,7 +736,7 @@
    {
     "data": {
      "text/html": [
-       "<strong style='color: #00008B;'>Response: {'status_code': 200, 'request_id': 'request-1', 'body': {'id': 'request-1', 'object': 'chat.completion', 'created': 1730106176, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': 'A programmer walks into a library and asks the librarian, \"Do you have any books on Pavlov\\'s dogs and Schrödinger\\'s cat?\"\\n\\nThe librarian replies, \"It rings a bell, but I\\'m not sure if it\\'s here'}, 'logprobs': None, 'finish_reason': 'length', 'matched_stop': None}, 'usage': {'prompt_tokens': 41, 'completion_tokens': 50, 'total_tokens': 91}, 'system_fingerprint': None}}</strong>"
+       "<strong style='color: #00008B;'>Response: {'status_code': 200, 'request_id': 'request-1', 'body': {'id': 'request-1', 'object': 'chat.completion', 'created': 1730429127, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': 'Why do programmers prefer dark mode?\\n\\nBecause light attracts bugs.'}, 'logprobs': None, 'finish_reason': 'stop', 'matched_stop': 128009}, 'usage': {'prompt_tokens': 41, 'completion_tokens': 13, 'total_tokens': 54}, 'system_fingerprint': None}}</strong>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
@@ -500,7 +760,7 @@
    {
     "data": {
      "text/html": [
-       "<strong style='color: #00008B;'>Response: {'status_code': 200, 'request_id': 'request-2', 'body': {'id': 'request-2', 'object': 'chat.completion', 'created': 1730106176, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': '**What is Python?**\\n\\nPython is a high-level, interpreted programming language that is widely used for various purposes, including:\\n\\n1. **Web Development**: Building web applications and web services using frameworks like Django and Flask.\\n2. **Data Analysis and'}, 'logprobs': None, 'finish_reason': 'length', 'matched_stop': None}, 'usage': {'prompt_tokens': 39, 'completion_tokens': 50, 'total_tokens': 89}, 'system_fingerprint': None}}</strong>"
+       "<strong style='color: #00008B;'>Response: {'status_code': 200, 'request_id': 'request-2', 'body': {'id': 'request-2', 'object': 'chat.completion', 'created': 1730429127, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': '**What is Python?**\\n\\nPython is a high-level, interpreted programming language that is widely used for various purposes such as web development, scientific computing, data analysis, artificial intelligence, and more. It was created in the late 1980s by'}, 'logprobs': None, 'finish_reason': 'length', 'matched_stop': None}, 'usage': {'prompt_tokens': 39, 'completion_tokens': 50, 'total_tokens': 89}, 'system_fingerprint': None}}</strong>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
@@ -525,7 +785,7 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "[2024-10-28 02:02:58] INFO:     127.0.0.1:43330 - \"DELETE /v1/files/backend_result_file-520da6c8-0cce-4d4c-a943-a86101f5f5b4 HTTP/1.1\" 200 OK\n"
+      "[2024-10-31 19:45:29] INFO:     127.0.0.1:57182 - \"DELETE /v1/files/backend_result_file-4ed79bf4-1e07-4fa9-9638-7448aa4e074b HTTP/1.1\" 200 OK\n"
     ]
    }
   ],
@@ -574,21 +834,28 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 9,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-01T02:45:29.812339Z",
+     "iopub.status.busy": "2024-11-01T02:45:29.812198Z",
+     "iopub.status.idle": "2024-11-01T02:45:54.851243Z",
+     "shell.execute_reply": "2024-11-01T02:45:54.850668Z"
+    }
+   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "[2024-10-28 02:02:58] INFO:     127.0.0.1:43336 - \"POST /v1/files HTTP/1.1\" 200 OK\n",
+      "[2024-10-31 19:45:29] INFO:     127.0.0.1:57186 - \"POST /v1/files HTTP/1.1\" 200 OK\n",
-      "[2024-10-28 02:02:58] INFO:     127.0.0.1:43336 - \"POST /v1/batches HTTP/1.1\" 200 OK\n"
+      "[2024-10-31 19:45:29] INFO:     127.0.0.1:57186 - \"POST /v1/batches HTTP/1.1\" 200 OK\n"
     ]
    },
    {
     "data": {
      "text/html": [
-       "<strong style='color: #00008B;'>Created batch job with ID: batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5</strong>"
+       "<strong style='color: #00008B;'>Created batch job with ID: batch_3d1a7f8e-af5a-4a14-8391-1001aadfe1b2</strong>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
@@ -613,23 +880,77 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "[2024-10-28 02:02:58 TP0] Prefill batch. #new-seq: 17, #new-token: 510, #cached-token: 425, cache hit rate: 43.40%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "[2024-10-31 19:45:29 TP0] Prefill batch. #new-seq: 27, #new-token: 810, #cached-token: 675, cache hit rate: 45.05%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
-      "[2024-10-28 02:02:58 TP0] Prefill batch. #new-seq: 83, #new-token: 2490, #cached-token: 2075, cache hit rate: 45.04%, token usage: 0.00, #running-req: 17, #queue-req: 0\n",
+      "[2024-10-31 19:45:29 TP0] Prefill batch. #new-seq: 73, #new-token: 2190, #cached-token: 1825, cache hit rate: 45.33%, token usage: 0.00, #running-req: 27, #queue-req: 0\n"
-      "[2024-10-28 02:02:59 TP0] Decode batch. #running-req: 100, #token: 3725, token usage: 0.02, gen throughput (token/s): 234.43, #queue-req: 0\n",
+     ]
-      "[2024-10-28 02:03:00 TP0] Decode batch. #running-req: 100, #token: 7725, token usage: 0.04, gen throughput (token/s): 3545.41, #queue-req: 0\n",
+    },
-      "[2024-10-28 02:03:01 TP0] Decode batch. #running-req: 100, #token: 11725, token usage: 0.05, gen throughput (token/s): 3448.10, #queue-req: 0\n",
+    {
-      "[2024-10-28 02:03:02 TP0] Decode batch. #running-req: 100, #token: 15725, token usage: 0.07, gen throughput (token/s): 3362.62, #queue-req: 0\n",
+     "name": "stdout",
-      "[2024-10-28 02:03:04 TP0] Decode batch. #running-req: 100, #token: 19725, token usage: 0.09, gen throughput (token/s): 3279.58, #queue-req: 0\n",
+     "output_type": "stream",
-      "[2024-10-28 02:03:05 TP0] Decode batch. #running-req: 100, #token: 23725, token usage: 0.11, gen throughput (token/s): 3200.86, #queue-req: 0\n",
+     "text": [
-      "[2024-10-28 02:03:06 TP0] Decode batch. #running-req: 100, #token: 27725, token usage: 0.13, gen throughput (token/s): 3126.52, #queue-req: 0\n",
+      "[2024-10-31 19:45:30 TP0] Decode batch. #running-req: 100, #token: 5125, token usage: 0.02, gen throughput (token/s): 636.38, #queue-req: 0\n"
-      "[2024-10-28 02:03:07 TP0] Decode batch. #running-req: 100, #token: 31725, token usage: 0.15, gen throughput (token/s): 3053.16, #queue-req: 0\n",
+     ]
-      "[2024-10-28 02:03:08] INFO:     127.0.0.1:41320 - \"GET /v1/batches/batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5 HTTP/1.1\" 200 OK\n"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-31 19:45:31 TP0] Decode batch. #running-req: 100, #token: 9125, token usage: 0.04, gen throughput (token/s): 3507.97, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-31 19:45:33 TP0] Decode batch. #running-req: 100, #token: 13125, token usage: 0.06, gen throughput (token/s): 3417.06, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-31 19:45:34 TP0] Decode batch. #running-req: 100, #token: 17125, token usage: 0.08, gen throughput (token/s): 3332.03, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-31 19:45:35 TP0] Decode batch. #running-req: 100, #token: 21125, token usage: 0.10, gen throughput (token/s): 3252.29, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-31 19:45:36 TP0] Decode batch. #running-req: 100, #token: 25125, token usage: 0.12, gen throughput (token/s): 3173.87, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-31 19:45:38 TP0] Decode batch. #running-req: 100, #token: 29125, token usage: 0.13, gen throughput (token/s): 3101.31, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-31 19:45:39 TP0] Decode batch. #running-req: 100, #token: 33125, token usage: 0.15, gen throughput (token/s): 3030.90, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-31 19:45:39] INFO:     127.0.0.1:37782 - \"GET /v1/batches/batch_3d1a7f8e-af5a-4a14-8391-1001aadfe1b2 HTTP/1.1\" 200 OK\n"
     ]
    },
    {
     "data": {
      "text/html": [
-       "<strong style='color: #00008B;'>Batch job details (check 1 / 5) // ID: batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5 // Status: in_progress // Created at: 1730106178 // Input file ID: backend_input_file-92cf2cc1-afbd-428f-8c5c-85fabd86cb63 // Output file ID: None</strong>"
+       "<strong style='color: #00008B;'>Batch job details (check 1 / 5) // ID: batch_3d1a7f8e-af5a-4a14-8391-1001aadfe1b2 // Status: in_progress // Created at: 1730429129 // Input file ID: backend_input_file-f42b27b5-05ee-4d27-9a37-ff04c3b4a427 // Output file ID: None</strong>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
@@ -654,15 +975,27 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "[2024-10-28 02:03:09 TP0] Decode batch. #running-req: 100, #token: 35725, token usage: 0.16, gen throughput (token/s): 2980.26, #queue-req: 0\n",
+      "[2024-10-31 19:45:40 TP0] Decode batch. #running-req: 100, #token: 37125, token usage: 0.17, gen throughput (token/s): 2961.37, #queue-req: 0\n"
-      "[2024-10-28 02:03:10 TP0] Decode batch. #running-req: 100, #token: 39725, token usage: 0.18, gen throughput (token/s): 2919.09, #queue-req: 0\n",
+     ]
-      "[2024-10-28 02:03:11] INFO:     127.0.0.1:41320 - \"GET /v1/batches/batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5 HTTP/1.1\" 200 OK\n"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-31 19:45:42 TP0] Decode batch. #running-req: 100, #token: 41125, token usage: 0.19, gen throughput (token/s): 2899.29, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-31 19:45:42] INFO:     127.0.0.1:37782 - \"GET /v1/batches/batch_3d1a7f8e-af5a-4a14-8391-1001aadfe1b2 HTTP/1.1\" 200 OK\n"
     ]
    },
    {
     "data": {
      "text/html": [
-       "<strong style='color: #00008B;'>Batch job details (check 2 / 5) // ID: batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5 // Status: in_progress // Created at: 1730106178 // Input file ID: backend_input_file-92cf2cc1-afbd-428f-8c5c-85fabd86cb63 // Output file ID: None</strong>"
+       "<strong style='color: #00008B;'>Batch job details (check 2 / 5) // ID: batch_3d1a7f8e-af5a-4a14-8391-1001aadfe1b2 // Status: in_progress // Created at: 1730429129 // Input file ID: backend_input_file-f42b27b5-05ee-4d27-9a37-ff04c3b4a427 // Output file ID: None</strong>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
@@ -687,15 +1020,27 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "[2024-10-28 02:03:11 TP0] Decode batch. #running-req: 100, #token: 43725, token usage: 0.20, gen throughput (token/s): 2854.92, #queue-req: 0\n",
+      "[2024-10-31 19:45:43 TP0] Decode batch. #running-req: 100, #token: 45125, token usage: 0.21, gen throughput (token/s): 2836.50, #queue-req: 0\n"
-      "[2024-10-28 02:03:13 TP0] Decode batch. #running-req: 100, #token: 47725, token usage: 0.22, gen throughput (token/s): 2794.62, #queue-req: 0\n",
+     ]
-      "[2024-10-28 02:03:14] INFO:     127.0.0.1:41320 - \"GET /v1/batches/batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5 HTTP/1.1\" 200 OK\n"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-31 19:45:45 TP0] Decode batch. #running-req: 100, #token: 49125, token usage: 0.23, gen throughput (token/s): 2777.80, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-31 19:45:45] INFO:     127.0.0.1:37782 - \"GET /v1/batches/batch_3d1a7f8e-af5a-4a14-8391-1001aadfe1b2 HTTP/1.1\" 200 OK\n"
     ]
    },
    {
     "data": {
      "text/html": [
-       "<strong style='color: #00008B;'>Batch job details (check 3 / 5) // ID: batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5 // Status: in_progress // Created at: 1730106178 // Input file ID: backend_input_file-92cf2cc1-afbd-428f-8c5c-85fabd86cb63 // Output file ID: None</strong>"
+       "<strong style='color: #00008B;'>Batch job details (check 3 / 5) // ID: batch_3d1a7f8e-af5a-4a14-8391-1001aadfe1b2 // Status: in_progress // Created at: 1730429129 // Input file ID: backend_input_file-f42b27b5-05ee-4d27-9a37-ff04c3b4a427 // Output file ID: None</strong>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
@@ -720,14 +1065,13 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "[2024-10-28 02:03:14 TP0] Decode batch. #running-req: 100, #token: 51725, token usage: 0.24, gen throughput (token/s): 2737.84, #queue-req: 0\n",
+      "[2024-10-31 19:45:48] INFO:     127.0.0.1:37782 - \"GET /v1/batches/batch_3d1a7f8e-af5a-4a14-8391-1001aadfe1b2 HTTP/1.1\" 200 OK\n"
-      "[2024-10-28 02:03:17] INFO:     127.0.0.1:41320 - \"GET /v1/batches/batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5 HTTP/1.1\" 200 OK\n"
     ]
    },
    {
     "data": {
      "text/html": [
-       "<strong style='color: #00008B;'>Batch job details (check 4 / 5) // ID: batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5 // Status: completed // Created at: 1730106178 // Input file ID: backend_input_file-92cf2cc1-afbd-428f-8c5c-85fabd86cb63 // Output file ID: backend_result_file-c10ee9f5-eca8-4357-a922-934543b7f433</strong>"
+       "<strong style='color: #00008B;'>Batch job details (check 4 / 5) // ID: batch_3d1a7f8e-af5a-4a14-8391-1001aadfe1b2 // Status: completed // Created at: 1730429129 // Input file ID: backend_input_file-f42b27b5-05ee-4d27-9a37-ff04c3b4a427 // Output file ID: backend_result_file-dc391511-07f2-4f94-90cb-3ed09bc4b8a3</strong>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
@@ -752,13 +1096,13 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "[2024-10-28 02:03:20] INFO:     127.0.0.1:41320 - \"GET /v1/batches/batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5 HTTP/1.1\" 200 OK\n"
+      "[2024-10-31 19:45:51] INFO:     127.0.0.1:37782 - \"GET /v1/batches/batch_3d1a7f8e-af5a-4a14-8391-1001aadfe1b2 HTTP/1.1\" 200 OK\n"
     ]
    },
    {
     "data": {
      "text/html": [
-       "<strong style='color: #00008B;'>Batch job details (check 5 / 5) // ID: batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5 // Status: completed // Created at: 1730106178 // Input file ID: backend_input_file-92cf2cc1-afbd-428f-8c5c-85fabd86cb63 // Output file ID: backend_result_file-c10ee9f5-eca8-4357-a922-934543b7f433</strong>"
+       "<strong style='color: #00008B;'>Batch job details (check 5 / 5) // ID: batch_3d1a7f8e-af5a-4a14-8391-1001aadfe1b2 // Status: completed // Created at: 1730429129 // Input file ID: backend_input_file-f42b27b5-05ee-4d27-9a37-ff04c3b4a427 // Output file ID: backend_result_file-dc391511-07f2-4f94-90cb-3ed09bc4b8a3</strong>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
@@ -853,21 +1197,28 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 10,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-01T02:45:54.854018Z",
+     "iopub.status.busy": "2024-11-01T02:45:54.853851Z",
+     "iopub.status.idle": "2024-11-01T02:46:07.893199Z",
+     "shell.execute_reply": "2024-11-01T02:46:07.892310Z"
+    }
+   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "[2024-10-28 02:03:23] INFO:     127.0.0.1:47360 - \"POST /v1/files HTTP/1.1\" 200 OK\n",
+      "[2024-10-31 19:45:54] INFO:     127.0.0.1:33180 - \"POST /v1/files HTTP/1.1\" 200 OK\n",
-      "[2024-10-28 02:03:23] INFO:     127.0.0.1:47360 - \"POST /v1/batches HTTP/1.1\" 200 OK\n"
+      "[2024-10-31 19:45:54] INFO:     127.0.0.1:33180 - \"POST /v1/batches HTTP/1.1\" 200 OK\n"
     ]
    },
    {
     "data": {
      "text/html": [
-       "<strong style='color: #00008B;'>Created batch job with ID: batch_8a409f86-b8c7-4e29-9cc7-187d6d28df62</strong>"
+       "<strong style='color: #00008B;'>Created batch job with ID: batch_c30756c3-8c09-4142-9630-9590d6124986</strong>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
@@ -892,12 +1243,49 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "[2024-10-28 02:03:23 TP0] Prefill batch. #new-seq: 44, #new-token: 44, #cached-token: 2376, cache hit rate: 60.81%, token usage: 0.01, #running-req: 0, #queue-req: 0\n",
+      "[2024-10-31 19:45:54 TP0] Prefill batch. #new-seq: 135, #new-token: 1150, #cached-token: 6275, cache hit rate: 67.38%, token usage: 0.01, #running-req: 0, #queue-req: 0\n"
-      "[2024-10-28 02:03:23 TP0] Prefill batch. #new-seq: 328, #new-token: 8192, #cached-token: 9824, cache hit rate: 56.49%, token usage: 0.01, #running-req: 44, #queue-req: 128\n",
+     ]
-      "[2024-10-28 02:03:24 TP0] Prefill batch. #new-seq: 129, #new-token: 3864, #cached-token: 3231, cache hit rate: 54.15%, token usage: 0.05, #running-req: 371, #queue-req: 1\n",
+    },
-      "[2024-10-28 02:03:27 TP0] Decode batch. #running-req: 500, #token: 29025, token usage: 0.13, gen throughput (token/s): 1162.55, #queue-req: 0\n",
+    {
-      "[2024-10-28 02:03:31 TP0] Decode batch. #running-req: 500, #token: 49025, token usage: 0.23, gen throughput (token/s): 5606.35, #queue-req: 0\n",
+     "name": "stdout",
-      "[2024-10-28 02:03:33] INFO:     127.0.0.1:40110 - \"POST /v1/batches/batch_8a409f86-b8c7-4e29-9cc7-187d6d28df62/cancel HTTP/1.1\" 200 OK\n"
+     "output_type": "stream",
+     "text": [
+      "[2024-10-31 19:45:55 TP0] Prefill batch. #new-seq: 274, #new-token: 8192, #cached-token: 6850, cache hit rate: 55.74%, token usage: 0.02, #running-req: 135, #queue-req: 91\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-31 19:45:56 TP0] Prefill batch. #new-seq: 92, #new-token: 2758, #cached-token: 2302, cache hit rate: 54.19%, token usage: 0.06, #running-req: 408, #queue-req: 1\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-31 19:45:56 TP0] Decode batch. #running-req: 500, #token: 16025, token usage: 0.07, gen throughput (token/s): 409.21, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-31 19:46:00 TP0] Decode batch. #running-req: 500, #token: 36025, token usage: 0.17, gen throughput (token/s): 5777.09, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-31 19:46:03 TP0] Decode batch. #running-req: 500, #token: 56025, token usage: 0.26, gen throughput (token/s): 5530.76, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-31 19:46:04] INFO:     127.0.0.1:57728 - \"POST /v1/batches/batch_c30756c3-8c09-4142-9630-9590d6124986/cancel HTTP/1.1\" 200 OK\n"
     ]
    },
    {
@@ -916,7 +1304,7 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "[2024-10-28 02:03:36] INFO:     127.0.0.1:40110 - \"GET /v1/batches/batch_8a409f86-b8c7-4e29-9cc7-187d6d28df62 HTTP/1.1\" 200 OK\n"
+      "[2024-10-31 19:46:07] INFO:     127.0.0.1:57728 - \"GET /v1/batches/batch_c30756c3-8c09-4142-9630-9590d6124986 HTTP/1.1\" 200 OK\n"
     ]
    },
    {
@@ -947,7 +1335,7 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "[2024-10-28 02:03:36] INFO:     127.0.0.1:40110 - \"DELETE /v1/files/backend_input_file-2e9608b6-981b-48ec-8adb-e653ffc69106 HTTP/1.1\" 200 OK\n"
+      "[2024-10-31 19:46:07] INFO:     127.0.0.1:57728 - \"DELETE /v1/files/backend_input_file-0fbf83a7-301c-488e-a221-b702e24df6a5 HTTP/1.1\" 200 OK\n"
     ]
    },
    {
@@ -961,12 +1349,25 @@
     },
     "metadata": {},
     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<strong style='color: #00008B;'>Successfully deleted local batch_requests.jsonl file</strong>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
    }
   ],
   "source": [
    "import json\n",
    "import time\n",
    "from openai import OpenAI\n",
+    "import os\n",
    "\n",
    "client = OpenAI(base_url=\"http://127.0.0.1:30000/v1\", api_key=\"None\")\n",
    "\n",
@@ -1037,6 +1438,9 @@
    "        del_response = client.files.delete(uploaded_file.id)\n",
    "        if del_response.deleted:\n",
    "            print_highlight(\"Successfully cleaned up input file\")\n",
+    "        if os.path.exists(input_file_path):\n",
+    "            os.remove(input_file_path)\n",
+    "            print_highlight(\"Successfully deleted local batch_requests.jsonl file\")\n",
    "    except Exception as e:\n",
    "        print_highlight(f\"Error cleaning up: {e}\")\n",
    "        raise e"
@@ -1044,8 +1448,15 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 11,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-01T02:46:07.896114Z",
+     "iopub.status.busy": "2024-11-01T02:46:07.895820Z",
+     "iopub.status.idle": "2024-11-01T02:46:09.365287Z",
+     "shell.execute_reply": "2024-11-01T02:46:09.364705Z"
+    }
+   },
   "outputs": [],
   "source": [
    "terminate_process(server_process)"
@@ -1068,7 +1479,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.11.7"
  }
 },
 "nbformat": 4,

--- a/docs/deploy.py
+++ b/docs/deploy.py
-# Deploy the documents
+# Deploy the documents	
 import os
 from datetime import datetime
-def run_cmd(cmd):
+def run_cmd(cmd):	
-    print(cmd)
+    print(cmd)	
-    os.system(cmd)
+    os.system(cmd)	
-run_cmd("cd $DOC_SITE_PATH; git pull")
+run_cmd("cd $DOC_SITE_PATH; git pull")	
-# (Optional) Remove old files
+# (Optional) Remove old files	
-# run_cmd("rm -rf $ALPA_SITE_PATH/*")
+# run_cmd("rm -rf $ALPA_SITE_PATH/*")	
-run_cmd("cp -r _build/html/* $DOC_SITE_PATH")
+run_cmd("cp -r _build/html/* $DOC_SITE_PATH")	
-cmd_message = f"Update {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
+cmd_message = f"Update {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"	
-run_cmd(
+run_cmd(	
-    f"cd $DOC_SITE_PATH; git add .; git commit -m '{cmd_message}'; git push origin main"
+    f"cd $DOC_SITE_PATH; git add .; git commit -m '{cmd_message}'; git push origin main"	
-)
+)	
--- a/docs/release_process.md
+++ b/docs/release_process.md
--- a/docs/setup_github_runner.md
+++ b/docs/setup_github_runner.md
--- a/docs/frontend.md
+++ b/docs/frontend.md
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -15,35 +15,35 @@ The core features include:
   :maxdepth: 1
   :caption: Getting Started
-   install.md
+   starts/install.md
-   send_request.ipynb
+   starts/send_request.ipynb
 .. toctree::
   :maxdepth: 1
   :caption: Backend Tutorial
-   openai_api.ipynb
+   backend/openai_api.ipynb
-   backend.md
+   backend/backend.md
 .. toctree::
   :maxdepth: 1
   :caption: Frontend Tutorial
-   frontend.md
+   frontend/frontend.md
 .. toctree::
   :maxdepth: 1
   :caption: References
-   sampling_params.md
+   references/sampling_params.md
-   hyperparameter_tuning.md
+   references/hyperparameter_tuning.md
-   model_support.md
+   references/model_support.md
-   contributor_guide.md
+   references/contributor_guide.md
-   choices_methods.md
+   references/choices_methods.md
-   benchmark_and_profiling.md
+   references/benchmark_and_profiling.md
-   troubleshooting.md
+   references/troubleshooting.md
-   embedding_model.ipynb
+   references/embedding_model.ipynb
-   learn_more.md
+   references/learn_more.md
--- a/docs/benchmark_and_profiling.md
+++ b/docs/benchmark_and_profiling.md
--- a/docs/choices_methods.md
+++ b/docs/choices_methods.md
--- a/docs/contributor_guide.md
+++ b/docs/contributor_guide.md
--- a/docs/custom_chat_template.md
+++ b/docs/custom_chat_template.md
--- a/docs/hyperparameter_tuning.md
+++ b/docs/hyperparameter_tuning.md
--- a/docs/learn_more.md
+++ b/docs/learn_more.md
--- a/docs/model_support.md
+++ b/docs/model_support.md
--- a/docs/sampling_params.md
+++ b/docs/sampling_params.md