change file tree (#1859)

Co-authored-by: Chayenne <zhaochenyang@g.ucla.edu>

change file tree (#1859)
Co-authored-by: Chayenne <zhaochenyang@g.ucla.edu>
61cf00e1 · Chayenne · GitHub · b9fd178f · 61cf00e1 · 61cf00e1
Unverified Commit 61cf00e1 authored Oct 31, 2024 by Chayenne Committed by GitHub Oct 31, 2024
Showing with 403 additions and 0 deletions

docs/references/troubleshooting.md docs/references/troubleshooting.md +0 -0

docs/starts/install.md docs/starts/install.md +0 -0

docs/starts/send_request.ipynb docs/starts/send_request.ipynb +403 -0

No files found.
--- a/docs/troubleshooting.md
+++ b/docs/troubleshooting.md
--- a/docs/install.md
+++ b/docs/install.md
--- a/docs/send_request.ipynb
+++ b/docs/send_request.ipynb
@@ -28,37 +28,140 @@
  {
   "cell_type": "code",
   "execution_count": 1,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-01T02:46:13.611212Z",
+     "iopub.status.busy": "2024-11-01T02:46:13.611093Z",
+     "iopub.status.idle": "2024-11-01T02:46:42.810261Z",
+     "shell.execute_reply": "2024-11-01T02:46:42.809147Z"
+    }
+   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "[2024-10-30 09:32:30] server_args=ServerArgs(model_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Meta-Llama-3.1-8B-Instruct', chat_template=None, is_embedding=False, host='0.0.0.0', port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=335520337, constrained_json_whitespace_pattern=None, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)\n",
+      "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
-      "[2024-10-30 09:32:39 TP0] Init torch distributed begin.\n",
+      "  warnings.warn(\n"
-      "[2024-10-30 09:32:43 TP0] Load weight begin. avail mem=76.83 GB\n",
+     ]
-      "[2024-10-30 09:32:43 TP0] lm_eval is not installed, GPTQ may not be usable\n",
+    },
-      "INFO 10-30 09:32:43 weight_utils.py:243] Using model weights format ['*.safetensors']\n",
+    {
-      "Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]\n",
+     "name": "stdout",
-      "Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:01<00:05,  1.78s/it]\n",
+     "output_type": "stream",
-      "Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:03<00:03,  1.78s/it]\n",
+     "text": [
-      "Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:05<00:01,  1.80s/it]\n",
+      "[2024-10-31 19:46:18] server_args=ServerArgs(model_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Meta-Llama-3.1-8B-Instruct', chat_template=None, is_embedding=False, host='0.0.0.0', port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=706578968, constrained_json_whitespace_pattern=None, decode_log_interval=40, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)\n"
-      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:05<00:00,  1.30s/it]\n",
+     ]
-      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:05<00:00,  1.48s/it]\n",
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
+      "  warnings.warn(\n",
+      "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-31 19:46:24 TP0] Init torch distributed begin.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-31 19:46:24 TP0] Load weight begin. avail mem=47.27 GB\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-31 19:46:25 TP0] lm_eval is not installed, GPTQ may not be usable\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO 10-31 19:46:26 weight_utils.py:243] Using model weights format ['*.safetensors']\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\r",
+      "Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\r",
+      "Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:00<00:01,  2.50it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\r",
+      "Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:00<00:00,  2.39it/s]\n",
+      "\r",
+      "Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:00<00:00,  3.45it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\r",
+      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:01<00:00,  2.95it/s]\n",
+      "\r",
+      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:01<00:00,  2.90it/s]\n",
      "\n",
-      "[2024-10-30 09:32:49 TP0] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=61.82 GB\n",
+      "[2024-10-31 19:46:28 TP0] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=32.22 GB\n",
-      "[2024-10-30 09:32:49 TP0] Memory pool end. avail mem=8.19 GB\n",
+      "[2024-10-31 19:46:28 TP0] Memory pool end. avail mem=4.60 GB\n",
-      "[2024-10-30 09:32:51 TP0] Capture cuda graph begin. This can take up to several minutes.\n",
+      "[2024-10-31 19:46:28 TP0] Capture cuda graph begin. This can take up to several minutes.\n"
-      "[2024-10-30 09:32:59 TP0] max_total_num_tokens=430915, max_prefill_tokens=16384, max_running_requests=2049, context_len=131072\n",
+     ]
-      "[2024-10-30 09:33:00] INFO:     Started server process [227758]\n",
+    },
-      "[2024-10-30 09:33:00] INFO:     Waiting for application startup.\n",
+    {
-      "[2024-10-30 09:33:00] INFO:     Application startup complete.\n",
+     "name": "stdout",
-      "[2024-10-30 09:33:00] INFO:     Uvicorn running on http://0.0.0.0:30000 (Press CTRL+C to quit)\n",
+     "output_type": "stream",
-      "[2024-10-30 09:33:01] INFO:     127.0.0.1:49220 - \"GET /v1/models HTTP/1.1\" 200 OK\n",
+     "text": [
-      "[2024-10-30 09:33:01] INFO:     127.0.0.1:49236 - \"GET /get_model_info HTTP/1.1\" 200 OK\n",
+      "[2024-10-31 19:46:36 TP0] max_total_num_tokens=217512, max_prefill_tokens=16384, max_running_requests=2049, context_len=131072\n"
-      "[2024-10-30 09:33:01 TP0] Prefill batch. #new-seq: 1, #new-token: 7, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+     ]
-      "[2024-10-30 09:33:01] INFO:     127.0.0.1:49240 - \"POST /generate HTTP/1.1\" 200 OK\n",
+    },
-      "[2024-10-30 09:33:01] The server is fired up and ready to roll!\n"
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-31 19:46:36] INFO:     Started server process [1548791]\n",
+      "[2024-10-31 19:46:36] INFO:     Waiting for application startup.\n",
+      "[2024-10-31 19:46:36] INFO:     Application startup complete.\n",
+      "[2024-10-31 19:46:36] INFO:     Uvicorn running on http://0.0.0.0:30000 (Press CTRL+C to quit)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-31 19:46:37] INFO:     127.0.0.1:46022 - \"GET /v1/models HTTP/1.1\" 200 OK\n",
+      "[2024-10-31 19:46:37] INFO:     127.0.0.1:46028 - \"GET /get_model_info HTTP/1.1\" 200 OK\n",
+      "[2024-10-31 19:46:37 TP0] Prefill batch. #new-seq: 1, #new-token: 7, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-31 19:46:38] INFO:     127.0.0.1:46042 - \"POST /generate HTTP/1.1\" 200 OK\n",
+      "[2024-10-31 19:46:38] The server is fired up and ready to roll!\n"
     ]
    },
    {
@@ -104,27 +207,84 @@
  {
   "cell_type": "code",
   "execution_count": 2,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-01T02:46:42.813656Z",
+     "iopub.status.busy": "2024-11-01T02:46:42.813354Z",
+     "iopub.status.idle": "2024-11-01T02:46:51.436613Z",
+     "shell.execute_reply": "2024-11-01T02:46:51.435965Z"
+    }
+   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "[2024-10-30 09:34:00 TP0] Prefill batch. #new-seq: 1, #new-token: 46, #cached-token: 1, cache hit rate: 1.85%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "[2024-10-31 19:46:42 TP0] Prefill batch. #new-seq: 1, #new-token: 46, #cached-token: 1, cache hit rate: 1.85%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
-      "[2024-10-30 09:34:00 TP0] Decode batch. #running-req: 1, #token: 80, token usage: 0.00, gen throughput (token/s): 0.65, #queue-req: 0\n",
+     ]
-      "[2024-10-30 09:34:01 TP0] Decode batch. #running-req: 1, #token: 120, token usage: 0.00, gen throughput (token/s): 139.05, #queue-req: 0\n",
+    },
-      "[2024-10-30 09:34:01 TP0] Decode batch. #running-req: 1, #token: 160, token usage: 0.00, gen throughput (token/s): 137.75, #queue-req: 0\n",
+    {
-      "[2024-10-30 09:34:01 TP0] Decode batch. #running-req: 1, #token: 200, token usage: 0.00, gen throughput (token/s): 137.59, #queue-req: 0\n",
+     "name": "stdout",
-      "[2024-10-30 09:34:02 TP0] Decode batch. #running-req: 1, #token: 240, token usage: 0.00, gen throughput (token/s): 137.62, #queue-req: 0\n",
+     "output_type": "stream",
-      "[2024-10-30 09:34:02 TP0] Decode batch. #running-req: 1, #token: 280, token usage: 0.00, gen throughput (token/s): 137.61, #queue-req: 0\n",
+     "text": [
-      "[2024-10-30 09:34:02 TP0] Decode batch. #running-req: 1, #token: 320, token usage: 0.00, gen throughput (token/s): 137.49, #queue-req: 0\n",
+      "[2024-10-31 19:46:43 TP0] Decode batch. #running-req: 1, #token: 80, token usage: 0.00, gen throughput (token/s): 5.40, #queue-req: 0\n"
-      "[2024-10-30 09:34:02 TP0] Decode batch. #running-req: 1, #token: 360, token usage: 0.00, gen throughput (token/s): 137.51, #queue-req: 0\n",
+     ]
-      "[2024-10-30 09:34:03 TP0] Decode batch. #running-req: 1, #token: 400, token usage: 0.00, gen throughput (token/s): 137.47, #queue-req: 0\n",
+    },
-      "[2024-10-30 09:34:03 TP0] Decode batch. #running-req: 1, #token: 440, token usage: 0.00, gen throughput (token/s): 137.48, #queue-req: 0\n",
+    {
-      "[2024-10-30 09:34:03 TP0] Decode batch. #running-req: 1, #token: 480, token usage: 0.00, gen throughput (token/s): 137.47, #queue-req: 0\n",
+     "name": "stdout",
-      "[2024-10-30 09:34:04 TP0] Decode batch. #running-req: 1, #token: 520, token usage: 0.00, gen throughput (token/s): 137.47, #queue-req: 0\n",
+     "output_type": "stream",
-      "[2024-10-30 09:34:04] INFO:     127.0.0.1:54110 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n",
+     "text": [
-      "{\"id\":\"a53e18ead1314ab0a2cec76cef484c11\",\"object\":\"chat.completion\",\"created\":1730280844,\"model\":\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\"choices\":[{\"index\":0,\"message\":{\"role\":\"assistant\",\"content\":\"LLM stands for Large Language Model. It's a type of artificial intelligence (AI) model that is designed to process and understand human language in a way that's similar to how humans do. \\n\\nLLMs are trained on vast amounts of text data, which allows them to learn patterns, relationships, and context within language. This training enables them to generate human-like responses to a wide range of questions, prompts, and topics.\\n\\nSome common characteristics of LLMs include:\\n\\n1. **Language understanding**: LLMs can comprehend the meaning and context of language, including nuances like idioms, sarcasm, and figurative language.\\n2. **Language generation**: LLMs can generate text that's coherent, contextually relevant, and often engaging.\\n3. **Knowledge retrieval**: LLMs can access and retrieve information from their vast training datasets, allowing them to answer questions and provide information on a wide range of topics.\\n4. **Conversational dialogue**: LLMs can engage in natural-sounding conversations, using context and understanding to respond to questions and statements.\\n\\nLLMs have many applications, including:\\n\\n1. **Virtual assistants**: LLMs power virtual assistants like Siri, Alexa, and Google Assistant.\\n2. **Language translation**: LLMs can translate languages in real-time, with high accuracy.\\n3. **Content generation**: LLMs can generate text, such as articles, emails, and social media posts.\\n4. **Chatbots**: LLMs can power chatbots that provide customer support, answer questions, and engage in conversations.\\n\\nSome popular examples of LLMs include:\\n\\n1. **BERT (Bidirectional Encoder Representations from Transformers)**: Developed by Google, BERT is a widely used LLM that's been trained on a massive dataset of text.\\n2. **RoBERTa (Robustly Optimized BERT Pretraining Approach)**: Developed by Facebook AI, RoBERTa is another popular LLM that's been trained on a large dataset of text.\\n3. **Language models from OpenAI**: OpenAI has developed a range of LLMs, including GPT-3 (Generative Pre-trained Transformer 3), which is one of the most advanced LLMs available today.\\n\\nOverall, LLMs have the potential to revolutionize the way we interact with language and information, making it easier to access and understand complex topics, and opening up new possibilities for language-based applications.\"},\"logprobs\":null,\"finish_reason\":\"stop\",\"matched_stop\":128009}],\"usage\":{\"prompt_tokens\":47,\"total_tokens\":539,\"completion_tokens\":492,\"prompt_tokens_details\":null}}"
+      "[2024-10-31 19:46:44 TP0] Decode batch. #running-req: 1, #token: 120, token usage: 0.00, gen throughput (token/s): 42.48, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-31 19:46:45 TP0] Decode batch. #running-req: 1, #token: 160, token usage: 0.00, gen throughput (token/s): 42.37, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-31 19:46:46 TP0] Decode batch. #running-req: 1, #token: 200, token usage: 0.00, gen throughput (token/s): 42.33, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-31 19:46:47 TP0] Decode batch. #running-req: 1, #token: 240, token usage: 0.00, gen throughput (token/s): 42.34, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-31 19:46:48 TP0] Decode batch. #running-req: 1, #token: 280, token usage: 0.00, gen throughput (token/s): 42.28, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-31 19:46:49 TP0] Decode batch. #running-req: 1, #token: 320, token usage: 0.00, gen throughput (token/s): 42.28, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-31 19:46:50 TP0] Decode batch. #running-req: 1, #token: 360, token usage: 0.00, gen throughput (token/s): 42.24, #queue-req: 0\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-31 19:46:51] INFO:     127.0.0.1:46046 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n",
+      "{\"id\":\"f9761ee1b1444bd7a640286884a90842\",\"object\":\"chat.completion\",\"created\":1730429211,\"model\":\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\"choices\":[{\"index\":0,\"message\":{\"role\":\"assistant\",\"content\":\"LLM stands for Large Language Model. It's a type of artificial intelligence (AI) designed to process and comprehend human language in a way that's similar to how humans do.\\n\\nLarge Language Models are trained on massive amounts of text data, which allows them to learn patterns and relationships in language. This training enables them to generate text, answer questions, summarize content, and even engage in conversation.\\n\\nSome key characteristics of LLMs include:\\n\\n1. **Language understanding**: LLMs can comprehend the meaning of text, including nuances like idioms, sarcasm, and figurative language.\\n2. **Contextual awareness**: LLMs can understand the context in which a piece of text is written, including the topic, tone, and intent.\\n3. **Generative capabilities**: LLMs can generate text, including entire articles, conversations, or even creative writing like stories or poetry.\\n4. **Continuous learning**: LLMs can learn from new data and update their understanding of language over time.\\n\\nLLMs are used in a wide range of applications, including:\\n\\n1. **Virtual assistants**: LLMs power virtual assistants like Siri, Alexa, and Google Assistant.\\n2. **Chatbots**: LLMs are used to create chatbots that can engage with customers and provide support.\\n3. **Language translation**: LLMs can translate text from one language to another with high accuracy.\\n4. **Content generation**: LLMs can generate content, such as articles, social media posts, and product descriptions.\\n5. **Research and analysis**: LLMs can help researchers analyze and understand large amounts of text data.\\n\\nIn the context of our conversation, I'm a Large Language Model designed to provide helpful and informative responses to your questions!\"},\"logprobs\":null,\"finish_reason\":\"stop\",\"matched_stop\":128009}],\"usage\":{\"prompt_tokens\":47,\"total_tokens\":400,\"completion_tokens\":353,\"prompt_tokens_details\":null}}"
     ]
    }
   ],
@@ -147,21 +307,35 @@
  {
   "cell_type": "code",
   "execution_count": 3,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-01T02:46:51.439372Z",
+     "iopub.status.busy": "2024-11-01T02:46:51.439178Z",
+     "iopub.status.idle": "2024-11-01T02:46:52.895776Z",
+     "shell.execute_reply": "2024-11-01T02:46:52.895318Z"
+    }
+   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "[2024-10-30 09:34:06 TP0] Prefill batch. #new-seq: 1, #new-token: 20, #cached-token: 29, cache hit rate: 29.13%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
+      "[2024-10-31 19:46:51 TP0] Prefill batch. #new-seq: 1, #new-token: 20, #cached-token: 29, cache hit rate: 29.13%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
-      "[2024-10-30 09:34:07 TP0] Decode batch. #running-req: 1, #token: 71, token usage: 0.00, gen throughput (token/s): 13.51, #queue-req: 0\n",
+      "[2024-10-31 19:46:51 TP0] Decode batch. #running-req: 1, #token: 50, token usage: 0.00, gen throughput (token/s): 27.57, #queue-req: 0\n"
-      "[2024-10-30 09:34:07] INFO:     127.0.0.1:42068 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-10-31 19:46:52 TP0] Decode batch. #running-req: 1, #token: 90, token usage: 0.00, gen throughput (token/s): 42.69, #queue-req: 0\n",
+      "[2024-10-31 19:46:52] INFO:     127.0.0.1:40952 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
     ]
    },
    {
     "data": {
      "text/html": [
-       "<strong style='color: #00008B;'>ChatCompletion(id='0708a0196e524456a1316359f6189e48', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **Country:** Japan\\n**Capital:** Tokyo\\n\\n2. **Country:** Australia\\n**Capital:** Canberra\\n\\n3. **Country:** Brazil\\n**Capital:** Brasília', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None), matched_stop=128009)], created=1730280847, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, completion_tokens_details=None, prompt_tokens_details=None))</strong>"
+       "<strong style='color: #00008B;'>ChatCompletion(id='c563abb8fe74496f83203fe21ec4ff61', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **Country:** Japan\\n**Capital:** Tokyo\\n\\n2. **Country:** Australia\\n**Capital:** Canberra\\n\\n3. **Country:** Brazil\\n**Capital:** Brasília', refusal=None, role='assistant', function_call=None, tool_calls=None), matched_stop=128009)], created=1730429212, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, prompt_tokens_details=None))</strong>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
@@ -191,7 +365,14 @@
  {
   "cell_type": "code",
   "execution_count": 4,
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-11-01T02:46:52.898411Z",
+     "iopub.status.busy": "2024-11-01T02:46:52.898149Z",
+     "iopub.status.idle": "2024-11-01T02:46:54.398382Z",
+     "shell.execute_reply": "2024-11-01T02:46:54.397564Z"
+    }
+   },
   "outputs": [],
   "source": [
    "terminate_process(server_process)"
@@ -214,7 +395,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.11.7"
  }
 },
 "nbformat": 4,