send_request.ipynb 14.4 KB
Newer Older
Chayenne's avatar
Chayenne committed
1
2
3
4
5
6
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
Chayenne's avatar
Chayenne committed
7
8
    "# Quick Start: Launch A Server and Send Requests\n",
    "\n",
Lianmin Zheng's avatar
Lianmin Zheng committed
9
    "This notebook provides a quick-start guide for using SGLang after installation."
Chayenne's avatar
Chayenne committed
10
11
12
13
14
15
16
17
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Launch a server\n",
    "\n",
Chayenne's avatar
Chayenne committed
18
    "This code block is equivalent to executing \n",
Chayenne's avatar
Chayenne committed
19
20
21
    "\n",
    "```bash\n",
    "python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n",
22
    "--port 30000 --host 0.0.0.0\n",
Chayenne's avatar
Chayenne committed
23
    "```\n",
Chayenne's avatar
Chayenne committed
24
    "\n",
Chayenne's avatar
Chayenne committed
25
26
27
28
29
30
31
32
33
34
35
36
    "in your command line and wait for the server to be ready."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
Lianmin Zheng's avatar
Lianmin Zheng committed
37
38
39
40
41
      "[2024-10-30 09:32:30] server_args=ServerArgs(model_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Meta-Llama-3.1-8B-Instruct', chat_template=None, is_embedding=False, host='0.0.0.0', port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=335520337, constrained_json_whitespace_pattern=None, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)\n",
      "[2024-10-30 09:32:39 TP0] Init torch distributed begin.\n",
      "[2024-10-30 09:32:43 TP0] Load weight begin. avail mem=76.83 GB\n",
      "[2024-10-30 09:32:43 TP0] lm_eval is not installed, GPTQ may not be usable\n",
      "INFO 10-30 09:32:43 weight_utils.py:243] Using model weights format ['*.safetensors']\n",
42
      "Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]\n",
Lianmin Zheng's avatar
Lianmin Zheng committed
43
44
45
46
47
      "Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:01<00:05,  1.78s/it]\n",
      "Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:03<00:03,  1.78s/it]\n",
      "Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:05<00:01,  1.80s/it]\n",
      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:05<00:00,  1.30s/it]\n",
      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:05<00:00,  1.48s/it]\n",
48
      "\n",
Lianmin Zheng's avatar
Lianmin Zheng committed
49
50
51
52
53
54
55
56
57
58
59
60
61
      "[2024-10-30 09:32:49 TP0] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=61.82 GB\n",
      "[2024-10-30 09:32:49 TP0] Memory pool end. avail mem=8.19 GB\n",
      "[2024-10-30 09:32:51 TP0] Capture cuda graph begin. This can take up to several minutes.\n",
      "[2024-10-30 09:32:59 TP0] max_total_num_tokens=430915, max_prefill_tokens=16384, max_running_requests=2049, context_len=131072\n",
      "[2024-10-30 09:33:00] INFO:     Started server process [227758]\n",
      "[2024-10-30 09:33:00] INFO:     Waiting for application startup.\n",
      "[2024-10-30 09:33:00] INFO:     Application startup complete.\n",
      "[2024-10-30 09:33:00] INFO:     Uvicorn running on http://0.0.0.0:30000 (Press CTRL+C to quit)\n",
      "[2024-10-30 09:33:01] INFO:     127.0.0.1:49220 - \"GET /v1/models HTTP/1.1\" 200 OK\n",
      "[2024-10-30 09:33:01] INFO:     127.0.0.1:49236 - \"GET /get_model_info HTTP/1.1\" 200 OK\n",
      "[2024-10-30 09:33:01 TP0] Prefill batch. #new-seq: 1, #new-token: 7, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
      "[2024-10-30 09:33:01] INFO:     127.0.0.1:49240 - \"POST /generate HTTP/1.1\" 200 OK\n",
      "[2024-10-30 09:33:01] The server is fired up and ready to roll!\n"
Chayenne's avatar
Chayenne committed
62
     ]
63
64
65
66
    },
    {
     "data": {
      "text/html": [
Lianmin Zheng's avatar
Lianmin Zheng committed
67
       "<strong style='color: #00008B;'><br><br>                    NOTE: Typically, the server runs in a separate terminal.<br>                    In this notebook, we run the server and notebook code together, so their outputs are combined.<br>                    To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue.<br>                    </strong>"
68
69
70
71
72
73
74
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
Chayenne's avatar
Chayenne committed
75
76
77
    }
   ],
   "source": [
78
79
80
81
82
83
    "from sglang.utils import (\n",
    "    execute_shell_command,\n",
    "    wait_for_server,\n",
    "    terminate_process,\n",
    "    print_highlight,\n",
    ")\n",
Chayenne's avatar
Chayenne committed
84
    "\n",
Chayenne's avatar
Chayenne committed
85
    "server_process = execute_shell_command(\n",
Lianmin Zheng's avatar
Lianmin Zheng committed
86
    "\"\"\"\n",
87
    "python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n",
88
    "--port 30000 --host 0.0.0.0\n",
Chayenne's avatar
Chayenne committed
89
90
    "\"\"\"\n",
    ")\n",
Chayenne's avatar
Chayenne committed
91
    "\n",
92
    "wait_for_server(\"http://localhost:30000\")"
Chayenne's avatar
Chayenne committed
93
94
95
96
97
98
99
100
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Send a Request\n",
    "\n",
Lianmin Zheng's avatar
Lianmin Zheng committed
101
    "Once the server is running, you can send test requests using curl. The server implements the [OpenAI-compatible API](https://platform.openai.com/docs/api-reference/chat)."
Chayenne's avatar
Chayenne committed
102
103
104
105
106
107
108
109
110
111
112
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
Lianmin Zheng's avatar
Lianmin Zheng committed
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
      "[2024-10-30 09:34:00 TP0] Prefill batch. #new-seq: 1, #new-token: 46, #cached-token: 1, cache hit rate: 1.85%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
      "[2024-10-30 09:34:00 TP0] Decode batch. #running-req: 1, #token: 80, token usage: 0.00, gen throughput (token/s): 0.65, #queue-req: 0\n",
      "[2024-10-30 09:34:01 TP0] Decode batch. #running-req: 1, #token: 120, token usage: 0.00, gen throughput (token/s): 139.05, #queue-req: 0\n",
      "[2024-10-30 09:34:01 TP0] Decode batch. #running-req: 1, #token: 160, token usage: 0.00, gen throughput (token/s): 137.75, #queue-req: 0\n",
      "[2024-10-30 09:34:01 TP0] Decode batch. #running-req: 1, #token: 200, token usage: 0.00, gen throughput (token/s): 137.59, #queue-req: 0\n",
      "[2024-10-30 09:34:02 TP0] Decode batch. #running-req: 1, #token: 240, token usage: 0.00, gen throughput (token/s): 137.62, #queue-req: 0\n",
      "[2024-10-30 09:34:02 TP0] Decode batch. #running-req: 1, #token: 280, token usage: 0.00, gen throughput (token/s): 137.61, #queue-req: 0\n",
      "[2024-10-30 09:34:02 TP0] Decode batch. #running-req: 1, #token: 320, token usage: 0.00, gen throughput (token/s): 137.49, #queue-req: 0\n",
      "[2024-10-30 09:34:02 TP0] Decode batch. #running-req: 1, #token: 360, token usage: 0.00, gen throughput (token/s): 137.51, #queue-req: 0\n",
      "[2024-10-30 09:34:03 TP0] Decode batch. #running-req: 1, #token: 400, token usage: 0.00, gen throughput (token/s): 137.47, #queue-req: 0\n",
      "[2024-10-30 09:34:03 TP0] Decode batch. #running-req: 1, #token: 440, token usage: 0.00, gen throughput (token/s): 137.48, #queue-req: 0\n",
      "[2024-10-30 09:34:03 TP0] Decode batch. #running-req: 1, #token: 480, token usage: 0.00, gen throughput (token/s): 137.47, #queue-req: 0\n",
      "[2024-10-30 09:34:04 TP0] Decode batch. #running-req: 1, #token: 520, token usage: 0.00, gen throughput (token/s): 137.47, #queue-req: 0\n",
      "[2024-10-30 09:34:04] INFO:     127.0.0.1:54110 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n",
      "{\"id\":\"a53e18ead1314ab0a2cec76cef484c11\",\"object\":\"chat.completion\",\"created\":1730280844,\"model\":\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\"choices\":[{\"index\":0,\"message\":{\"role\":\"assistant\",\"content\":\"LLM stands for Large Language Model. It's a type of artificial intelligence (AI) model that is designed to process and understand human language in a way that's similar to how humans do. \\n\\nLLMs are trained on vast amounts of text data, which allows them to learn patterns, relationships, and context within language. This training enables them to generate human-like responses to a wide range of questions, prompts, and topics.\\n\\nSome common characteristics of LLMs include:\\n\\n1. **Language understanding**: LLMs can comprehend the meaning and context of language, including nuances like idioms, sarcasm, and figurative language.\\n2. **Language generation**: LLMs can generate text that's coherent, contextually relevant, and often engaging.\\n3. **Knowledge retrieval**: LLMs can access and retrieve information from their vast training datasets, allowing them to answer questions and provide information on a wide range of topics.\\n4. **Conversational dialogue**: LLMs can engage in natural-sounding conversations, using context and understanding to respond to questions and statements.\\n\\nLLMs have many applications, including:\\n\\n1. **Virtual assistants**: LLMs power virtual assistants like Siri, Alexa, and Google Assistant.\\n2. **Language translation**: LLMs can translate languages in real-time, with high accuracy.\\n3. **Content generation**: LLMs can generate text, such as articles, emails, and social media posts.\\n4. **Chatbots**: LLMs can power chatbots that provide customer support, answer questions, and engage in conversations.\\n\\nSome popular examples of LLMs include:\\n\\n1. **BERT (Bidirectional Encoder Representations from Transformers)**: Developed by Google, BERT is a widely used LLM that's been trained on a massive dataset of text.\\n2. **RoBERTa (Robustly Optimized BERT Pretraining Approach)**: Developed by Facebook AI, RoBERTa is another popular LLM that's been trained on a large dataset of text.\\n3. **Language models from OpenAI**: OpenAI has developed a range of LLMs, including GPT-3 (Generative Pre-trained Transformer 3), which is one of the most advanced LLMs available today.\\n\\nOverall, LLMs have the potential to revolutionize the way we interact with language and information, making it easier to access and understand complex topics, and opening up new possibilities for language-based applications.\"},\"logprobs\":null,\"finish_reason\":\"stop\",\"matched_stop\":128009}],\"usage\":{\"prompt_tokens\":47,\"total_tokens\":539,\"completion_tokens\":492,\"prompt_tokens_details\":null}}"
Chayenne's avatar
Chayenne committed
128
129
130
131
132
133
134
135
136
137
138
139
140
141
     ]
    }
   ],
   "source": [
    "!curl http://localhost:30000/v1/chat/completions \\\n",
    "  -H \"Content-Type: application/json\" \\\n",
    "  -H \"Authorization: Bearer None\" \\\n",
    "  -d '{\"model\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\", \"messages\": [{\"role\": \"system\", \"content\": \"You are a helpful assistant.\"}, {\"role\": \"user\", \"content\": \"What is a LLM?\"}]}'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
Lianmin Zheng's avatar
Lianmin Zheng committed
142
    "## Using OpenAI Python Client\n",
Chayenne's avatar
Chayenne committed
143
    "\n",
Lianmin Zheng's avatar
Lianmin Zheng committed
144
    "You can also use the OpenAI Python API library to send requests."
Chayenne's avatar
Chayenne committed
145
146
147
148
149
150
151
152
153
154
155
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
Lianmin Zheng's avatar
Lianmin Zheng committed
156
157
158
      "[2024-10-30 09:34:06 TP0] Prefill batch. #new-seq: 1, #new-token: 20, #cached-token: 29, cache hit rate: 29.13%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
      "[2024-10-30 09:34:07 TP0] Decode batch. #running-req: 1, #token: 71, token usage: 0.00, gen throughput (token/s): 13.51, #queue-req: 0\n",
      "[2024-10-30 09:34:07] INFO:     127.0.0.1:42068 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
Chayenne's avatar
Chayenne committed
159
     ]
160
161
162
163
    },
    {
     "data": {
      "text/html": [
Lianmin Zheng's avatar
Lianmin Zheng committed
164
       "<strong style='color: #00008B;'>ChatCompletion(id='0708a0196e524456a1316359f6189e48', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **Country:** Japan\\n**Capital:** Tokyo\\n\\n2. **Country:** Australia\\n**Capital:** Canberra\\n\\n3. **Country:** Brazil\\n**Capital:** Brasília', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None), matched_stop=128009)], created=1730280847, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, completion_tokens_details=None, prompt_tokens_details=None))</strong>"
165
166
167
168
169
170
171
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
Chayenne's avatar
Chayenne committed
172
173
174
175
176
    }
   ],
   "source": [
    "import openai\n",
    "\n",
Chayenne's avatar
Chayenne committed
177
    "client = openai.Client(base_url=\"http://127.0.0.1:30000/v1\", api_key=\"None\")\n",
Chayenne's avatar
Chayenne committed
178
179
180
181
182
183
184
185
186
187
    "\n",
    "response = client.chat.completions.create(\n",
    "    model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
    "    messages=[\n",
    "        {\"role\": \"system\", \"content\": \"You are a helpful AI assistant\"},\n",
    "        {\"role\": \"user\", \"content\": \"List 3 countries and their capitals.\"},\n",
    "    ],\n",
    "    temperature=0,\n",
    "    max_tokens=64,\n",
    ")\n",
188
    "print_highlight(response)"
Chayenne's avatar
Chayenne committed
189
190
191
192
193
194
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
Lianmin Zheng's avatar
Lianmin Zheng committed
195
   "outputs": [],
Chayenne's avatar
Chayenne committed
196
   "source": [
197
    "terminate_process(server_process)"
Chayenne's avatar
Chayenne committed
198
199
200
201
202
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
Lianmin Zheng's avatar
Lianmin Zheng committed
203
   "display_name": "Python 3 (ipykernel)",
Chayenne's avatar
Chayenne committed
204
205
206
207
208
209
210
211
212
213
214
215
216
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
Lianmin Zheng's avatar
Lianmin Zheng committed
217
   "version": "3.10.12"
Chayenne's avatar
Chayenne committed
218
219
220
221
222
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}