"vscode:/vscode.git/clone" did not exist on "459d822b5188dba051e21dfd15b6552543a4bbcf"
send_request.ipynb 15.7 KB
Newer Older
Chayenne's avatar
Chayenne committed
1
2
3
4
5
6
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
Chayenne's avatar
Chayenne committed
7
8
9
    "# Quick Start: Launch A Server and Send Requests\n",
    "\n",
    "This section provides a quick start guide to using SGLang after installation."
Chayenne's avatar
Chayenne committed
10
11
12
13
14
15
16
17
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Launch a server\n",
    "\n",
Chayenne's avatar
Chayenne committed
18
    "This code block is equivalent to executing \n",
Chayenne's avatar
Chayenne committed
19
20
21
    "\n",
    "```bash\n",
    "python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n",
22
    "--port 30000 --host 0.0.0.0\n",
Chayenne's avatar
Chayenne committed
23
    "```\n",
Chayenne's avatar
Chayenne committed
24
    "\n",
Chayenne's avatar
Chayenne committed
25
26
27
28
29
30
31
32
33
34
35
36
    "in your command line and wait for the server to be ready."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
      "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
      "  warnings.warn(\n",
      "[2024-10-29 21:14:13] server_args=ServerArgs(model_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Meta-Llama-3.1-8B-Instruct', chat_template=None, is_embedding=False, host='0.0.0.0', port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=518055348, constrained_json_whitespace_pattern=None, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)\n",
      "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
      "  warnings.warn(\n",
      "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
      "  warnings.warn(\n",
      "[2024-10-29 21:14:19 TP0] Init torch distributed begin.\n",
      "[2024-10-29 21:14:20 TP0] Load weight begin. avail mem=47.27 GB\n",
      "[2024-10-29 21:14:21 TP0] lm_eval is not installed, GPTQ may not be usable\n",
      "INFO 10-29 21:14:21 weight_utils.py:243] Using model weights format ['*.safetensors']\n",
      "Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]\n",
      "Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:00<00:01,  2.32it/s]\n",
      "Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:00<00:00,  2.28it/s]\n",
      "Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:01<00:00,  3.27it/s]\n",
      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:01<00:00,  2.87it/s]\n",
      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:01<00:00,  2.78it/s]\n",
      "\n",
      "[2024-10-29 21:14:24 TP0] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=32.22 GB\n",
      "[2024-10-29 21:14:24 TP0] Memory pool end. avail mem=4.60 GB\n",
      "[2024-10-29 21:14:24 TP0] Capture cuda graph begin. This can take up to several minutes.\n",
      "[2024-10-29 21:14:32 TP0] max_total_num_tokens=217512, max_prefill_tokens=16384, max_running_requests=2049, context_len=131072\n",
      "[2024-10-29 21:14:32] INFO:     Started server process [2661188]\n",
      "[2024-10-29 21:14:32] INFO:     Waiting for application startup.\n",
      "[2024-10-29 21:14:32] INFO:     Application startup complete.\n",
      "[2024-10-29 21:14:32] INFO:     Uvicorn running on http://0.0.0.0:30000 (Press CTRL+C to quit)\n",
      "[2024-10-29 21:14:32] INFO:     127.0.0.1:49888 - \"GET /v1/models HTTP/1.1\" 200 OK\n"
Chayenne's avatar
Chayenne committed
64
     ]
65
66
67
68
69
70
71
72
73
74
75
76
    },
    {
     "data": {
      "text/html": [
       "<strong style='color: #00008B;'><br>                            Server and notebook outputs are combined for clarity.<br>                            <br>                            Typically, the server runs in a separate terminal.<br>                            <br>                            Server output is gray; notebook output is highlighted.<br>                            </strong>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
Chayenne's avatar
Chayenne committed
77
78
79
    }
   ],
   "source": [
80
81
82
83
84
85
    "from sglang.utils import (\n",
    "    execute_shell_command,\n",
    "    wait_for_server,\n",
    "    terminate_process,\n",
    "    print_highlight,\n",
    ")\n",
Chayenne's avatar
Chayenne committed
86
87
    "\n",
    "\n",
Chayenne's avatar
Chayenne committed
88
89
    "server_process = execute_shell_command(\n",
    "    \"\"\"\n",
90
    "python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n",
91
    "--port 30000 --host 0.0.0.0\n",
Chayenne's avatar
Chayenne committed
92
93
    "\"\"\"\n",
    ")\n",
Chayenne's avatar
Chayenne committed
94
    "\n",
95
    "wait_for_server(\"http://localhost:30000\")"
Chayenne's avatar
Chayenne committed
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Send a Request\n",
    "\n",
    "Once the server is running, you can send test requests using curl."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
      "[2024-10-29 21:14:32 TP0] Prefill batch. #new-seq: 1, #new-token: 47, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2024-10-29 21:14:33] INFO:     127.0.0.1:49914 - \"GET /get_model_info HTTP/1.1\" 200 OK\n",
      "[2024-10-29 21:14:33 TP0] Prefill batch. #new-seq: 1, #new-token: 6, #cached-token: 1, cache hit rate: 1.85%, token usage: 0.00, #running-req: 1, #queue-req: 0\n",
      "[2024-10-29 21:14:33] INFO:     127.0.0.1:49916 - \"POST /generate HTTP/1.1\" 200 OK\n",
      "[2024-10-29 21:14:33] The server is fired up and ready to roll!\n",
      "[2024-10-29 21:14:33 TP0] Decode batch. #running-req: 1, #token: 87, token usage: 0.00, gen throughput (token/s): 27.00, #queue-req: 0\n",
      "[2024-10-29 21:14:34 TP0] Decode batch. #running-req: 1, #token: 127, token usage: 0.00, gen throughput (token/s): 42.50, #queue-req: 0\n",
      "[2024-10-29 21:14:35 TP0] Decode batch. #running-req: 1, #token: 167, token usage: 0.00, gen throughput (token/s): 42.31, #queue-req: 0\n",
      "[2024-10-29 21:14:36 TP0] Decode batch. #running-req: 1, #token: 207, token usage: 0.00, gen throughput (token/s): 42.29, #queue-req: 0\n",
      "[2024-10-29 21:14:37 TP0] Decode batch. #running-req: 1, #token: 247, token usage: 0.00, gen throughput (token/s): 42.34, #queue-req: 0\n",
      "[2024-10-29 21:14:38 TP0] Decode batch. #running-req: 1, #token: 287, token usage: 0.00, gen throughput (token/s): 42.34, #queue-req: 0\n",
      "[2024-10-29 21:14:39 TP0] Decode batch. #running-req: 1, #token: 327, token usage: 0.00, gen throughput (token/s): 42.30, #queue-req: 0\n",
      "[2024-10-29 21:14:40 TP0] Decode batch. #running-req: 1, #token: 367, token usage: 0.00, gen throughput (token/s): 42.32, #queue-req: 0\n",
      "[2024-10-29 21:14:41 TP0] Decode batch. #running-req: 1, #token: 407, token usage: 0.00, gen throughput (token/s): 42.23, #queue-req: 0\n",
      "[2024-10-29 21:14:42 TP0] Decode batch. #running-req: 1, #token: 447, token usage: 0.00, gen throughput (token/s): 42.25, #queue-req: 0\n",
      "[2024-10-29 21:14:43 TP0] Decode batch. #running-req: 1, #token: 487, token usage: 0.00, gen throughput (token/s): 42.22, #queue-req: 0\n",
      "[2024-10-29 21:14:43] INFO:     127.0.0.1:49902 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n",
      "{\"id\":\"0635a1c4d1d940f597b11482bed9595f\",\"object\":\"chat.completion\",\"created\":1730261683,\"model\":\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\"choices\":[{\"index\":0,\"message\":{\"role\":\"assistant\",\"content\":\"LLM stands for Large Language Model. It's a type of artificial intelligence (AI) designed to process and understand human language. LLMs are trained on vast amounts of text data, allowing them to learn patterns, relationships, and context within language.\\n\\nLarge language models like myself use natural language processing (NLP) and machine learning algorithms to analyze and generate human-like text. This enables us to:\\n\\n1. **Answer questions**: Provide information on a wide range of topics, from general knowledge to specialized domains.\\n2. **Generate text**: Create coherent and contextually relevant text, such as articles, essays, or even entire stories.\\n3. **Translate languages**: Translate text from one language to another, helping to break language barriers.\\n4. **Summarize content**: Condense long pieces of text into shorter, more digestible summaries.\\n5. **Chat and converse**: Engage in natural-sounding conversations, using context and understanding to respond to questions and statements.\\n\\nLarge language models are typically trained on massive datasets, often consisting of billions of parameters and petabytes of text data. This training enables us to learn complex language patterns, nuances, and context, allowing us to provide helpful and informative responses.\\n\\nSome popular examples of large language models include:\\n\\n1. **BERT (Bidirectional Encoder Representations from Transformers)**: Developed by Google, BERT is a foundational model for many language understanding tasks.\\n2. **RoBERTa (Robustly Optimized BERT Pretraining Approach)**: A variant of BERT, developed by Facebook AI, which improved upon the original model's performance.\\n3. **Transformers**: A family of models developed by Google, which includes BERT and other related architectures.\\n\\nThese models have revolutionized the field of natural language processing and have many exciting applications in areas like:\\n\\n1. **Virtual assistants**: Like Siri, Alexa, or myself, which can understand and respond to voice commands.\\n2. **Language translation**: Enabling real-time translation of languages.\\n3. **Content generation**: Creating original text, such as articles, stories, or even entire books.\\n4. **Customer service**: Providing 24/7 support and answering common customer queries.\\n\\nI hope this helps you understand what a Large Language Model is and its capabilities!\"},\"logprobs\":null,\"finish_reason\":\"stop\",\"matched_stop\":128009}],\"usage\":{\"prompt_tokens\":47,\"total_tokens\":504,\"completion_tokens\":457,\"prompt_tokens_details\":null}}"
Chayenne's avatar
Chayenne committed
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
     ]
    }
   ],
   "source": [
    "!curl http://localhost:30000/v1/chat/completions \\\n",
    "  -H \"Content-Type: application/json\" \\\n",
    "  -H \"Authorization: Bearer None\" \\\n",
    "  -d '{\"model\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\", \"messages\": [{\"role\": \"system\", \"content\": \"You are a helpful assistant.\"}, {\"role\": \"user\", \"content\": \"What is a LLM?\"}]}'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Using OpenAI Compatible API\n",
    "\n",
    "SGLang supports OpenAI-compatible APIs. Here are Python examples:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
168
169
170
      "[2024-10-29 21:14:44 TP0] Prefill batch. #new-seq: 1, #new-token: 20, #cached-token: 29, cache hit rate: 29.13%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
      "[2024-10-29 21:14:44 TP0] Decode batch. #running-req: 1, #token: 73, token usage: 0.00, gen throughput (token/s): 26.00, #queue-req: 0\n",
      "[2024-10-29 21:14:45] INFO:     127.0.0.1:52764 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
Chayenne's avatar
Chayenne committed
171
     ]
172
173
174
175
176
177
178
179
180
181
182
183
    },
    {
     "data": {
      "text/html": [
       "<strong style='color: #00008B;'>ChatCompletion(id='994dd35133d34f57951a102c7470464f', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **Country:** Japan\\n**Capital:** Tokyo\\n\\n2. **Country:** Australia\\n**Capital:** Canberra\\n\\n3. **Country:** Brazil\\n**Capital:** Brasília', refusal=None, role='assistant', function_call=None, tool_calls=None), matched_stop=128009)], created=1730261685, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, prompt_tokens_details=None))</strong>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
Chayenne's avatar
Chayenne committed
184
185
186
187
188
189
190
191
    }
   ],
   "source": [
    "import openai\n",
    "\n",
    "# Always assign an api_key, even if not specified during server initialization.\n",
    "# Setting an API key during server initialization is strongly recommended.\n",
    "\n",
Chayenne's avatar
Chayenne committed
192
    "client = openai.Client(base_url=\"http://127.0.0.1:30000/v1\", api_key=\"None\")\n",
Chayenne's avatar
Chayenne committed
193
194
195
196
197
198
199
200
201
202
203
204
    "\n",
    "# Chat completion example\n",
    "\n",
    "response = client.chat.completions.create(\n",
    "    model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
    "    messages=[\n",
    "        {\"role\": \"system\", \"content\": \"You are a helpful AI assistant\"},\n",
    "        {\"role\": \"user\", \"content\": \"List 3 countries and their capitals.\"},\n",
    "    ],\n",
    "    temperature=0,\n",
    "    max_tokens=64,\n",
    ")\n",
205
    "print_highlight(response)"
Chayenne's avatar
Chayenne committed
206
207
208
209
210
211
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
212
213
214
215
216
217
218
219
220
221
222
223
224
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2024-10-29 21:14:45] INFO:     Shutting down\n",
      "[2024-10-29 21:14:45] INFO:     Waiting for application shutdown.\n",
      "[2024-10-29 21:14:45] INFO:     Application shutdown complete.\n",
      "[2024-10-29 21:14:45] INFO:     Finished server process [2661188]\n",
      "W1029 21:14:45.740000 139643311699520 torch/_inductor/compile_worker/subproc_pool.py:126] SubprocPool unclean exit\n"
     ]
    }
   ],
Chayenne's avatar
Chayenne committed
225
   "source": [
226
    "terminate_process(server_process)"
Chayenne's avatar
Chayenne committed
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "AlphaMeemory",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}