{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Quick Start: Sending Requests\n", "\n", "This notebook provides a quick-start guide for using SGLang after installation." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Launch a server\n", "\n", "This code block is equivalent to executing \n", "\n", "```bash\n", "python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n", "--port 30000 --host 0.0.0.0\n", "```\n", "\n", "in your terminal and wait for the server to be ready." ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "execution": { "iopub.execute_input": "2024-11-01T02:46:13.611212Z", "iopub.status.busy": "2024-11-01T02:46:13.611093Z", "iopub.status.idle": "2024-11-01T02:46:42.810261Z", "shell.execute_reply": "2024-11-01T02:46:42.809147Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2024-11-02 00:27:25.383621: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n", "2024-11-02 00:27:25.396224: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n", "2024-11-02 00:27:25.396257: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n", "2024-11-02 00:27:25.922262: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n", "[2024-11-02 00:27:34] server_args=ServerArgs(model_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Meta-Llama-3.1-8B-Instruct', chat_template=None, is_embedding=False, host='0.0.0.0', port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=259802610, constrained_json_whitespace_pattern=None, decode_log_interval=40, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)\n", "[2024-11-02 00:27:43 TP0] Init torch distributed begin.\n", "[2024-11-02 00:27:48 TP0] Load weight begin. avail mem=76.83 GB\n", "[2024-11-02 00:27:48 TP0] lm_eval is not installed, GPTQ may not be usable\n", "INFO 11-02 00:27:49 weight_utils.py:243] Using model weights format ['*.safetensors']\n", "Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00

NOTE: Typically, the server runs in a separate terminal.
In this notebook, we run the server and notebook code together, so their outputs are combined.
To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue.
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from sglang.utils import (\n", " execute_shell_command,\n", " wait_for_server,\n", " terminate_process,\n", " print_highlight,\n", ")\n", "\n", "server_process = execute_shell_command(\n", "\"\"\"\n", "python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n", "--port 30000 --host 0.0.0.0\n", "\"\"\"\n", ")\n", "\n", "wait_for_server(\"http://localhost:30000\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Send a Request\n", "\n", "Once the server is up, you can send test requests using curl. The server implements the [OpenAI-compatible API](https://platform.openai.com/docs/api-reference/)." ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "execution": { "iopub.execute_input": "2024-11-01T02:46:42.813656Z", "iopub.status.busy": "2024-11-01T02:46:42.813354Z", "iopub.status.idle": "2024-11-01T02:46:51.436613Z", "shell.execute_reply": "2024-11-01T02:46:51.435965Z" } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ " % Total % Received % Xferd Average Speed Time Time Time Current\n", " Dload Upload Total Spent Left Speed\n", "100 278 0 0 100 278 0 1387 --:--:-- --:--:-- --:--:-- 1383" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[2024-11-02 00:28:48 TP0] Prefill batch. #new-seq: 1, #new-token: 11, #cached-token: 42, cache hit rate: 40.19%, token usage: 0.00, #running-req: 0, #queue-req: 0\n", "[2024-11-02 00:28:48 TP0] Decode batch. #running-req: 1, #token: 75, token usage: 0.00, gen throughput (token/s): 1.46, #queue-req: 0\n", "[2024-11-02 00:28:49] INFO: 127.0.0.1:53714 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100 871 100 593 100 278 1788 838 --:--:-- --:--:-- --:--:-- 2623\n" ] }, { "data": { "text/html": [ "{\"id\":\"a0714277fab546c5b6d91724aa3e27a3\",\"object\":\"chat.completion\",\"created\":1730507329,\"model\":\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\"choices\":[{\"index\":0,\"message\":{\"role\":\"assistant\",\"content\":\"An LLM, or Large Language Model, is a type of artificial intelligence (AI) designed to process and generate human-like language, often used in applications such as chatbots, virtual assistants, and language translation software.\"},\"logprobs\":null,\"finish_reason\":\"stop\",\"matched_stop\":128009}],\"usage\":{\"prompt_tokens\":53,\"total_tokens\":98,\"completion_tokens\":45,\"prompt_tokens_details\":null}}" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import subprocess\n", "\n", "curl_command = \"\"\"\n", "curl http://localhost:30000/v1/chat/completions \\\\\n", " -H \"Content-Type: application/json\" \\\\\n", " -H \"Authorization: Bearer None\" \\\\\n", " -d '{\n", " \"model\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n", " \"messages\": [\n", " {\n", " \"role\": \"system\",\n", " \"content\": \"You are a helpful assistant.\"\n", " },\n", " {\n", " \"role\": \"user\",\n", " \"content\": \"What is an LLM? Tell me in one sentence.\"\n", " }\n", " ]\n", " }'\n", "\"\"\"\n", "\n", "response = subprocess.check_output(curl_command, shell=True).decode()\n", "\n", "print_highlight(response)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Using OpenAI Python Client\n", "\n", "You can use the OpenAI Python API library to send requests." ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "execution": { "iopub.execute_input": "2024-11-01T02:46:51.439372Z", "iopub.status.busy": "2024-11-01T02:46:51.439178Z", "iopub.status.idle": "2024-11-01T02:46:52.895776Z", "shell.execute_reply": "2024-11-01T02:46:52.895318Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2024-11-02 00:03:52 TP0] Prefill batch. #new-seq: 1, #new-token: 20, #cached-token: 29, cache hit rate: 29.13%, token usage: 0.00, #running-req: 0, #queue-req: 0\n", "[2024-11-02 00:03:52 TP0] Decode batch. #running-req: 1, #token: 65, token usage: 0.00, gen throughput (token/s): 11.33, #queue-req: 0\n", "[2024-11-02 00:03:53] INFO: 127.0.0.1:57008 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n" ] }, { "data": { "text/html": [ "ChatCompletion(id='a6590143c40f4732a5c57d4c91b43f05', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **Country:** Japan\\n**Capital:** Tokyo\\n\\n2. **Country:** Australia\\n**Capital:** Canberra\\n\\n3. **Country:** Brazil\\n**Capital:** Brasília', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None), matched_stop=128009)], created=1730505833, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, completion_tokens_details=None, prompt_tokens_details=None))" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import openai\n", "\n", "client = openai.Client(base_url=\"http://127.0.0.1:30000/v1\", api_key=\"None\")\n", "\n", "response = client.chat.completions.create(\n", " model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n", " messages=[\n", " {\"role\": \"system\", \"content\": \"You are a helpful AI assistant\"},\n", " {\"role\": \"user\", \"content\": \"List 3 countries and their capitals.\"},\n", " ],\n", " temperature=0,\n", " max_tokens=64,\n", ")\n", "\n", "print_highlight(response)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Using Native Generation APIs\n", "\n", "You can also use the native `/generate` endpoint. It provides more flexiblity.\n", "An API reference is available at [Sampling Parameters](https://sgl-project.github.io/references/sampling_params.html)." ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2024-11-02 00:05:04 TP0] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 5, cache hit rate: 33.04%, token usage: 0.00, #running-req: 0, #queue-req: 0\n", "[2024-11-02 00:05:04 TP0] Decode batch. #running-req: 1, #token: 26, token usage: 0.00, gen throughput (token/s): 3.10, #queue-req: 0\n", "[2024-11-02 00:05:04] INFO: 127.0.0.1:60536 - \"POST /generate HTTP/1.1\" 200 OK\n" ] }, { "data": { "text/html": [ "{'text': ' a city of romance, art, fashion, and history. Paris is a must-visit destination for anyone who loves culture, architecture, and cuisine. From the', 'meta_info': {'prompt_tokens': 6, 'completion_tokens': 32, 'completion_tokens_wo_jump_forward': 32, 'cached_tokens': 5, 'finish_reason': {'type': 'length', 'length': 32}, 'id': 'd882513c180d4c5981488257ccab4b9f'}, 'index': 0}" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import requests\n", "\n", "response = requests.post(\n", " \"http://localhost:30000/generate\",\n", " json={\n", " \"text\": \"The capital of France is\",\n", " \"sampling_params\": {\n", " \"temperature\": 0,\n", " \"max_new_tokens\": 32,\n", " },\n", " },\n", ")\n", "\n", "print_highlight(response.json())" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "execution": { "iopub.execute_input": "2024-11-01T02:46:52.898411Z", "iopub.status.busy": "2024-11-01T02:46:52.898149Z", "iopub.status.idle": "2024-11-01T02:46:54.398382Z", "shell.execute_reply": "2024-11-01T02:46:54.397564Z" } }, "outputs": [], "source": [ "terminate_process(server_process)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" } }, "nbformat": 4, "nbformat_minor": 2 }