send_request.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Quick Start: Launch A Server and Send Requests\n",
    "\n",
    "This notebook provides a quick-start guide for using SGLang after installation."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Launch a server\n",
    "\n",
    "This code block is equivalent to executing \n",
    "\n",
    "```bash\n",
    "python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n",
    "--port 30000 --host 0.0.0.0\n",
    "```\n",
    "\n",
    "in your command line and wait for the server to be ready."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2024-10-30 09:32:30] server_args=ServerArgs(model_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Meta-Llama-3.1-8B-Instruct', chat_template=None, is_embedding=False, host='0.0.0.0', port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=335520337, constrained_json_whitespace_pattern=None, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)\n",
      "[2024-10-30 09:32:39 TP0] Init torch distributed begin.\n",
      "[2024-10-30 09:32:43 TP0] Load weight begin. avail mem=76.83 GB\n",
      "[2024-10-30 09:32:43 TP0] lm_eval is not installed, GPTQ may not be usable\n",
      "INFO 10-30 09:32:43 weight_utils.py:243] Using model weights format ['*.safetensors']\n",
      "Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]\n",
      "Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:01<00:05,  1.78s/it]\n",
      "Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:03<00:03,  1.78s/it]\n",
      "Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:05<00:01,  1.80s/it]\n",
      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:05<00:00,  1.30s/it]\n",
      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:05<00:00,  1.48s/it]\n",
      "\n",
      "[2024-10-30 09:32:49 TP0] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=61.82 GB\n",
      "[2024-10-30 09:32:49 TP0] Memory pool end. avail mem=8.19 GB\n",
      "[2024-10-30 09:32:51 TP0] Capture cuda graph begin. This can take up to several minutes.\n",
      "[2024-10-30 09:32:59 TP0] max_total_num_tokens=430915, max_prefill_tokens=16384, max_running_requests=2049, context_len=131072\n",
      "[2024-10-30 09:33:00] INFO:     Started server process [227758]\n",
      "[2024-10-30 09:33:00] INFO:     Waiting for application startup.\n",
      "[2024-10-30 09:33:00] INFO:     Application startup complete.\n",
      "[2024-10-30 09:33:00] INFO:     Uvicorn running on http://0.0.0.0:30000 (Press CTRL+C to quit)\n",
      "[2024-10-30 09:33:01] INFO:     127.0.0.1:49220 - \"GET /v1/models HTTP/1.1\" 200 OK\n",
      "[2024-10-30 09:33:01] INFO:     127.0.0.1:49236 - \"GET /get_model_info HTTP/1.1\" 200 OK\n",
      "[2024-10-30 09:33:01 TP0] Prefill batch. #new-seq: 1, #new-token: 7, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
      "[2024-10-30 09:33:01] INFO:     127.0.0.1:49240 - \"POST /generate HTTP/1.1\" 200 OK\n",
      "[2024-10-30 09:33:01] The server is fired up and ready to roll!\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<strong style='color: #00008B;'><br><br>                    NOTE: Typically, the server runs in a separate terminal.<br>                    In this notebook, we run the server and notebook code together, so their outputs are combined.<br>                    To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue.<br>                    </strong>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from sglang.utils import (\n",
    "    execute_shell_command,\n",
    "    wait_for_server,\n",
    "    terminate_process,\n",
    "    print_highlight,\n",
    ")\n",
    "\n",
    "server_process = execute_shell_command(\n",
    "\"\"\"\n",
    "python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n",
    "--port 30000 --host 0.0.0.0\n",
    "\"\"\"\n",
    ")\n",
    "\n",
    "wait_for_server(\"http://localhost:30000\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Send a Request\n",
    "\n",
    "Once the server is running, you can send test requests using curl. The server implements the [OpenAI-compatible API](https://platform.openai.com/docs/api-reference/chat)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2024-10-30 09:34:00 TP0] Prefill batch. #new-seq: 1, #new-token: 46, #cached-token: 1, cache hit rate: 1.85%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
      "[2024-10-30 09:34:00 TP0] Decode batch. #running-req: 1, #token: 80, token usage: 0.00, gen throughput (token/s): 0.65, #queue-req: 0\n",
      "[2024-10-30 09:34:01 TP0] Decode batch. #running-req: 1, #token: 120, token usage: 0.00, gen throughput (token/s): 139.05, #queue-req: 0\n",
      "[2024-10-30 09:34:01 TP0] Decode batch. #running-req: 1, #token: 160, token usage: 0.00, gen throughput (token/s): 137.75, #queue-req: 0\n",
      "[2024-10-30 09:34:01 TP0] Decode batch. #running-req: 1, #token: 200, token usage: 0.00, gen throughput (token/s): 137.59, #queue-req: 0\n",
      "[2024-10-30 09:34:02 TP0] Decode batch. #running-req: 1, #token: 240, token usage: 0.00, gen throughput (token/s): 137.62, #queue-req: 0\n",
      "[2024-10-30 09:34:02 TP0] Decode batch. #running-req: 1, #token: 280, token usage: 0.00, gen throughput (token/s): 137.61, #queue-req: 0\n",
      "[2024-10-30 09:34:02 TP0] Decode batch. #running-req: 1, #token: 320, token usage: 0.00, gen throughput (token/s): 137.49, #queue-req: 0\n",
      "[2024-10-30 09:34:02 TP0] Decode batch. #running-req: 1, #token: 360, token usage: 0.00, gen throughput (token/s): 137.51, #queue-req: 0\n",
      "[2024-10-30 09:34:03 TP0] Decode batch. #running-req: 1, #token: 400, token usage: 0.00, gen throughput (token/s): 137.47, #queue-req: 0\n",
      "[2024-10-30 09:34:03 TP0] Decode batch. #running-req: 1, #token: 440, token usage: 0.00, gen throughput (token/s): 137.48, #queue-req: 0\n",
      "[2024-10-30 09:34:03 TP0] Decode batch. #running-req: 1, #token: 480, token usage: 0.00, gen throughput (token/s): 137.47, #queue-req: 0\n",
      "[2024-10-30 09:34:04 TP0] Decode batch. #running-req: 1, #token: 520, token usage: 0.00, gen throughput (token/s): 137.47, #queue-req: 0\n",
      "[2024-10-30 09:34:04] INFO:     127.0.0.1:54110 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n",
      "{\"id\":\"a53e18ead1314ab0a2cec76cef484c11\",\"object\":\"chat.completion\",\"created\":1730280844,\"model\":\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\"choices\":[{\"index\":0,\"message\":{\"role\":\"assistant\",\"content\":\"LLM stands for Large Language Model. It's a type of artificial intelligence (AI) model that is designed to process and understand human language in a way that's similar to how humans do. \\n\\nLLMs are trained on vast amounts of text data, which allows them to learn patterns, relationships, and context within language. This training enables them to generate human-like responses to a wide range of questions, prompts, and topics.\\n\\nSome common characteristics of LLMs include:\\n\\n1. **Language understanding**: LLMs can comprehend the meaning and context of language, including nuances like idioms, sarcasm, and figurative language.\\n2. **Language generation**: LLMs can generate text that's coherent, contextually relevant, and often engaging.\\n3. **Knowledge retrieval**: LLMs can access and retrieve information from their vast training datasets, allowing them to answer questions and provide information on a wide range of topics.\\n4. **Conversational dialogue**: LLMs can engage in natural-sounding conversations, using context and understanding to respond to questions and statements.\\n\\nLLMs have many applications, including:\\n\\n1. **Virtual assistants**: LLMs power virtual assistants like Siri, Alexa, and Google Assistant.\\n2. **Language translation**: LLMs can translate languages in real-time, with high accuracy.\\n3. **Content generation**: LLMs can generate text, such as articles, emails, and social media posts.\\n4. **Chatbots**: LLMs can power chatbots that provide customer support, answer questions, and engage in conversations.\\n\\nSome popular examples of LLMs include:\\n\\n1. **BERT (Bidirectional Encoder Representations from Transformers)**: Developed by Google, BERT is a widely used LLM that's been trained on a massive dataset of text.\\n2. **RoBERTa (Robustly Optimized BERT Pretraining Approach)**: Developed by Facebook AI, RoBERTa is another popular LLM that's been trained on a large dataset of text.\\n3. **Language models from OpenAI**: OpenAI has developed a range of LLMs, including GPT-3 (Generative Pre-trained Transformer 3), which is one of the most advanced LLMs available today.\\n\\nOverall, LLMs have the potential to revolutionize the way we interact with language and information, making it easier to access and understand complex topics, and opening up new possibilities for language-based applications.\"},\"logprobs\":null,\"finish_reason\":\"stop\",\"matched_stop\":128009}],\"usage\":{\"prompt_tokens\":47,\"total_tokens\":539,\"completion_tokens\":492,\"prompt_tokens_details\":null}}"
     ]
    }
   ],
   "source": [
    "!curl http://localhost:30000/v1/chat/completions \\\n",
    "  -H \"Content-Type: application/json\" \\\n",
    "  -H \"Authorization: Bearer None\" \\\n",
    "  -d '{\"model\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\", \"messages\": [{\"role\": \"system\", \"content\": \"You are a helpful assistant.\"}, {\"role\": \"user\", \"content\": \"What is a LLM?\"}]}'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Using OpenAI Python Client\n",
    "\n",
    "You can also use the OpenAI Python API library to send requests."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2024-10-30 09:34:06 TP0] Prefill batch. #new-seq: 1, #new-token: 20, #cached-token: 29, cache hit rate: 29.13%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
      "[2024-10-30 09:34:07 TP0] Decode batch. #running-req: 1, #token: 71, token usage: 0.00, gen throughput (token/s): 13.51, #queue-req: 0\n",
      "[2024-10-30 09:34:07] INFO:     127.0.0.1:42068 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<strong style='color: #00008B;'>ChatCompletion(id='0708a0196e524456a1316359f6189e48', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **Country:** Japan\\n**Capital:** Tokyo\\n\\n2. **Country:** Australia\\n**Capital:** Canberra\\n\\n3. **Country:** Brazil\\n**Capital:** Brasília', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None), matched_stop=128009)], created=1730280847, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, completion_tokens_details=None, prompt_tokens_details=None))</strong>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import openai\n",
    "\n",
    "client = openai.Client(base_url=\"http://127.0.0.1:30000/v1\", api_key=\"None\")\n",
    "\n",
    "response = client.chat.completions.create(\n",
    "    model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
    "    messages=[\n",
    "        {\"role\": \"system\", \"content\": \"You are a helpful AI assistant\"},\n",
    "        {\"role\": \"user\", \"content\": \"List 3 countries and their capitals.\"},\n",
    "    ],\n",
    "    temperature=0,\n",
    "    max_tokens=64,\n",
    ")\n",
    "print_highlight(response)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "terminate_process(server_process)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}