send_request.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Quick Start: Launch A Server and Send Requests\n",
    "\n",
    "This section provides a quick start guide to using SGLang after installation."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Launch a server\n",
    "\n",
    "This code block is equivalent to executing \n",
    "\n",
    "```bash\n",
    "python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n",
    "--port 30000 --host 0.0.0.0\n",
    "```\n",
    "\n",
    "in your command line and wait for the server to be ready."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
      "  warnings.warn(\n",
      "[2024-10-29 21:14:13] server_args=ServerArgs(model_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Meta-Llama-3.1-8B-Instruct', chat_template=None, is_embedding=False, host='0.0.0.0', port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=518055348, constrained_json_whitespace_pattern=None, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)\n",
      "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
      "  warnings.warn(\n",
      "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:128: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
      "  warnings.warn(\n",
      "[2024-10-29 21:14:19 TP0] Init torch distributed begin.\n",
      "[2024-10-29 21:14:20 TP0] Load weight begin. avail mem=47.27 GB\n",
      "[2024-10-29 21:14:21 TP0] lm_eval is not installed, GPTQ may not be usable\n",
      "INFO 10-29 21:14:21 weight_utils.py:243] Using model weights format ['*.safetensors']\n",
      "Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]\n",
      "Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:00<00:01,  2.32it/s]\n",
      "Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:00<00:00,  2.28it/s]\n",
      "Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:01<00:00,  3.27it/s]\n",
      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:01<00:00,  2.87it/s]\n",
      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:01<00:00,  2.78it/s]\n",
      "\n",
      "[2024-10-29 21:14:24 TP0] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=32.22 GB\n",
      "[2024-10-29 21:14:24 TP0] Memory pool end. avail mem=4.60 GB\n",
      "[2024-10-29 21:14:24 TP0] Capture cuda graph begin. This can take up to several minutes.\n",
      "[2024-10-29 21:14:32 TP0] max_total_num_tokens=217512, max_prefill_tokens=16384, max_running_requests=2049, context_len=131072\n",
      "[2024-10-29 21:14:32] INFO:     Started server process [2661188]\n",
      "[2024-10-29 21:14:32] INFO:     Waiting for application startup.\n",
      "[2024-10-29 21:14:32] INFO:     Application startup complete.\n",
      "[2024-10-29 21:14:32] INFO:     Uvicorn running on http://0.0.0.0:30000 (Press CTRL+C to quit)\n",
      "[2024-10-29 21:14:32] INFO:     127.0.0.1:49888 - \"GET /v1/models HTTP/1.1\" 200 OK\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<strong style='color: #00008B;'><br>                            Server and notebook outputs are combined for clarity.<br>                            <br>                            Typically, the server runs in a separate terminal.<br>                            <br>                            Server output is gray; notebook output is highlighted.<br>                            </strong>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from sglang.utils import (\n",
    "    execute_shell_command,\n",
    "    wait_for_server,\n",
    "    terminate_process,\n",
    "    print_highlight,\n",
    ")\n",
    "\n",
    "\n",
    "server_process = execute_shell_command(\n",
    "    \"\"\"\n",
    "python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n",
    "--port 30000 --host 0.0.0.0\n",
    "\"\"\"\n",
    ")\n",
    "\n",
    "wait_for_server(\"http://localhost:30000\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Send a Request\n",
    "\n",
    "Once the server is running, you can send test requests using curl."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2024-10-29 21:14:32 TP0] Prefill batch. #new-seq: 1, #new-token: 47, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2024-10-29 21:14:33] INFO:     127.0.0.1:49914 - \"GET /get_model_info HTTP/1.1\" 200 OK\n",
      "[2024-10-29 21:14:33 TP0] Prefill batch. #new-seq: 1, #new-token: 6, #cached-token: 1, cache hit rate: 1.85%, token usage: 0.00, #running-req: 1, #queue-req: 0\n",
      "[2024-10-29 21:14:33] INFO:     127.0.0.1:49916 - \"POST /generate HTTP/1.1\" 200 OK\n",
      "[2024-10-29 21:14:33] The server is fired up and ready to roll!\n",
      "[2024-10-29 21:14:33 TP0] Decode batch. #running-req: 1, #token: 87, token usage: 0.00, gen throughput (token/s): 27.00, #queue-req: 0\n",
      "[2024-10-29 21:14:34 TP0] Decode batch. #running-req: 1, #token: 127, token usage: 0.00, gen throughput (token/s): 42.50, #queue-req: 0\n",
      "[2024-10-29 21:14:35 TP0] Decode batch. #running-req: 1, #token: 167, token usage: 0.00, gen throughput (token/s): 42.31, #queue-req: 0\n",
      "[2024-10-29 21:14:36 TP0] Decode batch. #running-req: 1, #token: 207, token usage: 0.00, gen throughput (token/s): 42.29, #queue-req: 0\n",
      "[2024-10-29 21:14:37 TP0] Decode batch. #running-req: 1, #token: 247, token usage: 0.00, gen throughput (token/s): 42.34, #queue-req: 0\n",
      "[2024-10-29 21:14:38 TP0] Decode batch. #running-req: 1, #token: 287, token usage: 0.00, gen throughput (token/s): 42.34, #queue-req: 0\n",
      "[2024-10-29 21:14:39 TP0] Decode batch. #running-req: 1, #token: 327, token usage: 0.00, gen throughput (token/s): 42.30, #queue-req: 0\n",
      "[2024-10-29 21:14:40 TP0] Decode batch. #running-req: 1, #token: 367, token usage: 0.00, gen throughput (token/s): 42.32, #queue-req: 0\n",
      "[2024-10-29 21:14:41 TP0] Decode batch. #running-req: 1, #token: 407, token usage: 0.00, gen throughput (token/s): 42.23, #queue-req: 0\n",
      "[2024-10-29 21:14:42 TP0] Decode batch. #running-req: 1, #token: 447, token usage: 0.00, gen throughput (token/s): 42.25, #queue-req: 0\n",
      "[2024-10-29 21:14:43 TP0] Decode batch. #running-req: 1, #token: 487, token usage: 0.00, gen throughput (token/s): 42.22, #queue-req: 0\n",
      "[2024-10-29 21:14:43] INFO:     127.0.0.1:49902 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n",
      "{\"id\":\"0635a1c4d1d940f597b11482bed9595f\",\"object\":\"chat.completion\",\"created\":1730261683,\"model\":\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\"choices\":[{\"index\":0,\"message\":{\"role\":\"assistant\",\"content\":\"LLM stands for Large Language Model. It's a type of artificial intelligence (AI) designed to process and understand human language. LLMs are trained on vast amounts of text data, allowing them to learn patterns, relationships, and context within language.\\n\\nLarge language models like myself use natural language processing (NLP) and machine learning algorithms to analyze and generate human-like text. This enables us to:\\n\\n1. **Answer questions**: Provide information on a wide range of topics, from general knowledge to specialized domains.\\n2. **Generate text**: Create coherent and contextually relevant text, such as articles, essays, or even entire stories.\\n3. **Translate languages**: Translate text from one language to another, helping to break language barriers.\\n4. **Summarize content**: Condense long pieces of text into shorter, more digestible summaries.\\n5. **Chat and converse**: Engage in natural-sounding conversations, using context and understanding to respond to questions and statements.\\n\\nLarge language models are typically trained on massive datasets, often consisting of billions of parameters and petabytes of text data. This training enables us to learn complex language patterns, nuances, and context, allowing us to provide helpful and informative responses.\\n\\nSome popular examples of large language models include:\\n\\n1. **BERT (Bidirectional Encoder Representations from Transformers)**: Developed by Google, BERT is a foundational model for many language understanding tasks.\\n2. **RoBERTa (Robustly Optimized BERT Pretraining Approach)**: A variant of BERT, developed by Facebook AI, which improved upon the original model's performance.\\n3. **Transformers**: A family of models developed by Google, which includes BERT and other related architectures.\\n\\nThese models have revolutionized the field of natural language processing and have many exciting applications in areas like:\\n\\n1. **Virtual assistants**: Like Siri, Alexa, or myself, which can understand and respond to voice commands.\\n2. **Language translation**: Enabling real-time translation of languages.\\n3. **Content generation**: Creating original text, such as articles, stories, or even entire books.\\n4. **Customer service**: Providing 24/7 support and answering common customer queries.\\n\\nI hope this helps you understand what a Large Language Model is and its capabilities!\"},\"logprobs\":null,\"finish_reason\":\"stop\",\"matched_stop\":128009}],\"usage\":{\"prompt_tokens\":47,\"total_tokens\":504,\"completion_tokens\":457,\"prompt_tokens_details\":null}}"
     ]
    }
   ],
   "source": [
    "!curl http://localhost:30000/v1/chat/completions \\\n",
    "  -H \"Content-Type: application/json\" \\\n",
    "  -H \"Authorization: Bearer None\" \\\n",
    "  -d '{\"model\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\", \"messages\": [{\"role\": \"system\", \"content\": \"You are a helpful assistant.\"}, {\"role\": \"user\", \"content\": \"What is a LLM?\"}]}'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Using OpenAI Compatible API\n",
    "\n",
    "SGLang supports OpenAI-compatible APIs. Here are Python examples:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2024-10-29 21:14:44 TP0] Prefill batch. #new-seq: 1, #new-token: 20, #cached-token: 29, cache hit rate: 29.13%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
      "[2024-10-29 21:14:44 TP0] Decode batch. #running-req: 1, #token: 73, token usage: 0.00, gen throughput (token/s): 26.00, #queue-req: 0\n",
      "[2024-10-29 21:14:45] INFO:     127.0.0.1:52764 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<strong style='color: #00008B;'>ChatCompletion(id='994dd35133d34f57951a102c7470464f', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **Country:** Japan\\n**Capital:** Tokyo\\n\\n2. **Country:** Australia\\n**Capital:** Canberra\\n\\n3. **Country:** Brazil\\n**Capital:** Brasília', refusal=None, role='assistant', function_call=None, tool_calls=None), matched_stop=128009)], created=1730261685, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, prompt_tokens_details=None))</strong>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import openai\n",
    "\n",
    "# Always assign an api_key, even if not specified during server initialization.\n",
    "# Setting an API key during server initialization is strongly recommended.\n",
    "\n",
    "client = openai.Client(base_url=\"http://127.0.0.1:30000/v1\", api_key=\"None\")\n",
    "\n",
    "# Chat completion example\n",
    "\n",
    "response = client.chat.completions.create(\n",
    "    model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
    "    messages=[\n",
    "        {\"role\": \"system\", \"content\": \"You are a helpful AI assistant\"},\n",
    "        {\"role\": \"user\", \"content\": \"List 3 countries and their capitals.\"},\n",
    "    ],\n",
    "    temperature=0,\n",
    "    max_tokens=64,\n",
    ")\n",
    "print_highlight(response)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2024-10-29 21:14:45] INFO:     Shutting down\n",
      "[2024-10-29 21:14:45] INFO:     Waiting for application shutdown.\n",
      "[2024-10-29 21:14:45] INFO:     Application shutdown complete.\n",
      "[2024-10-29 21:14:45] INFO:     Finished server process [2661188]\n",
      "W1029 21:14:45.740000 139643311699520 torch/_inductor/compile_worker/subproc_pool.py:126] SubprocPool unclean exit\n"
     ]
    }
   ],
   "source": [
    "terminate_process(server_process)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "AlphaMeemory",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}