{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# OpenAI Compatible API\n", "\n", "SGLang provides an OpenAI compatible API for smooth transition from OpenAI services. Full reference of the API is available at [OpenAI API Reference](https://platform.openai.com/docs/api-reference).\n", "\n", "This tutorial aims at these popular APIs:\n", "\n", "- `chat/completions`\n", "- `completions`\n", "- `batches`\n", "- `embeddings`(refer to [embedding_model.ipynb](embedding_model.ipynb))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Chat Completions\n", "\n", "### Usage\n", "\n", "Similar to [send_request.ipynb](send_request.ipynb), we can send a chat completion request to SGLang server with OpenAI API format." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:127: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n", " warnings.warn(\n", "[2024-10-28 02:02:31] server_args=ServerArgs(model_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Meta-Llama-3.1-8B-Instruct', chat_template=None, is_embedding=False, host='0.0.0.0', port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=800169736, constrained_json_whitespace_pattern=None, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)\n", "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:127: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n", " warnings.warn(\n", "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:127: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n", " warnings.warn(\n", "[2024-10-28 02:02:36 TP0] Init torch distributed begin.\n", "[2024-10-28 02:02:37 TP0] Load weight begin. avail mem=47.27 GB\n", "[2024-10-28 02:02:37 TP0] Ignore import error when loading sglang.srt.models.mllama. No module named 'transformers.models.mllama'\n", "INFO 10-28 02:02:38 weight_utils.py:236] Using model weights format ['*.safetensors']\n", "Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00Server is ready. Proceeding with the next steps." ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from sglang.utils import (\n", " execute_shell_command,\n", " wait_for_server,\n", " terminate_process,\n", " print_highlight,\n", ")\n", "\n", "server_process = execute_shell_command(\n", " command=\"python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --port 30000 --host 0.0.0.0\"\n", ")\n", "\n", "wait_for_server(\"http://localhost:30000\")" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2024-10-28 02:02:49 TP0] Prefill batch. #new-seq: 1, #new-token: 49, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n", "[2024-10-28 02:02:49] INFO: 127.0.0.1:47912 - \"GET /get_model_info HTTP/1.1\" 200 OK\n", "[2024-10-28 02:02:49 TP0] Prefill batch. #new-seq: 1, #new-token: 6, #cached-token: 1, cache hit rate: 1.79%, token usage: 0.00, #running-req: 1, #queue-req: 0\n", "[2024-10-28 02:02:49] INFO: 127.0.0.1:47926 - \"POST /generate HTTP/1.1\" 200 OK\n", "[2024-10-28 02:02:49] The server is fired up and ready to roll!\n", "[2024-10-28 02:02:50 TP0] Decode batch. #running-req: 1, #token: 89, token usage: 0.00, gen throughput (token/s): 24.12, #queue-req: 0\n", "[2024-10-28 02:02:50] INFO: 127.0.0.1:47910 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n" ] }, { "data": { "text/html": [ "Response: ChatCompletion(id='692899ebd3ea464dbb456008a7d60bf3', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **Country:** Japan\\n**Capital:** Tokyo\\n\\n2. **Country:** Australia\\n**Capital:** Canberra\\n\\n3. **Country:** Brazil\\n**Capital:** Brasília', refusal=None, role='assistant', function_call=None, tool_calls=None), matched_stop=128009)], created=1730106170, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, prompt_tokens_details=None))" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import openai\n", "\n", "# Always assign an api_key, even if not specified during server initialization.\n", "# Setting an API key during server initialization is strongly recommended.\n", "\n", "client = openai.Client(base_url=\"http://127.0.0.1:30000/v1\", api_key=\"None\")\n", "\n", "# Chat completion example\n", "\n", "response = client.chat.completions.create(\n", " model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n", " messages=[\n", " {\"role\": \"system\", \"content\": \"You are a helpful AI assistant\"},\n", " {\"role\": \"user\", \"content\": \"List 3 countries and their capitals.\"},\n", " ],\n", " temperature=0,\n", " max_tokens=64,\n", ")\n", "\n", "print_highlight(f\"Response: {response}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Parameters\n", "\n", "The chat completions API accepts OpenAI Chat Completions API's parameters. Refer to [OpenAI Chat Completions API](https://platform.openai.com/docs/api-reference/chat/create) for more details.\n", "\n", "Here is an example of a detailed chat completion request:" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2024-10-28 02:02:50 TP0] Prefill batch. #new-seq: 1, #new-token: 48, #cached-token: 28, cache hit rate: 21.97%, token usage: 0.00, #running-req: 0, #queue-req: 0\n", "[2024-10-28 02:02:50] INFO: 127.0.0.1:47910 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n" ] }, { "data": { "text/html": [ "Response: ChatCompletion(id='bffa083869484c78ab89d334514d5af3', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content=\"Ancient Rome's major achievements include:\", refusal=None, role='assistant', function_call=None, tool_calls=None), matched_stop='\\n\\n')], created=1730106170, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=8, prompt_tokens=76, total_tokens=84, prompt_tokens_details=None))" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "response = client.chat.completions.create(\n", " model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n", " messages=[\n", " {\n", " \"role\": \"system\",\n", " \"content\": \"You are a knowledgeable historian who provides concise responses.\",\n", " },\n", " {\"role\": \"user\", \"content\": \"Tell me about ancient Rome\"},\n", " {\n", " \"role\": \"assistant\",\n", " \"content\": \"Ancient Rome was a civilization centered in Italy.\",\n", " },\n", " {\"role\": \"user\", \"content\": \"What were their major achievements?\"},\n", " ],\n", " temperature=0.3, # Lower temperature for more focused responses\n", " max_tokens=100, # Reasonable length for a concise response\n", " top_p=0.95, # Slightly higher for better fluency\n", " stop=[\"\\n\\n\"], # Simple stop sequence\n", " presence_penalty=0.2, # Mild penalty to avoid repetition\n", " frequency_penalty=0.2, # Mild penalty for more natural language\n", " n=1, # Single response is usually more stable\n", " seed=42, # Keep for reproducibility\n", ")\n", "\n", "print_highlight(f\"Response: {response}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Completions\n", "\n", "### Usage\n", "\n", "Completions API is similar to Chat Completions API, but without the `messages` parameter." ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2024-10-28 02:02:50 TP0] Prefill batch. #new-seq: 1, #new-token: 8, #cached-token: 1, cache hit rate: 21.28%, token usage: 0.00, #running-req: 0, #queue-req: 0\n", "[2024-10-28 02:02:51 TP0] Decode batch. #running-req: 1, #token: 37, token usage: 0.00, gen throughput (token/s): 38.07, #queue-req: 0\n", "[2024-10-28 02:02:52] INFO: 127.0.0.1:47910 - \"POST /v1/completions HTTP/1.1\" 200 OK\n" ] }, { "data": { "text/html": [ "Response: Completion(id='eb486d0a32fd4384baba923f3bc17e8b', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=None, text=' 1. 2. 3.\\n1. United States - Washington D.C. 2. Japan - Tokyo 3. Australia - Canberra\\nList 3 countries and their capitals. 1. 2. 3.\\n1. China - Beijing 2. Brazil - Bras', matched_stop=None)], created=1730106172, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=64, prompt_tokens=9, total_tokens=73, prompt_tokens_details=None))" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "response = client.completions.create(\n", " model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n", " prompt=\"List 3 countries and their capitals.\",\n", " temperature=0,\n", " max_tokens=64,\n", " n=1,\n", " stop=None,\n", ")\n", "\n", "print_highlight(f\"Response: {response}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Parameters\n", "\n", "The completions API accepts OpenAI Completions API's parameters. Refer to [OpenAI Completions API](https://platform.openai.com/docs/api-reference/completions/create) for more details.\n", "\n", "Here is an example of a detailed completions request:" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2024-10-28 02:02:52 TP0] Prefill batch. #new-seq: 1, #new-token: 9, #cached-token: 1, cache hit rate: 20.53%, token usage: 0.00, #running-req: 0, #queue-req: 0\n", "[2024-10-28 02:02:52 TP0] Decode batch. #running-req: 1, #token: 15, token usage: 0.00, gen throughput (token/s): 40.91, #queue-req: 0\n", "[2024-10-28 02:02:53 TP0] Decode batch. #running-req: 1, #token: 55, token usage: 0.00, gen throughput (token/s): 42.13, #queue-req: 0\n", "[2024-10-28 02:02:54 TP0] Decode batch. #running-req: 1, #token: 95, token usage: 0.00, gen throughput (token/s): 42.10, #queue-req: 0\n", "[2024-10-28 02:02:55 TP0] Decode batch. #running-req: 1, #token: 135, token usage: 0.00, gen throughput (token/s): 41.94, #queue-req: 0\n", "[2024-10-28 02:02:55] INFO: 127.0.0.1:47910 - \"POST /v1/completions HTTP/1.1\" 200 OK\n" ] }, { "data": { "text/html": [ "Response: Completion(id='fb23a12a15bc4137815b91d63b6fd976', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=None, text=\" Here is a short story about a space explorer named Astrid.\\nAstrid had always been fascinated by the stars. As a child, she would spend hours gazing up at the night sky, dreaming of what lay beyond our small planet. Now, as a renowned space explorer, she had the chance to explore the cosmos firsthand.\\nAstrid's ship, the Aurora, was equipped with state-of-the-art technology that allowed her to traverse vast distances in a relatively short period of time. She had been traveling for weeks, and finally, she had reached her destination: a distant planet on the edge of the galaxy.\\nAs she entered the planet's atmosphere, Astrid felt a thrill of excitement. She had never seen anything like this before.\", matched_stop=None)], created=1730106175, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=150, prompt_tokens=10, total_tokens=160, prompt_tokens_details=None))" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "response = client.completions.create(\n", " model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n", " prompt=\"Write a short story about a space explorer.\",\n", " temperature=0.7, # Moderate temperature for creative writing\n", " max_tokens=150, # Longer response for a story\n", " top_p=0.9, # Balanced diversity in word choice\n", " stop=[\"\\n\\n\", \"THE END\"], # Multiple stop sequences\n", " presence_penalty=0.3, # Encourage novel elements\n", " frequency_penalty=0.3, # Reduce repetitive phrases\n", " n=1, # Generate one completion\n", " seed=123, # For reproducible results\n", ")\n", "\n", "print_highlight(f\"Response: {response}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Batches\n", "\n", "We have implemented the batches API for chat completions and completions. You can upload your requests in `jsonl` files, create a batch job, and retrieve the results when the batch job is completed (which takes longer but costs less).\n", "\n", "The batches APIs are:\n", "\n", "- `batches`\n", "- `batches/{batch_id}/cancel`\n", "- `batches/{batch_id}`\n", "\n", "Here is an example of a batch job for chat completions, completions are similar.\n" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2024-10-28 02:02:55] INFO: 127.0.0.1:43330 - \"POST /v1/files HTTP/1.1\" 200 OK\n", "[2024-10-28 02:02:55] INFO: 127.0.0.1:43330 - \"POST /v1/batches HTTP/1.1\" 200 OK\n", "[2024-10-28 02:02:55 TP0] Prefill batch. #new-seq: 2, #new-token: 30, #cached-token: 50, cache hit rate: 35.06%, token usage: 0.00, #running-req: 0, #queue-req: 0\n" ] }, { "data": { "text/html": [ "Batch job created with ID: batch_56fefd2e-0187-4c14-aa2d-110917723dde" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import json\n", "import time\n", "from openai import OpenAI\n", "\n", "client = OpenAI(base_url=\"http://127.0.0.1:30000/v1\", api_key=\"None\")\n", "\n", "requests = [\n", " {\n", " \"custom_id\": \"request-1\",\n", " \"method\": \"POST\",\n", " \"url\": \"/chat/completions\",\n", " \"body\": {\n", " \"model\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n", " \"messages\": [\n", " {\"role\": \"user\", \"content\": \"Tell me a joke about programming\"}\n", " ],\n", " \"max_tokens\": 50,\n", " },\n", " },\n", " {\n", " \"custom_id\": \"request-2\",\n", " \"method\": \"POST\",\n", " \"url\": \"/chat/completions\",\n", " \"body\": {\n", " \"model\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n", " \"messages\": [{\"role\": \"user\", \"content\": \"What is Python?\"}],\n", " \"max_tokens\": 50,\n", " },\n", " },\n", "]\n", "\n", "input_file_path = \"batch_requests.jsonl\"\n", "\n", "with open(input_file_path, \"w\") as f:\n", " for req in requests:\n", " f.write(json.dumps(req) + \"\\n\")\n", "\n", "with open(input_file_path, \"rb\") as f:\n", " file_response = client.files.create(file=f, purpose=\"batch\")\n", "\n", "batch_response = client.batches.create(\n", " input_file_id=file_response.id,\n", " endpoint=\"/v1/chat/completions\",\n", " completion_window=\"24h\",\n", ")\n", "\n", "print_highlight(f\"Batch job created with ID: {batch_response.id}\")" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2024-10-28 02:02:56 TP0] Decode batch. #running-req: 2, #token: 82, token usage: 0.00, gen throughput (token/s): 55.10, #queue-req: 0\n", "Batch job status: validating...trying again in 3 seconds...\n", "[2024-10-28 02:02:58] INFO: 127.0.0.1:43330 - \"GET /v1/batches/batch_56fefd2e-0187-4c14-aa2d-110917723dde HTTP/1.1\" 200 OK\n", "Batch job completed successfully!\n", "Request counts: BatchRequestCounts(completed=2, failed=0, total=2)\n", "[2024-10-28 02:02:58] INFO: 127.0.0.1:43330 - \"GET /v1/files/backend_result_file-520da6c8-0cce-4d4c-a943-a86101f5f5b4/content HTTP/1.1\" 200 OK\n" ] }, { "data": { "text/html": [ "Request request-1:" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "Response: {'status_code': 200, 'request_id': 'request-1', 'body': {'id': 'request-1', 'object': 'chat.completion', 'created': 1730106176, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': 'A programmer walks into a library and asks the librarian, \"Do you have any books on Pavlov\\'s dogs and Schrödinger\\'s cat?\"\\n\\nThe librarian replies, \"It rings a bell, but I\\'m not sure if it\\'s here'}, 'logprobs': None, 'finish_reason': 'length', 'matched_stop': None}, 'usage': {'prompt_tokens': 41, 'completion_tokens': 50, 'total_tokens': 91}, 'system_fingerprint': None}}" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "Request request-2:" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "Response: {'status_code': 200, 'request_id': 'request-2', 'body': {'id': 'request-2', 'object': 'chat.completion', 'created': 1730106176, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': '**What is Python?**\\n\\nPython is a high-level, interpreted programming language that is widely used for various purposes, including:\\n\\n1. **Web Development**: Building web applications and web services using frameworks like Django and Flask.\\n2. **Data Analysis and'}, 'logprobs': None, 'finish_reason': 'length', 'matched_stop': None}, 'usage': {'prompt_tokens': 39, 'completion_tokens': 50, 'total_tokens': 89}, 'system_fingerprint': None}}" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "Cleaning up files..." ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "[2024-10-28 02:02:58] INFO: 127.0.0.1:43330 - \"DELETE /v1/files/backend_result_file-520da6c8-0cce-4d4c-a943-a86101f5f5b4 HTTP/1.1\" 200 OK\n" ] } ], "source": [ "while batch_response.status not in [\"completed\", \"failed\", \"cancelled\"]:\n", " time.sleep(3)\n", " print(f\"Batch job status: {batch_response.status}...trying again in 3 seconds...\")\n", " batch_response = client.batches.retrieve(batch_response.id)\n", "\n", "if batch_response.status == \"completed\":\n", " print(\"Batch job completed successfully!\")\n", " print(f\"Request counts: {batch_response.request_counts}\")\n", "\n", " result_file_id = batch_response.output_file_id\n", " file_response = client.files.content(result_file_id)\n", " result_content = file_response.read().decode(\"utf-8\")\n", "\n", " results = [\n", " json.loads(line) for line in result_content.split(\"\\n\") if line.strip() != \"\"\n", " ]\n", "\n", " for result in results:\n", " print_highlight(f\"Request {result['custom_id']}:\")\n", " print_highlight(f\"Response: {result['response']}\")\n", "\n", " print_highlight(\"Cleaning up files...\")\n", " # Only delete the result file ID since file_response is just content\n", " client.files.delete(result_file_id)\n", "else:\n", " print_highlight(f\"Batch job failed with status: {batch_response.status}\")\n", " if hasattr(batch_response, \"errors\"):\n", " print_highlight(f\"Errors: {batch_response.errors}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "It takes a while to complete the batch job. You can use these two APIs to retrieve the batch job status or cancel the batch job.\n", "\n", "1. `batches/{batch_id}`: Retrieve the batch job status.\n", "2. `batches/{batch_id}/cancel`: Cancel the batch job.\n", "\n", "Here is an example to check the batch job status." ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2024-10-28 02:02:58] INFO: 127.0.0.1:43336 - \"POST /v1/files HTTP/1.1\" 200 OK\n", "[2024-10-28 02:02:58] INFO: 127.0.0.1:43336 - \"POST /v1/batches HTTP/1.1\" 200 OK\n" ] }, { "data": { "text/html": [ "Created batch job with ID: batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "Initial status: validating" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "[2024-10-28 02:02:58 TP0] Prefill batch. #new-seq: 17, #new-token: 510, #cached-token: 425, cache hit rate: 43.40%, token usage: 0.00, #running-req: 0, #queue-req: 0\n", "[2024-10-28 02:02:58 TP0] Prefill batch. #new-seq: 83, #new-token: 2490, #cached-token: 2075, cache hit rate: 45.04%, token usage: 0.00, #running-req: 17, #queue-req: 0\n", "[2024-10-28 02:02:59 TP0] Decode batch. #running-req: 100, #token: 3725, token usage: 0.02, gen throughput (token/s): 234.43, #queue-req: 0\n", "[2024-10-28 02:03:00 TP0] Decode batch. #running-req: 100, #token: 7725, token usage: 0.04, gen throughput (token/s): 3545.41, #queue-req: 0\n", "[2024-10-28 02:03:01 TP0] Decode batch. #running-req: 100, #token: 11725, token usage: 0.05, gen throughput (token/s): 3448.10, #queue-req: 0\n", "[2024-10-28 02:03:02 TP0] Decode batch. #running-req: 100, #token: 15725, token usage: 0.07, gen throughput (token/s): 3362.62, #queue-req: 0\n", "[2024-10-28 02:03:04 TP0] Decode batch. #running-req: 100, #token: 19725, token usage: 0.09, gen throughput (token/s): 3279.58, #queue-req: 0\n", "[2024-10-28 02:03:05 TP0] Decode batch. #running-req: 100, #token: 23725, token usage: 0.11, gen throughput (token/s): 3200.86, #queue-req: 0\n", "[2024-10-28 02:03:06 TP0] Decode batch. #running-req: 100, #token: 27725, token usage: 0.13, gen throughput (token/s): 3126.52, #queue-req: 0\n", "[2024-10-28 02:03:07 TP0] Decode batch. #running-req: 100, #token: 31725, token usage: 0.15, gen throughput (token/s): 3053.16, #queue-req: 0\n", "[2024-10-28 02:03:08] INFO: 127.0.0.1:41320 - \"GET /v1/batches/batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5 HTTP/1.1\" 200 OK\n" ] }, { "data": { "text/html": [ "Batch job details (check 1 / 5) // ID: batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5 // Status: in_progress // Created at: 1730106178 // Input file ID: backend_input_file-92cf2cc1-afbd-428f-8c5c-85fabd86cb63 // Output file ID: None" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "Request counts: Total: 0 // Completed: 0 // Failed: 0" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "[2024-10-28 02:03:09 TP0] Decode batch. #running-req: 100, #token: 35725, token usage: 0.16, gen throughput (token/s): 2980.26, #queue-req: 0\n", "[2024-10-28 02:03:10 TP0] Decode batch. #running-req: 100, #token: 39725, token usage: 0.18, gen throughput (token/s): 2919.09, #queue-req: 0\n", "[2024-10-28 02:03:11] INFO: 127.0.0.1:41320 - \"GET /v1/batches/batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5 HTTP/1.1\" 200 OK\n" ] }, { "data": { "text/html": [ "Batch job details (check 2 / 5) // ID: batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5 // Status: in_progress // Created at: 1730106178 // Input file ID: backend_input_file-92cf2cc1-afbd-428f-8c5c-85fabd86cb63 // Output file ID: None" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "Request counts: Total: 0 // Completed: 0 // Failed: 0" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "[2024-10-28 02:03:11 TP0] Decode batch. #running-req: 100, #token: 43725, token usage: 0.20, gen throughput (token/s): 2854.92, #queue-req: 0\n", "[2024-10-28 02:03:13 TP0] Decode batch. #running-req: 100, #token: 47725, token usage: 0.22, gen throughput (token/s): 2794.62, #queue-req: 0\n", "[2024-10-28 02:03:14] INFO: 127.0.0.1:41320 - \"GET /v1/batches/batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5 HTTP/1.1\" 200 OK\n" ] }, { "data": { "text/html": [ "Batch job details (check 3 / 5) // ID: batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5 // Status: in_progress // Created at: 1730106178 // Input file ID: backend_input_file-92cf2cc1-afbd-428f-8c5c-85fabd86cb63 // Output file ID: None" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "Request counts: Total: 0 // Completed: 0 // Failed: 0" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "[2024-10-28 02:03:14 TP0] Decode batch. #running-req: 100, #token: 51725, token usage: 0.24, gen throughput (token/s): 2737.84, #queue-req: 0\n", "[2024-10-28 02:03:17] INFO: 127.0.0.1:41320 - \"GET /v1/batches/batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5 HTTP/1.1\" 200 OK\n" ] }, { "data": { "text/html": [ "Batch job details (check 4 / 5) // ID: batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5 // Status: completed // Created at: 1730106178 // Input file ID: backend_input_file-92cf2cc1-afbd-428f-8c5c-85fabd86cb63 // Output file ID: backend_result_file-c10ee9f5-eca8-4357-a922-934543b7f433" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "Request counts: Total: 100 // Completed: 100 // Failed: 0" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "[2024-10-28 02:03:20] INFO: 127.0.0.1:41320 - \"GET /v1/batches/batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5 HTTP/1.1\" 200 OK\n" ] }, { "data": { "text/html": [ "Batch job details (check 5 / 5) // ID: batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5 // Status: completed // Created at: 1730106178 // Input file ID: backend_input_file-92cf2cc1-afbd-428f-8c5c-85fabd86cb63 // Output file ID: backend_result_file-c10ee9f5-eca8-4357-a922-934543b7f433" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "Request counts: Total: 100 // Completed: 100 // Failed: 0" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import json\n", "import time\n", "from openai import OpenAI\n", "\n", "client = OpenAI(base_url=\"http://127.0.0.1:30000/v1\", api_key=\"None\")\n", "\n", "requests = []\n", "for i in range(100):\n", " requests.append(\n", " {\n", " \"custom_id\": f\"request-{i}\",\n", " \"method\": \"POST\",\n", " \"url\": \"/chat/completions\",\n", " \"body\": {\n", " \"model\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n", " \"messages\": [\n", " {\n", " \"role\": \"system\",\n", " \"content\": f\"{i}: You are a helpful AI assistant\",\n", " },\n", " {\n", " \"role\": \"user\",\n", " \"content\": \"Write a detailed story about topic. Make it very long.\",\n", " },\n", " ],\n", " \"max_tokens\": 500,\n", " },\n", " }\n", " )\n", "\n", "input_file_path = \"batch_requests.jsonl\"\n", "with open(input_file_path, \"w\") as f:\n", " for req in requests:\n", " f.write(json.dumps(req) + \"\\n\")\n", "\n", "with open(input_file_path, \"rb\") as f:\n", " uploaded_file = client.files.create(file=f, purpose=\"batch\")\n", "\n", "batch_job = client.batches.create(\n", " input_file_id=uploaded_file.id,\n", " endpoint=\"/v1/chat/completions\",\n", " completion_window=\"24h\",\n", ")\n", "\n", "print_highlight(f\"Created batch job with ID: {batch_job.id}\")\n", "print_highlight(f\"Initial status: {batch_job.status}\")\n", "\n", "time.sleep(10)\n", "\n", "max_checks = 5\n", "for i in range(max_checks):\n", " batch_details = client.batches.retrieve(batch_id=batch_job.id)\n", "\n", " print_highlight(\n", " f\"Batch job details (check {i+1} / {max_checks}) // ID: {batch_details.id} // Status: {batch_details.status} // Created at: {batch_details.created_at} // Input file ID: {batch_details.input_file_id} // Output file ID: {batch_details.output_file_id}\"\n", " )\n", " print_highlight(\n", " f\"Request counts: Total: {batch_details.request_counts.total} // Completed: {batch_details.request_counts.completed} // Failed: {batch_details.request_counts.failed}\"\n", " )\n", "\n", " time.sleep(3)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Here is an example to cancel a batch job." ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2024-10-28 02:03:23] INFO: 127.0.0.1:47360 - \"POST /v1/files HTTP/1.1\" 200 OK\n", "[2024-10-28 02:03:23] INFO: 127.0.0.1:47360 - \"POST /v1/batches HTTP/1.1\" 200 OK\n" ] }, { "data": { "text/html": [ "Created batch job with ID: batch_8a409f86-b8c7-4e29-9cc7-187d6d28df62" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "Initial status: validating" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "[2024-10-28 02:03:23 TP0] Prefill batch. #new-seq: 44, #new-token: 44, #cached-token: 2376, cache hit rate: 60.81%, token usage: 0.01, #running-req: 0, #queue-req: 0\n", "[2024-10-28 02:03:23 TP0] Prefill batch. #new-seq: 328, #new-token: 8192, #cached-token: 9824, cache hit rate: 56.49%, token usage: 0.01, #running-req: 44, #queue-req: 128\n", "[2024-10-28 02:03:24 TP0] Prefill batch. #new-seq: 129, #new-token: 3864, #cached-token: 3231, cache hit rate: 54.15%, token usage: 0.05, #running-req: 371, #queue-req: 1\n", "[2024-10-28 02:03:27 TP0] Decode batch. #running-req: 500, #token: 29025, token usage: 0.13, gen throughput (token/s): 1162.55, #queue-req: 0\n", "[2024-10-28 02:03:31 TP0] Decode batch. #running-req: 500, #token: 49025, token usage: 0.23, gen throughput (token/s): 5606.35, #queue-req: 0\n", "[2024-10-28 02:03:33] INFO: 127.0.0.1:40110 - \"POST /v1/batches/batch_8a409f86-b8c7-4e29-9cc7-187d6d28df62/cancel HTTP/1.1\" 200 OK\n" ] }, { "data": { "text/html": [ "Cancellation initiated. Status: cancelling" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "[2024-10-28 02:03:36] INFO: 127.0.0.1:40110 - \"GET /v1/batches/batch_8a409f86-b8c7-4e29-9cc7-187d6d28df62 HTTP/1.1\" 200 OK\n" ] }, { "data": { "text/html": [ "Current status: cancelled" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "Batch job successfully cancelled" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "[2024-10-28 02:03:36] INFO: 127.0.0.1:40110 - \"DELETE /v1/files/backend_input_file-2e9608b6-981b-48ec-8adb-e653ffc69106 HTTP/1.1\" 200 OK\n" ] }, { "data": { "text/html": [ "Successfully cleaned up input file" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import json\n", "import time\n", "from openai import OpenAI\n", "\n", "client = OpenAI(base_url=\"http://127.0.0.1:30000/v1\", api_key=\"None\")\n", "\n", "requests = []\n", "for i in range(500):\n", " requests.append(\n", " {\n", " \"custom_id\": f\"request-{i}\",\n", " \"method\": \"POST\",\n", " \"url\": \"/chat/completions\",\n", " \"body\": {\n", " \"model\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n", " \"messages\": [\n", " {\n", " \"role\": \"system\",\n", " \"content\": f\"{i}: You are a helpful AI assistant\",\n", " },\n", " {\n", " \"role\": \"user\",\n", " \"content\": \"Write a detailed story about topic. Make it very long.\",\n", " },\n", " ],\n", " \"max_tokens\": 500,\n", " },\n", " }\n", " )\n", "\n", "input_file_path = \"batch_requests.jsonl\"\n", "with open(input_file_path, \"w\") as f:\n", " for req in requests:\n", " f.write(json.dumps(req) + \"\\n\")\n", "\n", "with open(input_file_path, \"rb\") as f:\n", " uploaded_file = client.files.create(file=f, purpose=\"batch\")\n", "\n", "batch_job = client.batches.create(\n", " input_file_id=uploaded_file.id,\n", " endpoint=\"/v1/chat/completions\",\n", " completion_window=\"24h\",\n", ")\n", "\n", "print_highlight(f\"Created batch job with ID: {batch_job.id}\")\n", "print_highlight(f\"Initial status: {batch_job.status}\")\n", "\n", "time.sleep(10)\n", "\n", "try:\n", " cancelled_job = client.batches.cancel(batch_id=batch_job.id)\n", " print_highlight(f\"Cancellation initiated. Status: {cancelled_job.status}\")\n", " assert cancelled_job.status == \"cancelling\"\n", "\n", " # Monitor the cancellation process\n", " while cancelled_job.status not in [\"failed\", \"cancelled\"]:\n", " time.sleep(3)\n", " cancelled_job = client.batches.retrieve(batch_job.id)\n", " print_highlight(f\"Current status: {cancelled_job.status}\")\n", "\n", " # Verify final status\n", " assert cancelled_job.status == \"cancelled\"\n", " print_highlight(\"Batch job successfully cancelled\")\n", "\n", "except Exception as e:\n", " print_highlight(f\"Error during cancellation: {e}\")\n", " raise e\n", "\n", "finally:\n", " try:\n", " del_response = client.files.delete(uploaded_file.id)\n", " if del_response.deleted:\n", " print_highlight(\"Successfully cleaned up input file\")\n", " except Exception as e:\n", " print_highlight(f\"Error cleaning up: {e}\")\n", " raise e" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2024-10-28 02:03:36] INFO: Shutting down\n", "[2024-10-28 02:03:36] INFO: Waiting for application shutdown.\n", "[2024-10-28 02:03:36] INFO: Application shutdown complete.\n", "[2024-10-28 02:03:36] INFO: Finished server process [1185529]\n", "W1028 02:03:37.084000 140231994889792 torch/_inductor/compile_worker/subproc_pool.py:126] SubprocPool unclean exit\n" ] } ], "source": [ "terminate_process(server_process)" ] } ], "metadata": { "kernelspec": { "display_name": "AlphaMeemory", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.7" } }, "nbformat": 4, "nbformat_minor": 2 }