{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# OpenAI Compatible API\n",
"\n",
"SGLang provides an OpenAI compatible API for smooth transition from OpenAI services. Full reference of the API is available at [OpenAI API Reference](https://platform.openai.com/docs/api-reference).\n",
"\n",
"This tutorial aims at these popular APIs:\n",
"\n",
"- `chat/completions`\n",
"- `completions`\n",
"- `batches`\n",
"- `embeddings`(refer to [embedding_model.ipynb](embedding_model.ipynb))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Chat Completions\n",
"\n",
"### Usage\n",
"\n",
"Similar to [send_request.ipynb](send_request.ipynb), we can send a chat completion request to SGLang server with OpenAI API format."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:127: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
" warnings.warn(\n",
"[2024-10-28 02:02:31] server_args=ServerArgs(model_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Meta-Llama-3.1-8B-Instruct', chat_template=None, is_embedding=False, host='0.0.0.0', port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=800169736, constrained_json_whitespace_pattern=None, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)\n",
"/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:127: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
" warnings.warn(\n",
"/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:127: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
" warnings.warn(\n",
"[2024-10-28 02:02:36 TP0] Init torch distributed begin.\n",
"[2024-10-28 02:02:37 TP0] Load weight begin. avail mem=47.27 GB\n",
"[2024-10-28 02:02:37 TP0] Ignore import error when loading sglang.srt.models.mllama. No module named 'transformers.models.mllama'\n",
"INFO 10-28 02:02:38 weight_utils.py:236] Using model weights format ['*.safetensors']\n",
"Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00, ?it/s]\n",
"Loading safetensors checkpoint shards: 25% Completed | 1/4 [00:00<00:01, 2.57it/s]\n",
"Loading safetensors checkpoint shards: 50% Completed | 2/4 [00:00<00:00, 2.45it/s]\n",
"Loading safetensors checkpoint shards: 75% Completed | 3/4 [00:00<00:00, 3.53it/s]\n",
"Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:01<00:00, 2.98it/s]\n",
"Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:01<00:00, 2.94it/s]\n",
"\n",
"[2024-10-28 02:02:40 TP0] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=32.22 GB\n",
"[2024-10-28 02:02:40 TP0] Memory pool end. avail mem=4.60 GB\n",
"[2024-10-28 02:02:40 TP0] Capture cuda graph begin. This can take up to several minutes.\n",
"[2024-10-28 02:02:48 TP0] max_total_num_tokens=217512, max_prefill_tokens=16384, max_running_requests=2049, context_len=131072\n",
"[2024-10-28 02:02:48] INFO: Started server process [1185529]\n",
"[2024-10-28 02:02:48] INFO: Waiting for application startup.\n",
"[2024-10-28 02:02:48] INFO: Application startup complete.\n",
"[2024-10-28 02:02:48] INFO: Uvicorn running on http://0.0.0.0:30000 (Press CTRL+C to quit)\n",
"[2024-10-28 02:02:48] INFO: 127.0.0.1:47904 - \"GET /v1/models HTTP/1.1\" 200 OK\n"
]
},
{
"data": {
"text/html": [
"Server is ready. Proceeding with the next steps."
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from sglang.utils import (\n",
" execute_shell_command,\n",
" wait_for_server,\n",
" terminate_process,\n",
" print_highlight,\n",
")\n",
"\n",
"server_process = execute_shell_command(\n",
" command=\"python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --port 30000 --host 0.0.0.0\"\n",
")\n",
"\n",
"wait_for_server(\"http://localhost:30000\")"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[2024-10-28 02:02:49 TP0] Prefill batch. #new-seq: 1, #new-token: 49, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
"[2024-10-28 02:02:49] INFO: 127.0.0.1:47912 - \"GET /get_model_info HTTP/1.1\" 200 OK\n",
"[2024-10-28 02:02:49 TP0] Prefill batch. #new-seq: 1, #new-token: 6, #cached-token: 1, cache hit rate: 1.79%, token usage: 0.00, #running-req: 1, #queue-req: 0\n",
"[2024-10-28 02:02:49] INFO: 127.0.0.1:47926 - \"POST /generate HTTP/1.1\" 200 OK\n",
"[2024-10-28 02:02:49] The server is fired up and ready to roll!\n",
"[2024-10-28 02:02:50 TP0] Decode batch. #running-req: 1, #token: 89, token usage: 0.00, gen throughput (token/s): 24.12, #queue-req: 0\n",
"[2024-10-28 02:02:50] INFO: 127.0.0.1:47910 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
]
},
{
"data": {
"text/html": [
"Response: ChatCompletion(id='692899ebd3ea464dbb456008a7d60bf3', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **Country:** Japan\\n**Capital:** Tokyo\\n\\n2. **Country:** Australia\\n**Capital:** Canberra\\n\\n3. **Country:** Brazil\\n**Capital:** BrasÃlia', refusal=None, role='assistant', function_call=None, tool_calls=None), matched_stop=128009)], created=1730106170, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, prompt_tokens_details=None))"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import openai\n",
"\n",
"# Always assign an api_key, even if not specified during server initialization.\n",
"# Setting an API key during server initialization is strongly recommended.\n",
"\n",
"client = openai.Client(base_url=\"http://127.0.0.1:30000/v1\", api_key=\"None\")\n",
"\n",
"# Chat completion example\n",
"\n",
"response = client.chat.completions.create(\n",
" model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
" messages=[\n",
" {\"role\": \"system\", \"content\": \"You are a helpful AI assistant\"},\n",
" {\"role\": \"user\", \"content\": \"List 3 countries and their capitals.\"},\n",
" ],\n",
" temperature=0,\n",
" max_tokens=64,\n",
")\n",
"\n",
"print_highlight(f\"Response: {response}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Parameters\n",
"\n",
"The chat completions API accepts OpenAI Chat Completions API's parameters. Refer to [OpenAI Chat Completions API](https://platform.openai.com/docs/api-reference/chat/create) for more details.\n",
"\n",
"Here is an example of a detailed chat completion request:"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[2024-10-28 02:02:50 TP0] Prefill batch. #new-seq: 1, #new-token: 48, #cached-token: 28, cache hit rate: 21.97%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
"[2024-10-28 02:02:50] INFO: 127.0.0.1:47910 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
]
},
{
"data": {
"text/html": [
"Response: ChatCompletion(id='bffa083869484c78ab89d334514d5af3', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content=\"Ancient Rome's major achievements include:\", refusal=None, role='assistant', function_call=None, tool_calls=None), matched_stop='\\n\\n')], created=1730106170, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=8, prompt_tokens=76, total_tokens=84, prompt_tokens_details=None))"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"response = client.chat.completions.create(\n",
" model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
" messages=[\n",
" {\n",
" \"role\": \"system\",\n",
" \"content\": \"You are a knowledgeable historian who provides concise responses.\",\n",
" },\n",
" {\"role\": \"user\", \"content\": \"Tell me about ancient Rome\"},\n",
" {\n",
" \"role\": \"assistant\",\n",
" \"content\": \"Ancient Rome was a civilization centered in Italy.\",\n",
" },\n",
" {\"role\": \"user\", \"content\": \"What were their major achievements?\"},\n",
" ],\n",
" temperature=0.3, # Lower temperature for more focused responses\n",
" max_tokens=100, # Reasonable length for a concise response\n",
" top_p=0.95, # Slightly higher for better fluency\n",
" stop=[\"\\n\\n\"], # Simple stop sequence\n",
" presence_penalty=0.2, # Mild penalty to avoid repetition\n",
" frequency_penalty=0.2, # Mild penalty for more natural language\n",
" n=1, # Single response is usually more stable\n",
" seed=42, # Keep for reproducibility\n",
")\n",
"\n",
"print_highlight(f\"Response: {response}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Completions\n",
"\n",
"### Usage\n",
"\n",
"Completions API is similar to Chat Completions API, but without the `messages` parameter."
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[2024-10-28 02:02:50 TP0] Prefill batch. #new-seq: 1, #new-token: 8, #cached-token: 1, cache hit rate: 21.28%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
"[2024-10-28 02:02:51 TP0] Decode batch. #running-req: 1, #token: 37, token usage: 0.00, gen throughput (token/s): 38.07, #queue-req: 0\n",
"[2024-10-28 02:02:52] INFO: 127.0.0.1:47910 - \"POST /v1/completions HTTP/1.1\" 200 OK\n"
]
},
{
"data": {
"text/html": [
"Response: Completion(id='eb486d0a32fd4384baba923f3bc17e8b', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=None, text=' 1. 2. 3.\\n1. United States - Washington D.C. 2. Japan - Tokyo 3. Australia - Canberra\\nList 3 countries and their capitals. 1. 2. 3.\\n1. China - Beijing 2. Brazil - Bras', matched_stop=None)], created=1730106172, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=64, prompt_tokens=9, total_tokens=73, prompt_tokens_details=None))"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"response = client.completions.create(\n",
" model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
" prompt=\"List 3 countries and their capitals.\",\n",
" temperature=0,\n",
" max_tokens=64,\n",
" n=1,\n",
" stop=None,\n",
")\n",
"\n",
"print_highlight(f\"Response: {response}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Parameters\n",
"\n",
"The completions API accepts OpenAI Completions API's parameters. Refer to [OpenAI Completions API](https://platform.openai.com/docs/api-reference/completions/create) for more details.\n",
"\n",
"Here is an example of a detailed completions request:"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[2024-10-28 02:02:52 TP0] Prefill batch. #new-seq: 1, #new-token: 9, #cached-token: 1, cache hit rate: 20.53%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
"[2024-10-28 02:02:52 TP0] Decode batch. #running-req: 1, #token: 15, token usage: 0.00, gen throughput (token/s): 40.91, #queue-req: 0\n",
"[2024-10-28 02:02:53 TP0] Decode batch. #running-req: 1, #token: 55, token usage: 0.00, gen throughput (token/s): 42.13, #queue-req: 0\n",
"[2024-10-28 02:02:54 TP0] Decode batch. #running-req: 1, #token: 95, token usage: 0.00, gen throughput (token/s): 42.10, #queue-req: 0\n",
"[2024-10-28 02:02:55 TP0] Decode batch. #running-req: 1, #token: 135, token usage: 0.00, gen throughput (token/s): 41.94, #queue-req: 0\n",
"[2024-10-28 02:02:55] INFO: 127.0.0.1:47910 - \"POST /v1/completions HTTP/1.1\" 200 OK\n"
]
},
{
"data": {
"text/html": [
"Response: Completion(id='fb23a12a15bc4137815b91d63b6fd976', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=None, text=\" Here is a short story about a space explorer named Astrid.\\nAstrid had always been fascinated by the stars. As a child, she would spend hours gazing up at the night sky, dreaming of what lay beyond our small planet. Now, as a renowned space explorer, she had the chance to explore the cosmos firsthand.\\nAstrid's ship, the Aurora, was equipped with state-of-the-art technology that allowed her to traverse vast distances in a relatively short period of time. She had been traveling for weeks, and finally, she had reached her destination: a distant planet on the edge of the galaxy.\\nAs she entered the planet's atmosphere, Astrid felt a thrill of excitement. She had never seen anything like this before.\", matched_stop=None)], created=1730106175, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=150, prompt_tokens=10, total_tokens=160, prompt_tokens_details=None))"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"response = client.completions.create(\n",
" model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
" prompt=\"Write a short story about a space explorer.\",\n",
" temperature=0.7, # Moderate temperature for creative writing\n",
" max_tokens=150, # Longer response for a story\n",
" top_p=0.9, # Balanced diversity in word choice\n",
" stop=[\"\\n\\n\", \"THE END\"], # Multiple stop sequences\n",
" presence_penalty=0.3, # Encourage novel elements\n",
" frequency_penalty=0.3, # Reduce repetitive phrases\n",
" n=1, # Generate one completion\n",
" seed=123, # For reproducible results\n",
")\n",
"\n",
"print_highlight(f\"Response: {response}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Batches\n",
"\n",
"We have implemented the batches API for chat completions and completions. You can upload your requests in `jsonl` files, create a batch job, and retrieve the results when the batch job is completed (which takes longer but costs less).\n",
"\n",
"The batches APIs are:\n",
"\n",
"- `batches`\n",
"- `batches/{batch_id}/cancel`\n",
"- `batches/{batch_id}`\n",
"\n",
"Here is an example of a batch job for chat completions, completions are similar.\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[2024-10-28 02:02:55] INFO: 127.0.0.1:43330 - \"POST /v1/files HTTP/1.1\" 200 OK\n",
"[2024-10-28 02:02:55] INFO: 127.0.0.1:43330 - \"POST /v1/batches HTTP/1.1\" 200 OK\n",
"[2024-10-28 02:02:55 TP0] Prefill batch. #new-seq: 2, #new-token: 30, #cached-token: 50, cache hit rate: 35.06%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
]
},
{
"data": {
"text/html": [
"Batch job created with ID: batch_56fefd2e-0187-4c14-aa2d-110917723dde"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import json\n",
"import time\n",
"from openai import OpenAI\n",
"\n",
"client = OpenAI(base_url=\"http://127.0.0.1:30000/v1\", api_key=\"None\")\n",
"\n",
"requests = [\n",
" {\n",
" \"custom_id\": \"request-1\",\n",
" \"method\": \"POST\",\n",
" \"url\": \"/chat/completions\",\n",
" \"body\": {\n",
" \"model\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
" \"messages\": [\n",
" {\"role\": \"user\", \"content\": \"Tell me a joke about programming\"}\n",
" ],\n",
" \"max_tokens\": 50,\n",
" },\n",
" },\n",
" {\n",
" \"custom_id\": \"request-2\",\n",
" \"method\": \"POST\",\n",
" \"url\": \"/chat/completions\",\n",
" \"body\": {\n",
" \"model\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
" \"messages\": [{\"role\": \"user\", \"content\": \"What is Python?\"}],\n",
" \"max_tokens\": 50,\n",
" },\n",
" },\n",
"]\n",
"\n",
"input_file_path = \"batch_requests.jsonl\"\n",
"\n",
"with open(input_file_path, \"w\") as f:\n",
" for req in requests:\n",
" f.write(json.dumps(req) + \"\\n\")\n",
"\n",
"with open(input_file_path, \"rb\") as f:\n",
" file_response = client.files.create(file=f, purpose=\"batch\")\n",
"\n",
"batch_response = client.batches.create(\n",
" input_file_id=file_response.id,\n",
" endpoint=\"/v1/chat/completions\",\n",
" completion_window=\"24h\",\n",
")\n",
"\n",
"print_highlight(f\"Batch job created with ID: {batch_response.id}\")"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[2024-10-28 02:02:56 TP0] Decode batch. #running-req: 2, #token: 82, token usage: 0.00, gen throughput (token/s): 55.10, #queue-req: 0\n",
"Batch job status: validating...trying again in 3 seconds...\n",
"[2024-10-28 02:02:58] INFO: 127.0.0.1:43330 - \"GET /v1/batches/batch_56fefd2e-0187-4c14-aa2d-110917723dde HTTP/1.1\" 200 OK\n",
"Batch job completed successfully!\n",
"Request counts: BatchRequestCounts(completed=2, failed=0, total=2)\n",
"[2024-10-28 02:02:58] INFO: 127.0.0.1:43330 - \"GET /v1/files/backend_result_file-520da6c8-0cce-4d4c-a943-a86101f5f5b4/content HTTP/1.1\" 200 OK\n"
]
},
{
"data": {
"text/html": [
"Request request-1:"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"Response: {'status_code': 200, 'request_id': 'request-1', 'body': {'id': 'request-1', 'object': 'chat.completion', 'created': 1730106176, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': 'A programmer walks into a library and asks the librarian, \"Do you have any books on Pavlov\\'s dogs and Schrödinger\\'s cat?\"\\n\\nThe librarian replies, \"It rings a bell, but I\\'m not sure if it\\'s here'}, 'logprobs': None, 'finish_reason': 'length', 'matched_stop': None}, 'usage': {'prompt_tokens': 41, 'completion_tokens': 50, 'total_tokens': 91}, 'system_fingerprint': None}}"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"Request request-2:"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"Response: {'status_code': 200, 'request_id': 'request-2', 'body': {'id': 'request-2', 'object': 'chat.completion', 'created': 1730106176, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': '**What is Python?**\\n\\nPython is a high-level, interpreted programming language that is widely used for various purposes, including:\\n\\n1. **Web Development**: Building web applications and web services using frameworks like Django and Flask.\\n2. **Data Analysis and'}, 'logprobs': None, 'finish_reason': 'length', 'matched_stop': None}, 'usage': {'prompt_tokens': 39, 'completion_tokens': 50, 'total_tokens': 89}, 'system_fingerprint': None}}"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"Cleaning up files..."
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[2024-10-28 02:02:58] INFO: 127.0.0.1:43330 - \"DELETE /v1/files/backend_result_file-520da6c8-0cce-4d4c-a943-a86101f5f5b4 HTTP/1.1\" 200 OK\n"
]
}
],
"source": [
"while batch_response.status not in [\"completed\", \"failed\", \"cancelled\"]:\n",
" time.sleep(3)\n",
" print(f\"Batch job status: {batch_response.status}...trying again in 3 seconds...\")\n",
" batch_response = client.batches.retrieve(batch_response.id)\n",
"\n",
"if batch_response.status == \"completed\":\n",
" print(\"Batch job completed successfully!\")\n",
" print(f\"Request counts: {batch_response.request_counts}\")\n",
"\n",
" result_file_id = batch_response.output_file_id\n",
" file_response = client.files.content(result_file_id)\n",
" result_content = file_response.read().decode(\"utf-8\")\n",
"\n",
" results = [\n",
" json.loads(line) for line in result_content.split(\"\\n\") if line.strip() != \"\"\n",
" ]\n",
"\n",
" for result in results:\n",
" print_highlight(f\"Request {result['custom_id']}:\")\n",
" print_highlight(f\"Response: {result['response']}\")\n",
"\n",
" print_highlight(\"Cleaning up files...\")\n",
" # Only delete the result file ID since file_response is just content\n",
" client.files.delete(result_file_id)\n",
"else:\n",
" print_highlight(f\"Batch job failed with status: {batch_response.status}\")\n",
" if hasattr(batch_response, \"errors\"):\n",
" print_highlight(f\"Errors: {batch_response.errors}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"It takes a while to complete the batch job. You can use these two APIs to retrieve the batch job status or cancel the batch job.\n",
"\n",
"1. `batches/{batch_id}`: Retrieve the batch job status.\n",
"2. `batches/{batch_id}/cancel`: Cancel the batch job.\n",
"\n",
"Here is an example to check the batch job status."
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[2024-10-28 02:02:58] INFO: 127.0.0.1:43336 - \"POST /v1/files HTTP/1.1\" 200 OK\n",
"[2024-10-28 02:02:58] INFO: 127.0.0.1:43336 - \"POST /v1/batches HTTP/1.1\" 200 OK\n"
]
},
{
"data": {
"text/html": [
"Created batch job with ID: batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"Initial status: validating"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[2024-10-28 02:02:58 TP0] Prefill batch. #new-seq: 17, #new-token: 510, #cached-token: 425, cache hit rate: 43.40%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
"[2024-10-28 02:02:58 TP0] Prefill batch. #new-seq: 83, #new-token: 2490, #cached-token: 2075, cache hit rate: 45.04%, token usage: 0.00, #running-req: 17, #queue-req: 0\n",
"[2024-10-28 02:02:59 TP0] Decode batch. #running-req: 100, #token: 3725, token usage: 0.02, gen throughput (token/s): 234.43, #queue-req: 0\n",
"[2024-10-28 02:03:00 TP0] Decode batch. #running-req: 100, #token: 7725, token usage: 0.04, gen throughput (token/s): 3545.41, #queue-req: 0\n",
"[2024-10-28 02:03:01 TP0] Decode batch. #running-req: 100, #token: 11725, token usage: 0.05, gen throughput (token/s): 3448.10, #queue-req: 0\n",
"[2024-10-28 02:03:02 TP0] Decode batch. #running-req: 100, #token: 15725, token usage: 0.07, gen throughput (token/s): 3362.62, #queue-req: 0\n",
"[2024-10-28 02:03:04 TP0] Decode batch. #running-req: 100, #token: 19725, token usage: 0.09, gen throughput (token/s): 3279.58, #queue-req: 0\n",
"[2024-10-28 02:03:05 TP0] Decode batch. #running-req: 100, #token: 23725, token usage: 0.11, gen throughput (token/s): 3200.86, #queue-req: 0\n",
"[2024-10-28 02:03:06 TP0] Decode batch. #running-req: 100, #token: 27725, token usage: 0.13, gen throughput (token/s): 3126.52, #queue-req: 0\n",
"[2024-10-28 02:03:07 TP0] Decode batch. #running-req: 100, #token: 31725, token usage: 0.15, gen throughput (token/s): 3053.16, #queue-req: 0\n",
"[2024-10-28 02:03:08] INFO: 127.0.0.1:41320 - \"GET /v1/batches/batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5 HTTP/1.1\" 200 OK\n"
]
},
{
"data": {
"text/html": [
"Batch job details (check 1 / 5) // ID: batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5 // Status: in_progress // Created at: 1730106178 // Input file ID: backend_input_file-92cf2cc1-afbd-428f-8c5c-85fabd86cb63 // Output file ID: None"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"Request counts: Total: 0 // Completed: 0 // Failed: 0"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[2024-10-28 02:03:09 TP0] Decode batch. #running-req: 100, #token: 35725, token usage: 0.16, gen throughput (token/s): 2980.26, #queue-req: 0\n",
"[2024-10-28 02:03:10 TP0] Decode batch. #running-req: 100, #token: 39725, token usage: 0.18, gen throughput (token/s): 2919.09, #queue-req: 0\n",
"[2024-10-28 02:03:11] INFO: 127.0.0.1:41320 - \"GET /v1/batches/batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5 HTTP/1.1\" 200 OK\n"
]
},
{
"data": {
"text/html": [
"Batch job details (check 2 / 5) // ID: batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5 // Status: in_progress // Created at: 1730106178 // Input file ID: backend_input_file-92cf2cc1-afbd-428f-8c5c-85fabd86cb63 // Output file ID: None"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"Request counts: Total: 0 // Completed: 0 // Failed: 0"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[2024-10-28 02:03:11 TP0] Decode batch. #running-req: 100, #token: 43725, token usage: 0.20, gen throughput (token/s): 2854.92, #queue-req: 0\n",
"[2024-10-28 02:03:13 TP0] Decode batch. #running-req: 100, #token: 47725, token usage: 0.22, gen throughput (token/s): 2794.62, #queue-req: 0\n",
"[2024-10-28 02:03:14] INFO: 127.0.0.1:41320 - \"GET /v1/batches/batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5 HTTP/1.1\" 200 OK\n"
]
},
{
"data": {
"text/html": [
"Batch job details (check 3 / 5) // ID: batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5 // Status: in_progress // Created at: 1730106178 // Input file ID: backend_input_file-92cf2cc1-afbd-428f-8c5c-85fabd86cb63 // Output file ID: None"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"Request counts: Total: 0 // Completed: 0 // Failed: 0"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[2024-10-28 02:03:14 TP0] Decode batch. #running-req: 100, #token: 51725, token usage: 0.24, gen throughput (token/s): 2737.84, #queue-req: 0\n",
"[2024-10-28 02:03:17] INFO: 127.0.0.1:41320 - \"GET /v1/batches/batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5 HTTP/1.1\" 200 OK\n"
]
},
{
"data": {
"text/html": [
"Batch job details (check 4 / 5) // ID: batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5 // Status: completed // Created at: 1730106178 // Input file ID: backend_input_file-92cf2cc1-afbd-428f-8c5c-85fabd86cb63 // Output file ID: backend_result_file-c10ee9f5-eca8-4357-a922-934543b7f433"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"Request counts: Total: 100 // Completed: 100 // Failed: 0"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[2024-10-28 02:03:20] INFO: 127.0.0.1:41320 - \"GET /v1/batches/batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5 HTTP/1.1\" 200 OK\n"
]
},
{
"data": {
"text/html": [
"Batch job details (check 5 / 5) // ID: batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5 // Status: completed // Created at: 1730106178 // Input file ID: backend_input_file-92cf2cc1-afbd-428f-8c5c-85fabd86cb63 // Output file ID: backend_result_file-c10ee9f5-eca8-4357-a922-934543b7f433"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"Request counts: Total: 100 // Completed: 100 // Failed: 0"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import json\n",
"import time\n",
"from openai import OpenAI\n",
"\n",
"client = OpenAI(base_url=\"http://127.0.0.1:30000/v1\", api_key=\"None\")\n",
"\n",
"requests = []\n",
"for i in range(100):\n",
" requests.append(\n",
" {\n",
" \"custom_id\": f\"request-{i}\",\n",
" \"method\": \"POST\",\n",
" \"url\": \"/chat/completions\",\n",
" \"body\": {\n",
" \"model\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
" \"messages\": [\n",
" {\n",
" \"role\": \"system\",\n",
" \"content\": f\"{i}: You are a helpful AI assistant\",\n",
" },\n",
" {\n",
" \"role\": \"user\",\n",
" \"content\": \"Write a detailed story about topic. Make it very long.\",\n",
" },\n",
" ],\n",
" \"max_tokens\": 500,\n",
" },\n",
" }\n",
" )\n",
"\n",
"input_file_path = \"batch_requests.jsonl\"\n",
"with open(input_file_path, \"w\") as f:\n",
" for req in requests:\n",
" f.write(json.dumps(req) + \"\\n\")\n",
"\n",
"with open(input_file_path, \"rb\") as f:\n",
" uploaded_file = client.files.create(file=f, purpose=\"batch\")\n",
"\n",
"batch_job = client.batches.create(\n",
" input_file_id=uploaded_file.id,\n",
" endpoint=\"/v1/chat/completions\",\n",
" completion_window=\"24h\",\n",
")\n",
"\n",
"print_highlight(f\"Created batch job with ID: {batch_job.id}\")\n",
"print_highlight(f\"Initial status: {batch_job.status}\")\n",
"\n",
"time.sleep(10)\n",
"\n",
"max_checks = 5\n",
"for i in range(max_checks):\n",
" batch_details = client.batches.retrieve(batch_id=batch_job.id)\n",
"\n",
" print_highlight(\n",
" f\"Batch job details (check {i+1} / {max_checks}) // ID: {batch_details.id} // Status: {batch_details.status} // Created at: {batch_details.created_at} // Input file ID: {batch_details.input_file_id} // Output file ID: {batch_details.output_file_id}\"\n",
" )\n",
" print_highlight(\n",
" f\"Request counts: Total: {batch_details.request_counts.total} // Completed: {batch_details.request_counts.completed} // Failed: {batch_details.request_counts.failed}\"\n",
" )\n",
"\n",
" time.sleep(3)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Here is an example to cancel a batch job."
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[2024-10-28 02:03:23] INFO: 127.0.0.1:47360 - \"POST /v1/files HTTP/1.1\" 200 OK\n",
"[2024-10-28 02:03:23] INFO: 127.0.0.1:47360 - \"POST /v1/batches HTTP/1.1\" 200 OK\n"
]
},
{
"data": {
"text/html": [
"Created batch job with ID: batch_8a409f86-b8c7-4e29-9cc7-187d6d28df62"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"Initial status: validating"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[2024-10-28 02:03:23 TP0] Prefill batch. #new-seq: 44, #new-token: 44, #cached-token: 2376, cache hit rate: 60.81%, token usage: 0.01, #running-req: 0, #queue-req: 0\n",
"[2024-10-28 02:03:23 TP0] Prefill batch. #new-seq: 328, #new-token: 8192, #cached-token: 9824, cache hit rate: 56.49%, token usage: 0.01, #running-req: 44, #queue-req: 128\n",
"[2024-10-28 02:03:24 TP0] Prefill batch. #new-seq: 129, #new-token: 3864, #cached-token: 3231, cache hit rate: 54.15%, token usage: 0.05, #running-req: 371, #queue-req: 1\n",
"[2024-10-28 02:03:27 TP0] Decode batch. #running-req: 500, #token: 29025, token usage: 0.13, gen throughput (token/s): 1162.55, #queue-req: 0\n",
"[2024-10-28 02:03:31 TP0] Decode batch. #running-req: 500, #token: 49025, token usage: 0.23, gen throughput (token/s): 5606.35, #queue-req: 0\n",
"[2024-10-28 02:03:33] INFO: 127.0.0.1:40110 - \"POST /v1/batches/batch_8a409f86-b8c7-4e29-9cc7-187d6d28df62/cancel HTTP/1.1\" 200 OK\n"
]
},
{
"data": {
"text/html": [
"Cancellation initiated. Status: cancelling"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[2024-10-28 02:03:36] INFO: 127.0.0.1:40110 - \"GET /v1/batches/batch_8a409f86-b8c7-4e29-9cc7-187d6d28df62 HTTP/1.1\" 200 OK\n"
]
},
{
"data": {
"text/html": [
"Current status: cancelled"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"Batch job successfully cancelled"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[2024-10-28 02:03:36] INFO: 127.0.0.1:40110 - \"DELETE /v1/files/backend_input_file-2e9608b6-981b-48ec-8adb-e653ffc69106 HTTP/1.1\" 200 OK\n"
]
},
{
"data": {
"text/html": [
"Successfully cleaned up input file"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import json\n",
"import time\n",
"from openai import OpenAI\n",
"\n",
"client = OpenAI(base_url=\"http://127.0.0.1:30000/v1\", api_key=\"None\")\n",
"\n",
"requests = []\n",
"for i in range(500):\n",
" requests.append(\n",
" {\n",
" \"custom_id\": f\"request-{i}\",\n",
" \"method\": \"POST\",\n",
" \"url\": \"/chat/completions\",\n",
" \"body\": {\n",
" \"model\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
" \"messages\": [\n",
" {\n",
" \"role\": \"system\",\n",
" \"content\": f\"{i}: You are a helpful AI assistant\",\n",
" },\n",
" {\n",
" \"role\": \"user\",\n",
" \"content\": \"Write a detailed story about topic. Make it very long.\",\n",
" },\n",
" ],\n",
" \"max_tokens\": 500,\n",
" },\n",
" }\n",
" )\n",
"\n",
"input_file_path = \"batch_requests.jsonl\"\n",
"with open(input_file_path, \"w\") as f:\n",
" for req in requests:\n",
" f.write(json.dumps(req) + \"\\n\")\n",
"\n",
"with open(input_file_path, \"rb\") as f:\n",
" uploaded_file = client.files.create(file=f, purpose=\"batch\")\n",
"\n",
"batch_job = client.batches.create(\n",
" input_file_id=uploaded_file.id,\n",
" endpoint=\"/v1/chat/completions\",\n",
" completion_window=\"24h\",\n",
")\n",
"\n",
"print_highlight(f\"Created batch job with ID: {batch_job.id}\")\n",
"print_highlight(f\"Initial status: {batch_job.status}\")\n",
"\n",
"time.sleep(10)\n",
"\n",
"try:\n",
" cancelled_job = client.batches.cancel(batch_id=batch_job.id)\n",
" print_highlight(f\"Cancellation initiated. Status: {cancelled_job.status}\")\n",
" assert cancelled_job.status == \"cancelling\"\n",
"\n",
" # Monitor the cancellation process\n",
" while cancelled_job.status not in [\"failed\", \"cancelled\"]:\n",
" time.sleep(3)\n",
" cancelled_job = client.batches.retrieve(batch_job.id)\n",
" print_highlight(f\"Current status: {cancelled_job.status}\")\n",
"\n",
" # Verify final status\n",
" assert cancelled_job.status == \"cancelled\"\n",
" print_highlight(\"Batch job successfully cancelled\")\n",
"\n",
"except Exception as e:\n",
" print_highlight(f\"Error during cancellation: {e}\")\n",
" raise e\n",
"\n",
"finally:\n",
" try:\n",
" del_response = client.files.delete(uploaded_file.id)\n",
" if del_response.deleted:\n",
" print_highlight(\"Successfully cleaned up input file\")\n",
" except Exception as e:\n",
" print_highlight(f\"Error cleaning up: {e}\")\n",
" raise e"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[2024-10-28 02:03:36] INFO: Shutting down\n",
"[2024-10-28 02:03:36] INFO: Waiting for application shutdown.\n",
"[2024-10-28 02:03:36] INFO: Application shutdown complete.\n",
"[2024-10-28 02:03:36] INFO: Finished server process [1185529]\n",
"W1028 02:03:37.084000 140231994889792 torch/_inductor/compile_worker/subproc_pool.py:126] SubprocPool unclean exit\n"
]
}
],
"source": [
"terminate_process(server_process)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "AlphaMeemory",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.7"
}
},
"nbformat": 4,
"nbformat_minor": 2
}