openai_api.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# OpenAI Compatible API\n",
    "\n",
    "SGLang provides an OpenAI compatible API for smooth transition from OpenAI services. Full reference of the API is available at [OpenAI API Reference](https://platform.openai.com/docs/api-reference).\n",
    "\n",
    "This tutorial aims at these popular APIs:\n",
    "\n",
    "- `chat/completions`\n",
    "- `completions`\n",
    "- `batches`\n",
    "- `embeddings`(refer to [embedding_model.ipynb](embedding_model.ipynb))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Chat Completions\n",
    "\n",
    "### Usage\n",
    "\n",
    "Similar to [send_request.ipynb](send_request.ipynb), we can send a chat completion request to SGLang server with OpenAI API format."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:127: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
      "  warnings.warn(\n",
      "[2024-10-28 02:02:31] server_args=ServerArgs(model_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_path='meta-llama/Meta-Llama-3.1-8B-Instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, context_length=None, device='cuda', served_model_name='meta-llama/Meta-Llama-3.1-8B-Instruct', chat_template=None, is_embedding=False, host='0.0.0.0', port=30000, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='lpm', schedule_conservativeness=1.0, tp_size=1, stream_interval=1, random_seed=800169736, constrained_json_whitespace_pattern=None, log_level='info', log_level_http=None, log_requests=False, show_time_cost=False, api_key=None, file_storage_pth='SGLang_storage', enable_cache_report=False, watchdog_timeout=600, dp_size=1, load_balance_method='round_robin', dist_init_addr=None, nnodes=1, node_rank=0, json_model_override_args='{}', enable_double_sparsity=False, ds_channel_config_path=None, ds_heavy_channel_num=32, ds_heavy_token_num=256, ds_heavy_channel_type='qk', ds_sparse_decode_threshold=4096, lora_paths=None, max_loras_per_batch=8, attention_backend='flashinfer', sampling_backend='flashinfer', grammar_backend='outlines', disable_flashinfer=False, disable_flashinfer_sampling=False, disable_radix_cache=False, disable_regex_jump_forward=False, disable_cuda_graph=False, disable_cuda_graph_padding=False, disable_disk_cache=False, disable_custom_all_reduce=False, disable_mla=False, disable_penalizer=False, disable_nan_detection=False, enable_overlap_schedule=False, enable_mixed_chunk=False, enable_torch_compile=False, torch_compile_max_bs=32, cuda_graph_max_bs=160, torchao_config='', enable_p2p_check=False, triton_attention_reduce_in_fp32=False, num_continuous_decode_steps=1)\n",
      "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:127: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
      "  warnings.warn(\n",
      "/home/chenyang/miniconda3/envs/AlphaMeemory/lib/python3.11/site-packages/transformers/utils/hub.py:127: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
      "  warnings.warn(\n",
      "[2024-10-28 02:02:36 TP0] Init torch distributed begin.\n",
      "[2024-10-28 02:02:37 TP0] Load weight begin. avail mem=47.27 GB\n",
      "[2024-10-28 02:02:37 TP0] Ignore import error when loading sglang.srt.models.mllama. No module named 'transformers.models.mllama'\n",
      "INFO 10-28 02:02:38 weight_utils.py:236] Using model weights format ['*.safetensors']\n",
      "Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]\n",
      "Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:00<00:01,  2.57it/s]\n",
      "Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:00<00:00,  2.45it/s]\n",
      "Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:00<00:00,  3.53it/s]\n",
      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:01<00:00,  2.98it/s]\n",
      "Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:01<00:00,  2.94it/s]\n",
      "\n",
      "[2024-10-28 02:02:40 TP0] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=32.22 GB\n",
      "[2024-10-28 02:02:40 TP0] Memory pool end. avail mem=4.60 GB\n",
      "[2024-10-28 02:02:40 TP0] Capture cuda graph begin. This can take up to several minutes.\n",
      "[2024-10-28 02:02:48 TP0] max_total_num_tokens=217512, max_prefill_tokens=16384, max_running_requests=2049, context_len=131072\n",
      "[2024-10-28 02:02:48] INFO:     Started server process [1185529]\n",
      "[2024-10-28 02:02:48] INFO:     Waiting for application startup.\n",
      "[2024-10-28 02:02:48] INFO:     Application startup complete.\n",
      "[2024-10-28 02:02:48] INFO:     Uvicorn running on http://0.0.0.0:30000 (Press CTRL+C to quit)\n",
      "[2024-10-28 02:02:48] INFO:     127.0.0.1:47904 - \"GET /v1/models HTTP/1.1\" 200 OK\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<strong style='color: #00008B;'>Server is ready. Proceeding with the next steps.</strong>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from sglang.utils import (\n",
    "    execute_shell_command,\n",
    "    wait_for_server,\n",
    "    terminate_process,\n",
    "    print_highlight,\n",
    ")\n",
    "\n",
    "server_process = execute_shell_command(\n",
    "    command=\"python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --port 30000 --host 0.0.0.0\"\n",
    ")\n",
    "\n",
    "wait_for_server(\"http://localhost:30000\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2024-10-28 02:02:49 TP0] Prefill batch. #new-seq: 1, #new-token: 49, #cached-token: 0, cache hit rate: 0.00%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
      "[2024-10-28 02:02:49] INFO:     127.0.0.1:47912 - \"GET /get_model_info HTTP/1.1\" 200 OK\n",
      "[2024-10-28 02:02:49 TP0] Prefill batch. #new-seq: 1, #new-token: 6, #cached-token: 1, cache hit rate: 1.79%, token usage: 0.00, #running-req: 1, #queue-req: 0\n",
      "[2024-10-28 02:02:49] INFO:     127.0.0.1:47926 - \"POST /generate HTTP/1.1\" 200 OK\n",
      "[2024-10-28 02:02:49] The server is fired up and ready to roll!\n",
      "[2024-10-28 02:02:50 TP0] Decode batch. #running-req: 1, #token: 89, token usage: 0.00, gen throughput (token/s): 24.12, #queue-req: 0\n",
      "[2024-10-28 02:02:50] INFO:     127.0.0.1:47910 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<strong style='color: #00008B;'>Response: ChatCompletion(id='692899ebd3ea464dbb456008a7d60bf3', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **Country:** Japan\\n**Capital:** Tokyo\\n\\n2. **Country:** Australia\\n**Capital:** Canberra\\n\\n3. **Country:** Brazil\\n**Capital:** Brasília', refusal=None, role='assistant', function_call=None, tool_calls=None), matched_stop=128009)], created=1730106170, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, prompt_tokens_details=None))</strong>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import openai\n",
    "\n",
    "# Always assign an api_key, even if not specified during server initialization.\n",
    "# Setting an API key during server initialization is strongly recommended.\n",
    "\n",
    "client = openai.Client(base_url=\"http://127.0.0.1:30000/v1\", api_key=\"None\")\n",
    "\n",
    "# Chat completion example\n",
    "\n",
    "response = client.chat.completions.create(\n",
    "    model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
    "    messages=[\n",
    "        {\"role\": \"system\", \"content\": \"You are a helpful AI assistant\"},\n",
    "        {\"role\": \"user\", \"content\": \"List 3 countries and their capitals.\"},\n",
    "    ],\n",
    "    temperature=0,\n",
    "    max_tokens=64,\n",
    ")\n",
    "\n",
    "print_highlight(f\"Response: {response}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Parameters\n",
    "\n",
    "The chat completions API accepts OpenAI Chat Completions API's parameters. Refer to [OpenAI Chat Completions API](https://platform.openai.com/docs/api-reference/chat/create) for more details.\n",
    "\n",
    "Here is an example of a detailed chat completion request:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2024-10-28 02:02:50 TP0] Prefill batch. #new-seq: 1, #new-token: 48, #cached-token: 28, cache hit rate: 21.97%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
      "[2024-10-28 02:02:50] INFO:     127.0.0.1:47910 - \"POST /v1/chat/completions HTTP/1.1\" 200 OK\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<strong style='color: #00008B;'>Response: ChatCompletion(id='bffa083869484c78ab89d334514d5af3', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content=\"Ancient Rome's major achievements include:\", refusal=None, role='assistant', function_call=None, tool_calls=None), matched_stop='\\n\\n')], created=1730106170, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=8, prompt_tokens=76, total_tokens=84, prompt_tokens_details=None))</strong>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "response = client.chat.completions.create(\n",
    "    model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
    "    messages=[\n",
    "        {\n",
    "            \"role\": \"system\",\n",
    "            \"content\": \"You are a knowledgeable historian who provides concise responses.\",\n",
    "        },\n",
    "        {\"role\": \"user\", \"content\": \"Tell me about ancient Rome\"},\n",
    "        {\n",
    "            \"role\": \"assistant\",\n",
    "            \"content\": \"Ancient Rome was a civilization centered in Italy.\",\n",
    "        },\n",
    "        {\"role\": \"user\", \"content\": \"What were their major achievements?\"},\n",
    "    ],\n",
    "    temperature=0.3,  # Lower temperature for more focused responses\n",
    "    max_tokens=100,  # Reasonable length for a concise response\n",
    "    top_p=0.95,  # Slightly higher for better fluency\n",
    "    stop=[\"\\n\\n\"],  # Simple stop sequence\n",
    "    presence_penalty=0.2,  # Mild penalty to avoid repetition\n",
    "    frequency_penalty=0.2,  # Mild penalty for more natural language\n",
    "    n=1,  # Single response is usually more stable\n",
    "    seed=42,  # Keep for reproducibility\n",
    ")\n",
    "\n",
    "print_highlight(f\"Response: {response}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Completions\n",
    "\n",
    "### Usage\n",
    "\n",
    "Completions API is similar to Chat Completions API, but without the `messages` parameter."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2024-10-28 02:02:50 TP0] Prefill batch. #new-seq: 1, #new-token: 8, #cached-token: 1, cache hit rate: 21.28%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
      "[2024-10-28 02:02:51 TP0] Decode batch. #running-req: 1, #token: 37, token usage: 0.00, gen throughput (token/s): 38.07, #queue-req: 0\n",
      "[2024-10-28 02:02:52] INFO:     127.0.0.1:47910 - \"POST /v1/completions HTTP/1.1\" 200 OK\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<strong style='color: #00008B;'>Response: Completion(id='eb486d0a32fd4384baba923f3bc17e8b', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=None, text=' 1. 2. 3.\\n1.  United States - Washington D.C. 2.  Japan - Tokyo 3.  Australia - Canberra\\nList 3 countries and their capitals. 1. 2. 3.\\n1.  China - Beijing 2.  Brazil - Bras', matched_stop=None)], created=1730106172, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=64, prompt_tokens=9, total_tokens=73, prompt_tokens_details=None))</strong>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "response = client.completions.create(\n",
    "    model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
    "    prompt=\"List 3 countries and their capitals.\",\n",
    "    temperature=0,\n",
    "    max_tokens=64,\n",
    "    n=1,\n",
    "    stop=None,\n",
    ")\n",
    "\n",
    "print_highlight(f\"Response: {response}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Parameters\n",
    "\n",
    "The completions API accepts OpenAI Completions API's parameters.  Refer to [OpenAI Completions API](https://platform.openai.com/docs/api-reference/completions/create) for more details.\n",
    "\n",
    "Here is an example of a detailed completions request:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2024-10-28 02:02:52 TP0] Prefill batch. #new-seq: 1, #new-token: 9, #cached-token: 1, cache hit rate: 20.53%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
      "[2024-10-28 02:02:52 TP0] Decode batch. #running-req: 1, #token: 15, token usage: 0.00, gen throughput (token/s): 40.91, #queue-req: 0\n",
      "[2024-10-28 02:02:53 TP0] Decode batch. #running-req: 1, #token: 55, token usage: 0.00, gen throughput (token/s): 42.13, #queue-req: 0\n",
      "[2024-10-28 02:02:54 TP0] Decode batch. #running-req: 1, #token: 95, token usage: 0.00, gen throughput (token/s): 42.10, #queue-req: 0\n",
      "[2024-10-28 02:02:55 TP0] Decode batch. #running-req: 1, #token: 135, token usage: 0.00, gen throughput (token/s): 41.94, #queue-req: 0\n",
      "[2024-10-28 02:02:55] INFO:     127.0.0.1:47910 - \"POST /v1/completions HTTP/1.1\" 200 OK\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<strong style='color: #00008B;'>Response: Completion(id='fb23a12a15bc4137815b91d63b6fd976', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=None, text=\" Here is a short story about a space explorer named Astrid.\\nAstrid had always been fascinated by the stars. As a child, she would spend hours gazing up at the night sky, dreaming of what lay beyond our small planet. Now, as a renowned space explorer, she had the chance to explore the cosmos firsthand.\\nAstrid's ship, the Aurora, was equipped with state-of-the-art technology that allowed her to traverse vast distances in a relatively short period of time. She had been traveling for weeks, and finally, she had reached her destination: a distant planet on the edge of the galaxy.\\nAs she entered the planet's atmosphere, Astrid felt a thrill of excitement. She had never seen anything like this before.\", matched_stop=None)], created=1730106175, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=150, prompt_tokens=10, total_tokens=160, prompt_tokens_details=None))</strong>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "response = client.completions.create(\n",
    "    model=\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
    "    prompt=\"Write a short story about a space explorer.\",\n",
    "    temperature=0.7,  # Moderate temperature for creative writing\n",
    "    max_tokens=150,  # Longer response for a story\n",
    "    top_p=0.9,  # Balanced diversity in word choice\n",
    "    stop=[\"\\n\\n\", \"THE END\"],  # Multiple stop sequences\n",
    "    presence_penalty=0.3,  # Encourage novel elements\n",
    "    frequency_penalty=0.3,  # Reduce repetitive phrases\n",
    "    n=1,  # Generate one completion\n",
    "    seed=123,  # For reproducible results\n",
    ")\n",
    "\n",
    "print_highlight(f\"Response: {response}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Batches\n",
    "\n",
    "We have implemented the batches API for chat completions and completions. You can upload your requests in `jsonl` files, create a batch job, and retrieve the results when the batch job is completed (which takes longer but costs less).\n",
    "\n",
    "The batches APIs are:\n",
    "\n",
    "- `batches`\n",
    "- `batches/{batch_id}/cancel`\n",
    "- `batches/{batch_id}`\n",
    "\n",
    "Here is an example of a batch job for chat completions, completions are similar.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2024-10-28 02:02:55] INFO:     127.0.0.1:43330 - \"POST /v1/files HTTP/1.1\" 200 OK\n",
      "[2024-10-28 02:02:55] INFO:     127.0.0.1:43330 - \"POST /v1/batches HTTP/1.1\" 200 OK\n",
      "[2024-10-28 02:02:55 TP0] Prefill batch. #new-seq: 2, #new-token: 30, #cached-token: 50, cache hit rate: 35.06%, token usage: 0.00, #running-req: 0, #queue-req: 0\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<strong style='color: #00008B;'>Batch job created with ID: batch_56fefd2e-0187-4c14-aa2d-110917723dde</strong>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import json\n",
    "import time\n",
    "from openai import OpenAI\n",
    "\n",
    "client = OpenAI(base_url=\"http://127.0.0.1:30000/v1\", api_key=\"None\")\n",
    "\n",
    "requests = [\n",
    "    {\n",
    "        \"custom_id\": \"request-1\",\n",
    "        \"method\": \"POST\",\n",
    "        \"url\": \"/chat/completions\",\n",
    "        \"body\": {\n",
    "            \"model\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
    "            \"messages\": [\n",
    "                {\"role\": \"user\", \"content\": \"Tell me a joke about programming\"}\n",
    "            ],\n",
    "            \"max_tokens\": 50,\n",
    "        },\n",
    "    },\n",
    "    {\n",
    "        \"custom_id\": \"request-2\",\n",
    "        \"method\": \"POST\",\n",
    "        \"url\": \"/chat/completions\",\n",
    "        \"body\": {\n",
    "            \"model\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
    "            \"messages\": [{\"role\": \"user\", \"content\": \"What is Python?\"}],\n",
    "            \"max_tokens\": 50,\n",
    "        },\n",
    "    },\n",
    "]\n",
    "\n",
    "input_file_path = \"batch_requests.jsonl\"\n",
    "\n",
    "with open(input_file_path, \"w\") as f:\n",
    "    for req in requests:\n",
    "        f.write(json.dumps(req) + \"\\n\")\n",
    "\n",
    "with open(input_file_path, \"rb\") as f:\n",
    "    file_response = client.files.create(file=f, purpose=\"batch\")\n",
    "\n",
    "batch_response = client.batches.create(\n",
    "    input_file_id=file_response.id,\n",
    "    endpoint=\"/v1/chat/completions\",\n",
    "    completion_window=\"24h\",\n",
    ")\n",
    "\n",
    "print_highlight(f\"Batch job created with ID: {batch_response.id}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2024-10-28 02:02:56 TP0] Decode batch. #running-req: 2, #token: 82, token usage: 0.00, gen throughput (token/s): 55.10, #queue-req: 0\n",
      "Batch job status: validating...trying again in 3 seconds...\n",
      "[2024-10-28 02:02:58] INFO:     127.0.0.1:43330 - \"GET /v1/batches/batch_56fefd2e-0187-4c14-aa2d-110917723dde HTTP/1.1\" 200 OK\n",
      "Batch job completed successfully!\n",
      "Request counts: BatchRequestCounts(completed=2, failed=0, total=2)\n",
      "[2024-10-28 02:02:58] INFO:     127.0.0.1:43330 - \"GET /v1/files/backend_result_file-520da6c8-0cce-4d4c-a943-a86101f5f5b4/content HTTP/1.1\" 200 OK\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<strong style='color: #00008B;'>Request request-1:</strong>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<strong style='color: #00008B;'>Response: {'status_code': 200, 'request_id': 'request-1', 'body': {'id': 'request-1', 'object': 'chat.completion', 'created': 1730106176, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': 'A programmer walks into a library and asks the librarian, \"Do you have any books on Pavlov\\'s dogs and Schrödinger\\'s cat?\"\\n\\nThe librarian replies, \"It rings a bell, but I\\'m not sure if it\\'s here'}, 'logprobs': None, 'finish_reason': 'length', 'matched_stop': None}, 'usage': {'prompt_tokens': 41, 'completion_tokens': 50, 'total_tokens': 91}, 'system_fingerprint': None}}</strong>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<strong style='color: #00008B;'>Request request-2:</strong>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<strong style='color: #00008B;'>Response: {'status_code': 200, 'request_id': 'request-2', 'body': {'id': 'request-2', 'object': 'chat.completion', 'created': 1730106176, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': {'index': 0, 'message': {'role': 'assistant', 'content': '**What is Python?**\\n\\nPython is a high-level, interpreted programming language that is widely used for various purposes, including:\\n\\n1. **Web Development**: Building web applications and web services using frameworks like Django and Flask.\\n2. **Data Analysis and'}, 'logprobs': None, 'finish_reason': 'length', 'matched_stop': None}, 'usage': {'prompt_tokens': 39, 'completion_tokens': 50, 'total_tokens': 89}, 'system_fingerprint': None}}</strong>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<strong style='color: #00008B;'>Cleaning up files...</strong>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2024-10-28 02:02:58] INFO:     127.0.0.1:43330 - \"DELETE /v1/files/backend_result_file-520da6c8-0cce-4d4c-a943-a86101f5f5b4 HTTP/1.1\" 200 OK\n"
     ]
    }
   ],
   "source": [
    "while batch_response.status not in [\"completed\", \"failed\", \"cancelled\"]:\n",
    "    time.sleep(3)\n",
    "    print(f\"Batch job status: {batch_response.status}...trying again in 3 seconds...\")\n",
    "    batch_response = client.batches.retrieve(batch_response.id)\n",
    "\n",
    "if batch_response.status == \"completed\":\n",
    "    print(\"Batch job completed successfully!\")\n",
    "    print(f\"Request counts: {batch_response.request_counts}\")\n",
    "\n",
    "    result_file_id = batch_response.output_file_id\n",
    "    file_response = client.files.content(result_file_id)\n",
    "    result_content = file_response.read().decode(\"utf-8\")\n",
    "\n",
    "    results = [\n",
    "        json.loads(line) for line in result_content.split(\"\\n\") if line.strip() != \"\"\n",
    "    ]\n",
    "\n",
    "    for result in results:\n",
    "        print_highlight(f\"Request {result['custom_id']}:\")\n",
    "        print_highlight(f\"Response: {result['response']}\")\n",
    "\n",
    "    print_highlight(\"Cleaning up files...\")\n",
    "    # Only delete the result file ID since file_response is just content\n",
    "    client.files.delete(result_file_id)\n",
    "else:\n",
    "    print_highlight(f\"Batch job failed with status: {batch_response.status}\")\n",
    "    if hasattr(batch_response, \"errors\"):\n",
    "        print_highlight(f\"Errors: {batch_response.errors}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "It takes a while to complete the batch job. You can use these two APIs to retrieve the batch job status or cancel the batch job.\n",
    "\n",
    "1. `batches/{batch_id}`: Retrieve the batch job status.\n",
    "2. `batches/{batch_id}/cancel`: Cancel the batch job.\n",
    "\n",
    "Here is an example to check the batch job status."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2024-10-28 02:02:58] INFO:     127.0.0.1:43336 - \"POST /v1/files HTTP/1.1\" 200 OK\n",
      "[2024-10-28 02:02:58] INFO:     127.0.0.1:43336 - \"POST /v1/batches HTTP/1.1\" 200 OK\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<strong style='color: #00008B;'>Created batch job with ID: batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5</strong>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<strong style='color: #00008B;'>Initial status: validating</strong>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2024-10-28 02:02:58 TP0] Prefill batch. #new-seq: 17, #new-token: 510, #cached-token: 425, cache hit rate: 43.40%, token usage: 0.00, #running-req: 0, #queue-req: 0\n",
      "[2024-10-28 02:02:58 TP0] Prefill batch. #new-seq: 83, #new-token: 2490, #cached-token: 2075, cache hit rate: 45.04%, token usage: 0.00, #running-req: 17, #queue-req: 0\n",
      "[2024-10-28 02:02:59 TP0] Decode batch. #running-req: 100, #token: 3725, token usage: 0.02, gen throughput (token/s): 234.43, #queue-req: 0\n",
      "[2024-10-28 02:03:00 TP0] Decode batch. #running-req: 100, #token: 7725, token usage: 0.04, gen throughput (token/s): 3545.41, #queue-req: 0\n",
      "[2024-10-28 02:03:01 TP0] Decode batch. #running-req: 100, #token: 11725, token usage: 0.05, gen throughput (token/s): 3448.10, #queue-req: 0\n",
      "[2024-10-28 02:03:02 TP0] Decode batch. #running-req: 100, #token: 15725, token usage: 0.07, gen throughput (token/s): 3362.62, #queue-req: 0\n",
      "[2024-10-28 02:03:04 TP0] Decode batch. #running-req: 100, #token: 19725, token usage: 0.09, gen throughput (token/s): 3279.58, #queue-req: 0\n",
      "[2024-10-28 02:03:05 TP0] Decode batch. #running-req: 100, #token: 23725, token usage: 0.11, gen throughput (token/s): 3200.86, #queue-req: 0\n",
      "[2024-10-28 02:03:06 TP0] Decode batch. #running-req: 100, #token: 27725, token usage: 0.13, gen throughput (token/s): 3126.52, #queue-req: 0\n",
      "[2024-10-28 02:03:07 TP0] Decode batch. #running-req: 100, #token: 31725, token usage: 0.15, gen throughput (token/s): 3053.16, #queue-req: 0\n",
      "[2024-10-28 02:03:08] INFO:     127.0.0.1:41320 - \"GET /v1/batches/batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5 HTTP/1.1\" 200 OK\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<strong style='color: #00008B;'>Batch job details (check 1 / 5) // ID: batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5 // Status: in_progress // Created at: 1730106178 // Input file ID: backend_input_file-92cf2cc1-afbd-428f-8c5c-85fabd86cb63 // Output file ID: None</strong>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<strong style='color: #00008B;'><strong>Request counts: Total: 0 // Completed: 0 // Failed: 0</strong></strong>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2024-10-28 02:03:09 TP0] Decode batch. #running-req: 100, #token: 35725, token usage: 0.16, gen throughput (token/s): 2980.26, #queue-req: 0\n",
      "[2024-10-28 02:03:10 TP0] Decode batch. #running-req: 100, #token: 39725, token usage: 0.18, gen throughput (token/s): 2919.09, #queue-req: 0\n",
      "[2024-10-28 02:03:11] INFO:     127.0.0.1:41320 - \"GET /v1/batches/batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5 HTTP/1.1\" 200 OK\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<strong style='color: #00008B;'>Batch job details (check 2 / 5) // ID: batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5 // Status: in_progress // Created at: 1730106178 // Input file ID: backend_input_file-92cf2cc1-afbd-428f-8c5c-85fabd86cb63 // Output file ID: None</strong>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<strong style='color: #00008B;'><strong>Request counts: Total: 0 // Completed: 0 // Failed: 0</strong></strong>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2024-10-28 02:03:11 TP0] Decode batch. #running-req: 100, #token: 43725, token usage: 0.20, gen throughput (token/s): 2854.92, #queue-req: 0\n",
      "[2024-10-28 02:03:13 TP0] Decode batch. #running-req: 100, #token: 47725, token usage: 0.22, gen throughput (token/s): 2794.62, #queue-req: 0\n",
      "[2024-10-28 02:03:14] INFO:     127.0.0.1:41320 - \"GET /v1/batches/batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5 HTTP/1.1\" 200 OK\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<strong style='color: #00008B;'>Batch job details (check 3 / 5) // ID: batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5 // Status: in_progress // Created at: 1730106178 // Input file ID: backend_input_file-92cf2cc1-afbd-428f-8c5c-85fabd86cb63 // Output file ID: None</strong>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<strong style='color: #00008B;'><strong>Request counts: Total: 0 // Completed: 0 // Failed: 0</strong></strong>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2024-10-28 02:03:14 TP0] Decode batch. #running-req: 100, #token: 51725, token usage: 0.24, gen throughput (token/s): 2737.84, #queue-req: 0\n",
      "[2024-10-28 02:03:17] INFO:     127.0.0.1:41320 - \"GET /v1/batches/batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5 HTTP/1.1\" 200 OK\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<strong style='color: #00008B;'>Batch job details (check 4 / 5) // ID: batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5 // Status: completed // Created at: 1730106178 // Input file ID: backend_input_file-92cf2cc1-afbd-428f-8c5c-85fabd86cb63 // Output file ID: backend_result_file-c10ee9f5-eca8-4357-a922-934543b7f433</strong>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<strong style='color: #00008B;'><strong>Request counts: Total: 100 // Completed: 100 // Failed: 0</strong></strong>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2024-10-28 02:03:20] INFO:     127.0.0.1:41320 - \"GET /v1/batches/batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5 HTTP/1.1\" 200 OK\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<strong style='color: #00008B;'>Batch job details (check 5 / 5) // ID: batch_67da0e16-e7b2-4a75-9f7a-58c033e739e5 // Status: completed // Created at: 1730106178 // Input file ID: backend_input_file-92cf2cc1-afbd-428f-8c5c-85fabd86cb63 // Output file ID: backend_result_file-c10ee9f5-eca8-4357-a922-934543b7f433</strong>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<strong style='color: #00008B;'><strong>Request counts: Total: 100 // Completed: 100 // Failed: 0</strong></strong>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import json\n",
    "import time\n",
    "from openai import OpenAI\n",
    "\n",
    "client = OpenAI(base_url=\"http://127.0.0.1:30000/v1\", api_key=\"None\")\n",
    "\n",
    "requests = []\n",
    "for i in range(100):\n",
    "    requests.append(\n",
    "        {\n",
    "            \"custom_id\": f\"request-{i}\",\n",
    "            \"method\": \"POST\",\n",
    "            \"url\": \"/chat/completions\",\n",
    "            \"body\": {\n",
    "                \"model\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
    "                \"messages\": [\n",
    "                    {\n",
    "                        \"role\": \"system\",\n",
    "                        \"content\": f\"{i}: You are a helpful AI assistant\",\n",
    "                    },\n",
    "                    {\n",
    "                        \"role\": \"user\",\n",
    "                        \"content\": \"Write a detailed story about topic. Make it very long.\",\n",
    "                    },\n",
    "                ],\n",
    "                \"max_tokens\": 500,\n",
    "            },\n",
    "        }\n",
    "    )\n",
    "\n",
    "input_file_path = \"batch_requests.jsonl\"\n",
    "with open(input_file_path, \"w\") as f:\n",
    "    for req in requests:\n",
    "        f.write(json.dumps(req) + \"\\n\")\n",
    "\n",
    "with open(input_file_path, \"rb\") as f:\n",
    "    uploaded_file = client.files.create(file=f, purpose=\"batch\")\n",
    "\n",
    "batch_job = client.batches.create(\n",
    "    input_file_id=uploaded_file.id,\n",
    "    endpoint=\"/v1/chat/completions\",\n",
    "    completion_window=\"24h\",\n",
    ")\n",
    "\n",
    "print_highlight(f\"Created batch job with ID: {batch_job.id}\")\n",
    "print_highlight(f\"Initial status: {batch_job.status}\")\n",
    "\n",
    "time.sleep(10)\n",
    "\n",
    "max_checks = 5\n",
    "for i in range(max_checks):\n",
    "    batch_details = client.batches.retrieve(batch_id=batch_job.id)\n",
    "\n",
    "    print_highlight(\n",
    "        f\"Batch job details (check {i+1} / {max_checks}) // ID: {batch_details.id} // Status: {batch_details.status} // Created at: {batch_details.created_at} // Input file ID: {batch_details.input_file_id} // Output file ID: {batch_details.output_file_id}\"\n",
    "    )\n",
    "    print_highlight(\n",
    "        f\"<strong>Request counts: Total: {batch_details.request_counts.total} // Completed: {batch_details.request_counts.completed} // Failed: {batch_details.request_counts.failed}</strong>\"\n",
    "    )\n",
    "\n",
    "    time.sleep(3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Here is an example to cancel a batch job."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2024-10-28 02:03:23] INFO:     127.0.0.1:47360 - \"POST /v1/files HTTP/1.1\" 200 OK\n",
      "[2024-10-28 02:03:23] INFO:     127.0.0.1:47360 - \"POST /v1/batches HTTP/1.1\" 200 OK\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<strong style='color: #00008B;'>Created batch job with ID: batch_8a409f86-b8c7-4e29-9cc7-187d6d28df62</strong>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<strong style='color: #00008B;'>Initial status: validating</strong>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2024-10-28 02:03:23 TP0] Prefill batch. #new-seq: 44, #new-token: 44, #cached-token: 2376, cache hit rate: 60.81%, token usage: 0.01, #running-req: 0, #queue-req: 0\n",
      "[2024-10-28 02:03:23 TP0] Prefill batch. #new-seq: 328, #new-token: 8192, #cached-token: 9824, cache hit rate: 56.49%, token usage: 0.01, #running-req: 44, #queue-req: 128\n",
      "[2024-10-28 02:03:24 TP0] Prefill batch. #new-seq: 129, #new-token: 3864, #cached-token: 3231, cache hit rate: 54.15%, token usage: 0.05, #running-req: 371, #queue-req: 1\n",
      "[2024-10-28 02:03:27 TP0] Decode batch. #running-req: 500, #token: 29025, token usage: 0.13, gen throughput (token/s): 1162.55, #queue-req: 0\n",
      "[2024-10-28 02:03:31 TP0] Decode batch. #running-req: 500, #token: 49025, token usage: 0.23, gen throughput (token/s): 5606.35, #queue-req: 0\n",
      "[2024-10-28 02:03:33] INFO:     127.0.0.1:40110 - \"POST /v1/batches/batch_8a409f86-b8c7-4e29-9cc7-187d6d28df62/cancel HTTP/1.1\" 200 OK\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<strong style='color: #00008B;'>Cancellation initiated. Status: cancelling</strong>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2024-10-28 02:03:36] INFO:     127.0.0.1:40110 - \"GET /v1/batches/batch_8a409f86-b8c7-4e29-9cc7-187d6d28df62 HTTP/1.1\" 200 OK\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<strong style='color: #00008B;'>Current status: cancelled</strong>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<strong style='color: #00008B;'>Batch job successfully cancelled</strong>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2024-10-28 02:03:36] INFO:     127.0.0.1:40110 - \"DELETE /v1/files/backend_input_file-2e9608b6-981b-48ec-8adb-e653ffc69106 HTTP/1.1\" 200 OK\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<strong style='color: #00008B;'>Successfully cleaned up input file</strong>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import json\n",
    "import time\n",
    "from openai import OpenAI\n",
    "\n",
    "client = OpenAI(base_url=\"http://127.0.0.1:30000/v1\", api_key=\"None\")\n",
    "\n",
    "requests = []\n",
    "for i in range(500):\n",
    "    requests.append(\n",
    "        {\n",
    "            \"custom_id\": f\"request-{i}\",\n",
    "            \"method\": \"POST\",\n",
    "            \"url\": \"/chat/completions\",\n",
    "            \"body\": {\n",
    "                \"model\": \"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
    "                \"messages\": [\n",
    "                    {\n",
    "                        \"role\": \"system\",\n",
    "                        \"content\": f\"{i}: You are a helpful AI assistant\",\n",
    "                    },\n",
    "                    {\n",
    "                        \"role\": \"user\",\n",
    "                        \"content\": \"Write a detailed story about topic. Make it very long.\",\n",
    "                    },\n",
    "                ],\n",
    "                \"max_tokens\": 500,\n",
    "            },\n",
    "        }\n",
    "    )\n",
    "\n",
    "input_file_path = \"batch_requests.jsonl\"\n",
    "with open(input_file_path, \"w\") as f:\n",
    "    for req in requests:\n",
    "        f.write(json.dumps(req) + \"\\n\")\n",
    "\n",
    "with open(input_file_path, \"rb\") as f:\n",
    "    uploaded_file = client.files.create(file=f, purpose=\"batch\")\n",
    "\n",
    "batch_job = client.batches.create(\n",
    "    input_file_id=uploaded_file.id,\n",
    "    endpoint=\"/v1/chat/completions\",\n",
    "    completion_window=\"24h\",\n",
    ")\n",
    "\n",
    "print_highlight(f\"Created batch job with ID: {batch_job.id}\")\n",
    "print_highlight(f\"Initial status: {batch_job.status}\")\n",
    "\n",
    "time.sleep(10)\n",
    "\n",
    "try:\n",
    "    cancelled_job = client.batches.cancel(batch_id=batch_job.id)\n",
    "    print_highlight(f\"Cancellation initiated. Status: {cancelled_job.status}\")\n",
    "    assert cancelled_job.status == \"cancelling\"\n",
    "\n",
    "    # Monitor the cancellation process\n",
    "    while cancelled_job.status not in [\"failed\", \"cancelled\"]:\n",
    "        time.sleep(3)\n",
    "        cancelled_job = client.batches.retrieve(batch_job.id)\n",
    "        print_highlight(f\"Current status: {cancelled_job.status}\")\n",
    "\n",
    "    # Verify final status\n",
    "    assert cancelled_job.status == \"cancelled\"\n",
    "    print_highlight(\"Batch job successfully cancelled\")\n",
    "\n",
    "except Exception as e:\n",
    "    print_highlight(f\"Error during cancellation: {e}\")\n",
    "    raise e\n",
    "\n",
    "finally:\n",
    "    try:\n",
    "        del_response = client.files.delete(uploaded_file.id)\n",
    "        if del_response.deleted:\n",
    "            print_highlight(\"Successfully cleaned up input file\")\n",
    "    except Exception as e:\n",
    "        print_highlight(f\"Error cleaning up: {e}\")\n",
    "        raise e"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2024-10-28 02:03:36] INFO:     Shutting down\n",
      "[2024-10-28 02:03:36] INFO:     Waiting for application shutdown.\n",
      "[2024-10-28 02:03:36] INFO:     Application shutdown complete.\n",
      "[2024-10-28 02:03:36] INFO:     Finished server process [1185529]\n",
      "W1028 02:03:37.084000 140231994889792 torch/_inductor/compile_worker/subproc_pool.py:126] SubprocPool unclean exit\n"
     ]
    }
   ],
   "source": [
    "terminate_process(server_process)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "AlphaMeemory",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}