Simplify our docs with complicated functions into utils (#1807)

Co-authored-by: Chayenne <zhaochenyang@ucla.edu>

Simplify our docs with complicated functions into utils (#1807)
Co-authored-by: Chayenne <zhaochenyang@ucla.edu>
ced362f7 · Chayenne · GitHub · 9084a864 · ced362f7 · ced362f7
Unverified Commit ced362f7 authored Oct 26, 2024 by Chayenne Committed by GitHub Oct 26, 2024
5 changed files
--- a/.github/workflows/deploy-docs.yml
+++ b/.github/workflows/deploy-docs.yml
-name: Build Documentation
+name: Release Documentation
 on:
+  push:
+    branches:
+      - main
+    paths:
+      - 'docs/**'
+      - 'python/sglang/version.py'
  workflow_dispatch:
 jobs:
  execute-notebooks:
    runs-on: 1-gpu-runner
-    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    if: github.repository == 'sgl-project/sglang'
    steps:
      - name: Checkout code
        uses: actions/checkout@v3
@@ -38,8 +44,9 @@ jobs:
          done
  build-and-deploy:
+    needs: execute-notebooks
    if: github.repository == 'sgl-project/sglang'
-    runs-on: 1-gpu-runner
+    runs-on: ubuntu-latest
    steps:
      - name: Checkout code
        uses: actions/checkout@v3
@@ -75,4 +82,4 @@ jobs:
          git commit -m "Update $(date +'%Y-%m-%d %H:%M:%S')"
          git push https://$GITHUB_TOKEN@github.com/sgl-project/sgl-project.github.io.git main
          cd ..
          rm -rf sgl-project.github.io
\ No newline at end of file
--- a/.github/workflows/execute-notebook.yml
+++ b/.github/workflows/execute-notebook.yml
+name: Execute Notebooks
+on:
+  pull_request:
+  push:
+    branches:
+      - main
+  workflow_dispatch:
+jobs:
+  run-all-notebooks:
+    runs-on: 1-gpu-runner
+    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.9'
+      - name: Install dependencies
+        run: |
+          bash scripts/ci_install_dependency.sh
+          pip install -r docs/requirements.txt
+      - name: Setup Jupyter Kernel
+        run: |
+          python -m ipykernel install --user --name python3 --display-name "Python 3"
+      - name: Execute notebooks
+        run: |
+          cd docs
+          for nb in *.ipynb; do
+            if [ -f "$nb" ]; then
+              echo "Executing $nb"
+              jupyter nbconvert --to notebook --execute --inplace "$nb" \
+                --ExecutePreprocessor.timeout=600 \
+                --ExecutePreprocessor.kernel_name=python3
+            fi
+          done
\ No newline at end of file
--- a/docs/embedding_model.ipynb
+++ b/docs/embedding_model.ipynb
@@ -28,42 +28,16 @@
    }
   ],
   "source": [
-    "import subprocess\n",
-    "import time\n",
-    "import requests\n",
-    "\n",
    "# Equivalent to running this in the shell:\n",
    "# python -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-7B-instruct --port 30010 --host 0.0.0.0 --is-embedding --log-level error\n",
-    "embedding_process = subprocess.Popen(\n",
+    "from sglang.utils import execute_shell_command, wait_for_server, terminate_process\n",
-    "    [\n",
+    "\n",
-    "        \"python\",\n",
+    "embedding_process = execute_shell_command(\"\"\"\n",
-    "        \"-m\",\n",
+    "python -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-7B-instruct \\\n",
-    "        \"sglang.launch_server\",\n",
+    "    --port 30010 --host 0.0.0.0 --is-embedding --log-level error\n",
-    "        \"--model-path\",\n",
+    "\"\"\")\n",
-    "        \"Alibaba-NLP/gte-Qwen2-7B-instruct\",\n",
-    "        \"--port\",\n",
-    "        \"30010\",\n",
-    "        \"--host\",\n",
-    "        \"0.0.0.0\",\n",
-    "        \"--is-embedding\",\n",
-    "        \"--log-level\",\n",
-    "        \"error\",\n",
-    "    ],\n",
-    "    text=True,\n",
-    "    stdout=subprocess.DEVNULL,\n",
-    "    stderr=subprocess.DEVNULL,\n",
-    ")\n",
    "\n",
-    "while True:\n",
+    "wait_for_server(\"http://localhost:30010\")\n",
-    "    try:\n",
-    "        response = requests.get(\n",
-    "            \"http://localhost:30010/v1/models\",\n",
-    "            headers={\"Authorization\": \"Bearer None\"},\n",
-    "        )\n",
-    "        if response.status_code == 200:\n",
-    "            break\n",
-    "    except requests.exceptions.RequestException:\n",
-    "        time.sleep(1)\n",
    "\n",
    "print(\"Embedding server is ready. Proceeding with the next steps.\")"
   ]
@@ -134,6 +108,15 @@
    "embedding = response.data[0].embedding[:10]\n",
    "print(embedding)"
   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "terminate_process(embedding_process)"
+   ]
  }
 ],
 "metadata": {

--- a/docs/send_request.ipynb
+++ b/docs/send_request.ipynb
@@ -36,41 +36,15 @@
    }
   ],
   "source": [
-    "import subprocess\n",
+    "from sglang.utils import execute_shell_command, wait_for_server, terminate_process\n",
-    "import time\n",
-    "import requests\n",
-    "import os\n",
    "\n",
-    "server_process = subprocess.Popen(\n",
-    "    [\n",
-    "        \"python\",\n",
-    "        \"-m\",\n",
-    "        \"sglang.launch_server\",\n",
-    "        \"--model-path\",\n",
-    "        \"meta-llama/Meta-Llama-3.1-8B-Instruct\",\n",
-    "        \"--port\",\n",
-    "        \"30000\",\n",
-    "        \"--host\",\n",
-    "        \"0.0.0.0\",\n",
-    "        \"--log-level\",\n",
-    "        \"error\",\n",
-    "    ],\n",
-    "    text=True,\n",
-    "    stdout=subprocess.DEVNULL,\n",
-    "    stderr=subprocess.DEVNULL,\n",
-    ")\n",
    "\n",
-    "while True:\n",
+    "server_process = execute_shell_command(\"\"\"\n",
-    "    try:\n",
+    "python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \\\n",
-    "        response = requests.get(\n",
+    "--port 30000 --host 0.0.0.0 --log-level warning\n",
-    "            \"http://localhost:30000/v1/models\",\n",
+    "\"\"\")\n",
-    "            headers={\"Authorization\": \"Bearer None\"},\n",
-    "        )\n",
-    "        if response.status_code == 200:\n",
-    "            break\n",
-    "    except requests.exceptions.RequestException:\n",
-    "        time.sleep(1)\n",
    "\n",
+    "wait_for_server(\"http://localhost:30000\")\n",
    "print(\"Server is ready. Proceeding with the next steps.\")"
   ]
  },
@@ -92,7 +66,7 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "{\"id\":\"1449c9c20d4448299431a57facc68d7a\",\"object\":\"chat.completion\",\"created\":1729816891,\"model\":\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\"choices\":[{\"index\":0,\"message\":{\"role\":\"assistant\",\"content\":\"LLM stands for Large Language Model. It's a type of artificial intelligence (AI) designed to process and generate human-like language. LLMs are trained on vast amounts of text data, which enables them to learn patterns, relationships, and nuances of language.\\n\\nLarge Language Models are typically trained using a technique called deep learning, where multiple layers of artificial neural networks are used to analyze and understand the input data. This training process involves feeding the model massive amounts of text data, which it uses to learn and improve its language understanding and generation capabilities.\\n\\nSome key characteristics of LLMs include:\\n\\n1. **Language understanding**: LLMs can comprehend natural language, including its syntax, semantics, and context.\\n2. **Language generation**: LLMs can generate text, including responses to user input, articles, stories, and more.\\n3. **Contextual understanding**: LLMs can understand the context in which language is being used, including the topic, tone, and intent.\\n4. **Self-supervised learning**: LLMs can learn from large datasets without explicit supervision or labeling.\\n\\nLLMs have a wide range of applications, including:\\n\\n1. **Virtual assistants**: LLMs power virtual assistants like Siri, Alexa, and Google Assistant.\\n2. **Language translation**: LLMs can translate text from one language to another.\\n3. **Text summarization**: LLMs can summarize long pieces of text into shorter, more digestible versions.\\n4. **Content generation**: LLMs can generate content, such as news articles, product descriptions, and social media posts.\\n5. **Chatbots**: LLMs can power chatbots that can have human-like conversations with users.\\n\\nThe Large Language Model I am, is a type of LLM that has been trained on a massive dataset of text and can answer a wide range of questions and engage in conversation.\"},\"logprobs\":null,\"finish_reason\":\"stop\",\"matched_stop\":128009}],\"usage\":{\"prompt_tokens\":47,\"total_tokens\":426,\"completion_tokens\":379,\"prompt_tokens_details\":null}}"
+      "{\"id\":\"449710eb827c49c99b82ce187e912c2a\",\"object\":\"chat.completion\",\"created\":1729962606,\"model\":\"meta-llama/Meta-Llama-3.1-8B-Instruct\",\"choices\":[{\"index\":0,\"message\":{\"role\":\"assistant\",\"content\":\"LLM stands for Large Language Model. It's a type of artificial intelligence (AI) designed to process and generate human-like language. These models are trained on vast amounts of text data, allowing them to learn patterns, relationships, and context within language.\\n\\nLarge language models use various techniques, such as deep learning and natural language processing, to analyze and understand the input text. They can then use this understanding to generate coherent and context-specific text, such as:\\n\\n1. Responses to questions or prompts\\n2. Summaries of long pieces of text\\n3. Creative writing, like stories or poetry\\n4. Translation of text from one language to another\\n\\nSome popular examples of LLMs include:\\n\\n1. Chatbots: Virtual assistants that can understand and respond to user input\\n2. Virtual assistants: Like Siri, Alexa, or Google Assistant\\n3. Language translation tools: Such as Google Translate\\n4. Writing assistants: Like Grammarly or Language Tool\\n\\nThe key characteristics of LLMs include:\\n\\n1. **Scalability**: They can process large amounts of text data\\n2. **Flexibility**: They can be fine-tuned for specific tasks or domains\\n3. **Contextual understanding**: They can recognize context and nuances in language\\n4. **Creativity**: They can generate original text or responses\\n\\nHowever, LLMs also have limitations and potential drawbacks:\\n\\n1. **Bias**: They can perpetuate existing biases in the training data\\n2. **Misinformation**: They can spread misinformation or false information\\n3. **Dependence on data quality**: The quality of the training data directly affects the model's performance\\n\\nOverall, LLMs are powerful tools that can be used in various applications, from language translation and writing assistance to chatbots and virtual assistants.\"},\"logprobs\":null,\"finish_reason\":\"stop\",\"matched_stop\":128009}],\"usage\":{\"prompt_tokens\":47,\"total_tokens\":408,\"completion_tokens\":361,\"prompt_tokens_details\":null}}"
     ]
    }
   ],
@@ -121,7 +95,7 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "ChatCompletion(id='16757c3dd6e14a6e9bafd1122f84e4c5', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **Country:** Japan\\n**Capital:** Tokyo\\n\\n2. **Country:** Australia\\n**Capital:** Canberra\\n\\n3. **Country:** Brazil\\n**Capital:** Brasília', refusal=None, role='assistant', function_call=None, tool_calls=None), matched_stop=128009)], created=1729816893, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, prompt_tokens_details=None))\n"
+      "ChatCompletion(id='6bbf20fed17940739eb5cd5d685fa29a', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\\n\\n1. **Country:** Japan\\n**Capital:** Tokyo\\n\\n2. **Country:** Australia\\n**Capital:** Canberra\\n\\n3. **Country:** Brazil\\n**Capital:** Brasília', refusal=None, role='assistant', function_call=None, tool_calls=None), matched_stop=128009)], created=1729962608, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=46, prompt_tokens=49, total_tokens=95, prompt_tokens_details=None))\n"
     ]
    }
   ],
@@ -155,38 +129,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "import signal\n",
+    "terminate_process(server_process)"
-    "import gc\n",
-    "import torch\n",
-    "\n",
-    "def terminate_process(process):\n",
-    "    try:\n",
-    "        process.terminate()\n",
-    "        try:\n",
-    "            process.wait(timeout=5)\n",
-    "        except subprocess.TimeoutExpired:\n",
-    "            if os.name != 'nt':\n",
-    "                try:\n",
-    "                    pgid = os.getpgid(process.pid)\n",
-    "                    os.killpg(pgid, signal.SIGTERM)\n",
-    "                    time.sleep(1)\n",
-    "                    if process.poll() is None:\n",
-    "                        os.killpg(pgid, signal.SIGKILL)\n",
-    "                except ProcessLookupError:\n",
-    "                    pass\n",
-    "            else:\n",
-    "                process.kill()\n",
-    "            process.wait()\n",
-    "    except Exception as e:\n",
-    "        print(f\"Warning: {e}\")\n",
-    "    finally:\n",
-    "        gc.collect()\n",
-    "        if torch.cuda.is_available():\n",
-    "            torch.cuda.empty_cache()\n",
-    "            torch.cuda.ipc_collect()\n",
-    "\n",
-    "terminate_process(server_process)\n",
-    "time.sleep(2)"
   ]
  }
 ],

--- a/python/sglang/utils.py
+++ b/python/sglang/utils.py
 """Common utilities."""
 import base64
+import gc
 import importlib
 import json
 import logging
 import os
 import signal
+import subprocess
 import sys
+import time
 import traceback
 import urllib.request
 from concurrent.futures import ThreadPoolExecutor
@@ -16,6 +19,7 @@ from typing import Optional, Union
 import numpy as np
 import requests
+import torch
 from tqdm import tqdm
 logger = logging.getLogger(__name__)
@@ -294,3 +298,80 @@ def download_and_cache_file(url: str, filename: Optional[str] = None):
            bar.update(len(chunk))
    return filename
+def execute_shell_command(command: str) -> subprocess.Popen:
+    """
+    Execute a shell command and return the process handle
+    Args:
+        command: Shell command as a string (can include \ line continuations)
+    Returns:
+        subprocess.Popen: Process handle
+    """
+    # Replace \ newline with space and split
+    command = command.replace("\\\n", " ").replace("\\", " ")
+    parts = command.split()
+    return subprocess.Popen(
+        parts,
+        text=True,
+        stdout=subprocess.DEVNULL,
+        stderr=subprocess.DEVNULL,
+    )
+def wait_for_server(base_url: str, timeout: int = None) -> None:
+    """Wait for the server to be ready by polling the /v1/models endpoint.
+    Args:
+        base_url: The base URL of the server
+        timeout: Maximum time to wait in seconds. None means wait forever.
+    """
+    start_time = time.time()
+    while True:
+        try:
+            response = requests.get(
+                f"{base_url}/v1/models",
+                headers={"Authorization": "Bearer None"},
+            )
+            if response.status_code == 200:
+                break
+            if timeout and time.time() - start_time > timeout:
+                raise TimeoutError("Server did not become ready within timeout period")
+        except requests.exceptions.RequestException:
+            time.sleep(1)
+def terminate_process(process):
+    """Safely terminate a process and clean up GPU memory.
+    Args:
+        process: subprocess.Popen object to terminate
+    """
+    try:
+        process.terminate()
+        try:
+            process.wait(timeout=5)
+        except subprocess.TimeoutExpired:
+            if os.name != "nt":
+                try:
+                    pgid = os.getpgid(process.pid)
+                    os.killpg(pgid, signal.SIGTERM)
+                    time.sleep(1)
+                    if process.poll() is None:
+                        os.killpg(pgid, signal.SIGKILL)
+                except ProcessLookupError:
+                    pass
+            else:
+                process.kill()
+            process.wait()
+    except Exception as e:
+        print(f"Warning: {e}")
+    finally:
+        gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            torch.cuda.ipc_collect()
+        time.sleep(2)