Update docs (#1839)

b548801d · Lianmin Zheng · GitHub · 539df95d · b548801d · 539df95d
Unverified Commit b548801d authored Oct 30, 2024 by Lianmin Zheng Committed by GitHub Oct 30, 2024
11 changed files
--- a/docs/deploy.py
+++ b/docs/deploy.py
-#!/usr/bin/python3
+# Deploy the documents
 import os
 from datetime import datetime

--- a/docs/deploy_docs.sh
+++ b/docs/deploy_docs.sh
--- a/docs/openai_api.ipynb
+++ b/docs/openai_api.ipynb
--- a/docs/send_request.ipynb
+++ b/docs/send_request.ipynb
--- a/python/sglang/srt/mem_cache/flush_cache.py
+++ b/python/sglang/srt/mem_cache/flush_cache.py
@@ -29,5 +29,5 @@ if __name__ == "__main__":
    parser.add_argument("--url", type=str, default="http://localhost:30000")
    args = parser.parse_args()
-    response = requests.get(args.url + "/flush_cache")
+    response = requests.post(args.url + "/flush_cache")
    assert response.status_code == 200
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -124,7 +124,7 @@ class ModelRunner:
                "Automatically turn off --chunked-prefill-size and adjust --mem-fraction-static for multimodal models."
            )
            server_args.chunked_prefill_size = None
-            server_args.mem_fraction_static *= 0.95
+            self.mem_fraction_static *= 0.95
            # TODO: qwen2-vl does not support radix cache now, set disable_radix_cache=True automatically
            if self.model_config.hf_config.architectures == [
                "Qwen2VLForConditionalGeneration"

--- a/python/sglang/srt/server.py
+++ b/python/sglang/srt/server.py
@@ -139,7 +139,7 @@ async def get_server_args():
    return dataclasses.asdict(tokenizer_manager.server_args)
-@app.get("/flush_cache")
+@app.post("/flush_cache")
 async def flush_cache():
    """Flush the radix cache."""
    tokenizer_manager.flush_cache()
@@ -180,7 +180,7 @@ async def get_memory_pool_size():
        return ret
    except Exception as e:
-        return JSONResponse(
+        return ORJSONResponse(
            {"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
        )

--- a/python/sglang/utils.py
+++ b/python/sglang/utils.py
@@ -19,7 +19,6 @@ from typing import Optional, Union
 import numpy as np
 import requests
-import torch
 from IPython.display import HTML, display
 from tqdm import tqdm
@@ -332,13 +331,12 @@ def wait_for_server(base_url: str, timeout: int = None) -> None:
                headers={"Authorization": "Bearer None"},
            )
            if response.status_code == 200:
+                time.sleep(5)
                print_highlight(
-                    """
+                    """\n
-                            Server and notebook outputs are combined for clarity.
+                    NOTE: Typically, the server runs in a separate terminal.
+                    In this notebook, we run the server and notebook code together, so their outputs are combined.
-                            Typically, the server runs in a separate terminal.
+                    To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue.
-                            Server output is gray; notebook output is highlighted.
                    """
                )
                break
@@ -350,36 +348,8 @@ def wait_for_server(base_url: str, timeout: int = None) -> None:
 def terminate_process(process):
-    """Safely terminate a process and clean up GPU memory.
+    from sglang.srt.utils import kill_child_process
+    kill_child_process(process.pid, include_self=True)
-    Args:
-        process: subprocess.Popen object to terminate
-    """
-    try:
-        process.terminate()
-        try:
-            process.wait(timeout=5)
-        except subprocess.TimeoutExpired:
-            if os.name != "nt":
-                try:
-                    pgid = os.getpgid(process.pid)
-                    os.killpg(pgid, signal.SIGTERM)
-                    time.sleep(1)
-                    if process.poll() is None:
-                        os.killpg(pgid, signal.SIGKILL)
-                except ProcessLookupError:
-                    pass
-            else:
-                process.kill()
-            process.wait()
-    except Exception as e:
-        print(f"Warning: {e}")
-    finally:
-        gc.collect()
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-            torch.cuda.ipc_collect()
-        time.sleep(2)
 def print_highlight(html_content: str):

--- a/scripts/ci_install_dependency.sh
+++ b/scripts/ci_install_dependency.sh
+"""
+Install the dependency in CI.
+"""
 pip install --upgrade pip
 pip install -e "python[all]"
 pip install transformers==4.45.2

--- a/scripts/killall_sglang.sh
+++ b/scripts/killall_sglang.sh
+"""
+Kill all SGLang processes and free the GPU memory.
+"""
 kill -9 $(ps aux | grep 'multiprocessing.spawn' | grep -v 'grep' | awk '{print $2}')
 kill -9 $(ps aux | grep 'sglang.launch_server' | grep -v 'grep' | awk '{print $2}')
--- a/scripts/version_branch_to_tag.sh
+++ b/scripts/version_branch_to_tag.sh
 #!/bin/bash
-# This script tags all remote branches starting with 'v' with the same name as the branch, 
+# This script is used for release.
+# It tags all remote branches starting with 'v' with the same name as the branch, 
 # deletes the corresponding branches from the remote, and pushes the tags to the remote repository. 
 git fetch origin --prune