Unverified Commit b548801d authored by Lianmin Zheng's avatar Lianmin Zheng Committed by GitHub
Browse files

Update docs (#1839)

parent 539df95d
#!/usr/bin/python3
# Deploy the documents
import os
from datetime import datetime
......
This diff is collapsed.
This diff is collapsed.
......@@ -29,5 +29,5 @@ if __name__ == "__main__":
parser.add_argument("--url", type=str, default="http://localhost:30000")
args = parser.parse_args()
response = requests.get(args.url + "/flush_cache")
response = requests.post(args.url + "/flush_cache")
assert response.status_code == 200
......@@ -124,7 +124,7 @@ class ModelRunner:
"Automatically turn off --chunked-prefill-size and adjust --mem-fraction-static for multimodal models."
)
server_args.chunked_prefill_size = None
server_args.mem_fraction_static *= 0.95
self.mem_fraction_static *= 0.95
# TODO: qwen2-vl does not support radix cache now, set disable_radix_cache=True automatically
if self.model_config.hf_config.architectures == [
"Qwen2VLForConditionalGeneration"
......
......@@ -139,7 +139,7 @@ async def get_server_args():
return dataclasses.asdict(tokenizer_manager.server_args)
@app.get("/flush_cache")
@app.post("/flush_cache")
async def flush_cache():
"""Flush the radix cache."""
tokenizer_manager.flush_cache()
......@@ -180,7 +180,7 @@ async def get_memory_pool_size():
return ret
except Exception as e:
return JSONResponse(
return ORJSONResponse(
{"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
)
......
......@@ -19,7 +19,6 @@ from typing import Optional, Union
import numpy as np
import requests
import torch
from IPython.display import HTML, display
from tqdm import tqdm
......@@ -332,14 +331,13 @@ def wait_for_server(base_url: str, timeout: int = None) -> None:
headers={"Authorization": "Bearer None"},
)
if response.status_code == 200:
time.sleep(5)
print_highlight(
"""\n
NOTE: Typically, the server runs in a separate terminal.
In this notebook, we run the server and notebook code together, so their outputs are combined.
To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue.
"""
Server and notebook outputs are combined for clarity.
Typically, the server runs in a separate terminal.
Server output is gray; notebook output is highlighted.
"""
)
break
......@@ -350,36 +348,8 @@ def wait_for_server(base_url: str, timeout: int = None) -> None:
def terminate_process(process):
"""Safely terminate a process and clean up GPU memory.
Args:
process: subprocess.Popen object to terminate
"""
try:
process.terminate()
try:
process.wait(timeout=5)
except subprocess.TimeoutExpired:
if os.name != "nt":
try:
pgid = os.getpgid(process.pid)
os.killpg(pgid, signal.SIGTERM)
time.sleep(1)
if process.poll() is None:
os.killpg(pgid, signal.SIGKILL)
except ProcessLookupError:
pass
else:
process.kill()
process.wait()
except Exception as e:
print(f"Warning: {e}")
finally:
gc.collect()
if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
time.sleep(2)
from sglang.srt.utils import kill_child_process
kill_child_process(process.pid, include_self=True)
def print_highlight(html_content: str):
......
"""
Install the dependency in CI.
"""
pip install --upgrade pip
pip install -e "python[all]"
pip install transformers==4.45.2
......
"""
Kill all SGLang processes and free the GPU memory.
"""
kill -9 $(ps aux | grep 'multiprocessing.spawn' | grep -v 'grep' | awk '{print $2}')
kill -9 $(ps aux | grep 'sglang.launch_server' | grep -v 'grep' | awk '{print $2}')
#!/bin/bash
# This script tags all remote branches starting with 'v' with the same name as the branch,
# This script is used for release.
# It tags all remote branches starting with 'v' with the same name as the branch,
# deletes the corresponding branches from the remote, and pushes the tags to the remote repository.
git fetch origin --prune
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment