Unverified Commit b548801d authored by Lianmin Zheng's avatar Lianmin Zheng Committed by GitHub
Browse files

Update docs (#1839)

parent 539df95d
#!/usr/bin/python3 # Deploy the documents
import os import os
from datetime import datetime from datetime import datetime
......
This diff is collapsed.
This diff is collapsed.
...@@ -29,5 +29,5 @@ if __name__ == "__main__": ...@@ -29,5 +29,5 @@ if __name__ == "__main__":
parser.add_argument("--url", type=str, default="http://localhost:30000") parser.add_argument("--url", type=str, default="http://localhost:30000")
args = parser.parse_args() args = parser.parse_args()
response = requests.get(args.url + "/flush_cache") response = requests.post(args.url + "/flush_cache")
assert response.status_code == 200 assert response.status_code == 200
...@@ -124,7 +124,7 @@ class ModelRunner: ...@@ -124,7 +124,7 @@ class ModelRunner:
"Automatically turn off --chunked-prefill-size and adjust --mem-fraction-static for multimodal models." "Automatically turn off --chunked-prefill-size and adjust --mem-fraction-static for multimodal models."
) )
server_args.chunked_prefill_size = None server_args.chunked_prefill_size = None
server_args.mem_fraction_static *= 0.95 self.mem_fraction_static *= 0.95
# TODO: qwen2-vl does not support radix cache now, set disable_radix_cache=True automatically # TODO: qwen2-vl does not support radix cache now, set disable_radix_cache=True automatically
if self.model_config.hf_config.architectures == [ if self.model_config.hf_config.architectures == [
"Qwen2VLForConditionalGeneration" "Qwen2VLForConditionalGeneration"
......
...@@ -139,7 +139,7 @@ async def get_server_args(): ...@@ -139,7 +139,7 @@ async def get_server_args():
return dataclasses.asdict(tokenizer_manager.server_args) return dataclasses.asdict(tokenizer_manager.server_args)
@app.get("/flush_cache") @app.post("/flush_cache")
async def flush_cache(): async def flush_cache():
"""Flush the radix cache.""" """Flush the radix cache."""
tokenizer_manager.flush_cache() tokenizer_manager.flush_cache()
...@@ -180,7 +180,7 @@ async def get_memory_pool_size(): ...@@ -180,7 +180,7 @@ async def get_memory_pool_size():
return ret return ret
except Exception as e: except Exception as e:
return JSONResponse( return ORJSONResponse(
{"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST {"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
) )
......
...@@ -19,7 +19,6 @@ from typing import Optional, Union ...@@ -19,7 +19,6 @@ from typing import Optional, Union
import numpy as np import numpy as np
import requests import requests
import torch
from IPython.display import HTML, display from IPython.display import HTML, display
from tqdm import tqdm from tqdm import tqdm
...@@ -332,13 +331,12 @@ def wait_for_server(base_url: str, timeout: int = None) -> None: ...@@ -332,13 +331,12 @@ def wait_for_server(base_url: str, timeout: int = None) -> None:
headers={"Authorization": "Bearer None"}, headers={"Authorization": "Bearer None"},
) )
if response.status_code == 200: if response.status_code == 200:
time.sleep(5)
print_highlight( print_highlight(
""" """\n
Server and notebook outputs are combined for clarity. NOTE: Typically, the server runs in a separate terminal.
In this notebook, we run the server and notebook code together, so their outputs are combined.
Typically, the server runs in a separate terminal. To improve clarity, the server logs are displayed in the original black color, while the notebook outputs are highlighted in blue.
Server output is gray; notebook output is highlighted.
""" """
) )
break break
...@@ -350,36 +348,8 @@ def wait_for_server(base_url: str, timeout: int = None) -> None: ...@@ -350,36 +348,8 @@ def wait_for_server(base_url: str, timeout: int = None) -> None:
def terminate_process(process): def terminate_process(process):
"""Safely terminate a process and clean up GPU memory. from sglang.srt.utils import kill_child_process
kill_child_process(process.pid, include_self=True)
Args:
process: subprocess.Popen object to terminate
"""
try:
process.terminate()
try:
process.wait(timeout=5)
except subprocess.TimeoutExpired:
if os.name != "nt":
try:
pgid = os.getpgid(process.pid)
os.killpg(pgid, signal.SIGTERM)
time.sleep(1)
if process.poll() is None:
os.killpg(pgid, signal.SIGKILL)
except ProcessLookupError:
pass
else:
process.kill()
process.wait()
except Exception as e:
print(f"Warning: {e}")
finally:
gc.collect()
if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
time.sleep(2)
def print_highlight(html_content: str): def print_highlight(html_content: str):
......
"""
Install the dependency in CI.
"""
pip install --upgrade pip pip install --upgrade pip
pip install -e "python[all]" pip install -e "python[all]"
pip install transformers==4.45.2 pip install transformers==4.45.2
......
"""
Kill all SGLang processes and free the GPU memory.
"""
kill -9 $(ps aux | grep 'multiprocessing.spawn' | grep -v 'grep' | awk '{print $2}') kill -9 $(ps aux | grep 'multiprocessing.spawn' | grep -v 'grep' | awk '{print $2}')
kill -9 $(ps aux | grep 'sglang.launch_server' | grep -v 'grep' | awk '{print $2}') kill -9 $(ps aux | grep 'sglang.launch_server' | grep -v 'grep' | awk '{print $2}')
#!/bin/bash #!/bin/bash
# This script tags all remote branches starting with 'v' with the same name as the branch, # This script is used for release.
# It tags all remote branches starting with 'v' with the same name as the branch,
# deletes the corresponding branches from the remote, and pushes the tags to the remote repository. # deletes the corresponding branches from the remote, and pushes the tags to the remote repository.
git fetch origin --prune git fetch origin --prune
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment