Unverified Commit b91a4cb1 authored by liuyhwangyh's avatar liuyhwangyh Committed by GitHub
Browse files

support models from www.modelscope.cn (#994)


Co-authored-by: default avatarmulin.lyh <mulin.lyh@taobao.com>
parent 95a28019
...@@ -168,6 +168,16 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct ...@@ -168,6 +168,16 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
- To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments. - To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
- To enable experimental torch.compile support, you can add `--enable-torch-compile`. It accelerates small models on small batch sizes. - To enable experimental torch.compile support, you can add `--enable-torch-compile`. It accelerates small models on small batch sizes.
### Use Models From ModelScope
To use model from [ModelScope](https://www.modelscope.cn), setting environment variable SGLANG_USE_MODELSCOPE.
```
export SGLANG_USE_MODELSCOPE=true
```
Launch [Qwen2-7B-Instruct](https://www.modelscope.cn/models/qwen/qwen2-7b-instruct) Server
```
SGLANG_USE_MODELSCOPE=true python -m sglang.launch_server --model-path qwen/Qwen2-7B-Instruct --port 30000
```
### Supported Models ### Supported Models
- Llama / Llama 2 / Llama 3 / Llama 3.1 - Llama / Llama 2 / Llama 3 / Llama 3.1
......
...@@ -74,6 +74,8 @@ from sglang.srt.utils import ( ...@@ -74,6 +74,8 @@ from sglang.srt.utils import (
enable_show_time_cost, enable_show_time_cost,
kill_child_process, kill_child_process,
maybe_set_triton_cache_manager, maybe_set_triton_cache_manager,
prepare_model,
prepare_tokenizer,
set_ulimit, set_ulimit,
) )
from sglang.utils import get_exception_traceback from sglang.utils import get_exception_traceback
...@@ -250,6 +252,10 @@ def launch_server( ...@@ -250,6 +252,10 @@ def launch_server(
) )
logger.info(f"{server_args=}") logger.info(f"{server_args=}")
# Use model from www.modelscope.cn, first download the model.
server_args.model_path = prepare_model(server_args.model_path)
server_args.tokenizer_path = prepare_tokenizer(server_args.tokenizer_path)
# Launch processes for multi-node tensor parallelism # Launch processes for multi-node tensor parallelism
if server_args.nnodes > 1: if server_args.nnodes > 1:
if server_args.node_rank != 0: if server_args.node_rank != 0:
......
...@@ -701,3 +701,23 @@ def add_api_key_middleware(app, api_key): ...@@ -701,3 +701,23 @@ def add_api_key_middleware(app, api_key):
if request.headers.get("Authorization") != "Bearer " + api_key: if request.headers.get("Authorization") != "Bearer " + api_key:
return JSONResponse(content={"error": "Unauthorized"}, status_code=401) return JSONResponse(content={"error": "Unauthorized"}, status_code=401)
return await call_next(request) return await call_next(request)
def prepare_model(model_path):
if "SGLANG_USE_MODELSCOPE" in os.environ:
if not os.path.exists(model_path):
from modelscope import snapshot_download
return snapshot_download(model_path)
return model_path
def prepare_tokenizer(tokenizer_path):
if "SGLANG_USE_MODELSCOPE" in os.environ:
if not os.path.exists(tokenizer_path):
from modelscope import snapshot_download
return snapshot_download(
tokenizer_path, ignore_patterns=["*.bin", "*.safetensors"]
)
return tokenizer_path
...@@ -10,6 +10,7 @@ suites = { ...@@ -10,6 +10,7 @@ suites = {
"test_vision_openai_server.py", "test_vision_openai_server.py",
"test_chunked_prefill.py", "test_chunked_prefill.py",
"test_torch_compile.py", "test_torch_compile.py",
"test_models_from_modelscope.py",
"models/test_generation_models.py", "models/test_generation_models.py",
"models/test_embedding_models.py", "models/test_embedding_models.py",
"sampling/penaltylib", "sampling/penaltylib",
......
import os
import shutil
import subprocess
import unittest
from unittest import mock
from sglang.srt.utils import prepare_model, prepare_tokenizer
class TestDownloadFromModelScope(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls.model = "iic/nlp_lstmcrf_word-segmentation_chinese-news"
stat, output = subprocess.getstatusoutput("pip install modelscope")
cls.with_modelscope_environ = {k: v for k, v in os.environ.items()}
cls.with_modelscope_environ["SGLANG_USE_MODELSCOPE"] = "True"
@classmethod
def tearDownClass(cls):
pass
def test_prepare_model(self):
from modelscope.utils.file_utils import get_model_cache_root
model_cache_root = get_model_cache_root()
if os.path.exists(model_cache_root):
shutil.rmtree(model_cache_root)
with mock.patch.dict(os.environ, self.with_modelscope_environ, clear=True):
model_path = prepare_model(self.model)
assert os.path.exists(os.path.join(model_path, "pytorch_model.bin"))
def test_prepare_tokenizer(self):
from modelscope.utils.file_utils import get_model_cache_root
model_cache_root = get_model_cache_root()
if os.path.exists(model_cache_root):
shutil.rmtree(model_cache_root)
with mock.patch.dict(os.environ, self.with_modelscope_environ, clear=True):
tokenizer_path = prepare_tokenizer(self.model)
assert not os.path.exists(os.path.join(tokenizer_path, "pytorch_model.bin"))
assert os.path.exists(os.path.join(tokenizer_path, "config.json"))
if __name__ == "__main__":
unittest.main(warnings="ignore")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment