support models from www.modelscope.cn (#994)

Co-authored-by: mulin.lyh <mulin.lyh@taobao.com>

support models from www.modelscope.cn (#994)
Co-authored-by: mulin.lyh <mulin.lyh@taobao.com>
b91a4cb1 · liuyhwangyh · GitHub · 95a28019 · b91a4cb1 · b91a4cb1
Unverified Commit b91a4cb1 authored Aug 09, 2024 by liuyhwangyh Committed by GitHub Aug 09, 2024
5 changed files
--- a/README.md
+++ b/README.md
@@ -168,6 +168,16 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
 - To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
 - To enable experimental torch.compile support, you can add `--enable-torch-compile`. It accelerates small models on small batch sizes.
+### Use Models From ModelScope
+To use model from [ModelScope](https://www.modelscope.cn), setting environment variable SGLANG_USE_MODELSCOPE.
+```
+export SGLANG_USE_MODELSCOPE=true
+```
+Launch [Qwen2-7B-Instruct](https://www.modelscope.cn/models/qwen/qwen2-7b-instruct) Server
+```
+SGLANG_USE_MODELSCOPE=true python -m sglang.launch_server --model-path qwen/Qwen2-7B-Instruct --port 30000
+```    
 ### Supported Models
 - Llama / Llama 2 / Llama 3 / Llama 3.1

--- a/python/sglang/srt/server.py
+++ b/python/sglang/srt/server.py
@@ -74,6 +74,8 @@ from sglang.srt.utils import (
    enable_show_time_cost,
    kill_child_process,
    maybe_set_triton_cache_manager,
+    prepare_model,
+    prepare_tokenizer,
    set_ulimit,
 )
 from sglang.utils import get_exception_traceback
@@ -250,6 +252,10 @@ def launch_server(
    )
    logger.info(f"{server_args=}")
+    # Use model from www.modelscope.cn, first download the model.
+    server_args.model_path = prepare_model(server_args.model_path)
+    server_args.tokenizer_path = prepare_tokenizer(server_args.tokenizer_path)
    # Launch processes for multi-node tensor parallelism
    if server_args.nnodes > 1:
        if server_args.node_rank != 0:

--- a/python/sglang/srt/utils.py
+++ b/python/sglang/srt/utils.py
@@ -701,3 +701,23 @@ def add_api_key_middleware(app, api_key):
        if request.headers.get("Authorization") != "Bearer " + api_key:
            return JSONResponse(content={"error": "Unauthorized"}, status_code=401)
        return await call_next(request)
+def prepare_model(model_path):
+    if "SGLANG_USE_MODELSCOPE" in os.environ:
+        if not os.path.exists(model_path):
+            from modelscope import snapshot_download
+            return snapshot_download(model_path)
+    return model_path
+def prepare_tokenizer(tokenizer_path):
+    if "SGLANG_USE_MODELSCOPE" in os.environ:
+        if not os.path.exists(tokenizer_path):
+            from modelscope import snapshot_download
+            return snapshot_download(
+                tokenizer_path, ignore_patterns=["*.bin", "*.safetensors"]
+            )
+    return tokenizer_path
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -10,6 +10,7 @@ suites = {
        "test_vision_openai_server.py",
        "test_chunked_prefill.py",
        "test_torch_compile.py",
+        "test_models_from_modelscope.py",
        "models/test_generation_models.py",
        "models/test_embedding_models.py",
        "sampling/penaltylib",

--- a/test/srt/test_models_from_modelscope.py
+++ b/test/srt/test_models_from_modelscope.py
+import os
+import shutil
+import subprocess
+import unittest
+from unittest import mock
+from sglang.srt.utils import prepare_model, prepare_tokenizer
+class TestDownloadFromModelScope(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = "iic/nlp_lstmcrf_word-segmentation_chinese-news"
+        stat, output = subprocess.getstatusoutput("pip install modelscope")
+        cls.with_modelscope_environ = {k: v for k, v in os.environ.items()}
+        cls.with_modelscope_environ["SGLANG_USE_MODELSCOPE"] = "True"
+    @classmethod
+    def tearDownClass(cls):
+        pass
+    def test_prepare_model(self):
+        from modelscope.utils.file_utils import get_model_cache_root
+        model_cache_root = get_model_cache_root()
+        if os.path.exists(model_cache_root):
+            shutil.rmtree(model_cache_root)
+        with mock.patch.dict(os.environ, self.with_modelscope_environ, clear=True):
+            model_path = prepare_model(self.model)
+            assert os.path.exists(os.path.join(model_path, "pytorch_model.bin"))
+    def test_prepare_tokenizer(self):
+        from modelscope.utils.file_utils import get_model_cache_root
+        model_cache_root = get_model_cache_root()
+        if os.path.exists(model_cache_root):
+            shutil.rmtree(model_cache_root)
+        with mock.patch.dict(os.environ, self.with_modelscope_environ, clear=True):
+            tokenizer_path = prepare_tokenizer(self.model)
+            assert not os.path.exists(os.path.join(tokenizer_path, "pytorch_model.bin"))
+            assert os.path.exists(os.path.join(tokenizer_path, "config.json"))
+if __name__ == "__main__":
+    unittest.main(warnings="ignore")