Update benchmark scripts (#8)

70359bf3 · Lianmin Zheng · GitHub · 01ca82d7 · 70359bf3 · 70359bf3
Unverified Commit 70359bf3 authored Jan 15, 2024 by Lianmin Zheng Committed by GitHub Jan 15, 2024
8 changed files
--- a/benchmark/tree_of_thought/README.md
+++ b/benchmark/tree_of_thought/README.md
--- a/benchmark/tree_of_thought/bench_other.py
+++ b/benchmark/tree_of_thought/bench_other.py
--- a/benchmark/tree_of_thought/bench_sglang.py
+++ b/benchmark/tree_of_thought/bench_sglang.py
--- a/docs/flashinfer.md
+++ b/docs/flashinfer.md
 ## Flashinfer Mode
-[`flashinfer`](https://github.com/flashinfer-ai/flashinfer) is a kernel library for LLM serving; we use it here to support our attention computation.
+[flashinfer](https://github.com/flashinfer-ai/flashinfer) is a kernel library for LLM serving.
+It can be used in SGLang runtime to accelerate attention computation.
 ### Install flashinfer
+Note: The compilation can take a very long time.
 ```bash
 git submodule update --init --recursive
 pip install 3rdparty/flashinfer/python
 ```
-### Run Sever With Flashinfer Mode
+### Run a Server With Flashinfer Mode
-Add through `--model_mode` argument from the command line.
+Add `--model-mode flashinfer` argument to enable flashinfer when launching a server.
 Example:
 ```bash
 python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --model-mode flashinfer
 ```
\ No newline at end of file
--- a/python/sglang/srt/models/mixtral.py
+++ b/python/sglang/srt/models/mixtral.py
@@ -351,7 +351,7 @@ class MixtralForCausalLM(nn.Module):
        params_dict = dict(self.named_parameters())
        for name, loaded_weight in hf_model_weights_iterator(
-            model_name_or_path, cache_dir, load_format, revision, fall_back_to_pt=False
+            model_name_or_path, cache_dir, load_format, revision
        ):
            if "rotary_emb.inv_freq" in name:
                continue

--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -93,7 +93,8 @@ class ServerArgs:
            type=str,
            default=[],
            nargs="+",
-            help="Model mode: [flashinfer, no-cache, aggressive-new-fill]",
+            choices=["flashinfer", "no-cache"],
+            help="Model mode: [flashinfer, no-cache]",
        )
        parser.add_argument(
            "--schedule-heuristic",

--- a/python/sglang/test/test_utils.py
+++ b/python/sglang/test/test_utils.py
@@ -99,7 +99,7 @@ def call_select_vllm(context, choices, url):
        }
        res = requests.post(url, json=data)
        assert res.status_code == 200
-        scores.append(res.json()["prompt_score"])
+        scores.append(res.json().get("prompt_score", 0))
    return np.argmax(scores)
    """
@@ -112,7 +112,7 @@ def call_select_vllm(context, choices, url):
 def add_common_other_args_and_parse(parser):
-    parser.add_argument("--parallel", type=int, default=96)
+    parser.add_argument("--parallel", type=int, default=64)
    parser.add_argument("--host", type=str, default="http://127.0.0.1")
    parser.add_argument("--port", type=int, default=None)
    parser.add_argument(

--- a/scripts/launch_tgi.sh
+++ b/scripts/launch_tgi.sh
+docker run --name tgi --rm -ti --gpus all --network host \
+  -v /home/ubuntu/model_weights/Llama-2-7b-chat-hf:/Llama-2-7b-chat-hf \
+  ghcr.io/huggingface/text-generation-inference:1.3.0 \
+  --model-id /Llama-2-7b-chat-hf --num-shard 1  --trust-remote-code \
+  --max-input-length 2048 --max-total-tokens 4096 \
+  --port 24000