Unverified Commit 70359bf3 authored by Lianmin Zheng's avatar Lianmin Zheng Committed by GitHub
Browse files

Update benchmark scripts (#8)

parent 01ca82d7
## Flashinfer Mode ## Flashinfer Mode
[`flashinfer`](https://github.com/flashinfer-ai/flashinfer) is a kernel library for LLM serving; we use it here to support our attention computation. [flashinfer](https://github.com/flashinfer-ai/flashinfer) is a kernel library for LLM serving.
It can be used in SGLang runtime to accelerate attention computation.
### Install flashinfer ### Install flashinfer
Note: The compilation can take a very long time.
```bash ```bash
git submodule update --init --recursive git submodule update --init --recursive
pip install 3rdparty/flashinfer/python pip install 3rdparty/flashinfer/python
``` ```
### Run Sever With Flashinfer Mode ### Run a Server With Flashinfer Mode
Add through `--model_mode` argument from the command line. Add `--model-mode flashinfer` argument to enable flashinfer when launching a server.
Example: Example:
```bash ```bash
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --model-mode flashinfer python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --model-mode flashinfer
``` ```
\ No newline at end of file
...@@ -351,7 +351,7 @@ class MixtralForCausalLM(nn.Module): ...@@ -351,7 +351,7 @@ class MixtralForCausalLM(nn.Module):
params_dict = dict(self.named_parameters()) params_dict = dict(self.named_parameters())
for name, loaded_weight in hf_model_weights_iterator( for name, loaded_weight in hf_model_weights_iterator(
model_name_or_path, cache_dir, load_format, revision, fall_back_to_pt=False model_name_or_path, cache_dir, load_format, revision
): ):
if "rotary_emb.inv_freq" in name: if "rotary_emb.inv_freq" in name:
continue continue
......
...@@ -93,7 +93,8 @@ class ServerArgs: ...@@ -93,7 +93,8 @@ class ServerArgs:
type=str, type=str,
default=[], default=[],
nargs="+", nargs="+",
help="Model mode: [flashinfer, no-cache, aggressive-new-fill]", choices=["flashinfer", "no-cache"],
help="Model mode: [flashinfer, no-cache]",
) )
parser.add_argument( parser.add_argument(
"--schedule-heuristic", "--schedule-heuristic",
......
...@@ -99,7 +99,7 @@ def call_select_vllm(context, choices, url): ...@@ -99,7 +99,7 @@ def call_select_vllm(context, choices, url):
} }
res = requests.post(url, json=data) res = requests.post(url, json=data)
assert res.status_code == 200 assert res.status_code == 200
scores.append(res.json()["prompt_score"]) scores.append(res.json().get("prompt_score", 0))
return np.argmax(scores) return np.argmax(scores)
""" """
...@@ -112,7 +112,7 @@ def call_select_vllm(context, choices, url): ...@@ -112,7 +112,7 @@ def call_select_vllm(context, choices, url):
def add_common_other_args_and_parse(parser): def add_common_other_args_and_parse(parser):
parser.add_argument("--parallel", type=int, default=96) parser.add_argument("--parallel", type=int, default=64)
parser.add_argument("--host", type=str, default="http://127.0.0.1") parser.add_argument("--host", type=str, default="http://127.0.0.1")
parser.add_argument("--port", type=int, default=None) parser.add_argument("--port", type=int, default=None)
parser.add_argument( parser.add_argument(
......
docker run --name tgi --rm -ti --gpus all --network host \
-v /home/ubuntu/model_weights/Llama-2-7b-chat-hf:/Llama-2-7b-chat-hf \
ghcr.io/huggingface/text-generation-inference:1.3.0 \
--model-id /Llama-2-7b-chat-hf --num-shard 1 --trust-remote-code \
--max-input-length 2048 --max-total-tokens 4096 \
--port 24000
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment