Unverified Commit 77e592e8 authored by Lianmin Zheng's avatar Lianmin Zheng Committed by GitHub
Browse files

support non-streaming benchmark (#682)

parent caaad53b
...@@ -154,6 +154,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct ...@@ -154,6 +154,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-1:50000 --nnodes 2 --node-rank 1 python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-1:50000 --nnodes 2 --node-rank 1
``` ```
- If the model does not have a template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/custom_chat_template.md). - If the model does not have a template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/custom_chat_template.md).
- To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
### Supported Models ### Supported Models
......
...@@ -143,7 +143,7 @@ async def async_request_openai_completions( ...@@ -143,7 +143,7 @@ async def async_request_openai_completions(
"temperature": 0.0, "temperature": 0.0,
"best_of": 1, "best_of": 1,
"max_tokens": request_func_input.output_len, "max_tokens": request_func_input.output_len,
"stream": True, "stream": not args.disable_stream,
"ignore_eos": True, "ignore_eos": True,
} }
headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"} headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
...@@ -166,8 +166,9 @@ async def async_request_openai_completions( ...@@ -166,8 +166,9 @@ async def async_request_openai_completions(
continue continue
chunk = remove_prefix(chunk_bytes.decode("utf-8"), "data: ") chunk = remove_prefix(chunk_bytes.decode("utf-8"), "data: ")
latency = time.perf_counter() - st
if chunk == "[DONE]": if chunk == "[DONE]":
latency = time.perf_counter() - st pass
else: else:
data = json.loads(chunk) data = json.loads(chunk)
...@@ -897,6 +898,11 @@ if __name__ == "__main__": ...@@ -897,6 +898,11 @@ if __name__ == "__main__":
help="Range of request rates in the format start,stop,step. Default is 2,34,2", help="Range of request rates in the format start,stop,step. Default is 2,34,2",
) )
parser.add_argument("--output-file", type=str, help="Output JSONL file name.") parser.add_argument("--output-file", type=str, help="Output JSONL file name.")
parser.add_argument(
"--disable-stream",
action="store_true",
help="Disable streaming mode.",
)
set_ulimit() set_ulimit()
......
...@@ -28,11 +28,16 @@ class ScheduleHeuristic: ...@@ -28,11 +28,16 @@ class ScheduleHeuristic:
# longest prefix match # longest prefix match
forward_queue.sort(key=lambda x: -len(x.prefix_indices)) forward_queue.sort(key=lambda x: -len(x.prefix_indices))
return forward_queue return forward_queue
elif self.schedule_heuristic == "fcfs":
# first come first serve
return forward_queue
elif self.schedule_heuristic == "lof":
# longest output first
forward_queue.sort(key=lambda x: -x.sampling_params.max_new_tokens)
return forward_queue
elif self.schedule_heuristic == "random": elif self.schedule_heuristic == "random":
random.shuffle(forward_queue) random.shuffle(forward_queue)
return forward_queue return forward_queue
elif self.schedule_heuristic == "fcfs":
return forward_queue
elif self.schedule_heuristic == "dfs-weight": elif self.schedule_heuristic == "dfs-weight":
last_node_to_reqs = defaultdict(list) last_node_to_reqs = defaultdict(list)
for req in forward_queue: for req in forward_queue:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment