Fix the default arguments of bench_offline_throughput.py & simplify detokenizer manager (#2042)

2558d6a6 · Lianmin Zheng · GitHub · 29ebe3df · 2558d6a6 · 2558d6a6
Unverified Commit 2558d6a6 authored Nov 15, 2024 by Lianmin Zheng Committed by GitHub Nov 15, 2024
5 changed files
--- a/python/sglang/bench_offline_throughput.py
+++ b/python/sglang/bench_offline_throughput.py
 """
 Benchmark the throughput of using the offline LLM engine.
 This script does not launch a server.
-It accepts the same arguments as launch_server.py and additional benchmark arguments
+It accepts server arguments (the same as launch_server.py) and benchmark arguments (the same as bench_serving.py).

 # Usage
 ## Sharegpt dataset with default args
-python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3-8B-Instruct
+python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3.1-8B-Instruct

 ## Random dataset with default args
-python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3-8B-Instruct --dataset-name random
+python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --dataset-name random

 ## Shared prefix dataset with default args
-python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3-8B-Instruct --dataset-name generated-shared-prefix
+python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --dataset-name generated-shared-prefix

 ## Sharegpt dataset on runtime backend
-python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3-8B-Instruct --backend runtime
+python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --backend runtime
 """

 import argparse
@@ -23,7 +23,7 @@ import json
 import logging
 import random
 import time
-from typing import List, Tuple
+from typing import List, Optional, Tuple

 import numpy as np

@@ -45,14 +45,15 @@ class BenchArgs:
    dataset_name: str = "sharegpt"
    dataset_path: str = ""
    num_prompts: int = 1000
-    sharegpt_output_len: int = 256
-    random_input_len: int = 256
-    random_output_len: int = 256
+    sharegpt_output_len: Optional[int] = None
+    random_input_len: int = 1024
+    random_output_len: int = 1024
    random_range_ratio: float = 0.0
-    gen_num_groups: int = 8
+    gen_num_groups: int = 64
    gen_prompts_per_group: int = 16
-    gen_system_prompt_len: int = 128
-    gen_question_len: int = 256
+    gen_system_prompt_len: int = 2048
+    gen_question_len: int = 128
+    gen_output_len: int = 256
    disable_ignore_eos: bool = False
    seed: int = 1

@@ -129,6 +130,12 @@ class BenchArgs:
            default=BenchArgs.gen_question_len,
            help="Question length, used" "only for generate-shared-prefix",
        )
+        parser.add_argument(
+            "--gen-output-len",
+            type=int,
+            default=BenchArgs.gen_output_len,
+            help="Target length in tokens for outputs in generated-shared-prefix dataset",
+        )
        parser.add_argument(
            "--disable-ignore-eos",
            type=bool,
@@ -139,12 +146,8 @@ class BenchArgs:

    @classmethod
    def from_cli_args(cls, args: argparse.Namespace):
-        # use the default value's type to case the args into correct types.
-        attrs = [(attr.name, type(attr.default)) for attr in dataclasses.fields(cls)]
-        print(attrs)
-        return cls(
-            **{attr: attr_type(getattr(args, attr)) for attr, attr_type in attrs}
-        )
+        attrs = [attr.name for attr in dataclasses.fields(cls)]
+        return cls(**{attr: getattr(args, attr) for attr in attrs})


 def throughput_test_once(
@@ -224,6 +227,7 @@ def throughput_test(
    random.seed(bench_args.seed)
    np.random.seed(bench_args.seed)

+    # Read dataset
    input_requests = get_dataset(bench_args, tokenizer)

    warmup_requests = sample_random_requests(

--- a/python/sglang/bench_serving.py
+++ b/python/sglang/bench_serving.py
@@ -1241,10 +1241,12 @@ if __name__ == "__main__":
    parser.add_argument(
        "--random-input-len",
        type=int,
+        default=1024,
        help="Number of input tokens per request, used only for random dataset.",
    )
    parser.add_argument(
        "--random-output-len",
+        default=1024,
        type=int,
        help="Number of output tokens per request, used only for random dataset.",
    )

--- a/python/sglang/srt/managers/detokenizer_manager.py
+++ b/python/sglang/srt/managers/detokenizer_manager.py
@@ -100,20 +100,6 @@ class DetokenizerManager:

            if isinstance(recv_obj, BatchEmbeddingOut):
                # If it is embedding model, no detokenization is needed.
-                self.send_to_tokenizer.send_pyobj(
-                    BatchEmbeddingOut(
-                        rids=recv_obj.rids,
-                        embeddings=recv_obj.embeddings,
-                        meta_info=recv_obj.meta_info,
-                        finished_reason=recv_obj.finished_reason,
-                    )
-                )
-                continue
-            elif isinstance(recv_obj, UpdateWeightReqOutput):
-                # If it is a weight update request, no detokenization is needed.
-                self.send_to_tokenizer.send_pyobj(recv_obj)
-                continue
-            elif isinstance(recv_obj, GetMemPoolSizeReqOutput):
                self.send_to_tokenizer.send_pyobj(recv_obj)
                continue
            else:

--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -114,6 +114,9 @@ class Scheduler:
            self.recv_from_tokenizer = get_zmq_socket(
                context, zmq.PULL, port_args.scheduler_input_ipc_name
            )
+            self.send_to_tokenizer = get_zmq_socket(
+                context, zmq.PUSH, port_args.tokenizer_ipc_name
+            )

            if server_args.skip_tokenizer_init:
                # Directly send to the tokenizer/api
@@ -127,6 +130,7 @@ class Scheduler:
                )
        else:
            self.recv_from_tokenizer = None
+            self.send_to_tokenizer = SimpleNamespace(send_pyobj=lambda x: None)
            self.send_to_detokenizer = SimpleNamespace(send_pyobj=lambda x: None)

        # Init tokenizer
@@ -421,7 +425,7 @@ class Scheduler:
                self.abort_request(recv_req)
            elif isinstance(recv_req, UpdateWeightReqInput):
                success, message = self.update_weights(recv_req)
-                self.send_to_detokenizer.send_pyobj(
+                self.send_to_tokenizer.send_pyobj(
                    UpdateWeightReqOutput(success, message)
                )
            elif isinstance(recv_req, ProfileReq):
@@ -430,7 +434,7 @@ class Scheduler:
                else:
                    self.stop_profile()
            elif isinstance(recv_req, GetMemPoolSizeReq):
-                self.send_to_detokenizer.send_pyobj(
+                self.send_to_tokenizer.send_pyobj(
                    GetMemPoolSizeReqOutput(self.max_total_num_tokens)
                )
            else:

--- a/test/srt/test_overlap_schedule.py
+++ b/test/srt/test_overlap_schedule.py
@@ -11,16 +11,24 @@ from sglang.test.test_utils import run_mmlu_test

 class TestOverlapSchedule(unittest.TestCase):
    def test_no_radix_attention_chunked_prefill(self):
-        run_mmlu_test(disable_radix_cache=True, chunked_prefill_size=32)
+        run_mmlu_test(
+            disable_radix_cache=True, chunked_prefill_size=32, enable_overlap=True
+        )

    def test_no_radix_attention_no_chunked_prefill(self):
-        run_mmlu_test(disable_radix_cache=True, chunked_prefill_size=-1)
+        run_mmlu_test(
+            disable_radix_cache=True, chunked_prefill_size=-1, enable_overlap=True
+        )

    def test_radix_attention_chunked_prefill(self):
-        run_mmlu_test(disable_radix_cache=False, chunked_prefill_size=32)
+        run_mmlu_test(
+            disable_radix_cache=False, chunked_prefill_size=32, enable_overlap=True
+        )

    def test_radix_attention_no_chunked_prefill(self):
-        run_mmlu_test(disable_radix_cache=False, chunked_prefill_size=-1)
+        run_mmlu_test(
+            disable_radix_cache=False, chunked_prefill_size=-1, enable_overlap=True
+        )


 if __name__ == "__main__":