[Misc] Add args for selecting distributed executor to benchmarks (#5335)

b3376e5c · Benjamin Kitor · GitHub · e69ded7d · b3376e5c · b3376e5c
Unverified Commit b3376e5c authored Jun 07, 2024 by Benjamin Kitor Committed by GitHub Jun 08, 2024
Show whitespace changes
Inline Side-by-side

Showing with 20 additions and 3 deletions

benchmarks/benchmark_latency.py benchmarks/benchmark_latency.py +9 -1

benchmarks/benchmark_throughput.py benchmarks/benchmark_throughput.py +11 -2

No files found.
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -36,7 +36,8 @@ def main(args: argparse.Namespace):
              enable_chunked_prefill=args.enable_chunked_prefill,
              download_dir=args.download_dir,
              block_size=args.block_size,
-              gpu_memory_utilization=args.gpu_memory_utilization)
+              gpu_memory_utilization=args.gpu_memory_utilization,
+              distributed_executor_backend=args.distributed_executor_backend)
    sampling_params = SamplingParams(
        n=args.n,
@@ -221,5 +222,12 @@ if __name__ == '__main__':
                        help='the fraction of GPU memory to be used for '
                        'the model executor, which can range from 0 to 1.'
                        'If unspecified, will use the default value of 0.9.')
+    parser.add_argument(
+        '--distributed-executor-backend',
+        choices=['ray', 'mp'],
+        default=None,
+        help='Backend to use for distributed serving. When more than 1 GPU '
+        'is used, will be automatically set to "ray" if installed '
+        'or "mp" (multiprocessing) otherwise.')
    args = parser.parse_args()
    main(args)
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -78,6 +78,7 @@ def run_vllm(
    enable_prefix_caching: bool,
    enable_chunked_prefill: bool,
    max_num_batched_tokens: int,
+    distributed_executor_backend: Optional[str],
    gpu_memory_utilization: float = 0.9,
    download_dir: Optional[str] = None,
 ) -> float:
@@ -100,6 +101,7 @@ def run_vllm(
        download_dir=download_dir,
        enable_chunked_prefill=enable_chunked_prefill,
        max_num_batched_tokens=max_num_batched_tokens,
+        distributed_executor_backend=distributed_executor_backend,
    )
    # Add the requests to the engine.
@@ -225,8 +227,8 @@ def main(args: argparse.Namespace):
            args.enforce_eager, args.kv_cache_dtype,
            args.quantization_param_path, args.device,
            args.enable_prefix_caching, args.enable_chunked_prefill,
-            args.max_num_batched_tokens, args.gpu_memory_utilization,
+            args.max_num_batched_tokens, args.distributed_executor_backend,
-            args.download_dir)
+            args.gpu_memory_utilization, args.download_dir)
    elif args.backend == "hf":
        assert args.tensor_parallel_size == 1
        elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
@@ -368,6 +370,13 @@ if __name__ == "__main__":
        type=str,
        default=None,
        help='Path to save the throughput results in JSON format.')
+    parser.add_argument(
+        '--distributed-executor-backend',
+        choices=['ray', 'mp'],
+        default=None,
+        help='Backend to use for distributed serving. When more than 1 GPU '
+        'is used, will be automatically set to "ray" if installed '
+        'or "mp" (multiprocessing) otherwise.')
    args = parser.parse_args()
    if args.tokenizer is None:
        args.tokenizer = args.model