enable --gpu-memory-utilization in benchmark_throughput.py (#3175)

Co-authored-by: zixiao <shunli.dsl@alibaba-inc.com>

enable --gpu-memory-utilization in benchmark_throughput.py (#3175)
Co-authored-by: zixiao <shunli.dsl@alibaba-inc.com>
9cbc7e5f · Allen.Dou · GitHub · 27a7b070 · 9cbc7e5f
Unverified Commit 9cbc7e5f authored Mar 05, 2024 by Allen.Dou Committed by GitHub Mar 04, 2024
Show whitespace changes
Inline Side-by-side

Showing with 14 additions and 7 deletions

benchmarks/benchmark_throughput.py benchmarks/benchmark_throughput.py +14 -7

No files found.
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -74,6 +74,7 @@ def run_vllm(
    kv_cache_dtype: str,
    device: str,
    enable_prefix_caching: bool,
+    gpu_memory_utilization: float = 0.9,
 ) -> float:
    from vllm import LLM, SamplingParams
    llm = LLM(model=model,
@@ -84,6 +85,7 @@ def run_vllm(
              trust_remote_code=trust_remote_code,
              dtype=dtype,
              max_model_len=max_model_len,
+              gpu_memory_utilization=gpu_memory_utilization,
              enforce_eager=enforce_eager,
              kv_cache_dtype=kv_cache_dtype,
              device=device,
@@ -206,13 +208,12 @@ def main(args: argparse.Namespace):
                                   args.output_len)
    if args.backend == "vllm":
-        elapsed_time = run_vllm(requests, args.model, args.tokenizer,
+        elapsed_time = run_vllm(
-                                args.quantization, args.tensor_parallel_size,
+            requests, args.model, args.tokenizer, args.quantization,
-                                args.seed, args.n, args.use_beam_search,
+            args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
-                                args.trust_remote_code, args.dtype,
+            args.trust_remote_code, args.dtype, args.max_model_len,
-                                args.max_model_len, args.enforce_eager,
+            args.enforce_eager, args.kv_cache_dtype, args.device,
-                                args.kv_cache_dtype, args.device,
+            args.enable_prefix_caching, args.gpu_memory_utilization)
-                                args.enable_prefix_caching)
    elif args.backend == "hf":
        assert args.tensor_parallel_size == 1
        elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
@@ -287,6 +288,12 @@ if __name__ == "__main__":
        'The "auto" option will use FP16 precision '
        'for FP32 and FP16 models, and BF16 precision '
        'for BF16 models.')
+    parser.add_argument('--gpu-memory-utilization',
+                        type=float,
+                        default=0.9,
+                        help='the fraction of GPU memory to be used for '
+                        'the model executor, which can range from 0 to 1.'
+                        'If unspecified, will use the default value of 0.9.')
    parser.add_argument("--enforce-eager",
                        action="store_true",
                        help="enforce eager execution")