[Misc] add the "download-dir" option to the latency/throughput benchmarks (#3621)

19569314 · AmadeusChan · GitHub · e24336b5 · 19569314 · 19569314
Unverified Commit 19569314 authored Mar 27, 2024 by AmadeusChan Committed by GitHub Mar 27, 2024
Show whitespace changes
Inline Side-by-side

Showing with 32 additions and 19 deletions

benchmarks/benchmark_latency.py benchmarks/benchmark_latency.py +16 -12

benchmarks/benchmark_throughput.py benchmarks/benchmark_throughput.py +16 -7

No files found.
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -16,8 +16,7 @@ def main(args: argparse.Namespace):
    # NOTE(woosuk): If the request cannot be processed in a single batch,
    # the engine will automatically process the request in multiple batches.
-    llm = LLM(
+    llm = LLM(model=args.model,
-        model=args.model,
              tokenizer=args.tokenizer,
              quantization=args.quantization,
              tensor_parallel_size=args.tensor_parallel_size,
@@ -27,7 +26,7 @@ def main(args: argparse.Namespace):
              kv_cache_dtype=args.kv_cache_dtype,
              device=args.device,
              ray_workers_use_nsight=args.ray_workers_use_nsight,
-    )
+              download_dir=args.download_dir)
    sampling_params = SamplingParams(
        n=args.n,
@@ -151,5 +150,10 @@ if __name__ == '__main__':
        action='store_true',
        help="If specified, use nsight to profile ray workers",
    )
+    parser.add_argument('--download-dir',
+                        type=str,
+                        default=None,
+                        help='directory to download and load the weights, '
+                        'default to the default cache dir of huggingface')
    args = parser.parse_args()
    main(args)
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -75,6 +75,7 @@ def run_vllm(
    device: str,
    enable_prefix_caching: bool,
    gpu_memory_utilization: float = 0.9,
+    download_dir: Optional[str] = None,
 ) -> float:
    from vllm import LLM, SamplingParams
    llm = LLM(model=model,
@@ -89,7 +90,8 @@ def run_vllm(
              enforce_eager=enforce_eager,
              kv_cache_dtype=kv_cache_dtype,
              device=device,
-              enable_prefix_caching=enable_prefix_caching)
+              enable_prefix_caching=enable_prefix_caching,
+              download_dir=download_dir)
    # Add the requests to the engine.
    for prompt, _, output_len in requests:
@@ -208,12 +210,14 @@ def main(args: argparse.Namespace):
                                   args.output_len)
    if args.backend == "vllm":
-        elapsed_time = run_vllm(
+        elapsed_time = run_vllm(requests, args.model, args.tokenizer,
-            requests, args.model, args.tokenizer, args.quantization,
+                                args.quantization, args.tensor_parallel_size,
-            args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
+                                args.seed, args.n, args.use_beam_search,
-            args.trust_remote_code, args.dtype, args.max_model_len,
+                                args.trust_remote_code, args.dtype,
-            args.enforce_eager, args.kv_cache_dtype, args.device,
+                                args.max_model_len, args.enforce_eager,
-            args.enable_prefix_caching, args.gpu_memory_utilization)
+                                args.kv_cache_dtype, args.device,
+                                args.enable_prefix_caching,
+                                args.gpu_memory_utilization, args.download_dir)
    elif args.backend == "hf":
        assert args.tensor_parallel_size == 1
        elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
@@ -314,6 +318,11 @@ if __name__ == "__main__":
        "--enable-prefix-caching",
        action='store_true',
        help="enable automatic prefix caching for vLLM backend.")
+    parser.add_argument('--download-dir',
+                        type=str,
+                        default=None,
+                        help='directory to download and load the weights, '
+                        'default to the default cache dir of huggingface')
    args = parser.parse_args()
    if args.tokenizer is None:
        args.tokenizer = args.model