# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # Embedding cache ON vs OFF on a single vllm_serve workflow. # # Usage: # python -m benchmarks.multimodal.sweep --config benchmarks/multimodal/sweep/experiments/embedding_cache/cache_on_off.yaml model: Qwen/Qwen3-VL-30B-A3B-Instruct-FP8 concurrencies: [16, 32, 64, 128, 256] osl: 150 request_count: 1000 warmup_count: 5 port: 8000 timeout: 900 output_dir: benchmarks/multimodal/sweep/results/vllm_serve env: ENABLE_ENCODER_CACHE: "0" input_files: - benchmarks/multimodal/jsonl/1000req_1img_200pool_400word_base64.jsonl - benchmarks/multimodal/jsonl/1000req_2img_200pool_400word_base64.jsonl - benchmarks/multimodal/jsonl/1000req_4img_200pool_400word_base64.jsonl - benchmarks/multimodal/jsonl/1000req_2img_200pool_400word_http.jsonl - benchmarks/multimodal/jsonl/1000req_4img_200pool_400word_http.jsonl - benchmarks/multimodal/jsonl/1000req_1img_800pool_400word_http.jsonl - benchmarks/multimodal/jsonl/1000req_2img_1600pool_400word_http.jsonl - benchmarks/multimodal/jsonl/1000req_4img_3200pool_400word_http.jsonl configs: - label: cache-off workflow: benchmarks/multimodal/sweep/workflows/vllm_serve.sh extra_args: [--no-enable-prefix-caching, --multimodal-embedding-cache-capacity-gb, "0"] - label: cache-on workflow: benchmarks/multimodal/sweep/workflows/vllm_serve.sh extra_args: [--no-enable-prefix-caching, --multimodal-embedding-cache-capacity-gb, "10"]