vllm_serve.yaml

# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

# Embedding cache ON vs OFF on a single vllm_serve workflow.
#
# Usage:
#   python -m benchmarks.multimodal.sweep --config benchmarks/multimodal/sweep/experiments/embedding_cache/cache_on_off.yaml

model: Qwen/Qwen3-VL-30B-A3B-Instruct-FP8
concurrencies: [16, 32, 64, 128, 256]
osl: 150
request_count: 1000
warmup_count: 5
port: 8000
timeout: 900
output_dir: benchmarks/multimodal/sweep/results/vllm_serve

env:
  ENABLE_ENCODER_CACHE: "0"

input_files:
  - benchmarks/multimodal/jsonl/1000req_1img_200pool_400word_base64.jsonl
  - benchmarks/multimodal/jsonl/1000req_2img_200pool_400word_base64.jsonl
  - benchmarks/multimodal/jsonl/1000req_4img_200pool_400word_base64.jsonl
  - benchmarks/multimodal/jsonl/1000req_2img_200pool_400word_http.jsonl
  - benchmarks/multimodal/jsonl/1000req_4img_200pool_400word_http.jsonl
  - benchmarks/multimodal/jsonl/1000req_1img_800pool_400word_http.jsonl
  - benchmarks/multimodal/jsonl/1000req_2img_1600pool_400word_http.jsonl
  - benchmarks/multimodal/jsonl/1000req_4img_3200pool_400word_http.jsonl

configs:
  - label: cache-off
    workflow: benchmarks/multimodal/sweep/workflows/vllm_serve.sh
    extra_args: [--no-enable-prefix-caching, --multimodal-embedding-cache-capacity-gb, "0"]

  - label: cache-on
    workflow: benchmarks/multimodal/sweep/workflows/vllm_serve.sh
    extra_args: [--no-enable-prefix-caching, --multimodal-embedding-cache-capacity-gb, "10"]