[Benchmark] Allow oversample request in benchmark dataset (#15170)

Signed-off-by: Jennifer Zhao <ai.jenniferzhao@gmail.com>

[Benchmark] Allow oversample request in benchmark dataset (#15170)
Signed-off-by: Jennifer Zhao <ai.jenniferzhao@gmail.com>
b88be221 · Jennifer Zhao · GitHub · d8c6d7d6 · b88be221 · b88be221
Unverified Commit b88be221 authored Mar 19, 2025 by Jennifer Zhao Committed by GitHub Mar 20, 2025
Show whitespace changes
Inline Side-by-side

Showing with 139 additions and 59 deletions

benchmarks/README.md benchmarks/README.md +54 -3

benchmarks/benchmark_dataset.py benchmarks/benchmark_dataset.py +85 -56

No files found.
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -42,7 +42,7 @@ become available.
    </tr>
    <tr>
      <td><strong>HuggingFace</strong></td>
-      <td style="text-align: center;">✅</td>
+      <td style="text-align: center;">🟡</td>
      <td style="text-align: center;">🟡</td>
      <td>Specify your dataset path on HuggingFace</td>
    </tr>
@@ -60,8 +60,8 @@ become available.
 🚧: to be supported

 🟡: Partial support. Currently, HuggingFaceDataset only supports dataset formats
-similar to `lmms-lab/LLaVA-OneVision-Data`. If you need support for other dataset
-formats, please consider contributing.
+similar to `lmms-lab/LLaVA-OneVision-Data` and `Aeala/ShareGPT_Vicuna_unfiltered`.
+If you need support for other dataset formats, please consider contributing.

 **Note**: VisionArena’s `dataset-name` should be set to `hf`

@@ -139,6 +139,57 @@ python3 vllm/benchmarks/benchmark_serving.py \
  --num-prompts "${NUM_PROMPTS}"
 ```

+### HuggingFaceDataset Examples
+
+Currently, HuggingFaceDataset only supports dataset formats
+similar to `lmms-lab/LLaVA-OneVision-Data` and `Aeala/ShareGPT_Vicuna_unfiltered`. If you need support for other dataset
+formats, please consider contributing.
+
+```bash
+# need a model with vision capability here
+vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
+```
+
+**`lmms-lab/LLaVA-OneVision-Data`**
+
+```bash
+MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
+NUM_PROMPTS=10
+BACKEND="openai-chat"
+DATASET_NAME="hf"
+DATASET_PATH="lmms-lab/LLaVA-OneVision-Data"
+DATASET_SPLIT='train'
+DATASET_SUBSET='chart2text(cauldron)'
+python3 vllm/benchmarks/benchmark_serving.py \
+  --backend "${BACKEND}" \
+  --model "${MODEL_NAME}" \
+  --endpoint "/v1/chat/completions" \
+  --dataset-name "${DATASET_NAME}" \
+  --dataset-path "${DATASET_PATH}" \
+  --hf-split "${DATASET_SPLIT}" \
+  --num-prompts "${NUM_PROMPTS}" \
+  --hf-subset "${DATASET_SUBSET}"
+```
+
+**`Aeala/ShareGPT_Vicuna_unfiltered`**
+
+```bash
+MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
+NUM_PROMPTS=10
+BACKEND="openai-chat"
+DATASET_NAME="hf"
+DATASET_PATH="Aeala/ShareGPT_Vicuna_unfiltered"
+DATASET_SPLIT='train'
+python3 vllm/benchmarks/benchmark_serving.py \
+  --backend "${BACKEND}" \
+  --model "${MODEL_NAME}" \
+  --endpoint "/v1/chat/completions" \
+  --dataset-name "${DATASET_NAME}" \
+  --dataset-path "${DATASET_PATH}" \
+  --hf-split "${DATASET_SPLIT}" \
+  --num-prompts "${NUM_PROMPTS}" \
+```
+
 ---
 ## Example - Offline Throughput Benchmark


--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@@ -17,6 +17,7 @@ SampleRequest instances, similar to the approach used in ShareGPT.
 import base64
 import io
 import json
+import logging
 import random
 from abc import ABC, abstractmethod
 from collections.abc import Mapping
@@ -35,6 +36,8 @@ from vllm.lora.utils import get_adapter_absolute_path
 from vllm.multimodal import MultiModalDataDict
 from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer

+logger = logging.getLogger(__name__)
+
 # -----------------------------------------------------------------------------
 # Data Classes
 # -----------------------------------------------------------------------------
@@ -61,9 +64,6 @@ class SampleRequest:
 class BenchmarkDataset(ABC):
    DEFAULT_SEED = 0

-    # num_requests has default 1000 in both the benchmark_serving.py and
-    # benchmark_throughput.py
-
    def __init__(
        self,
        dataset_path: Optional[str] = None,
@@ -90,8 +90,8 @@ class BenchmarkDataset(ABC):
            mm_content: Optional[MultiModalDataDict] = None) -> list[dict]:
        """
        Transform a prompt and optional multimodal content into a chat format.
-        This method is used for chat models that expect a specific 
-        conversation format.
+        This method is used for chat models that expect a specific conversation
+        format.
        """
        content = [{"text": prompt, "type": "text"}]
        if mm_content is not None:
@@ -175,6 +175,24 @@ class BenchmarkDataset(ABC):
        """
        raise NotImplementedError("sample must be implemented in subclasses.")

+    def maybe_oversample_requests(self, requests: list[SampleRequest],
+                                  num_requests: int) -> None:
+        """
+        Oversamples the list of requests if its size is less than the desired
+        number.
+
+        Args:
+            requests (List[SampleRequest]): The current list of sampled
+            requests.  num_requests (int): The target number of requests.
+        """
+        if len(requests) < num_requests:
+            random.seed(self.random_seed)
+            additional = random.choices(requests,
+                                        k=num_requests - len(requests))
+            requests.extend(additional)
+            logger.info("Oversampled requests to reach %d total samples.",
+                        num_requests)
+

 # -----------------------------------------------------------------------------
 # Utility Functions and Global Caches
@@ -276,15 +294,16 @@ class RandomDataset(BenchmarkDataset):
    ) -> None:
        super().__init__(**kwargs)

-    def sample(self,
+    def sample(
+        self,
        tokenizer: PreTrainedTokenizerBase,
        num_requests: int,
        prefix_len: int = DEFAULT_PREFIX_LEN,
        range_ratio: float = DEFAULT_RANGE_RATIO,
        input_len: int = DEFAULT_INPUT_LEN,
        output_len: int = DEFAULT_OUTPUT_LEN,
-               **kwargs) -> list[SampleRequest]:
-
+        **kwargs,
+    ) -> list[SampleRequest]:
        vocab_size = tokenizer.vocab_size

        prefix_token_ids = (np.random.randint(
@@ -346,20 +365,24 @@ class ShareGPTDataset(BenchmarkDataset):
        random.seed(self.random_seed)
        random.shuffle(self.data)

-    def sample(self,
+    def sample(
+        self,
        tokenizer: PreTrainedTokenizerBase,
        num_requests: int,
        lora_path: Optional[str] = None,
        max_loras: Optional[int] = None,
        output_len: Optional[int] = None,
        enable_multimodal_chat: bool = False,
-               **kwargs) -> list:
+        **kwargs,
+    ) -> list:
        samples: list = []
        for entry in self.data:
            if len(samples) >= num_requests:
                break
-            prompt, completion = entry["conversations"][0]["value"],\
-                entry["conversations"][1]["value"]
+            prompt, completion = (
+                entry["conversations"][0]["value"],
+                entry["conversations"][1]["value"],
+            )

            lora_request, tokenizer = self.get_random_lora_request(
                tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path)
@@ -383,6 +406,7 @@ class ShareGPTDataset(BenchmarkDataset):
                    expected_output_len=new_output_len,
                    lora_request=lora_request,
                ))
+        self.maybe_oversample_requests(samples, num_requests)
        return samples


@@ -415,19 +439,20 @@ class SonnetDataset(BenchmarkDataset):
        with open(self.dataset_path, encoding="utf-8") as f:
            self.data = f.readlines()

-    def sample(self,
+    def sample(
+        self,
        tokenizer,
        num_requests: int,
        prefix_len: int = DEFAULT_PREFIX_LEN,
        input_len: int = DEFAULT_INPUT_LEN,
        output_len: int = DEFAULT_OUTPUT_LEN,
        return_prompt_formatted: bool = False,
-               **kwargs) -> list:
+        **kwargs,
+    ) -> list:
        # Calculate average token length for a poem line.
        tokenized_lines = [tokenizer(line).input_ids for line in self.data]
        avg_len = sum(len(tokens)
-                      for tokens in \
-                        tokenized_lines) / len(tokenized_lines)
+                      for tokens in tokenized_lines) / len(tokenized_lines)

        # Build the base prompt.
        base_prompt = "Pick as many lines as you can from these poem lines:\n"
@@ -506,12 +531,14 @@ class BurstGPTDataset(BenchmarkDataset):
        # Convert the dataframe to a list of lists.
        return data.values.tolist()

-    def sample(self,
+    def sample(
+        self,
        tokenizer: PreTrainedTokenizerBase,
        num_requests: int,
        max_loras: Optional[int] = None,
        lora_path: Optional[str] = None,
-               **kwargs) -> list[SampleRequest]:
+        **kwargs,
+    ) -> list[SampleRequest]:
        samples = []
        data = self._sample_loaded_data(num_requests=num_requests)
        for i in range(num_requests):
@@ -544,7 +571,6 @@ class HuggingFaceDataset(BenchmarkDataset):
    Dataset class for processing a HuggingFace dataset with conversation data
    and optional images.
    """
-    DEFAULT_NUM_REQUESTS = 1000

    def __init__(
        self,
@@ -618,6 +644,7 @@ class HuggingFaceDataset(BenchmarkDataset):
                    expected_output_len=output_len,
                    multi_modal_data=mm_content,
                ))
+        self.maybe_oversample_requests(sampled_requests, num_requests)
        return sampled_requests


@@ -632,7 +659,6 @@ class VisionArenaDataset(HuggingFaceDataset):
    """

    DEFAULT_OUTPUT_LEN = 128
-    DEFAULT_NUM_REQUESTS = 1000
    VISION_ARENA_DATASET_PATH = "lmarena-ai/vision-arena-bench-v0.1"

    def __init__(
@@ -657,12 +683,14 @@ class VisionArenaDataset(HuggingFaceDataset):
        )
        self.data = dataset.shuffle(seed=self.random_seed)

-    def sample(self,
+    def sample(
+        self,
        tokenizer: PreTrainedTokenizerBase,
        num_requests: int,
        output_len: Optional[int] = None,
        enable_multimodal_chat: bool = False,
-               **kwargs) -> list:
+        **kwargs,
+    ) -> list:
        output_len = (output_len
                      if output_len is not None else self.DEFAULT_OUTPUT_LEN)
        sampled_requests = []
@@ -685,4 +713,5 @@ class VisionArenaDataset(HuggingFaceDataset):
                    expected_output_len=output_len,
                    multi_modal_data=mm_content,
                ))
+        self.maybe_oversample_requests(sampled_requests, num_requests)
        return sampled_requests