Unverified Commit 1630cc8d authored by Chenheli Hua's avatar Chenheli Hua Committed by GitHub
Browse files

[Benchmarks] Add video inputs to ShareGPTDataset. (#23199)


Signed-off-by: default avatarChenheli Hua <huachenheli@outlook.com>
parent 14e2b073
...@@ -32,6 +32,14 @@ become available. ...@@ -32,6 +32,14 @@ become available.
<div>Note that the images need to be downloaded separately. For example, to download COCO's 2017 Train images:</div> <div>Note that the images need to be downloaded separately. For example, to download COCO's 2017 Train images:</div>
<code>wget http://images.cocodataset.org/zips/train2017.zip</code> <code>wget http://images.cocodataset.org/zips/train2017.zip</code>
</td> </td>
</tr>
<tr>
<td><strong>ShareGPT4Video (Video)</strong></td>
<td style="text-align: center;"></td>
<td style="text-align: center;"></td>
<td>
<code>git clone https://huggingface.co/datasets/ShareGPT4Video/ShareGPT4Video</code>
</td>
</tr> </tr>
<tr> <tr>
<td><strong>BurstGPT</strong></td> <td><strong>BurstGPT</strong></td>
...@@ -231,7 +239,7 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct ...@@ -231,7 +239,7 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct
```bash ```bash
vllm bench serve \ vllm bench serve \
--backend openai-chat \ --backend openai-chat \
--endpoint-type openai-chat \ --endpoint-type openai-chat \
--model Qwen/Qwen2-VL-7B-Instruct \ --model Qwen/Qwen2-VL-7B-Instruct \
--endpoint /v1/chat/completions \ --endpoint /v1/chat/completions \
--dataset-name hf \ --dataset-name hf \
...@@ -246,7 +254,7 @@ vllm bench serve \ ...@@ -246,7 +254,7 @@ vllm bench serve \
```bash ```bash
vllm bench serve \ vllm bench serve \
--backend openai-chat \ --backend openai-chat \
--endpoint-type openai-chat \ --endpoint-type openai-chat \
--model Qwen/Qwen2-VL-7B-Instruct \ --model Qwen/Qwen2-VL-7B-Instruct \
--endpoint /v1/chat/completions \ --endpoint /v1/chat/completions \
--dataset-name hf \ --dataset-name hf \
...@@ -612,7 +620,7 @@ vllm bench serve \ ...@@ -612,7 +620,7 @@ vllm bench serve \
--prefix-repetition-prefix-len 512 \ --prefix-repetition-prefix-len 512 \
--prefix-repetition-suffix-len 128 \ --prefix-repetition-suffix-len 128 \
--prefix-repetition-num-prefixes 5 \ --prefix-repetition-num-prefixes 5 \
--prefix-repetition-output-len 128 --prefix-repetition-output-len 128
``` ```
</details> </details>
...@@ -687,4 +695,31 @@ python benchmarks/benchmark_serving.py \ ...@@ -687,4 +695,31 @@ python benchmarks/benchmark_serving.py \
--endpoint /v1/chat/completion --endpoint /v1/chat/completion
``` ```
### Videos (ShareGPT4Video)
Start vLLM:
```bash
python -m vllm.entrypoints.openai.api_server \
--model Qwen/Qwen2.5-VL-7B-Instruct \
--dtype bfloat16 \
--limit-mm-per-prompt '{"video": 1}' \
--allowed-local-media-path /path/to/sharegpt4video/videos
```
Send requests with videos:
```bash
python benchmarks/benchmark_serving.py \
--backend openai-chat \
--model Qwen/Qwen2.5-VL-7B-Instruct \
--dataset-name sharegpt \
--dataset-path /path/to/ShareGPT4Video/llava_v1_5_mix665k_with_video_chatgpt72k_share4video28k.json \
--num-prompts 100 \
--save-result \
--result-dir ~/vllm_benchmark_results \
--save-detailed \
--endpoint /v1/chat/completion
```
</details> </details>
...@@ -293,6 +293,41 @@ def process_image(image: Any) -> Mapping[str, Any]: ...@@ -293,6 +293,41 @@ def process_image(image: Any) -> Mapping[str, Any]:
) )
def process_video(video: Any) -> Mapping[str, Any]:
"""
Process a single video input and return a multimedia content dictionary.
Supports the following input types:
1. Dictionary with raw video bytes: - Expects a dict with a 'bytes' key
containing raw video data.
2. String input: - Treats the string as a URL or local file path. -
Prepends "file://" if the string doesn't start with "http://" or
"file://". - Returns a dictionary with the image URL.
Raises:
ValueError: If the input is not a supported type.
"""
if isinstance(video, dict) and "bytes" in video:
video_bytes = video["bytes"]
video_base64 = base64.b64encode(video_bytes).decode("utf-8")
return {
"type": "video_url",
"video_url": {"url": f"data:video/mp4;base64,{video_base64}"},
}
if isinstance(video, str):
video_url = (
video if video.startswith(("http://", "file://")) else f"file://{video}"
)
return {"type": "video_url", "video_url": {"url": video_url}}
raise ValueError(
f"Invalid video input {video}. Must be a string of local path/remote url, or a dictionary with raw video bytes in the form of `{{'bytes': raw_video_bytes}}`." # noqa: E501
)
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
# Random Dataset Implementation (Synthetic Data) # Random Dataset Implementation (Synthetic Data)
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
...@@ -451,9 +486,10 @@ class ShareGPTDataset(BenchmarkDataset): ...@@ -451,9 +486,10 @@ class ShareGPTDataset(BenchmarkDataset):
skip_min_output_len_check=output_len is not None, skip_min_output_len_check=output_len is not None,
): ):
continue continue
# TODO: Also support ShareGPT4Video.
if image_path := entry.get("image"): if image_path := entry.get("image"):
mm_content = process_image(image_path) mm_content = process_image(image_path)
elif video_path := entry.get("video"):
mm_content = process_video(video_path)
else: else:
mm_content = None mm_content = None
if enable_multimodal_chat: if enable_multimodal_chat:
......
...@@ -281,7 +281,7 @@ def process_image(image: Any) -> Mapping[str, Any]: ...@@ -281,7 +281,7 @@ def process_image(image: Any) -> Mapping[str, Any]:
""" """
Process a single image input and return a multimedia content dictionary. Process a single image input and return a multimedia content dictionary.
Supports three input types: Supports the following input types:
1. Dictionary with raw image bytes: - Expects a dict with a 'bytes' key 1. Dictionary with raw image bytes: - Expects a dict with a 'bytes' key
containing raw image data. - Loads the bytes as a PIL.Image.Image. containing raw image data. - Loads the bytes as a PIL.Image.Image.
...@@ -321,6 +321,41 @@ def process_image(image: Any) -> Mapping[str, Any]: ...@@ -321,6 +321,41 @@ def process_image(image: Any) -> Mapping[str, Any]:
" or str or dictionary with raw image bytes.") " or str or dictionary with raw image bytes.")
def process_video(video: Any) -> Mapping[str, Any]:
"""
Process a single video input and return a multimedia content dictionary.
Supports the following input types:
1. Dictionary with raw video bytes: - Expects a dict with a 'bytes' key
containing raw video data.
2. String input: - Treats the string as a URL or local file path. -
Prepends "file://" if the string doesn't start with "http://" or
"file://". - Returns a dictionary with the image URL.
Raises:
ValueError: If the input is not a supported type.
"""
if isinstance(video, dict) and 'bytes' in video:
video_bytes = video['bytes']
video_base64 = base64.b64encode(video_bytes).decode("utf-8")
return {
"type": "video_url",
"video_url": {
"url": f"data:video/mp4;base64,{video_base64}"
},
}
if isinstance(video, str):
video_url = (video if video.startswith(
("http://", "file://")) else f"file://{video}")
return {"type": "video_url", "video_url": {"url": video_url}}
raise ValueError(
f"Invalid video input {video}. Must be a string of local path/remote url, or a dictionary with raw video bytes in the form of `{{'bytes': raw_video_bytes}}`." # noqa: E501
)
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
# Random Dataset Implementation (Synthetic Data) # Random Dataset Implementation (Synthetic Data)
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
...@@ -474,9 +509,10 @@ class ShareGPTDataset(BenchmarkDataset): ...@@ -474,9 +509,10 @@ class ShareGPTDataset(BenchmarkDataset):
skip_min_output_len_check=output_len skip_min_output_len_check=output_len
is not None): is not None):
continue continue
# TODO: Also support ShareGPT4Video.
if image_path := entry.get("image"): if image_path := entry.get("image"):
mm_content = process_image(image_path) mm_content = process_image(image_path)
elif video_path := entry.get("video"):
mm_content = process_video(video_path)
else: else:
mm_content = None mm_content = None
if enable_multimodal_chat: if enable_multimodal_chat:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment