Commit fcfc474d authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.8.3' into v0.8.3-dev

parents bb94d2e5 296c6572
FROM mambaorg/micromamba
ARG MAMBA_DOCKERFILE_ACTIVATE=1
USER root
ENV PATH="/usr/local/cargo/bin:$PATH:/opt/conda/bin/"
RUN apt-get update -y && apt-get install -y git wget kmod curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1 libssl-dev
# Some packages in requirements/cpu are installed here
# IBM provides optimized packages for ppc64le processors in the open-ce project for mamba
# Currently these may not be available for venv or pip directly
RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 rust && micromamba clean --all --yes
COPY ./ /workspace/vllm
WORKDIR /workspace/vllm
ARG GIT_REPO_CHECK=0
RUN --mount=type=bind,source=.git,target=.git \
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi
RUN --mount=type=cache,target=/root/.cache/pip \
RUSTFLAGS='-L /opt/conda/lib' pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \
'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
-r requirements/cpu.txt \
xformers uvloop==0.20.0
RUN --mount=type=bind,source=.git,target=.git \
VLLM_TARGET_DEVICE=cpu python3 setup.py install
# install development dependencies (for testing)
RUN python3 -m pip install -e tests/vllm_test_utils
WORKDIR /workspace/
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
ENTRYPOINT ["/opt/conda/bin/python3", "-m", "vllm.entrypoints.openai.api_server"]
...@@ -85,7 +85,7 @@ VLLM_INSTALL_PUNICA_KERNELS=1 python3 setup.py install (若调试,可使用V ...@@ -85,7 +85,7 @@ VLLM_INSTALL_PUNICA_KERNELS=1 python3 setup.py install (若调试,可使用V
+ 若使用 pip install 下载安装过慢,可添加源:-i https://pypi.tuna.tsinghua.edu.cn/simple/ + 若使用 pip install 下载安装过慢,可添加源:-i https://pypi.tuna.tsinghua.edu.cn/simple/
## 验证 ## 验证
- python -c "import vllm; print(vllm.\_\_version__)",版本号与官方版本同步,查询该软件的版本号,例如0.8.2; - python -c "import vllm; print(vllm.\_\_version__)",版本号与官方版本同步,查询该软件的版本号,例如0.8.3;
## Known Issue ## Known Issue
- -
......
...@@ -15,14 +15,12 @@ Easy, fast, and cheap LLM serving for everyone ...@@ -15,14 +15,12 @@ Easy, fast, and cheap LLM serving for everyone
--- ---
[2025/03] We are collaborating with Ollama to host an [Inference Night](https://lu.ma/vllm-ollama) at Y Combinator in San Francisco on Thursday, March 27, at 6 PM. Discuss all things inference local or data center!
[2025/04] We're hosting our first-ever *vLLM Asia Developer Day* in Singapore on *April 3rd*! This is a full-day event (9 AM - 9 PM SGT) in partnership with SGInnovate, AMD, and Embedded LLM. Meet the vLLM team and learn about LLM inference for RL, MI300X, and more! [Register Now](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day) [2025/04] We're hosting our first-ever *vLLM Asia Developer Day* in Singapore on *April 3rd*! This is a full-day event (9 AM - 9 PM SGT) in partnership with SGInnovate, AMD, and Embedded LLM. Meet the vLLM team and learn about LLM inference for RL, MI300X, and more! [Register Now](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)
--- ---
*Latest News* 🔥 *Latest News* 🔥
- [2025/03] We hosted [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).
- [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing). - [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
- [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0). - [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0).
- [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted. - [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted.
...@@ -103,7 +101,7 @@ Visit our [documentation](https://docs.vllm.ai/en/latest/) to learn more. ...@@ -103,7 +101,7 @@ Visit our [documentation](https://docs.vllm.ai/en/latest/) to learn more.
## Contributing ## Contributing
We welcome and value any contributions and collaborations. We welcome and value any contributions and collaborations.
Please check out [CONTRIBUTING.md](./CONTRIBUTING.md) for how to get involved. Please check out [Contributing to vLLM](https://docs.vllm.ai/en/stable/contributing/overview.html) for how to get involved.
## Sponsors ## Sponsors
...@@ -126,6 +124,7 @@ Compute Resources: ...@@ -126,6 +124,7 @@ Compute Resources:
- Databricks - Databricks
- DeepInfra - DeepInfra
- Google Cloud - Google Cloud
- Intel
- Lambda Lab - Lambda Lab
- Nebius - Nebius
- Novita AI - Novita AI
......
...@@ -41,29 +41,39 @@ become available. ...@@ -41,29 +41,39 @@ become available.
<td><code>synthetic</code></td> <td><code>synthetic</code></td>
</tr> </tr>
<tr> <tr>
<td><strong>HuggingFace</strong></td> <td><strong>HuggingFace-VisionArena</strong></td>
<td style="text-align: center;">🟡</td> <td style="text-align: center;"></td>
<td style="text-align: center;">🟡</td> <td style="text-align: center;"></td>
<td>Specify your dataset path on HuggingFace</td> <td><code>lmarena-ai/VisionArena-Chat</code></td>
</tr>
<tr>
<td><strong>HuggingFace-InstructCoder</strong></td>
<td style="text-align: center;"></td>
<td style="text-align: center;"></td>
<td><code>likaixin/InstructCoder</code></td>
</tr>
<tr>
<td><strong>HuggingFace-AIMO</strong></td>
<td style="text-align: center;"></td>
<td style="text-align: center;"></td>
<td><code>AI-MO/aimo-validation-aime</code> , <code>AI-MO/NuminaMath-1.5</code>, <code>AI-MO/NuminaMath-CoT</code></td>
</tr> </tr>
<tr> <tr>
<td><strong>VisionArena</strong></td> <td><strong>HuggingFace-Other</strong></td>
<td style="text-align: center;"></td> <td style="text-align: center;"></td>
<td style="text-align: center;"></td> <td style="text-align: center;"></td>
<td><code>lmarena-ai/vision-arena-bench-v0.1</code> (a HuggingFace dataset)</td> <td><code>lmms-lab/LLaVA-OneVision-Data</code>, <code>Aeala/ShareGPT_Vicuna_unfiltered</code></td>
</tr> </tr>
</tbody> </tbody>
</table> </table>
✅: supported ✅: supported
🚧: to be supported 🟡: Partial support
🟡: Partial support. Currently, HuggingFaceDataset only supports dataset formats 🚧: to be supported
similar to `lmms-lab/LLaVA-OneVision-Data` and `Aeala/ShareGPT_Vicuna_unfiltered`.
If you need support for other dataset formats, please consider contributing.
**Note**: VisionArena’s `dataset-name` should be set to `hf` **Note**: HuggingFace dataset's `dataset-name` should be set to `hf`
--- ---
## Example - Online Benchmark ## Example - Online Benchmark
...@@ -71,8 +81,7 @@ If you need support for other dataset formats, please consider contributing. ...@@ -71,8 +81,7 @@ If you need support for other dataset formats, please consider contributing.
First start serving your model First start serving your model
```bash ```bash
MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B" vllm serve NousResearch/Hermes-3-Llama-3.1-8B --disable-log-requests
vllm serve ${MODEL_NAME} --disable-log-requests
``` ```
Then run the benchmarking script Then run the benchmarking script
...@@ -80,12 +89,13 @@ Then run the benchmarking script ...@@ -80,12 +89,13 @@ Then run the benchmarking script
```bash ```bash
# download dataset # download dataset
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json # wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B" python3 vllm/benchmarks/benchmark_serving.py \
NUM_PROMPTS=10 --backend vllm \
BACKEND="vllm" --model NousResearch/Hermes-3-Llama-3.1-8B \
DATASET_NAME="sharegpt" --endpoint /v1/completions \
DATASET_PATH="<your data path>/ShareGPT_V3_unfiltered_cleaned_split.json" --dataset-name sharegpt \
python3 vllm/benchmarks/benchmark_serving.py --backend ${BACKEND} --model ${MODEL_NAME} --endpoint /v1/completions --dataset-name ${DATASET_NAME} --dataset-path ${DATASET_PATH} --num-prompts ${NUM_PROMPTS} --dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
--num-prompts 10
``` ```
If successful, you will see the following output If successful, you will see the following output
...@@ -122,88 +132,87 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests ...@@ -122,88 +132,87 @@ vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
``` ```
```bash ```bash
MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
NUM_PROMPTS=10
BACKEND="openai-chat"
DATASET_NAME="hf"
DATASET_PATH="lmarena-ai/vision-arena-bench-v0.1"
DATASET_SPLIT='train'
python3 vllm/benchmarks/benchmark_serving.py \ python3 vllm/benchmarks/benchmark_serving.py \
--backend "${BACKEND}" \ --backend openai-chat \
--model "${MODEL_NAME}" \ --model Qwen/Qwen2-VL-7B-Instruct \
--endpoint "/v1/chat/completions" \ --endpoint /v1/chat/completions \
--dataset-name "${DATASET_NAME}" \ --dataset-name hf \
--dataset-path "${DATASET_PATH}" \ --dataset-path lmarena-ai/VisionArena-Chat \
--hf-split "${DATASET_SPLIT}" \ --hf-split train \
--num-prompts "${NUM_PROMPTS}" --num-prompts 1000
``` ```
### HuggingFaceDataset Examples ### InstructCoder Benchmark with Speculative Decoding
Currently, HuggingFaceDataset only supports dataset formats ``` bash
similar to `lmms-lab/LLaVA-OneVision-Data` and `Aeala/ShareGPT_Vicuna_unfiltered`. If you need support for other dataset VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
formats, please consider contributing. --speculative-model "[ngram]" \
--ngram_prompt_lookup_min 2 \
--ngram-prompt-lookup-max 5 \
--num_speculative_tokens 5
```
``` bash
python3 benchmarks/benchmark_serving.py \
--model meta-llama/Meta-Llama-3-8B-Instruct \
--dataset-name hf \
--dataset-path likaixin/InstructCoder \
--num-prompts 2048
```
### Other HuggingFaceDataset Examples
```bash ```bash
# need a model with vision capability here
vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
``` ```
**`lmms-lab/LLaVA-OneVision-Data`** **`lmms-lab/LLaVA-OneVision-Data`**
```bash ```bash
MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
NUM_PROMPTS=10
BACKEND="openai-chat"
DATASET_NAME="hf"
DATASET_PATH="lmms-lab/LLaVA-OneVision-Data"
DATASET_SPLIT='train'
DATASET_SUBSET='chart2text(cauldron)'
python3 vllm/benchmarks/benchmark_serving.py \ python3 vllm/benchmarks/benchmark_serving.py \
--backend "${BACKEND}" \ --backend openai-chat \
--model "${MODEL_NAME}" \ --model Qwen/Qwen2-VL-7B-Instruct \
--endpoint "/v1/chat/completions" \ --endpoint /v1/chat/completions \
--dataset-name "${DATASET_NAME}" \ --dataset-name hf \
--dataset-path "${DATASET_PATH}" \ --dataset-path lmms-lab/LLaVA-OneVision-Data \
--hf-split "${DATASET_SPLIT}" \ --hf-split train \
--num-prompts "${NUM_PROMPTS}" \ --hf-subset "chart2text(cauldron)" \
--hf-subset "${DATASET_SUBSET}" --num-prompts 10
``` ```
**`Aeala/ShareGPT_Vicuna_unfiltered`** **`Aeala/ShareGPT_Vicuna_unfiltered`**
```bash ```bash
MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
NUM_PROMPTS=10
BACKEND="openai-chat"
DATASET_NAME="hf"
DATASET_PATH="Aeala/ShareGPT_Vicuna_unfiltered"
DATASET_SPLIT='train'
python3 vllm/benchmarks/benchmark_serving.py \ python3 vllm/benchmarks/benchmark_serving.py \
--backend "${BACKEND}" \ --backend openai-chat \
--model "${MODEL_NAME}" \ --model Qwen/Qwen2-VL-7B-Instruct \
--endpoint "/v1/chat/completions" \ --endpoint /v1/chat/completions \
--dataset-name "${DATASET_NAME}" \ --dataset-name hf \
--dataset-path "${DATASET_PATH}" \ --dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
--hf-split "${DATASET_SPLIT}" \ --hf-split train \
--num-prompts "${NUM_PROMPTS}" \ --num-prompts 10
```
**`AI-MO/aimo-validation-aime`**
``` bash
python3 vllm/benchmarks/benchmark_serving.py \
--model Qwen/QwQ-32B \
--dataset-name hf \
--dataset-path AI-MO/aimo-validation-aime \
--num-prompts 10 \
--seed 42
``` ```
--- ---
## Example - Offline Throughput Benchmark ## Example - Offline Throughput Benchmark
```bash ```bash
MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
NUM_PROMPTS=10
DATASET_NAME="sonnet"
DATASET_PATH="vllm/benchmarks/sonnet.txt"
python3 vllm/benchmarks/benchmark_throughput.py \ python3 vllm/benchmarks/benchmark_throughput.py \
--model "${MODEL_NAME}" \ --model NousResearch/Hermes-3-Llama-3.1-8B \
--dataset-name "${DATASET_NAME}" \ --dataset-name sonnet \
--dataset-path "${DATASET_PATH}" \ --dataset-path vllm/benchmarks/sonnet.txt \
--num-prompts "${NUM_PROMPTS}" --num-prompts 10
``` ```
If successful, you will see the following output If successful, you will see the following output
...@@ -217,19 +226,13 @@ Total num output tokens: 1500 ...@@ -217,19 +226,13 @@ Total num output tokens: 1500
### VisionArena Benchmark for Vision Language Models ### VisionArena Benchmark for Vision Language Models
``` bash ``` bash
MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
NUM_PROMPTS=10
DATASET_NAME="hf"
DATASET_PATH="lmarena-ai/vision-arena-bench-v0.1"
DATASET_SPLIT="train"
python3 vllm/benchmarks/benchmark_throughput.py \ python3 vllm/benchmarks/benchmark_throughput.py \
--model "${MODEL_NAME}" \ --model Qwen/Qwen2-VL-7B-Instruct \
--backend "vllm-chat" \ --backend vllm-chat \
--dataset-name "${DATASET_NAME}" \ --dataset-name hf \
--dataset-path "${DATASET_PATH}" \ --dataset-path lmarena-ai/VisionArena-Chat \
--num-prompts "${NUM_PROMPTS}" \ --num-prompts 1000 \
--hf-split "${DATASET_SPLIT}" --hf-split train
``` ```
The `num prompt tokens` now includes image token counts The `num prompt tokens` now includes image token counts
...@@ -240,29 +243,83 @@ Total num prompt tokens: 14527 ...@@ -240,29 +243,83 @@ Total num prompt tokens: 14527
Total num output tokens: 1280 Total num output tokens: 1280
``` ```
### InstructCoder Benchmark with Speculative Decoding
``` bash
VLLM_WORKER_MULTIPROC_METHOD=spawn \
VLLM_USE_V1=1 \
python3 vllm/benchmarks/benchmark_throughput.py \
--dataset-name=hf \
--dataset-path=likaixin/InstructCoder \
--model=meta-llama/Meta-Llama-3-8B-Instruct \
--input-len=1000 \
--output-len=100 \
--num-prompts=2048 \
--async-engine \
--speculative-model="[ngram]" \
--ngram_prompt_lookup_min=2 \
--ngram-prompt-lookup-max=5 \
--num_speculative_tokens=5
```
```
Throughput: 104.77 requests/s, 23836.22 total tokens/s, 10477.10 output tokens/s
Total num prompt tokens: 261136
Total num output tokens: 204800
```
### Other HuggingFaceDataset Examples
**`lmms-lab/LLaVA-OneVision-Data`**
```bash
python3 vllm/benchmarks/benchmark_throughput.py \
--model Qwen/Qwen2-VL-7B-Instruct \
--backend vllm-chat \
--dataset-name hf \
--dataset-path lmms-lab/LLaVA-OneVision-Data \
--hf-split train \
--hf-subset "chart2text(cauldron)" \
--num-prompts 10
```
**`Aeala/ShareGPT_Vicuna_unfiltered`**
```bash
python3 vllm/benchmarks/benchmark_throughput.py \
--model Qwen/Qwen2-VL-7B-Instruct \
--backend vllm-chat \
--dataset-name hf \
--dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
--hf-split train \
--num-prompts 10
```
**`AI-MO/aimo-validation-aime`**
```bash
python3 benchmarks/benchmark_throughput.py \
--model Qwen/QwQ-32B \
--backend vllm \
--dataset-name hf \
--dataset-path AI-MO/aimo-validation-aime \
--hf-split train \
--num-prompts 10
```
### Benchmark with LoRA Adapters ### Benchmark with LoRA Adapters
``` bash ``` bash
# download dataset # download dataset
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json # wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
MODEL_NAME="meta-llama/Llama-2-7b-hf"
BACKEND="vllm"
DATASET_NAME="sharegpt"
DATASET_PATH="<your data path>/ShareGPT_V3_unfiltered_cleaned_split.json"
NUM_PROMPTS=10
MAX_LORAS=2
MAX_LORA_RANK=8
ENABLE_LORA="--enable-lora"
LORA_PATH="yard1/llama-2-7b-sql-lora-test"
python3 vllm/benchmarks/benchmark_throughput.py \ python3 vllm/benchmarks/benchmark_throughput.py \
--model "${MODEL_NAME}" \ --model meta-llama/Llama-2-7b-hf \
--backend "${BACKEND}" \ --backend vllm \
--dataset_path "${DATASET_PATH}" \ --dataset_path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
--dataset_name "${DATASET_NAME}" \ --dataset_name sharegpt \
--num-prompts "${NUM_PROMPTS}" \ --num-prompts 10 \
--max-loras "${MAX_LORAS}" \ --max-loras 2 \
--max-lora-rank "${MAX_LORA_RANK}" \ --max-lora-rank 8 \
${ENABLE_LORA} \ --enable-lora \
--lora-path "${LORA_PATH}" --lora-path yard1/llama-2-7b-sql-lora-test
``` ```
...@@ -219,7 +219,15 @@ async def async_request_deepspeed_mii( ...@@ -219,7 +219,15 @@ async def async_request_deepspeed_mii(
if response.status == 200: if response.status == 200:
parsed_resp = await response.json() parsed_resp = await response.json()
output.latency = time.perf_counter() - st output.latency = time.perf_counter() - st
output.generated_text = parsed_resp["text"][0] if "choices" in parsed_resp:
output.generated_text = parsed_resp["choices"][0][
"text"]
elif "text" in parsed_resp:
output.generated_text = parsed_resp["text"][0]
else:
output.error = ("Unexpected response format: "
"neither 'choices' nor 'text' found")
output.success = False
output.success = True output.success = True
else: else:
output.error = response.reason or "" output.error = response.reason or ""
......
...@@ -23,7 +23,8 @@ from abc import ABC, abstractmethod ...@@ -23,7 +23,8 @@ from abc import ABC, abstractmethod
from collections.abc import Mapping from collections.abc import Mapping
from dataclasses import dataclass from dataclasses import dataclass
from functools import cache from functools import cache
from typing import Any, Optional, Union from io import BytesIO
from typing import Any, Callable, Optional, Union
import numpy as np import numpy as np
import pandas as pd import pandas as pd
...@@ -239,21 +240,24 @@ def process_image(image: Any) -> Mapping[str, Any]: ...@@ -239,21 +240,24 @@ def process_image(image: Any) -> Mapping[str, Any]:
""" """
Process a single image input and return a multimedia content dictionary. Process a single image input and return a multimedia content dictionary.
For a PIL.Image.Image input: Supports three input types:
- Converts the image to RGB.
- Saves the image as a JPEG in-memory.
- Encodes the JPEG data as a base64 string.
- Returns a dictionary with the image as a base64 data URL.
For a string input: 1. Dictionary with raw image bytes: - Expects a dict with a 'bytes' key
- Treats the string as a URL or file path. containing raw image data. - Loads the bytes as a PIL.Image.Image.
- Prepends "file://" if the string doesn't start with "http://" or
"file://". 2. PIL.Image.Image input: - Converts the image to RGB. - Saves the image as
- Returns a dictionary with the image URL. a JPEG in memory. - Encodes the JPEG data as a base64 string. - Returns
a dictionary with the image as a base64 data URL.
3. String input: - Treats the string as a URL or local file path. -
Prepends "file://" if the string doesn't start with "http://" or
"file://". - Returns a dictionary with the image URL.
Raises: Raises:
ValueError: If the input is neither a PIL.Image.Image nor a string. ValueError: If the input is not a supported type.
""" """
if isinstance(image, dict) and 'bytes' in image:
image = Image.open(BytesIO(image['bytes']))
if isinstance(image, Image.Image): if isinstance(image, Image.Image):
image = image.convert("RGB") image = image.convert("RGB")
with io.BytesIO() as image_data: with io.BytesIO() as image_data:
...@@ -272,8 +276,8 @@ def process_image(image: Any) -> Mapping[str, Any]: ...@@ -272,8 +276,8 @@ def process_image(image: Any) -> Mapping[str, Any]:
("http://", "file://")) else f"file://{image}") ("http://", "file://")) else f"file://{image}")
return {"type": "image_url", "image_url": {"url": image_url}} return {"type": "image_url", "image_url": {"url": image_url}}
raise ValueError( raise ValueError(f"Invalid image input {image}. Must be a PIL.Image.Image"
f"Invalid image input {image}. Must be a PIL.Image.Image or str.") " or str or dictionary with raw image bytes.")
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
...@@ -562,48 +566,47 @@ class BurstGPTDataset(BenchmarkDataset): ...@@ -562,48 +566,47 @@ class BurstGPTDataset(BenchmarkDataset):
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
# HuggingFace Dataset Implementation # HuggingFace Dataset Base Implementation
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
class HuggingFaceDataset(BenchmarkDataset): class HuggingFaceDataset(BenchmarkDataset):
""" """Base class for datasets hosted on HuggingFace."""
Dataset class for processing a HuggingFace dataset with conversation data
and optional images. SUPPORTED_DATASET_PATHS: Union[set[str], dict[str, Callable]] = set()
"""
def __init__( def __init__(
self, self,
dataset_path: str,
dataset_split: str, dataset_split: str,
dataset_subset: Optional[str] = None, dataset_subset: Optional[str] = None,
**kwargs, **kwargs,
) -> None: ) -> None:
super().__init__(**kwargs) super().__init__(dataset_path=dataset_path, **kwargs)
self.dataset_split = dataset_split self.dataset_split = dataset_split
self.dataset_subset = dataset_subset self.dataset_subset = dataset_subset
self.load_data() self.load_data()
def load_data(self) -> None: def load_data(self) -> None:
if not self.dataset_path: """Load data from HuggingFace datasets."""
raise ValueError("dataset_path must be provided for loading data.")
self.data = load_dataset( self.data = load_dataset(
self.dataset_path, self.dataset_path,
name=self.dataset_subset, name=self.dataset_subset,
split=self.dataset_split, split=self.dataset_split,
streaming=True, streaming=True,
) )
if self.data.features is None or "conversations" \ self.data = self.data.shuffle(seed=self.random_seed)
not in self.data.features:
raise ValueError(
"HuggingFaceDataset currently only supports datasets with " # -----------------------------------------------------------------------------
"a 'conversations' column like lmms-lab/LLaVA-OneVision-Data. " # Conversation Dataset Implementation
"Please consider contributing if you would like to add " # -----------------------------------------------------------------------------
"support for additional dataset formats.")
# Shuffle and filter examples with at least 2 conversations.
self.data = self.data.shuffle(seed=self.random_seed).filter( class ConversationDataset(HuggingFaceDataset):
lambda x: len(x["conversations"]) >= 2) """Dataset for conversation data with multimodal support."""
SUPPORTED_DATASET_PATHS = {
'lmms-lab/LLaVA-OneVision-Data', 'Aeala/ShareGPT_Vicuna_unfiltered'
}
def sample(self, def sample(self,
tokenizer: PreTrainedTokenizerBase, tokenizer: PreTrainedTokenizerBase,
...@@ -611,10 +614,13 @@ class HuggingFaceDataset(BenchmarkDataset): ...@@ -611,10 +614,13 @@ class HuggingFaceDataset(BenchmarkDataset):
output_len: Optional[int] = None, output_len: Optional[int] = None,
enable_multimodal_chat: bool = False, enable_multimodal_chat: bool = False,
**kwargs) -> list: **kwargs) -> list:
# Filter examples with at least 2 conversations
filtered_data = self.data.filter(
lambda x: len(x["conversations"]) >= 2)
sampled_requests = [] sampled_requests = []
dynamic_output = output_len is None dynamic_output = output_len is None
for item in self.data: for item in filtered_data:
if len(sampled_requests) >= num_requests: if len(sampled_requests) >= num_requests:
break break
conv = item["conversations"] conv = item["conversations"]
...@@ -659,29 +665,12 @@ class VisionArenaDataset(HuggingFaceDataset): ...@@ -659,29 +665,12 @@ class VisionArenaDataset(HuggingFaceDataset):
""" """
DEFAULT_OUTPUT_LEN = 128 DEFAULT_OUTPUT_LEN = 128
VISION_ARENA_DATASET_PATH = "lmarena-ai/vision-arena-bench-v0.1" SUPPORTED_DATASET_PATHS = {
"lmarena-ai/VisionArena-Chat":
def __init__( lambda x: x["conversation"][0][0]["content"],
self, "lmarena-ai/vision-arena-bench-v0.1":
**kwargs, lambda x: x["turns"][0][0]["content"]
) -> None: }
super().__init__(**kwargs)
if self.dataset_path != self.VISION_ARENA_DATASET_PATH:
raise ValueError(f"Only support Vision Arena dataset.\
This data path {self.dataset_path} is not valid.")
if self.dataset_subset is None and self.dataset_split != "train":
raise ValueError("Dataset split must be 'train'.")
self.load_data()
def load_data(self) -> None:
dataset = load_dataset(
self.dataset_path,
name=self.dataset_subset,
split=self.dataset_split,
streaming=True,
)
self.data = dataset.shuffle(seed=self.random_seed)
def sample( def sample(
self, self,
...@@ -697,7 +686,11 @@ class VisionArenaDataset(HuggingFaceDataset): ...@@ -697,7 +686,11 @@ class VisionArenaDataset(HuggingFaceDataset):
for item in self.data: for item in self.data:
if len(sampled_requests) >= num_requests: if len(sampled_requests) >= num_requests:
break break
prompt = item["turns"][0][0]["content"] parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.dataset_path)
if parser_fn is None:
raise ValueError(
f"Unsupported dataset path: {self.dataset_path}")
prompt = parser_fn(item)
mm_content = process_image(item["images"][0]) mm_content = process_image(item["images"][0])
prompt_len = len(tokenizer(prompt).input_ids) prompt_len = len(tokenizer(prompt).input_ids)
if enable_multimodal_chat: if enable_multimodal_chat:
...@@ -715,3 +708,96 @@ class VisionArenaDataset(HuggingFaceDataset): ...@@ -715,3 +708,96 @@ class VisionArenaDataset(HuggingFaceDataset):
)) ))
self.maybe_oversample_requests(sampled_requests, num_requests) self.maybe_oversample_requests(sampled_requests, num_requests)
return sampled_requests return sampled_requests
# -----------------------------------------------------------------------------
# Instruct Coder Dataset Implementation
# -----------------------------------------------------------------------------
class InstructCoderDataset(HuggingFaceDataset):
"""
InstructCoder Dataset.
https://huggingface.co/datasets/likaixin/InstructCoder
InstructCoder is the dataset designed for general code editing. It consists
of 114,239 instruction-input-output triplets, and covers multiple distinct
code editing scenario.
"""
DEFAULT_OUTPUT_LEN = 200 # this is the average default output length
SUPPORTED_DATASET_PATHS = {
"likaixin/InstructCoder",
}
def sample(self,
tokenizer: PreTrainedTokenizerBase,
num_requests: int,
output_len: Optional[int] = None,
enable_multimodal_chat: bool = False,
**kwargs) -> list:
output_len = (output_len
if output_len is not None else self.DEFAULT_OUTPUT_LEN)
sampled_requests = []
for item in self.data:
if len(sampled_requests) >= num_requests:
break
prompt = f"{item['instruction']}:\n{item['input']}"
prompt_len = len(tokenizer(prompt).input_ids)
sampled_requests.append(
SampleRequest(
prompt=prompt,
prompt_len=prompt_len,
expected_output_len=output_len,
))
self.maybe_oversample_requests(sampled_requests, num_requests)
return sampled_requests
# -----------------------------------------------------------------------------
# AIMO Dataset Implementation
# -----------------------------------------------------------------------------
class AIMODataset(HuggingFaceDataset):
"""
Dataset class for processing a AIMO dataset with reasoning questions.
"""
SUPPORTED_DATASET_PATHS = {
"AI-MO/aimo-validation-aime", "AI-MO/NuminaMath-1.5",
"AI-MO/NuminaMath-CoT"
}
def sample(self,
tokenizer: PreTrainedTokenizerBase,
num_requests: int,
output_len: Optional[int] = None,
**kwargs) -> list:
sampled_requests = []
dynamic_output = output_len is None
for item in self.data:
if len(sampled_requests) >= num_requests:
break
prompt, completion = item['problem'], item["solution"]
prompt_ids = tokenizer(prompt).input_ids
completion_ids = tokenizer(completion).input_ids
prompt_len = len(prompt_ids)
completion_len = len(completion_ids)
output_len = completion_len if dynamic_output else output_len
assert isinstance(output_len, int) and output_len > 0
if dynamic_output and not is_valid_sequence(prompt_len,
completion_len,
max_prompt_len=2048,
max_total_len=32000):
continue
sampled_requests.append(
SampleRequest(
prompt=prompt,
prompt_len=prompt_len,
expected_output_len=output_len,
multi_modal_data=None,
))
self.maybe_oversample_requests(sampled_requests, num_requests)
return sampled_requests
...@@ -7,9 +7,6 @@ On the server side, run one of the following commands: ...@@ -7,9 +7,6 @@ On the server side, run one of the following commands:
--swap-space 16 \ --swap-space 16 \
--disable-log-requests --disable-log-requests
(TGI backend)
./launch_tgi_server.sh <your_model> <max_batch_total_tokens>
On the client side, run: On the client side, run:
python benchmarks/benchmark_serving.py \ python benchmarks/benchmark_serving.py \
--backend <backend> \ --backend <backend> \
...@@ -52,9 +49,11 @@ try: ...@@ -52,9 +49,11 @@ try:
except ImportError: except ImportError:
from argparse import ArgumentParser as FlexibleArgumentParser from argparse import ArgumentParser as FlexibleArgumentParser
from benchmark_dataset import (BurstGPTDataset, HuggingFaceDataset, from benchmark_dataset import (AIMODataset, BurstGPTDataset,
RandomDataset, SampleRequest, ShareGPTDataset, ConversationDataset, HuggingFaceDataset,
SonnetDataset, VisionArenaDataset) InstructCoderDataset, RandomDataset,
SampleRequest, ShareGPTDataset, SonnetDataset,
VisionArenaDataset)
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
MILLISECONDS_TO_SECONDS_CONVERSION = 1000 MILLISECONDS_TO_SECONDS_CONVERSION = 1000
...@@ -586,19 +585,39 @@ def main(args: argparse.Namespace): ...@@ -586,19 +585,39 @@ def main(args: argparse.Namespace):
return_prompt_formatted=True) return_prompt_formatted=True)
elif args.dataset_name == "hf": elif args.dataset_name == "hf":
# Choose between VisionArenaDataset # all following datasets are implemented from the
# and HuggingFaceDataset based on provided parameters. # HuggingFaceDataset base class
dataset_class = (VisionArenaDataset if args.dataset_path if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
== VisionArenaDataset.VISION_ARENA_DATASET_PATH dataset_class = VisionArenaDataset
and args.hf_subset is None else HuggingFaceDataset) args.hf_split = "train"
args.hf_subset = None
elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
dataset_class = InstructCoderDataset
args.hf_split = "train"
elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
dataset_class = ConversationDataset
elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS:
dataset_class = AIMODataset
args.hf_split = "train"
else:
supported_datasets = set([
dataset_name for cls in HuggingFaceDataset.__subclasses__()
for dataset_name in cls.SUPPORTED_DATASET_PATHS
])
raise ValueError(
f"Unsupported dataset path: {args.dataset_path}. "
"Huggingface dataset only supports dataset_path"
f" from one of following: {supported_datasets}. "
"Please consider contributing if you would "
"like to add support for additional dataset formats.")
input_requests = dataset_class( input_requests = dataset_class(
dataset_path=args.dataset_path, dataset_path=args.dataset_path,
dataset_subset=args.hf_subset, dataset_subset=args.hf_subset,
dataset_split=args.hf_split, dataset_split=args.hf_split,
random_seed=args.seed,
).sample( ).sample(
num_requests=args.num_prompts, num_requests=args.num_prompts,
tokenizer=tokenizer, tokenizer=tokenizer,
random_seed=args.seed,
output_len=args.hf_output_len, output_len=args.hf_output_len,
) )
......
...@@ -5,9 +5,6 @@ On the server side, run one of the following commands: ...@@ -5,9 +5,6 @@ On the server side, run one of the following commands:
(vLLM OpenAI API server) (vLLM OpenAI API server)
vllm serve <your_model> --disable-log-requests vllm serve <your_model> --disable-log-requests
(TGI backend)
./launch_tgi_server.sh <your_model> <max_batch_total_tokens>
On the client side, run: On the client side, run:
python benchmarks/benchmark_serving_structured_output.py \ python benchmarks/benchmark_serving_structured_output.py \
--backend <backend> \ --backend <backend> \
......
...@@ -14,7 +14,8 @@ from typing import Any, Optional, Union ...@@ -14,7 +14,8 @@ from typing import Any, Optional, Union
import numpy as np import numpy as np
import torch import torch
import uvloop import uvloop
from benchmark_dataset import (BurstGPTDataset, HuggingFaceDataset, from benchmark_dataset import (AIMODataset, BurstGPTDataset,
ConversationDataset, InstructCoderDataset,
RandomDataset, SampleRequest, ShareGPTDataset, RandomDataset, SampleRequest, ShareGPTDataset,
SonnetDataset, VisionArenaDataset) SonnetDataset, VisionArenaDataset)
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
...@@ -347,6 +348,7 @@ def get_requests(args, tokenizer): ...@@ -347,6 +348,7 @@ def get_requests(args, tokenizer):
"input_len": args.input_len, "input_len": args.input_len,
"output_len": args.output_len, "output_len": args.output_len,
} }
if args.dataset_path is None or args.dataset_name == "random": if args.dataset_path is None or args.dataset_name == "random":
sample_kwargs["range_ratio"] = args.random_range_ratio sample_kwargs["range_ratio"] = args.random_range_ratio
sample_kwargs["prefix_len"] = args.prefix_len sample_kwargs["prefix_len"] = args.prefix_len
...@@ -364,18 +366,23 @@ def get_requests(args, tokenizer): ...@@ -364,18 +366,23 @@ def get_requests(args, tokenizer):
elif args.dataset_name == "burstgpt": elif args.dataset_name == "burstgpt":
dataset_cls = BurstGPTDataset dataset_cls = BurstGPTDataset
elif args.dataset_name == "hf": elif args.dataset_name == "hf":
if args.backend != "vllm-chat": if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
raise ValueError( dataset_cls = VisionArenaDataset
"hf datasets only are supported by vllm-chat backend") common_kwargs['dataset_subset'] = None
# Choose between VisionArenaDataset and HuggingFaceDataset based on common_kwargs['dataset_split'] = "train"
# provided parameters. sample_kwargs["enable_multimodal_chat"] = True
dataset_cls = (VisionArenaDataset if args.dataset_path elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
== VisionArenaDataset.VISION_ARENA_DATASET_PATH dataset_cls = InstructCoderDataset
and args.hf_subset is None else HuggingFaceDataset) common_kwargs['dataset_split'] = "train"
common_kwargs['dataset_subset'] = args.hf_subset elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
common_kwargs['dataset_split'] = args.hf_split dataset_cls = ConversationDataset
sample_kwargs["enable_multimodal_chat"] = True common_kwargs['dataset_subset'] = args.hf_subset
common_kwargs['dataset_split'] = args.hf_split
sample_kwargs["enable_multimodal_chat"] = True
elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS:
dataset_cls = AIMODataset
common_kwargs['dataset_subset'] = None
common_kwargs['dataset_split'] = "train"
else: else:
raise ValueError(f"Unknown dataset name: {args.dataset_name}") raise ValueError(f"Unknown dataset name: {args.dataset_name}")
# Remove None values # Remove None values
...@@ -509,9 +516,17 @@ def validate_args(args): ...@@ -509,9 +516,17 @@ def validate_args(args):
warnings.warn("--hf-subset and --hf-split will be ignored \ warnings.warn("--hf-subset and --hf-split will be ignored \
since --dataset-name is not 'hf'.", since --dataset-name is not 'hf'.",
stacklevel=2) stacklevel=2)
elif args.dataset_name == "hf" and args.backend != "vllm-chat": elif args.dataset_name == "hf":
raise ValueError( if args.dataset_path in (
"When --dataset-name is 'hf', backend must be 'vllm-chat'") VisionArenaDataset.SUPPORTED_DATASET_PATHS.keys()
| ConversationDataset.SUPPORTED_DATASET_PATHS):
assert args.backend == "vllm-chat", f"{args.dataset_path} needs to use vllm-chat as the backend." #noqa: E501
elif args.dataset_path in (InstructCoderDataset.SUPPORTED_DATASET_PATHS
| AIMODataset.SUPPORTED_DATASET_PATHS):
assert args.backend == "vllm", f"{args.dataset_path} needs to use vllm as the backend." #noqa: E501
else:
raise ValueError(
f"{args.dataset_path} is not supported by hf dataset.")
# --random-range-ratio: only used when dataset_name is 'random' # --random-range-ratio: only used when dataset_name is 'random'
if args.dataset_name != 'random' and args.random_range_ratio is not None: if args.dataset_name != 'random' and args.random_range_ratio is not None:
......
# SPDX-License-Identifier: Apache-2.0
import torch
import torch.utils.benchmark as benchmark
from benchmark_shapes import WEIGHT_SHAPES_MOE
from vllm import _custom_ops as ops
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
from vllm.model_executor.layers.fused_moe.fused_moe import (cutlass_moe_fp8,
fused_experts,
fused_topk)
from vllm.utils import FlexibleArgumentParser
DEFAULT_MODELS = [
"nm-testing/Mixtral-8x7B-Instruct-v0.1", "nm-testing/deepseekv2-lite",
"ibm-granite/granite-3.0-1b-a400m", "ibm-granite/granite-3.0-3b-a800m"
]
DEFAULT_BATCH_SIZES = [1, 4, 8, 16, 32, 64, 128, 256, 512]
DEFAULT_TP_SIZES = [1]
PER_ACT_TOKEN_OPTS = [False]
PER_OUT_CH_OPTS = [False]
def to_fp8(tensor: torch.Tensor):
finfo = torch.finfo(torch.float8_e4m3fn)
return torch.round(tensor.clamp(
min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
def bench_run(results: list[benchmark.Measurement], model: str,
num_experts: int, topk: int, per_act_token: bool,
per_out_ch: bool, mkn: tuple[int, int, int]):
label = "Quant Matmul"
sub_label = (
"{}, num_experts={}, topk={}, per_act_token={} per_out_ch={}, "
"MKN=({})".format(model, num_experts, topk, per_act_token, per_out_ch,
mkn))
print(f"Testing: {sub_label}")
(m, k, n) = mkn
dtype = torch.half
a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
w1 = torch.randn((num_experts, 2 * n, k), device="cuda", dtype=dtype) / 10
w2 = torch.randn((num_experts, k, n), device="cuda", dtype=dtype) / 10
_, a_scale = ops.scaled_fp8_quant(a)
w1_q = torch.empty((num_experts, 2 * n, k),
device="cuda",
dtype=torch.float8_e4m3fn)
w2_q = torch.empty((num_experts, k, n),
device="cuda",
dtype=torch.float8_e4m3fn)
w1_scale = torch.empty((num_experts, 1, 1),
device="cuda",
dtype=torch.float32)
w2_scale = torch.empty((num_experts, 1, 1),
device="cuda",
dtype=torch.float32)
ab_strides1 = torch.full((num_experts, ),
k,
device="cuda",
dtype=torch.int64)
c_strides1 = torch.full((num_experts, ),
2 * n,
device="cuda",
dtype=torch.int64)
ab_strides2 = torch.full((num_experts, ),
n,
device="cuda",
dtype=torch.int64)
c_strides2 = torch.full((num_experts, ),
k,
device="cuda",
dtype=torch.int64)
for expert in range(num_experts):
w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(w1[expert])
w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(w2[expert])
w1_q_notransp = w1_q.clone()
w2_q_notransp = w2_q.clone()
w1_q = w1_q.transpose(1, 2)
w2_q = w2_q.transpose(1, 2)
score = torch.randn((m, num_experts), device="cuda", dtype=dtype)
topk_weights, topk_ids = fused_topk(a, score, topk, renormalize=False)
def run_triton_moe(a: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor,
topk_weights: torch.Tensor, topk_ids: torch.Tensor,
w1_scale: torch.Tensor, w2_scale: torch.Tensor,
a_scale: torch.Tensor, num_repeats: int):
for _ in range(num_repeats):
fused_experts(a,
w1,
w2,
topk_weights,
topk_ids,
use_fp8_w8a8=True,
w1_scale=w1_scale,
w2_scale=w2_scale,
a1_scale=a_scale)
def run_cutlass_moe(a: torch.Tensor, a_scale: torch.Tensor,
w1: torch.Tensor, w2: torch.Tensor,
w1_scale: torch.Tensor, w2_scale: torch.Tensor,
topk_weights: torch.Tensor, topk_ids: torch.Tensor,
ab_strides1: torch.Tensor, c_strides1: torch.Tensor,
ab_strides2: torch.Tensor, c_strides2: torch.Tensor,
num_repeats: int):
for _ in range(num_repeats):
cutlass_moe_fp8(a,
w1,
w2,
w1_scale,
w2_scale,
topk_weights,
topk_ids,
ab_strides1,
c_strides1,
ab_strides2,
c_strides2,
a1_scale=a_scale)
def run_cutlass_from_graph(
a: torch.Tensor, a_scale: torch.Tensor, w1_q: torch.Tensor,
w2_q: torch.Tensor, w1_scale: torch.Tensor, w2_scale: torch.Tensor,
topk_weights: torch.Tensor, topk_ids: torch.Tensor,
ab_strides1: torch.Tensor, c_strides1: torch.Tensor,
ab_strides2: torch.Tensor, c_strides2: torch.Tensor):
with set_current_vllm_config(
VllmConfig(parallel_config=ParallelConfig(
pipeline_parallel_size=1))):
return cutlass_moe_fp8(a,
w1_q,
w2_q,
w1_scale,
w2_scale,
topk_weights,
topk_ids,
ab_strides1,
c_strides1,
ab_strides2,
c_strides2,
a1_scale=a_scale)
def run_triton_from_graph(a: torch.Tensor, w1: torch.Tensor,
w2: torch.Tensor, topk_weights: torch.Tensor,
topk_ids: torch.Tensor, w1_scale: torch.Tensor,
w2_scale: torch.Tensor, a_scale: torch.Tensor):
with set_current_vllm_config(
VllmConfig(parallel_config=ParallelConfig(
pipeline_parallel_size=1))):
return fused_experts(a,
w1,
w2,
topk_weights,
topk_ids,
use_fp8_w8a8=True,
w1_scale=w1_scale,
w2_scale=w2_scale,
a1_scale=a_scale)
def replay_graph(graph, num_repeats):
for _ in range(num_repeats):
graph.replay()
torch.cuda.synchronize()
cutlass_stream = torch.cuda.Stream()
cutlass_graph = torch.cuda.CUDAGraph()
with torch.cuda.graph(cutlass_graph, stream=cutlass_stream):
run_cutlass_from_graph(a, a_scale, w1_q, w2_q, w1_scale, w2_scale,
topk_weights, topk_ids, ab_strides1, c_strides1,
ab_strides2, c_strides2)
torch.cuda.synchronize()
triton_stream = torch.cuda.Stream()
triton_graph = torch.cuda.CUDAGraph()
with torch.cuda.graph(triton_graph, stream=triton_stream):
run_triton_from_graph(a, w1_q_notransp, w2_q_notransp, topk_weights,
topk_ids, w1_scale, w2_scale, a_scale)
torch.cuda.synchronize()
min_run_time = 5
num_warmup = 5
num_runs = 25
globals = {
# Baseline params
"w1": w1,
"w2": w2,
"score": score,
"topk": topk,
"w1_q_notransp": w1_q_notransp,
"w2_q_notransp": w2_q_notransp,
# Cutlass params
"a_scale": a_scale,
"w1_q": w1_q,
"w2_q": w2_q,
"w1_scale": w1_scale,
"w2_scale": w2_scale,
"ab_strides1": ab_strides1,
"c_strides1": c_strides1,
"ab_strides2": ab_strides2,
"c_strides2": c_strides2,
# cuda graph params
"cutlass_graph": cutlass_graph,
"triton_graph": triton_graph,
# Gen params
"a": a,
"topk_weights": topk_weights,
"topk_ids": topk_ids,
"num_runs": num_runs,
# Kernels
"run_triton_moe": run_triton_moe,
"run_cutlass_moe": run_cutlass_moe,
"replay_graph": replay_graph,
}
# Warmup
run_triton_moe(a, w1_q_notransp, w2_q_notransp, topk_weights, topk_ids,
w1_scale, w2_scale, a_scale, num_warmup)
results.append(
benchmark.Timer(
stmt=
"run_triton_moe(a, w1_q_notransp, w2_q_notransp, topk_weights, topk_ids, w1_scale, w2_scale, a_scale, num_runs)", # noqa: E501
globals=globals,
label=label,
sub_label=sub_label,
description="triton_moe",
).blocked_autorange(min_run_time=min_run_time))
# Warmup
replay_graph(triton_graph, num_warmup)
results.append(
benchmark.Timer(
stmt="replay_graph(triton_graph, num_runs)",
globals=globals,
label=label,
sub_label=sub_label,
description="triton_moe_cuda_graphs",
).blocked_autorange(min_run_time=min_run_time))
# Warmup
run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights,
topk_ids, ab_strides1, c_strides1, ab_strides2, c_strides2,
num_warmup)
results.append(
benchmark.Timer(
stmt=
"run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, ab_strides1, c_strides1, ab_strides2, c_strides2, num_runs)", # noqa: E501
globals=globals,
label=label,
sub_label=sub_label,
description="grouped_gemm_moe",
).blocked_autorange(min_run_time=min_run_time))
# Warmup
replay_graph(cutlass_graph, num_warmup)
results.append(
benchmark.Timer(
stmt="replay_graph(cutlass_graph, num_runs)",
globals=globals,
label=label,
sub_label=sub_label,
description="grouped_gemm_moe_cuda_graphs",
).blocked_autorange(min_run_time=min_run_time))
def main(args):
print("Benchmarking models:")
for i, model in enumerate(args.models):
print(f"[{i}] {model}")
results: list[benchmark.Measurement] = []
for model in args.models:
for tp in args.tp_sizes:
for layer in WEIGHT_SHAPES_MOE[model]:
num_experts = layer[0]
topk = layer[1]
size_k = layer[2]
size_n = layer[3] // tp
if len(args.limit_k) > 0 and size_k not in args.limit_k:
continue
if len(args.limit_n) > 0 and size_n not in args.limit_n:
continue
for per_act_token in PER_ACT_TOKEN_OPTS:
for per_out_ch in PER_OUT_CH_OPTS:
for size_m in DEFAULT_BATCH_SIZES:
mkn = (size_m, size_k, size_n)
bench_run(results, model, num_experts, topk,
per_act_token, per_out_ch, mkn)
compare = benchmark.Compare(results)
compare.print()
if __name__ == "__main__":
parser = FlexibleArgumentParser(
description="Benchmark Marlin across specified models/shapes/batches")
parser.add_argument(
"--models",
nargs="+",
type=str,
default=DEFAULT_MODELS,
choices=WEIGHT_SHAPES_MOE.keys(),
)
parser.add_argument("--tp-sizes",
nargs="+",
type=int,
default=DEFAULT_TP_SIZES)
parser.add_argument("--batch-sizes",
nargs="+",
type=int,
default=DEFAULT_BATCH_SIZES)
parser.add_argument("--limit-k", nargs="+", type=int, default=[])
parser.add_argument("--limit-n", nargs="+", type=int, default=[])
parser.add_argument("--limit-num-groups", nargs="+", type=int, default=[])
parser.add_argument("--limit-per-act-token",
nargs="+",
type=int,
default=[])
parser.add_argument("--limit-per-out-ch", nargs="+", type=int, default=[])
args = parser.parse_args()
main(args)
...@@ -31,20 +31,19 @@ class BenchmarkConfig(TypedDict): ...@@ -31,20 +31,19 @@ class BenchmarkConfig(TypedDict):
num_ldmatrixes: Optional[int] num_ldmatrixes: Optional[int]
def benchmark_config( def benchmark_config(config: BenchmarkConfig,
config: BenchmarkConfig, num_tokens: int,
num_tokens: int, num_experts: int,
num_experts: int, shard_intermediate_size: int,
shard_intermediate_size: int, hidden_size: int,
hidden_size: int, topk: int,
topk: int, dtype: torch.dtype,
dtype: torch.dtype, use_fp8_w8a8: bool,
use_fp8_w8a8: bool, use_int8_w8a16: bool,
use_int8_w8a16: bool, num_iters: int = 100,
num_iters: int = 100, block_quant_shape: List[int] = None,
block_quant_shape: List[int] = None, use_deep_gemm: bool = False,
nn_moe: Optional[bool] = False, nn_moe: Optional[bool] = False) -> float:
) -> float:
init_dtype = torch.float16 if use_fp8_w8a8 else dtype init_dtype = torch.float16 if use_fp8_w8a8 else dtype
x = torch.randn(num_tokens, hidden_size, dtype=dtype) x = torch.randn(num_tokens, hidden_size, dtype=dtype)
if use_int8_w8a16: if use_int8_w8a16:
...@@ -143,23 +142,43 @@ def benchmark_config( ...@@ -143,23 +142,43 @@ def benchmark_config(
def run(): def run():
from vllm.model_executor.layers.fused_moe import override_config from vllm.model_executor.layers.fused_moe import override_config
with override_config(config): with override_config(config):
fused_moe( if use_deep_gemm:
x, topk_weights, topk_ids = fused_topk(x, input_gating, topk,
w1, False)
w2, return fused_experts(
input_gating, x,
topk, w1,
renormalize=True, w2,
inplace=True, topk_weights,
use_fp8_w8a8=use_fp8_w8a8, topk_ids,
use_int8_w8a16=use_int8_w8a16, inplace=True,
w1_scale=w1_scale, use_fp8_w8a8=use_fp8_w8a8,
w2_scale=w2_scale, w1_scale=w1_scale,
a1_scale=a1_scale, w2_scale=w2_scale,
a2_scale=a2_scale, a1_scale=a1_scale,
block_shape=block_quant_shape, a2_scale=a2_scale,
use_nn_moe=nn_moe, block_shape=block_quant_shape,
) allow_deep_gemm=True,
use_nn_moe=nn_moe,
)
else:
fused_moe(
x,
w1,
w2,
input_gating,
topk,
renormalize=True,
inplace=True,
use_fp8_w8a8=use_fp8_w8a8,
use_int8_w8a16=use_int8_w8a16,
w1_scale=w1_scale,
w2_scale=w2_scale,
a1_scale=a1_scale,
a2_scale=a2_scale,
block_shape=block_quant_shape,
use_nn_moe=nn_moe,
)
# JIT compilation & warmup # JIT compilation & warmup
run() run()
...@@ -405,6 +424,7 @@ class BenchmarkWorker: ...@@ -405,6 +424,7 @@ class BenchmarkWorker:
use_fp8_w8a8: bool, use_fp8_w8a8: bool,
use_int8_w8a16: bool, use_int8_w8a16: bool,
block_quant_shape: List[int] = None, block_quant_shape: List[int] = None,
use_deep_gemm: bool = False,
nn_moe: Optional[bool] = False, nn_moe: Optional[bool] = False,
) -> tuple[dict[str, int], float]: ) -> tuple[dict[str, int], float]:
current_platform.seed_everything(self.seed) current_platform.seed_everything(self.seed)
...@@ -438,7 +458,8 @@ class BenchmarkWorker: ...@@ -438,7 +458,8 @@ class BenchmarkWorker:
use_int8_w8a16, use_int8_w8a16,
num_iters=100, num_iters=100,
block_quant_shape=block_quant_shape, block_quant_shape=block_quant_shape,
nn_moe=nn_moe,) use_deep_gemm=use_deep_gemm,
nn_moe=nn_moe)
return config, kernel_time return config, kernel_time
def tune( def tune(
...@@ -453,6 +474,7 @@ class BenchmarkWorker: ...@@ -453,6 +474,7 @@ class BenchmarkWorker:
use_int8_w8a16: bool, use_int8_w8a16: bool,
search_space: list[dict[str, int]], search_space: list[dict[str, int]],
block_quant_shape: list[int], block_quant_shape: list[int],
use_deep_gemm: bool,
nn_moe: Optional[bool] = False, nn_moe: Optional[bool] = False,
) -> dict[str, int]: ) -> dict[str, int]:
best_config = None best_config = None
...@@ -480,7 +502,8 @@ class BenchmarkWorker: ...@@ -480,7 +502,8 @@ class BenchmarkWorker:
use_int8_w8a16, use_int8_w8a16,
num_iters=20, num_iters=20,
block_quant_shape=block_quant_shape, block_quant_shape=block_quant_shape,
nn_moe=nn_moe,) use_deep_gemm=use_deep_gemm,
nn_moe=nn_moe)
except triton.runtime.autotuner.OutOfResources: except triton.runtime.autotuner.OutOfResources:
# Some configurations may be invalid and fail to compile. # Some configurations may be invalid and fail to compile.
continue continue
...@@ -588,6 +611,9 @@ def main(args: argparse.Namespace): ...@@ -588,6 +611,9 @@ def main(args: argparse.Namespace):
intermediate_size = config.moe_intermediate_size intermediate_size = config.moe_intermediate_size
shard_intermediate_size = 2 * intermediate_size // args.tp_size shard_intermediate_size = 2 * intermediate_size // args.tp_size
else: else:
if not hasattr(config, "hidden_size"):
# Support for llama4
config = config.text_config
# Default: Mixtral. # Default: Mixtral.
E = config.num_local_experts E = config.num_local_experts
topk = config.num_experts_per_tok topk = config.num_experts_per_tok
...@@ -607,9 +633,12 @@ def main(args: argparse.Namespace): ...@@ -607,9 +633,12 @@ def main(args: argparse.Namespace):
else: else:
batch_sizes = [args.batch_size] batch_sizes = [args.batch_size]
use_deep_gemm = bool(args.use_deep_gemm)
ray.init(address=None, ray.init(address=None,
ignore_reinit_error=True, ignore_reinit_error=True,
num_gpus=args.num_gpus) num_gpus=args.num_gpus)
num_gpus = int(ray.available_resources()["GPU"]) num_gpus = int(ray.available_resources()["GPU"])
workers = [BenchmarkWorker.remote(args.seed, i) for i in range(num_gpus)] workers = [BenchmarkWorker.remote(args.seed, i) for i in range(num_gpus)]
...@@ -631,10 +660,10 @@ def main(args: argparse.Namespace): ...@@ -631,10 +660,10 @@ def main(args: argparse.Namespace):
start = time.time() start = time.time()
configs = _distribute( configs = _distribute(
"tune", "tune", [(batch_size, E, shard_intermediate_size, hidden_size,
[(batch_size, E, shard_intermediate_size, hidden_size, topk, dtype, topk, dtype, use_fp8_w8a8, use_int8_w8a16, search_space,
use_fp8_w8a8, use_int8_w8a16, search_space, block_quant_shape, args.nn_moe) block_quant_shape, use_deep_gemm, args.nn_moe)
for batch_size in batch_sizes]) for batch_size in batch_sizes])
best_configs = { best_configs = {
M: sort_config(config) M: sort_config(config)
for M, config in zip(batch_sizes, configs) for M, config in zip(batch_sizes, configs)
...@@ -648,7 +677,7 @@ def main(args: argparse.Namespace): ...@@ -648,7 +677,7 @@ def main(args: argparse.Namespace):
outputs = _distribute( outputs = _distribute(
"benchmark", "benchmark",
[(batch_size, E, shard_intermediate_size, hidden_size, topk, dtype, [(batch_size, E, shard_intermediate_size, hidden_size, topk, dtype,
use_fp8_w8a8, use_int8_w8a16, block_quant_shape, args.nn_moe) use_fp8_w8a8, use_int8_w8a16, block_quant_shape, use_deep_gemm, args.nn_moe)
for batch_size in batch_sizes]) for batch_size in batch_sizes])
for batch_size, (config, kernel_time) in zip(batch_sizes, outputs): for batch_size, (config, kernel_time) in zip(batch_sizes, outputs):
...@@ -670,6 +699,7 @@ if __name__ == "__main__": ...@@ -670,6 +699,7 @@ if __name__ == "__main__":
type=str, type=str,
choices=["auto", "fp8_w8a8", "int8_w8a16"], choices=["auto", "fp8_w8a8", "int8_w8a16"],
default="auto") default="auto")
parser.add_argument("--use-deep-gemm", action="store_true")
parser.add_argument("--seed", type=int, default=0) parser.add_argument("--seed", type=int, default=0)
parser.add_argument("--batch-size", type=int, required=False) parser.add_argument("--batch-size", type=int, required=False)
parser.add_argument("--tune", action="store_true") parser.add_argument("--tune", action="store_true")
......
...@@ -7,12 +7,15 @@ from typing import Optional ...@@ -7,12 +7,15 @@ from typing import Optional
import torch import torch
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm.logger import init_logger
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser, from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser,
create_kv_caches_with_random) create_kv_caches_with_random)
import vllm.envs as envs import vllm.envs as envs
logger = init_logger(__name__)
NUM_BLOCKS = 128 * 1024 NUM_BLOCKS = 128 * 1024
PARTITION_SIZE = 512 PARTITION_SIZE = 512
PARTITION_SIZE_ROCM = 256 PARTITION_SIZE_ROCM = 256
...@@ -269,6 +272,9 @@ def main( ...@@ -269,6 +272,9 @@ def main(
if __name__ == '__main__': if __name__ == '__main__':
logger.warning("This script benchmarks the paged attention kernel. "
"By default this is no longer used in vLLM inference.")
parser = FlexibleArgumentParser( parser = FlexibleArgumentParser(
description="Benchmark the paged attention kernel.") description="Benchmark the paged attention kernel.")
parser.add_argument("--version", parser.add_argument("--version",
......
...@@ -75,3 +75,19 @@ WEIGHT_SHAPES = { ...@@ -75,3 +75,19 @@ WEIGHT_SHAPES = {
[7168, 8192], [7168, 8192],
], ],
} }
WEIGHT_SHAPES_MOE = {
"nm-testing/Mixtral-8x7B-Instruct-v0.1": [
[8, 2, 4096, 28672],
[8, 2, 14336, 4096],
],
"nm-testing/deepseekv2-lite": [
[64, 6, 2048, 1408],
],
"ibm-granite/granite-3.0-1b-a400m": [
[32, 8, 1024, 1024],
],
"ibm-granite/granite-3.0-3b-a800m": [
[40, 8, 1024, 1536],
],
}
#!/bin/bash
PORT=8000
MODEL=$1
TOKENS=$2
docker run -e "HF_TOKEN=$HF_TOKEN" --gpus all --shm-size 1g -p $PORT:80 \
-v "$PWD/data:/data" \
ghcr.io/huggingface/text-generation-inference:2.2.0 \
--model-id "$MODEL" \
--sharded false \
--max-input-length 1024 \
--max-total-tokens 2048 \
--max-best-of 5 \
--max-concurrent-requests 5000 \
--max-batch-total-tokens "$TOKENS"
...@@ -190,12 +190,14 @@ set(VLLM_EXT_SRC ...@@ -190,12 +190,14 @@ set(VLLM_EXT_SRC
"csrc/cpu/cache.cpp" "csrc/cpu/cache.cpp"
"csrc/cpu/utils.cpp" "csrc/cpu/utils.cpp"
"csrc/cpu/layernorm.cpp" "csrc/cpu/layernorm.cpp"
"csrc/cpu/mla_decode.cpp"
"csrc/cpu/pos_encoding.cpp" "csrc/cpu/pos_encoding.cpp"
"csrc/cpu/torch_bindings.cpp") "csrc/cpu/torch_bindings.cpp")
if (AVX512_FOUND AND NOT AVX512_DISABLED) if (AVX512_FOUND AND NOT AVX512_DISABLED)
set(VLLM_EXT_SRC set(VLLM_EXT_SRC
"csrc/cpu/quant.cpp" "csrc/cpu/quant.cpp"
"csrc/cpu/shm.cpp"
${VLLM_EXT_SRC}) ${VLLM_EXT_SRC})
endif() endif()
......
...@@ -482,16 +482,28 @@ def get_pip_packages(run_lambda, patterns=None): ...@@ -482,16 +482,28 @@ def get_pip_packages(run_lambda, patterns=None):
if patterns is None: if patterns is None:
patterns = DEFAULT_PIP_PATTERNS patterns = DEFAULT_PIP_PATTERNS
# People generally have `pip` as `pip` or `pip3` def run_with_pip():
# But here it is invoked as `python -mpip` try:
def run_with_pip(pip): import importlib.util
out = run_and_read_all(run_lambda, pip + ["list", "--format=freeze"]) pip_spec = importlib.util.find_spec('pip')
pip_available = pip_spec is not None
except ImportError:
pip_available = False
if pip_available:
cmd = [sys.executable, '-mpip', 'list', '--format=freeze']
elif os.environ.get("UV") is not None:
print("uv is set")
cmd = ["uv", "pip", "list", "--format=freeze"]
else:
raise RuntimeError("Could not collect pip list output (pip or uv module not available)")
out = run_and_read_all(run_lambda, cmd)
return "\n".join(line for line in out.splitlines() return "\n".join(line for line in out.splitlines()
if any(name in line for name in patterns)) if any(name in line for name in patterns))
pip_version = 'pip3' if sys.version[0] == '3' else 'pip' pip_version = 'pip3' if sys.version[0] == '3' else 'pip'
out = run_with_pip([sys.executable, '-mpip']) out = run_with_pip()
return pip_version, out return pip_version, out
......
...@@ -88,6 +88,48 @@ void reshape_and_cache_cpu_impl( ...@@ -88,6 +88,48 @@ void reshape_and_cache_cpu_impl(
} }
}; // namespace }; // namespace
template <typename scalar_t>
void concat_and_cache_mla_cpu_impl(
const scalar_t* __restrict__ kv_c, // [num_tokens, kv_lora_rank]
const scalar_t* __restrict__ k_pe, // [num_tokens, pe_dim]
scalar_t* __restrict__ kv_cache, // [num_blocks, block_size, (kv_lora_rank
// + pe_dim)]
const int64_t* __restrict__ slot_mapping, // [num_tokens]
const int num_tokens, //
const int block_stride, //
const int entry_stride, //
const int kv_c_stride, //
const int k_pe_stride, //
const int kv_lora_rank, //
const int pe_dim, //
const int block_size //
) {
#pragma omp parallel for
for (int token_idx = 0; token_idx < num_tokens; ++token_idx) {
const int64_t slot_idx = slot_mapping[token_idx];
// NOTE: slot_idx can be -1 if the token is padded
if (slot_idx < 0) {
continue;
}
const int64_t block_idx = slot_idx / block_size;
const int64_t block_offset = slot_idx % block_size;
auto copy = [&](const scalar_t* __restrict__ src,
scalar_t* __restrict__ dst, int src_stride, int dst_stride,
int size, int offset) {
for (int i = 0; i < size; i++) {
const int64_t src_idx = token_idx * src_stride + i;
const int64_t dst_idx =
block_idx * block_stride + block_offset * entry_stride + i + offset;
dst[dst_idx] = src[src_idx];
}
};
copy(kv_c, kv_cache, kv_c_stride, block_stride, kv_lora_rank, 0);
copy(k_pe, kv_cache, k_pe_stride, block_stride, pe_dim, kv_lora_rank);
}
}
// Note: the key_caches and value_caches vectors are constant but // Note: the key_caches and value_caches vectors are constant but
// not the Tensors they contain. The vectors need to be const refs // not the Tensors they contain. The vectors need to be const refs
// in order to satisfy pytorch's C++ operator registration code. // in order to satisfy pytorch's C++ operator registration code.
...@@ -134,6 +176,38 @@ void reshape_and_cache(torch::Tensor& key, torch::Tensor& value, ...@@ -134,6 +176,38 @@ void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
}); });
} }
void concat_and_cache_mla(
torch::Tensor& kv_c, // [num_tokens, kv_lora_rank]
torch::Tensor& k_pe, // [num_tokens, pe_dim]
torch::Tensor& kv_cache, // [num_blocks, block_size, (kv_lora_rank +
// pe_dim)]
torch::Tensor& slot_mapping, // [num_tokens] or [num_actual_tokens]
const std::string& kv_cache_dtype, torch::Tensor& scale) {
int num_tokens = slot_mapping.size(0);
int kv_lora_rank = kv_c.size(1);
int pe_dim = k_pe.size(1);
int block_size = kv_cache.size(1);
TORCH_CHECK(kv_cache.size(2) == kv_lora_rank + pe_dim);
TORCH_CHECK(kv_cache_dtype != "fp8");
int kv_c_stride = kv_c.stride(0);
int k_pe_stride = k_pe.stride(0);
int block_stride = kv_cache.stride(0);
int entry_stride = kv_cache.stride(1);
VLLM_DISPATCH_FLOATING_TYPES(
kv_c.scalar_type(), "concat_and_cache_mla_cpu_impl", [&] {
CPU_KERNEL_GUARD_IN(concat_and_cache_mla_cpu_impl)
concat_and_cache_mla_cpu_impl<scalar_t>(
kv_c.data_ptr<scalar_t>(), k_pe.data_ptr<scalar_t>(),
kv_cache.data_ptr<scalar_t>(), slot_mapping.data_ptr<int64_t>(),
num_tokens, block_stride, entry_stride, kv_c_stride, k_pe_stride,
kv_lora_rank, pe_dim, block_size);
CPU_KERNEL_GUARD_OUT(concat_and_cache_mla_cpu_impl)
});
}
void swap_blocks(torch::Tensor& src, torch::Tensor& dst, void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
const torch::Tensor& block_mapping) { const torch::Tensor& block_mapping) {
TORCH_CHECK(false, "swap_blocks is unsupported on CPU.") TORCH_CHECK(false, "swap_blocks is unsupported on CPU.")
......
...@@ -78,9 +78,14 @@ struct FP16Vec16 : public Vec<FP16Vec16> { ...@@ -78,9 +78,14 @@ struct FP16Vec16 : public Vec<FP16Vec16> {
__m256i reg; __m256i reg;
// normal load
explicit FP16Vec16(const void* ptr) explicit FP16Vec16(const void* ptr)
: reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {} : reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {}
// non-temproal load
explicit FP16Vec16(bool, void* ptr)
: reg(_mm256_stream_load_si256((__m256i*)ptr)) {}
explicit FP16Vec16(const FP32Vec16&); explicit FP16Vec16(const FP32Vec16&);
void save(void* ptr) const { *reinterpret_cast<__m256i*>(ptr) = reg; } void save(void* ptr) const { *reinterpret_cast<__m256i*>(ptr) = reg; }
...@@ -110,9 +115,14 @@ struct BF16Vec16 : public Vec<BF16Vec16> { ...@@ -110,9 +115,14 @@ struct BF16Vec16 : public Vec<BF16Vec16> {
__m256i reg; __m256i reg;
// normal load
explicit BF16Vec16(const void* ptr) explicit BF16Vec16(const void* ptr)
: reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {} : reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {}
// non-temproal load
explicit BF16Vec16(bool, void* ptr)
: reg(_mm256_stream_load_si256((__m256i*)ptr)) {}
explicit BF16Vec16(const FP32Vec16&); explicit BF16Vec16(const FP32Vec16&);
void save(void* ptr) const { *reinterpret_cast<__m256i*>(ptr) = reg; } void save(void* ptr) const { *reinterpret_cast<__m256i*>(ptr) = reg; }
...@@ -130,6 +140,8 @@ struct BF16Vec32 : public Vec<BF16Vec32> { ...@@ -130,6 +140,8 @@ struct BF16Vec32 : public Vec<BF16Vec32> {
__m512i reg; __m512i reg;
explicit BF16Vec32() : reg(_mm512_setzero_si512()) {}
explicit BF16Vec32(const void* ptr) : reg((__m512i)_mm512_loadu_si512(ptr)) {} explicit BF16Vec32(const void* ptr) : reg((__m512i)_mm512_loadu_si512(ptr)) {}
explicit BF16Vec32(__m512i data) : reg(data) {} explicit BF16Vec32(__m512i data) : reg(data) {}
...@@ -311,8 +323,13 @@ struct FP32Vec16 : public Vec<FP32Vec16> { ...@@ -311,8 +323,13 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
explicit FP32Vec16() : reg(_mm512_set1_ps(0.0)) {} explicit FP32Vec16() : reg(_mm512_set1_ps(0.0)) {}
// normal load
explicit FP32Vec16(const float* ptr) : reg(_mm512_loadu_ps(ptr)) {} explicit FP32Vec16(const float* ptr) : reg(_mm512_loadu_ps(ptr)) {}
// non-temproal load
explicit FP32Vec16(bool, void* ptr)
: reg((__m512)_mm512_stream_load_si512(ptr)) {}
explicit FP32Vec16(__m512 data) : reg(data) {} explicit FP32Vec16(__m512 data) : reg(data) {}
explicit FP32Vec16(const FP32Vec4& data) explicit FP32Vec16(const FP32Vec4& data)
...@@ -545,6 +562,33 @@ struct INT8Vec16 : public Vec<INT8Vec16> { ...@@ -545,6 +562,33 @@ struct INT8Vec16 : public Vec<INT8Vec16> {
_mm_mask_storeu_epi8(ptr, mask, reg); _mm_mask_storeu_epi8(ptr, mask, reg);
} }
}; };
struct INT8Vec64 : public Vec<INT8Vec64> {
constexpr static int VEC_ELEM_NUM = 64;
union AliasReg {
__m512i reg;
int8_t values[VEC_ELEM_NUM];
};
__m512i reg;
// normal load
explicit INT8Vec64(void* ptr) : reg(_mm512_loadu_epi8(ptr)) {}
// non-temproal load
explicit INT8Vec64(bool, void* ptr) : reg(_mm512_stream_load_si512(ptr)) {}
void save(void* ptr) const { _mm512_storeu_epi8(ptr, reg); }
void save(int8_t* ptr, const int elem_num) const {
constexpr uint64_t M = 0xFFFFFFFFFFFFFFFF;
__mmask64 mask = _cvtu64_mask64(M >> (64 - elem_num));
_mm512_mask_storeu_epi8(ptr, mask, reg);
}
// non-temproal save
void nt_save(int8_t* ptr) { _mm512_stream_si512((__m512i*)ptr, reg); }
};
#endif #endif
template <typename T> template <typename T>
...@@ -655,6 +699,22 @@ inline BF16Vec16::BF16Vec16(const FP32Vec16& v) { ...@@ -655,6 +699,22 @@ inline BF16Vec16::BF16Vec16(const FP32Vec16& v) {
inline void prefetch(const void* addr) { _mm_prefetch(addr, _MM_HINT_T1); } inline void prefetch(const void* addr) { _mm_prefetch(addr, _MM_HINT_T1); }
#ifdef __AVX512F__
inline void non_temporal_save(FP16Vec16& vec, void* ptr) {
_mm256_stream_si256((__m256i*)ptr, vec.reg);
}
inline void non_temporal_save(BF16Vec32& vec, void* ptr) {
_mm512_stream_si512((__m512i*)ptr, vec.reg);
}
inline void non_temporal_save(BF16Vec16& vec, void* ptr) {
_mm256_stream_si256((__m256i*)ptr, vec.reg);
}
inline void non_temporal_save(FP32Vec16& vec, void* ptr) {
_mm512_stream_ps((float*)ptr, vec.reg);
}
#endif
inline void mem_barrier() { _mm_mfence(); }
}; // namespace vec_op }; // namespace vec_op
#endif #endif
#include "cpu_types.hpp"
#include <float.h>
namespace {
template <typename scalar_t>
struct KernelVecType {
using qk_load_vec_type = void;
using qk_vec_type = void;
using v_load_vec_type = void;
};
template <>
struct KernelVecType<float> {
using qk_load_vec_type = vec_op::FP32Vec16;
using qk_vec_type = vec_op::FP32Vec16;
using v_load_vec_type = vec_op::FP32Vec16;
};
template <>
struct KernelVecType<c10::Half> {
#if defined(__powerpc64__) || defined(__s390x__)
// Power and s390x architecture-specific vector types
using qk_load_vec_type = vec_op::FP32Vec16;
using qk_vec_type = vec_op::FP32Vec16;
using v_load_vec_type = vec_op::FP32Vec16;
#else
// Fallback for other architectures, including x86
using qk_load_vec_type = vec_op::FP16Vec16;
using qk_vec_type = vec_op::FP32Vec16;
using v_load_vec_type = vec_op::FP16Vec16;
#endif
};
#ifdef __AVX512BF16__
template <>
struct KernelVecType<c10::BFloat16> {
using qk_load_vec_type = vec_op::BF16Vec32;
using qk_vec_type = vec_op::BF16Vec32;
using v_load_vec_type = vec_op::BF16Vec16;
};
#elif defined(__aarch64__) && !defined(ARM_BF16_SUPPORT)
// pass
#else
template <>
struct KernelVecType<c10::BFloat16> {
using qk_load_vec_type = vec_op::BF16Vec16;
using qk_vec_type = vec_op::FP32Vec16;
using v_load_vec_type = vec_op::BF16Vec16;
};
#endif
template <int HEAD_DIM, int V_HEAD_DIM, int BLOCK_SIZE, int HEAD_UNROLL,
typename qk_vec_type>
void mla_decode_block_head(
const qk_vec_type* __restrict__ q_vecs, // [HEAD_UNROLL, head_dim]
const qk_vec_type* __restrict__ k_vecs, // [block_size, head_dim]
const vec_op::FP32Vec16* __restrict v_vecs_f32, // [block_size, v_head_dim]
float* __restrict__ acc_out, // [HEAD_UNROLL, v_head_dim]
float* __restrict__ acc_lse, // [HEAD_UNROLL]
const float scale, const int num_tokens) {
using f32_vec_type = vec_op::FP32Vec16;
constexpr int QK_NUM_ELEM = qk_vec_type::VEC_ELEM_NUM;
constexpr int V_NUM_ELEM = f32_vec_type::VEC_ELEM_NUM;
float logits[BLOCK_SIZE][HEAD_UNROLL] = {}; // initialize to zeros
float max_val[HEAD_UNROLL];
std::fill(max_val, max_val + HEAD_UNROLL, -FLT_MAX);
f32_vec_type acc_vec[BLOCK_SIZE][HEAD_UNROLL];
for (int i = 0; i < HEAD_DIM; i += QK_NUM_ELEM) {
// load to registers
qk_vec_type q_vec[HEAD_UNROLL];
#pragma unroll
for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll)
q_vec[unroll] =
qk_vec_type{q_vecs[(i + unroll * HEAD_DIM) / QK_NUM_ELEM]};
for (int block_offset = 0; block_offset < num_tokens; ++block_offset) {
qk_vec_type k_vec(k_vecs[(block_offset * HEAD_DIM + i) / QK_NUM_ELEM]);
#pragma unroll
for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll)
vec_op::fma(acc_vec[block_offset][unroll], q_vec[unroll], k_vec);
}
}
for (int block_offset = 0; block_offset < num_tokens; ++block_offset) {
#pragma unroll
for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll) {
const float acc = acc_vec[block_offset][unroll].reduce_sum() * scale;
logits[block_offset][unroll] = acc;
max_val[unroll] = std::max(max_val[unroll], acc);
}
}
float sum_exp[HEAD_UNROLL] = {};
for (int block_offset = 0; block_offset < num_tokens; ++block_offset) {
#pragma unroll
for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll) {
const float val =
std::exp(logits[block_offset][unroll] - max_val[unroll]);
logits[block_offset][unroll] = val;
sum_exp[unroll] += val;
}
}
f32_vec_type this_out[V_HEAD_DIM / V_NUM_ELEM][HEAD_UNROLL];
for (int block_offset = 0; block_offset < num_tokens; ++block_offset) {
// load to registers
f32_vec_type scale_[HEAD_UNROLL];
#pragma unroll
for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll)
scale_[unroll] =
f32_vec_type{logits[block_offset][unroll] / sum_exp[unroll]};
for (int i = 0; i < V_HEAD_DIM; i += V_NUM_ELEM) {
f32_vec_type v_vec(
v_vecs_f32[(block_offset * HEAD_DIM + i) / V_NUM_ELEM]);
#pragma unroll
for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll)
vec_op::fma(this_out[i / V_NUM_ELEM][unroll], v_vec, scale_[unroll]);
}
}
// merge attention state
// section 2.2 in https://arxiv.org/pdf/2501.01005
f32_vec_type prev_scale[HEAD_UNROLL];
f32_vec_type curr_scale[HEAD_UNROLL];
#pragma unroll
for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll) {
const float prev_lse = acc_lse[unroll];
const float curr_lse = std::log(sum_exp[unroll]) +
max_val[unroll]; // add back max_val to get true lse
// softmax trick
const float max_lse = std::max(prev_lse, curr_lse);
const float prev_sum_exp = std::exp(prev_lse - max_lse);
const float curr_sum_exp = std::exp(curr_lse - max_lse);
const float new_sum_exp = prev_sum_exp + curr_sum_exp;
acc_lse[unroll] = std::log(new_sum_exp) + max_lse;
prev_scale[unroll] = f32_vec_type{prev_sum_exp / new_sum_exp};
curr_scale[unroll] = f32_vec_type{curr_sum_exp / new_sum_exp};
}
for (int i = 0; i < V_HEAD_DIM; i += V_NUM_ELEM) {
#pragma unroll
for (int unroll = 0; unroll < HEAD_UNROLL; ++unroll) {
f32_vec_type o_vec(acc_out + i + V_HEAD_DIM * unroll);
o_vec = o_vec * prev_scale[unroll] +
this_out[i / V_NUM_ELEM][unroll] * curr_scale[unroll];
o_vec.save(acc_out + i + V_HEAD_DIM * unroll);
}
}
q_vecs += HEAD_DIM / QK_NUM_ELEM * HEAD_UNROLL;
acc_out += V_HEAD_DIM * HEAD_UNROLL;
}
template <typename scalar_t, int HEAD_DIM, int V_HEAD_DIM, int BLOCK_SIZE,
typename qk_vec_type>
void mla_decode_block(
const qk_vec_type* __restrict__ q_vecs, // [num_heads, head_dim]
const scalar_t* __restrict__ kv_cache, // [block_size, head_dim]
float* __restrict__ acc_out, // [num_heads, v_head_dim]
float* __restrict__ acc_lse, // [num_heads]
const int num_heads, const float scale, const int num_tokens) {
using qk_load_vec_type = typename KernelVecType<scalar_t>::qk_load_vec_type;
static_assert(
std::is_same<qk_vec_type,
typename KernelVecType<scalar_t>::qk_vec_type>::value);
using v_load_vec_type = typename KernelVecType<scalar_t>::v_load_vec_type;
using f32_vec_type = vec_op::FP32Vec16;
static_assert(qk_load_vec_type::VEC_ELEM_NUM == qk_vec_type::VEC_ELEM_NUM);
static_assert(v_load_vec_type::VEC_ELEM_NUM == f32_vec_type::VEC_ELEM_NUM);
constexpr int QK_NUM_ELEM = qk_vec_type::VEC_ELEM_NUM;
constexpr int V_NUM_ELEM = v_load_vec_type::VEC_ELEM_NUM;
const qk_vec_type* k_vecs;
const f32_vec_type* v_vecs_f32;
float* kv_cache_f32 = nullptr;
if constexpr (!std::is_same<scalar_t, float>::value) {
// convert KV cache block to FP32 to reuse it across query heads and
// attn @ V computation, since FP16/BF16->FP32 is expensive.
// TODO: move malloc outside of this fn to reuse across iterations.
const int nbytes = BLOCK_SIZE * HEAD_DIM * sizeof(float);
kv_cache_f32 = static_cast<float*>(std::aligned_alloc(64, nbytes));
for (int block_offset = 0; block_offset < num_tokens; ++block_offset)
for (int i = 0; i < HEAD_DIM; i += V_NUM_ELEM) {
v_load_vec_type kv_load_vec(kv_cache + block_offset * HEAD_DIM + i);
f32_vec_type kv_vec_f32(kv_load_vec);
kv_vec_f32.save(kv_cache_f32 + block_offset * HEAD_DIM + i);
}
if constexpr (std::is_same<qk_load_vec_type, qk_vec_type>::value) {
// for AVX512_BF16, Q @ K.T uses BF16 for K (no conversion)
// NOTE: in this case, we only need to convert the V section to FP32.
// But for simplicity, we will convert the whole KV block to FP32.
k_vecs = reinterpret_cast<const qk_vec_type*>(kv_cache);
} else {
k_vecs = reinterpret_cast<const qk_vec_type*>(kv_cache_f32);
}
// attn @ V always use FP32 for V, since attn is FP32.
v_vecs_f32 = reinterpret_cast<const f32_vec_type*>(kv_cache_f32);
} else {
// KV cache is FP32. don't need to do anything.
k_vecs = reinterpret_cast<const qk_vec_type*>(kv_cache);
v_vecs_f32 = reinterpret_cast<const f32_vec_type*>(kv_cache);
}
// compute 2 heads at the same time to improve ILP and
// take advantage of register cache for K and V.
constexpr int HEAD_UNROLL = 2;
for (int iter = 0; iter < num_heads / HEAD_UNROLL; ++iter) {
mla_decode_block_head<HEAD_DIM, V_HEAD_DIM, BLOCK_SIZE, HEAD_UNROLL>(
q_vecs, k_vecs, v_vecs_f32, acc_out, acc_lse, scale, num_tokens);
q_vecs += HEAD_UNROLL * HEAD_DIM / QK_NUM_ELEM;
acc_out += HEAD_UNROLL * V_HEAD_DIM;
acc_lse += HEAD_UNROLL;
}
// take care of the remaining heads
for (int iter = 0; iter < num_heads % HEAD_UNROLL; ++iter) {
mla_decode_block_head<HEAD_DIM, V_HEAD_DIM, BLOCK_SIZE, 1>(
q_vecs, k_vecs, v_vecs_f32, acc_out, acc_lse, scale, num_tokens);
q_vecs += HEAD_DIM / QK_NUM_ELEM;
acc_out += V_HEAD_DIM;
acc_lse += 1;
}
if (kv_cache_f32 != nullptr) {
std::free(kv_cache_f32);
}
}
} // namespace
template <typename scalar_t, int HEAD_DIM, int V_HEAD_DIM, int BLOCK_SIZE>
void mla_decode_kvcache_cpu_impl(
scalar_t* __restrict__ out, // [num_seqs, num_heads, v_head_dim]
const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_dim]
const scalar_t* __restrict__ kv_cache, // [num_blocks, block_size,
// head_dim]
const int num_heads, const float scale,
const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
const int* __restrict__ seq_lens, // [num_seqs]
const int max_num_blocks_per_seq, const int o_stride, const int q_stride,
const int kv_stride, const int num_seqs) {
using qk_load_vec_type = typename KernelVecType<scalar_t>::qk_load_vec_type;
using qk_vec_type = typename KernelVecType<scalar_t>::qk_vec_type;
constexpr int QK_NUM_ELEM = qk_vec_type::VEC_ELEM_NUM;
// shared across threads
const int max_threads = omp_get_max_threads();
const int acc_out_nbytes =
max_threads * num_heads * V_HEAD_DIM * sizeof(float);
float* acc_out = static_cast<float*>(std::aligned_alloc(64, acc_out_nbytes));
std::vector<float> acc_lse(max_threads * num_heads);
// allocate memory to pre-convert query to FP32 later
float* q_f32;
constexpr bool PRE_CONVERT_QUERY =
!std::is_same<scalar_t, float>::value &&
std::is_same<qk_vec_type, vec_op::FP32Vec16>::value;
if constexpr (PRE_CONVERT_QUERY) {
const int q_f32_nbytes = num_heads * HEAD_DIM * sizeof(float);
q_f32 = static_cast<float*>(std::aligned_alloc(64, q_f32_nbytes));
}
#pragma omp parallel
{
const int num_threads = omp_get_num_threads();
const int thread_id = omp_get_thread_num();
float* __restrict__ acc_out_thread =
acc_out + thread_id * num_heads * V_HEAD_DIM;
float* __restrict__ acc_lse_thread = acc_lse.data() + thread_id * num_heads;
for (int seq_idx = 0; seq_idx < num_seqs; ++seq_idx) {
// reset accumulator
std::fill(acc_out_thread, acc_out_thread + num_heads * V_HEAD_DIM, 0.0f);
std::fill(acc_lse_thread, acc_lse_thread + num_heads, -FLT_MAX);
const int seq_len = seq_lens[seq_idx];
const int block_num = (seq_len + BLOCK_SIZE - 1) / BLOCK_SIZE;
const int last_block_size = seq_len - (block_num - 1) * BLOCK_SIZE;
const qk_vec_type* q_vecs;
if constexpr (PRE_CONVERT_QUERY) {
// pre-convert query to FP32 since FP16/BF16->FP32 is slow.
#pragma omp for
for (int i = 0; i < num_heads * HEAD_DIM; i += QK_NUM_ELEM) {
qk_load_vec_type q_load_vec(q + seq_idx * q_stride + i);
qk_vec_type q_vec(q_load_vec);
q_vec.save(q_f32 + i);
}
q_vecs = reinterpret_cast<const qk_vec_type*>(q_f32);
} else {
q_vecs = reinterpret_cast<const qk_vec_type*>(q + seq_idx * q_stride);
}
#pragma omp for
for (int block_idx = 0; block_idx < block_num; ++block_idx) {
const int physical_block_idx =
block_tables[seq_idx * max_num_blocks_per_seq + block_idx];
const int num_tokens =
block_idx < block_num - 1 ? BLOCK_SIZE : last_block_size;
mla_decode_block<scalar_t, HEAD_DIM, V_HEAD_DIM, BLOCK_SIZE>(
q_vecs, kv_cache + physical_block_idx * kv_stride, acc_out_thread,
acc_lse_thread, num_heads, scale, num_tokens);
}
// merge attention states across threads
// section 2.2 in https://arxiv.org/pdf/2501.01005
// each thread is responsible for 1 head
#pragma omp for
for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
float* acc_lse_head = acc_lse.data() + head_idx;
float* acc_out_head = acc_out + head_idx * V_HEAD_DIM;
float max_val = -FLT_MAX;
for (int thread_id_ = 0; thread_id_ < num_threads; ++thread_id_) {
max_val = std::max(max_val, acc_lse_head[thread_id_ * num_heads]);
}
float sum_exp = 0.0f;
for (int thread_id_ = 0; thread_id_ < num_threads; ++thread_id_) {
float val = std::exp(acc_lse_head[thread_id_ * num_heads] - max_val);
acc_lse_head[thread_id_ * num_heads] = val;
sum_exp += val;
}
float inv_sum = 1.0f / sum_exp;
float out_head[V_HEAD_DIM] = {};
for (int thread_id_ = 0; thread_id_ < num_threads; ++thread_id_) {
float scale_ = acc_lse_head[thread_id_ * num_heads] * inv_sum;
for (int i = 0; i < V_HEAD_DIM; ++i) {
out_head[i] +=
acc_out_head[thread_id_ * num_heads * V_HEAD_DIM + i] * scale_;
}
}
for (int i = 0; i < V_HEAD_DIM; ++i) {
vec_op::storeFP32(out_head[i], out + seq_idx * o_stride +
head_idx * V_HEAD_DIM + i);
}
}
}
}
if (PRE_CONVERT_QUERY) {
std::free(q_f32);
}
std::free(acc_out);
}
void mla_decode_kvcache(torch::Tensor& out, torch::Tensor& query,
torch::Tensor& kv_cache, double scale,
torch::Tensor& block_tables, torch::Tensor& seq_lens) {
const int num_seqs = query.size(0);
const int num_heads = query.size(1);
const int head_dim = query.size(2);
const int block_size = kv_cache.size(1);
const int v_head_dim = out.size(2);
const int max_num_blocks_per_seq = block_tables.size(1);
const int o_stride = out.stride(0);
const int q_stride = query.stride(0);
const int kv_stride = kv_cache.stride(0);
VLLM_DISPATCH_FLOATING_TYPES(
query.scalar_type(), "mla_decode_kvcache_cpu_impl", [&] {
CPU_KERNEL_GUARD_IN(mla_decode_kvcache_cpu_impl)
if (head_dim == 576 && v_head_dim == 512 && block_size == 16)
mla_decode_kvcache_cpu_impl<scalar_t, 576, 512, 16>(
out.data_ptr<scalar_t>(), query.data_ptr<scalar_t>(),
kv_cache.data_ptr<scalar_t>(), num_heads, scale,
block_tables.data_ptr<int>(), seq_lens.data_ptr<int>(),
max_num_blocks_per_seq, o_stride, q_stride, kv_stride, num_seqs);
else
TORCH_CHECK(false, "Unsupported block size: ", block_size);
CPU_KERNEL_GUARD_OUT(mla_decode_kvcache_cpu_impl)
});
}
\ No newline at end of file
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment