Commit e7c1b7f3 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge branch 'v0.5.4-dtk24.04.1'

parents 7462218e 04c62b93
ARG NIGHTLY_DATE="20240601" ARG NIGHTLY_DATE="20240726"
ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_$NIGHTLY_DATE" ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_$NIGHTLY_DATE"
FROM $BASE_IMAGE FROM $BASE_IMAGE
WORKDIR /workspace WORKDIR /workspace
COPY . /workspace/vllm
ENV VLLM_TARGET_DEVICE="tpu"
# Install aiohttp separately to avoid build errors. # Install aiohttp separately to avoid build errors.
RUN pip install aiohttp RUN pip install aiohttp
# Install NumPy 1 instead of NumPy 2.
RUN pip install "numpy<2"
# Install the TPU and Pallas dependencies. # Install the TPU and Pallas dependencies.
RUN pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html RUN pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
RUN pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html RUN pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
# Fix FastAPI dependence
RUN pip install "starlette<0.38.0"
# Build vLLM. # Build vLLM.
COPY . /workspace/vllm
ENV VLLM_TARGET_DEVICE="tpu"
RUN cd /workspace/vllm && python setup.py develop RUN cd /workspace/vllm && python setup.py develop
CMD ["/bin/bash"] CMD ["/bin/bash"]
FROM intel/oneapi-basekit:2024.1.0-devel-ubuntu20.04
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
rm /etc/apt/sources.list.d/intel-graphics.list && \
wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
chmod 644 /usr/share/keyrings/intel-graphics.gpg
RUN apt-get update -y \
&& apt-get install -y curl libicu70 lsb-release git wget vim numactl python3 python3-pip
COPY ./ /workspace/vllm
WORKDIR /workspace/vllm
RUN pip install -v -r requirements-xpu.txt
RUN VLLM_TARGET_DEVICE=xpu python3 setup.py install
CMD ["/bin/bash"]
include LICENSE include LICENSE
include requirements-adag.txt
include requirements-common.txt include requirements-common.txt
include requirements-cuda.txt include requirements-cuda.txt
include requirements-rocm.txt include requirements-rocm.txt
......
...@@ -4,7 +4,7 @@ vLLM是一个快速且易于使用的LLM推理和服务库,使用PageAttention ...@@ -4,7 +4,7 @@ vLLM是一个快速且易于使用的LLM推理和服务库,使用PageAttention
## 暂不支持的官方功能 ## 暂不支持的官方功能
- **量化推理**:目前支持fp16的推理和gptq,awq-int4推理,mralin的权重量化、kv-cache fp8推理方案暂不支持 - **量化推理**:目前支持fp16的推理和gptq,awq-int4推理,mralin的权重量化、kv-cache fp8推理方案暂不支持
- **模块支持**:目前不支持Sliding window attention、 moe kernel和lora模块 - **模块支持**:目前不支持Sliding window attention、 moe kernel模块
## 支持模型结构列表 ## 支持模型结构列表
...@@ -82,7 +82,7 @@ VLLM_INSTALL_PUNICA_KERNELS=1 python3 setup.py install ...@@ -82,7 +82,7 @@ VLLM_INSTALL_PUNICA_KERNELS=1 python3 setup.py install
+ 若使用 pip install 下载安装过慢,可添加源:-i https://pypi.tuna.tsinghua.edu.cn/simple/ + 若使用 pip install 下载安装过慢,可添加源:-i https://pypi.tuna.tsinghua.edu.cn/simple/
## 验证 ## 验证
- python -c "import vllm; print(vllm.\_\_version__)",版本号与官方版本同步,查询该软件的版本号,例如0.5.0 - python -c "import vllm; print(vllm.\_\_version__)",版本号与官方版本同步,查询该软件的版本号,例如0.5.4
## Known Issue ## Known Issue
- -
......
...@@ -16,33 +16,14 @@ Easy, fast, and cheap LLM serving for everyone ...@@ -16,33 +16,14 @@ Easy, fast, and cheap LLM serving for everyone
--- ---
**Ray Summit CPF is Open (June 4th to June 20th)!**
There will be a track for vLLM at the Ray Summit (09/30-10/02, SF) this year!
If you have cool projects related to vLLM or LLM inference, we would love to see your proposals.
This will be a great chance for everyone in the community to get together and learn.
Please submit your proposal [here](https://raysummit.anyscale.com/flow/anyscale/raysummit2024/landing/page/eventsite)
**The Fourth vLLM Bay Area Meetup (June 11th 5:30pm-8pm PT)**
We are thrilled to announce our fourth vLLM Meetup!
The vLLM team will share recent updates and roadmap.
We will also have vLLM collaborators from BentoML and Cloudflare coming up to the stage to discuss their experience in deploying LLMs with vLLM.
Please register [here](https://lu.ma/agivllm) and join us!
---
*Latest News* 🔥 *Latest News* 🔥
- [2024/07] We hosted [the fifth vLLM meetup](https://lu.ma/lp0gyjqr) with AWS! Please find the meetup slides [here](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing).
- [2024/07] In partnership with Meta, vLLM officially supports Llama 3.1 with FP8 quantization and pipeline parallelism! Please check out our blog post [here](https://blog.vllm.ai/2024/07/23/llama31.html).
- [2024/06] We hosted [the fourth vLLM meetup](https://lu.ma/agivllm) with Cloudflare and BentoML! Please find the meetup slides [here](https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing).
- [2024/04] We hosted [the third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/) with Roblox! Please find the meetup slides [here](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing). - [2024/04] We hosted [the third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/) with Roblox! Please find the meetup slides [here](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing).
- [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing). - [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) with IBM! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing).
- [2024/01] Added ROCm 6.0 support to vLLM. - [2023/10] We hosted [the first vLLM meetup](https://lu.ma/first-vllm-meetup) with a16z! Please find the meetup slides [here](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing).
- [2023/12] Added ROCm 5.7 support to vLLM.
- [2023/10] We hosted [the first vLLM meetup](https://lu.ma/first-vllm-meetup) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing).
- [2023/09] We created our [Discord server](https://discord.gg/jz7wjKhh6g)! Join us to discuss vLLM and LLM serving! We will also post the latest announcements and updates there.
- [2023/09] We released our [PagedAttention paper](https://arxiv.org/abs/2309.06180) on arXiv!
- [2023/08] We would like to express our sincere gratitude to [Andreessen Horowitz](https://a16z.com/2023/08/30/supporting-the-open-source-ai-community/) (a16z) for providing a generous grant to support the open-source development and research of vLLM. - [2023/08] We would like to express our sincere gratitude to [Andreessen Horowitz](https://a16z.com/2023/08/30/supporting-the-open-source-ai-community/) (a16z) for providing a generous grant to support the open-source development and research of vLLM.
- [2023/07] Added support for LLaMA-2! You can run and serve 7B/13B/70B LLaMA-2s on vLLM with a single command!
- [2023/06] Serving vLLM On any Cloud with SkyPilot. Check out a 1-click [example](https://github.com/skypilot-org/skypilot/blob/master/llm/vllm) to start the vLLM demo, and the [blog post](https://blog.skypilot.co/serving-llm-24x-faster-on-the-cloud-with-vllm-and-skypilot/) for the story behind vLLM development on the clouds.
- [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai). - [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai).
--- ---
...@@ -58,14 +39,16 @@ vLLM is fast with: ...@@ -58,14 +39,16 @@ vLLM is fast with:
- Quantization: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [SqueezeLLM](https://arxiv.org/abs/2306.07629), FP8 KV Cache - Quantization: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [SqueezeLLM](https://arxiv.org/abs/2306.07629), FP8 KV Cache
- Optimized CUDA kernels - Optimized CUDA kernels
**Performance benchmark**: We include a [performance benchmark](https://buildkite.com/vllm/performance-benchmark/builds/4068) that compares the performance of vllm against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [text-generation-inference](https://github.com/huggingface/text-generation-inference) and [lmdeploy](https://github.com/InternLM/lmdeploy)).
vLLM is flexible and easy to use with: vLLM is flexible and easy to use with:
- Seamless integration with popular Hugging Face models - Seamless integration with popular Hugging Face models
- High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more - High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more
- Tensor parallelism support for distributed inference - Tensor parallelism and pipeline parallelism support for distributed inference
- Streaming outputs - Streaming outputs
- OpenAI-compatible API server - OpenAI-compatible API server
- Support NVIDIA GPUs and AMD GPUs - Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs
- (Experimental) Prefix caching support - (Experimental) Prefix caching support
- (Experimental) Multi-lora support - (Experimental) Multi-lora support
...@@ -109,6 +92,7 @@ vLLM is a community project. Our compute resources for development and testing a ...@@ -109,6 +92,7 @@ vLLM is a community project. Our compute resources for development and testing a
- Databricks - Databricks
- DeepInfra - DeepInfra
- Dropbox - Dropbox
- Google Cloud
- Lambda Lab - Lambda Lab
- NVIDIA - NVIDIA
- Replicate - Replicate
...@@ -118,6 +102,7 @@ vLLM is a community project. Our compute resources for development and testing a ...@@ -118,6 +102,7 @@ vLLM is a community project. Our compute resources for development and testing a
- Trainy - Trainy
- UC Berkeley - UC Berkeley
- UC San Diego - UC San Diego
- ZhenFund
We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM. We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM.
......
...@@ -4,10 +4,13 @@ import sys ...@@ -4,10 +4,13 @@ import sys
import time import time
import traceback import traceback
from dataclasses import dataclass, field from dataclasses import dataclass, field
from typing import List, Optional from typing import List, Optional, Union
import aiohttp import aiohttp
import huggingface_hub.constants
from tqdm.asyncio import tqdm from tqdm.asyncio import tqdm
from transformers import (AutoTokenizer, PreTrainedTokenizer,
PreTrainedTokenizerFast)
AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60) AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
...@@ -222,8 +225,8 @@ async def async_request_openai_completions( ...@@ -222,8 +225,8 @@ async def async_request_openai_completions(
) -> RequestFuncOutput: ) -> RequestFuncOutput:
api_url = request_func_input.api_url api_url = request_func_input.api_url
assert api_url.endswith( assert api_url.endswith(
"v1/completions" "completions"
), "OpenAI Completions API URL must end with 'v1/completions'." ), "OpenAI Completions API URL must end with 'completions'."
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
assert not request_func_input.use_beam_search assert not request_func_input.use_beam_search
...@@ -262,6 +265,9 @@ async def async_request_openai_completions( ...@@ -262,6 +265,9 @@ async def async_request_openai_completions(
else: else:
data = json.loads(chunk) data = json.loads(chunk)
# NOTE: Some completion API might have a last
# usage summary response without a token so we
# want to check a token was generated
if data["choices"][0]["text"]: if data["choices"][0]["text"]:
timestamp = time.perf_counter() timestamp = time.perf_counter()
# First token # First token
...@@ -270,12 +276,8 @@ async def async_request_openai_completions( ...@@ -270,12 +276,8 @@ async def async_request_openai_completions(
output.ttft = ttft output.ttft = ttft
# Decoding phase # Decoding phase
# NOTE: Some completion API might have a last output.itl.append(timestamp -
# usage summary response without a token so we most_recent_timestamp)
# do not want to include as inter-token-latency
elif data.get("usage", None) is None:
output.itl.append(timestamp -
most_recent_timestamp)
most_recent_timestamp = timestamp most_recent_timestamp = timestamp
generated_text += data["choices"][0]["text"] generated_text += data["choices"][0]["text"]
...@@ -302,8 +304,8 @@ async def async_request_openai_chat_completions( ...@@ -302,8 +304,8 @@ async def async_request_openai_chat_completions(
) -> RequestFuncOutput: ) -> RequestFuncOutput:
api_url = request_func_input.api_url api_url = request_func_input.api_url
assert api_url.endswith( assert api_url.endswith(
"v1/chat/completions" "chat/completions"
), "OpenAI Chat Completions API URL must end with 'v1/chat/completions'." ), "OpenAI Chat Completions API URL must end with 'chat/completions'."
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
assert not request_func_input.use_beam_search assert not request_func_input.use_beam_search
...@@ -388,6 +390,30 @@ def remove_prefix(text: str, prefix: str) -> str: ...@@ -388,6 +390,30 @@ def remove_prefix(text: str, prefix: str) -> str:
return text return text
def get_model(pretrained_model_name_or_path: str) -> str:
if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
from modelscope import snapshot_download
model_path = snapshot_download(
model_id=pretrained_model_name_or_path,
local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])
return model_path
return pretrained_model_name_or_path
def get_tokenizer(
pretrained_model_name_or_path: str, trust_remote_code: bool
) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
if pretrained_model_name_or_path is not None and not os.path.exists(
pretrained_model_name_or_path):
pretrained_model_name_or_path = get_model(
pretrained_model_name_or_path)
return AutoTokenizer.from_pretrained(pretrained_model_name_or_path,
trust_remote_code=trust_remote_code)
ASYNC_REQUEST_FUNCS = { ASYNC_REQUEST_FUNCS = {
"tgi": async_request_tgi, "tgi": async_request_tgi,
"vllm": async_request_openai_completions, "vllm": async_request_openai_completions,
...@@ -396,4 +422,5 @@ ASYNC_REQUEST_FUNCS = { ...@@ -396,4 +422,5 @@ ASYNC_REQUEST_FUNCS = {
"openai": async_request_openai_completions, "openai": async_request_openai_completions,
"openai-chat": async_request_openai_chat_completions, "openai-chat": async_request_openai_chat_completions,
"tensorrt-llm": async_request_trt_llm, "tensorrt-llm": async_request_trt_llm,
"scalellm": async_request_openai_completions,
} }
...@@ -10,8 +10,10 @@ import torch ...@@ -10,8 +10,10 @@ import torch
from tqdm import tqdm from tqdm import tqdm
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.inputs import PromptStrictInputs from vllm.engine.arg_utils import EngineArgs
from vllm.inputs import PromptInputs
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
from vllm.utils import FlexibleArgumentParser
def main(args: argparse.Namespace): def main(args: argparse.Namespace):
...@@ -19,25 +21,33 @@ def main(args: argparse.Namespace): ...@@ -19,25 +21,33 @@ def main(args: argparse.Namespace):
# NOTE(woosuk): If the request cannot be processed in a single batch, # NOTE(woosuk): If the request cannot be processed in a single batch,
# the engine will automatically process the request in multiple batches. # the engine will automatically process the request in multiple batches.
llm = LLM(model=args.model, llm = LLM(
speculative_model=args.speculative_model, model=args.model,
num_speculative_tokens=args.num_speculative_tokens, speculative_model=args.speculative_model,
tokenizer=args.tokenizer, num_speculative_tokens=args.num_speculative_tokens,
quantization=args.quantization, speculative_draft_tensor_parallel_size=\
tensor_parallel_size=args.tensor_parallel_size, args.speculative_draft_tensor_parallel_size,
trust_remote_code=args.trust_remote_code, tokenizer=args.tokenizer,
dtype=args.dtype, quantization=args.quantization,
enforce_eager=args.enforce_eager, tensor_parallel_size=args.tensor_parallel_size,
kv_cache_dtype=args.kv_cache_dtype, trust_remote_code=args.trust_remote_code,
quantization_param_path=args.quantization_param_path, dtype=args.dtype,
device=args.device, max_model_len=args.max_model_len,
ray_workers_use_nsight=args.ray_workers_use_nsight, enforce_eager=args.enforce_eager,
use_v2_block_manager=args.use_v2_block_manager, kv_cache_dtype=args.kv_cache_dtype,
enable_chunked_prefill=args.enable_chunked_prefill, quantization_param_path=args.quantization_param_path,
download_dir=args.download_dir, device=args.device,
block_size=args.block_size, ray_workers_use_nsight=args.ray_workers_use_nsight,
gpu_memory_utilization=args.gpu_memory_utilization, use_v2_block_manager=args.use_v2_block_manager,
distributed_executor_backend=args.distributed_executor_backend) enable_chunked_prefill=args.enable_chunked_prefill,
download_dir=args.download_dir,
block_size=args.block_size,
gpu_memory_utilization=args.gpu_memory_utilization,
load_format=args.load_format,
distributed_executor_backend=args.distributed_executor_backend,
otlp_traces_endpoint=args.otlp_traces_endpoint,
enable_prefix_caching=args.enable_prefix_caching,
)
sampling_params = SamplingParams( sampling_params = SamplingParams(
n=args.n, n=args.n,
...@@ -51,7 +61,7 @@ def main(args: argparse.Namespace): ...@@ -51,7 +61,7 @@ def main(args: argparse.Namespace):
dummy_prompt_token_ids = np.random.randint(10000, dummy_prompt_token_ids = np.random.randint(10000,
size=(args.batch_size, size=(args.batch_size,
args.input_len)) args.input_len))
dummy_inputs: List[PromptStrictInputs] = [{ dummy_inputs: List[PromptInputs] = [{
"prompt_token_ids": batch "prompt_token_ids": batch
} for batch in dummy_prompt_token_ids.tolist()] } for batch in dummy_prompt_token_ids.tolist()]
...@@ -96,7 +106,7 @@ def main(args: argparse.Namespace): ...@@ -96,7 +106,7 @@ def main(args: argparse.Namespace):
for _ in tqdm(range(args.num_iters), desc="Profiling iterations"): for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
latencies.append(run_to_completion(profile_dir=None)) latencies.append(run_to_completion(profile_dir=None))
latencies = np.array(latencies) latencies = np.array(latencies)
percentages = [10, 25, 50, 75, 90] percentages = [10, 25, 50, 75, 90, 99]
percentiles = np.percentile(latencies, percentages) percentiles = np.percentile(latencies, percentages)
print(f'Avg latency: {np.mean(latencies)} seconds') print(f'Avg latency: {np.mean(latencies)} seconds')
for percentage, percentile in zip(percentages, percentiles): for percentage, percentile in zip(percentages, percentiles):
...@@ -114,12 +124,16 @@ def main(args: argparse.Namespace): ...@@ -114,12 +124,16 @@ def main(args: argparse.Namespace):
if __name__ == '__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser( parser = FlexibleArgumentParser(
description='Benchmark the latency of processing a single batch of ' description='Benchmark the latency of processing a single batch of '
'requests till completion.') 'requests till completion.')
parser.add_argument('--model', type=str, default='facebook/opt-125m') parser.add_argument('--model', type=str, default='facebook/opt-125m')
parser.add_argument('--speculative-model', type=str, default=None) parser.add_argument('--speculative-model', type=str, default=None)
parser.add_argument('--num-speculative-tokens', type=int, default=None) parser.add_argument('--num-speculative-tokens', type=int, default=None)
parser.add_argument('--speculative-draft-tensor-parallel-size',
'-spec-draft-tp',
type=int,
default=None)
parser.add_argument('--tokenizer', type=str, default=None) parser.add_argument('--tokenizer', type=str, default=None)
parser.add_argument('--quantization', parser.add_argument('--quantization',
'-q', '-q',
...@@ -145,6 +159,12 @@ if __name__ == '__main__': ...@@ -145,6 +159,12 @@ if __name__ == '__main__':
parser.add_argument('--trust-remote-code', parser.add_argument('--trust-remote-code',
action='store_true', action='store_true',
help='trust remote code from huggingface') help='trust remote code from huggingface')
parser.add_argument(
'--max-model-len',
type=int,
default=None,
help='Maximum length of a sequence (including prompt and output). '
'If None, will be derived from the model.')
parser.add_argument( parser.add_argument(
'--dtype', '--dtype',
type=str, type=str,
...@@ -188,9 +208,10 @@ if __name__ == '__main__': ...@@ -188,9 +208,10 @@ if __name__ == '__main__':
parser.add_argument( parser.add_argument(
"--device", "--device",
type=str, type=str,
default="cuda", default="auto",
choices=["cuda", "cpu", "tpu"], choices=["auto", "cuda", "cpu", "openvino", "tpu", "xpu"],
help='device type for vLLM execution, supporting CUDA and CPU.') help='device type for vLLM execution, supporting CUDA, OpenVINO and '
'CPU.')
parser.add_argument('--block-size', parser.add_argument('--block-size',
type=int, type=int,
default=16, default=16,
...@@ -200,6 +221,9 @@ if __name__ == '__main__': ...@@ -200,6 +221,9 @@ if __name__ == '__main__':
action='store_true', action='store_true',
help='If True, the prefill requests can be chunked based on the ' help='If True, the prefill requests can be chunked based on the '
'max_num_batched_tokens') 'max_num_batched_tokens')
parser.add_argument("--enable-prefix-caching",
action='store_true',
help="Enable automatic prefix caching")
parser.add_argument('--use-v2-block-manager', action='store_true') parser.add_argument('--use-v2-block-manager', action='store_true')
parser.add_argument( parser.add_argument(
"--ray-workers-use-nsight", "--ray-workers-use-nsight",
...@@ -222,6 +246,29 @@ if __name__ == '__main__': ...@@ -222,6 +246,29 @@ if __name__ == '__main__':
help='the fraction of GPU memory to be used for ' help='the fraction of GPU memory to be used for '
'the model executor, which can range from 0 to 1.' 'the model executor, which can range from 0 to 1.'
'If unspecified, will use the default value of 0.9.') 'If unspecified, will use the default value of 0.9.')
parser.add_argument(
'--load-format',
type=str,
default=EngineArgs.load_format,
choices=[
'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer',
'bitsandbytes'
],
help='The format of the model weights to load.\n\n'
'* "auto" will try to load the weights in the safetensors format '
'and fall back to the pytorch bin format if safetensors format '
'is not available.\n'
'* "pt" will load the weights in the pytorch bin format.\n'
'* "safetensors" will load the weights in the safetensors format.\n'
'* "npcache" will load the weights in pytorch format and store '
'a numpy cache to speed up the loading.\n'
'* "dummy" will initialize the weights with random values, '
'which is mainly for profiling.\n'
'* "tensorizer" will load the weights using tensorizer from '
'CoreWeave. See the Tensorize vLLM Model script in the Examples'
'section for more information.\n'
'* "bitsandbytes" will load the weights using bitsandbytes '
'quantization.\n')
parser.add_argument( parser.add_argument(
'--distributed-executor-backend', '--distributed-executor-backend',
choices=['ray', 'mp'], choices=['ray', 'mp'],
...@@ -229,5 +276,10 @@ if __name__ == '__main__': ...@@ -229,5 +276,10 @@ if __name__ == '__main__':
help='Backend to use for distributed serving. When more than 1 GPU ' help='Backend to use for distributed serving. When more than 1 GPU '
'is used, will be automatically set to "ray" if installed ' 'is used, will be automatically set to "ray" if installed '
'or "mp" (multiprocessing) otherwise.') 'or "mp" (multiprocessing) otherwise.')
parser.add_argument(
'--otlp-traces-endpoint',
type=str,
default=None,
help='Target URL to which OpenTelemetry traces will be sent.')
args = parser.parse_args() args = parser.parse_args()
main(args) main(args)
import argparse
import time import time
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.utils import FlexibleArgumentParser
PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as fellows. You need to answer my question about the table.\n# Table\n|Opening|Opening|Sl. No.|Film|Cast|Director|Music Director|Notes|\n|----|----|----|----|----|----|----|----|\n|J A N|9|1|Agni Pushpam|Jayabharathi, Kamalahasan|Jeassy|M. K. Arjunan||\n|J A N|16|2|Priyamvada|Mohan Sharma, Lakshmi, KPAC Lalitha|K. S. Sethumadhavan|V. Dakshinamoorthy||\n|J A N|23|3|Yakshagaanam|Madhu, Sheela|Sheela|M. S. Viswanathan||\n|J A N|30|4|Paalkkadal|Sheela, Sharada|T. K. Prasad|A. T. Ummer||\n|F E B|5|5|Amma|Madhu, Srividya|M. Krishnan Nair|M. K. Arjunan||\n|F E B|13|6|Appooppan|Thikkurissi Sukumaran Nair, Kamal Haasan|P. Bhaskaran|M. S. Baburaj||\n|F E B|20|7|Srishti|Chowalloor Krishnankutty, Ravi Alummoodu|K. T. Muhammad|M. S. Baburaj||\n|F E B|20|8|Vanadevatha|Prem Nazir, Madhubala|Yusufali Kechery|G. Devarajan||\n|F E B|27|9|Samasya|Madhu, Kamalahaasan|K. Thankappan|Shyam||\n|F E B|27|10|Yudhabhoomi|K. P. Ummer, Vidhubala|Crossbelt Mani|R. K. Shekhar||\n|M A R|5|11|Seemantha Puthran|Prem Nazir, Jayabharathi|A. B. Raj|M. K. Arjunan||\n|M A R|12|12|Swapnadanam|Rani Chandra, Dr. Mohandas|K. G. George|Bhaskar Chandavarkar||\n|M A R|19|13|Thulavarsham|Prem Nazir, sreedevi, Sudheer|N. Sankaran Nair|V. Dakshinamoorthy||\n|M A R|20|14|Aruthu|Kaviyoor Ponnamma, Kamalahasan|Ravi|G. Devarajan||\n|M A R|26|15|Swimming Pool|Kamal Haasan, M. G. Soman|J. Sasikumar|M. K. Arjunan||\n\n# Question\nWhat' s the content in the (1,1) cells\n" # noqa: E501 PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as fellows. You need to answer my question about the table.\n# Table\n|Opening|Opening|Sl. No.|Film|Cast|Director|Music Director|Notes|\n|----|----|----|----|----|----|----|----|\n|J A N|9|1|Agni Pushpam|Jayabharathi, Kamalahasan|Jeassy|M. K. Arjunan||\n|J A N|16|2|Priyamvada|Mohan Sharma, Lakshmi, KPAC Lalitha|K. S. Sethumadhavan|V. Dakshinamoorthy||\n|J A N|23|3|Yakshagaanam|Madhu, Sheela|Sheela|M. S. Viswanathan||\n|J A N|30|4|Paalkkadal|Sheela, Sharada|T. K. Prasad|A. T. Ummer||\n|F E B|5|5|Amma|Madhu, Srividya|M. Krishnan Nair|M. K. Arjunan||\n|F E B|13|6|Appooppan|Thikkurissi Sukumaran Nair, Kamal Haasan|P. Bhaskaran|M. S. Baburaj||\n|F E B|20|7|Srishti|Chowalloor Krishnankutty, Ravi Alummoodu|K. T. Muhammad|M. S. Baburaj||\n|F E B|20|8|Vanadevatha|Prem Nazir, Madhubala|Yusufali Kechery|G. Devarajan||\n|F E B|27|9|Samasya|Madhu, Kamalahaasan|K. Thankappan|Shyam||\n|F E B|27|10|Yudhabhoomi|K. P. Ummer, Vidhubala|Crossbelt Mani|R. K. Shekhar||\n|M A R|5|11|Seemantha Puthran|Prem Nazir, Jayabharathi|A. B. Raj|M. K. Arjunan||\n|M A R|12|12|Swapnadanam|Rani Chandra, Dr. Mohandas|K. G. George|Bhaskar Chandavarkar||\n|M A R|19|13|Thulavarsham|Prem Nazir, sreedevi, Sudheer|N. Sankaran Nair|V. Dakshinamoorthy||\n|M A R|20|14|Aruthu|Kaviyoor Ponnamma, Kamalahasan|Ravi|G. Devarajan||\n|M A R|26|15|Swimming Pool|Kamal Haasan, M. G. Soman|J. Sasikumar|M. K. Arjunan||\n\n# Question\nWhat' s the content in the (1,1) cells\n" # noqa: E501
...@@ -44,7 +44,7 @@ def main(args): ...@@ -44,7 +44,7 @@ def main(args):
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser( parser = FlexibleArgumentParser(
description='Benchmark the performance with or without automatic ' description='Benchmark the performance with or without automatic '
'prefix caching.') 'prefix caching.')
parser.add_argument('--model', parser.add_argument('--model',
......
...@@ -2,8 +2,8 @@ ...@@ -2,8 +2,8 @@
On the server side, run one of the following commands: On the server side, run one of the following commands:
vLLM OpenAI API server vLLM OpenAI API server
python -m vllm.entrypoints.openai.api_server \ vllm serve <your_model> \
--model <your_model> --swap-space 16 \ --swap-space 16 \
--disable-log-requests --disable-log-requests
(TGI backend) (TGI backend)
...@@ -17,7 +17,7 @@ On the client side, run: ...@@ -17,7 +17,7 @@ On the client side, run:
--dataset-path <path to dataset> \ --dataset-path <path to dataset> \
--request-rate <request_rate> \ # By default <request_rate> is inf --request-rate <request_rate> \ # By default <request_rate> is inf
--num-prompts <num_prompts> # By default <num_prompts> is 1000 --num-prompts <num_prompts> # By default <num_prompts> is 1000
when using tgi backend, add when using tgi backend, add
--endpoint /generate_stream --endpoint /generate_stream
to the end of the command above. to the end of the command above.
...@@ -31,7 +31,7 @@ import time ...@@ -31,7 +31,7 @@ import time
import warnings import warnings
from dataclasses import dataclass from dataclasses import dataclass
from datetime import datetime from datetime import datetime
from typing import AsyncGenerator, List, Optional, Tuple from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple
import numpy as np import numpy as np
from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput, from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
...@@ -39,7 +39,15 @@ from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput, ...@@ -39,7 +39,15 @@ from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
from tqdm.asyncio import tqdm from tqdm.asyncio import tqdm
from transformers import PreTrainedTokenizerBase from transformers import PreTrainedTokenizerBase
from vllm.transformers_utils.tokenizer import get_tokenizer try:
from vllm.transformers_utils.tokenizer import get_tokenizer
except ImportError:
from backend_request_func import get_tokenizer
try:
from vllm.utils import FlexibleArgumentParser
except ImportError:
from argparse import ArgumentParser as FlexibleArgumentParser
@dataclass @dataclass
...@@ -52,12 +60,15 @@ class BenchmarkMetrics: ...@@ -52,12 +60,15 @@ class BenchmarkMetrics:
output_throughput: float output_throughput: float
mean_ttft_ms: float mean_ttft_ms: float
median_ttft_ms: float median_ttft_ms: float
std_ttft_ms: float
p99_ttft_ms: float p99_ttft_ms: float
mean_tpot_ms: float mean_tpot_ms: float
median_tpot_ms: float median_tpot_ms: float
std_tpot_ms: float
p99_tpot_ms: float p99_tpot_ms: float
mean_itl_ms: float mean_itl_ms: float
median_itl_ms: float median_itl_ms: float
std_itl_ms: float
p99_itl_ms: float p99_itl_ms: float
...@@ -69,7 +80,6 @@ def sample_sharegpt_requests( ...@@ -69,7 +80,6 @@ def sample_sharegpt_requests(
) -> List[Tuple[str, int, int]]: ) -> List[Tuple[str, int, int]]:
if fixed_output_len is not None and fixed_output_len < 4: if fixed_output_len is not None and fixed_output_len < 4:
raise ValueError("output_len too small") raise ValueError("output_len too small")
# Load the dataset. # Load the dataset.
with open(dataset_path) as f: with open(dataset_path) as f:
dataset = json.load(f) dataset = json.load(f)
...@@ -177,6 +187,31 @@ def sample_sonnet_requests( ...@@ -177,6 +187,31 @@ def sample_sonnet_requests(
return sampled_requests return sampled_requests
def sample_random_requests(
input_len: int, output_len: int, num_prompts: int, range_ratio: float,
tokenizer: PreTrainedTokenizerBase) -> List[Tuple[str, int, int]]:
input_lens = np.random.randint(
int(input_len * range_ratio),
input_len + 1,
size=num_prompts,
)
output_lens = np.random.randint(
int(output_len * range_ratio),
output_len + 1,
size=num_prompts,
)
offsets = np.random.randint(0, tokenizer.vocab_size, size=num_prompts)
input_requests = []
for i in range(num_prompts):
prompt = tokenizer.decode([(offsets[i] + i + j) % tokenizer.vocab_size
for j in range(input_lens[i])])
input_requests.append(
(prompt, int(input_lens[i]), int(output_lens[i])))
return input_requests
async def get_request( async def get_request(
input_requests: List[Tuple[str, int, int]], input_requests: List[Tuple[str, int, int]],
request_rate: float, request_rate: float,
...@@ -188,6 +223,7 @@ async def get_request( ...@@ -188,6 +223,7 @@ async def get_request(
if request_rate == float("inf"): if request_rate == float("inf"):
# If the request rate is infinity, then we don't need to wait. # If the request rate is infinity, then we don't need to wait.
continue continue
# Sample the request interval from the exponential distribution. # Sample the request interval from the exponential distribution.
interval = np.random.exponential(1.0 / request_rate) interval = np.random.exponential(1.0 / request_rate)
# The next request will be sent after the interval. # The next request will be sent after the interval.
...@@ -200,18 +236,18 @@ def calculate_metrics( ...@@ -200,18 +236,18 @@ def calculate_metrics(
dur_s: float, dur_s: float,
tokenizer: PreTrainedTokenizerBase, tokenizer: PreTrainedTokenizerBase,
) -> Tuple[BenchmarkMetrics, List[int]]: ) -> Tuple[BenchmarkMetrics, List[int]]:
actual_output_lens = [] actual_output_lens: List[int] = []
total_input = 0 total_input = 0
completed = 0 completed = 0
itls = [] itls: List[float] = []
tpots = [] tpots: List[float] = []
ttfts = [] ttfts: List[float] = []
for i in range(len(outputs)): for i in range(len(outputs)):
if outputs[i].success: if outputs[i].success:
# We use the tokenizer to count the number of output tokens for all # We use the tokenizer to count the number of output tokens for all
# serving backends instead of looking at len(outputs[i].itl) since # serving backends instead of looking at len(outputs[i].itl) since
# multiple output tokens may be bundled together # multiple output tokens may be bundled together
# Note: this may inflate the output token count slightly # Note : this may inflate the output token count slightly
output_len = len( output_len = len(
tokenizer(outputs[i].generated_text, tokenizer(outputs[i].generated_text,
add_special_tokens=False).input_ids) add_special_tokens=False).input_ids)
...@@ -241,12 +277,15 @@ def calculate_metrics( ...@@ -241,12 +277,15 @@ def calculate_metrics(
mean_ttft_ms=np.mean(ttfts or 0) * mean_ttft_ms=np.mean(ttfts or 0) *
1000, # ttfts is empty if streaming is not supported by backend 1000, # ttfts is empty if streaming is not supported by backend
median_ttft_ms=np.median(ttfts or 0) * 1000, median_ttft_ms=np.median(ttfts or 0) * 1000,
std_ttft_ms=np.std(ttfts or 0) * 1000,
p99_ttft_ms=np.percentile(ttfts or 0, 99) * 1000, p99_ttft_ms=np.percentile(ttfts or 0, 99) * 1000,
mean_tpot_ms=np.mean(tpots or 0) * 1000, mean_tpot_ms=np.mean(tpots or 0) * 1000,
median_tpot_ms=np.median(tpots or 0) * 1000, median_tpot_ms=np.median(tpots or 0) * 1000,
std_tpot_ms=np.std(tpots or 0) * 1000,
p99_tpot_ms=np.percentile(tpots or 0, 99) * 1000, p99_tpot_ms=np.percentile(tpots or 0, 99) * 1000,
mean_itl_ms=np.mean(itls or 0) * 1000, mean_itl_ms=np.mean(itls or 0) * 1000,
median_itl_ms=np.median(itls or 0) * 1000, median_itl_ms=np.median(itls or 0) * 1000,
std_itl_ms=np.std(itls or 0) * 1000,
p99_itl_ms=np.percentile(itls or 0, 99) * 1000, p99_itl_ms=np.percentile(itls or 0, 99) * 1000,
) )
...@@ -265,7 +304,7 @@ async def benchmark( ...@@ -265,7 +304,7 @@ async def benchmark(
disable_tqdm: bool, disable_tqdm: bool,
): ):
if backend in ASYNC_REQUEST_FUNCS: if backend in ASYNC_REQUEST_FUNCS:
request_func = ASYNC_REQUEST_FUNCS.get(backend) request_func = ASYNC_REQUEST_FUNCS[backend]
else: else:
raise ValueError(f"Unknown backend: {backend}") raise ValueError(f"Unknown backend: {backend}")
...@@ -292,7 +331,7 @@ async def benchmark( ...@@ -292,7 +331,7 @@ async def benchmark(
pbar = None if disable_tqdm else tqdm(total=len(input_requests)) pbar = None if disable_tqdm else tqdm(total=len(input_requests))
benchmark_start_time = time.perf_counter() benchmark_start_time = time.perf_counter()
tasks = [] tasks: List[asyncio.Task] = []
async for request in get_request(input_requests, request_rate): async for request in get_request(input_requests, request_rate):
prompt, prompt_len, output_len = request prompt, prompt_len, output_len = request
request_func_input = RequestFuncInput( request_func_input = RequestFuncInput(
...@@ -310,7 +349,7 @@ async def benchmark( ...@@ -310,7 +349,7 @@ async def benchmark(
pbar=pbar))) pbar=pbar)))
outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks) outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
if not disable_tqdm: if pbar is not None:
pbar.close() pbar.close()
benchmark_duration = time.perf_counter() - benchmark_start_time benchmark_duration = time.perf_counter() - benchmark_start_time
...@@ -363,12 +402,15 @@ async def benchmark( ...@@ -363,12 +402,15 @@ async def benchmark(
"output_throughput": metrics.output_throughput, "output_throughput": metrics.output_throughput,
"mean_ttft_ms": metrics.mean_ttft_ms, "mean_ttft_ms": metrics.mean_ttft_ms,
"median_ttft_ms": metrics.median_ttft_ms, "median_ttft_ms": metrics.median_ttft_ms,
"std_ttft_ms": metrics.std_ttft_ms,
"p99_ttft_ms": metrics.p99_ttft_ms, "p99_ttft_ms": metrics.p99_ttft_ms,
"mean_tpot_ms": metrics.mean_tpot_ms, "mean_tpot_ms": metrics.mean_tpot_ms,
"median_tpot_ms": metrics.median_tpot_ms, "median_tpot_ms": metrics.median_tpot_ms,
"std_tpot_ms": metrics.std_tpot_ms,
"p99_tpot_ms": metrics.p99_tpot_ms, "p99_tpot_ms": metrics.p99_tpot_ms,
"mean_itl_ms": metrics.mean_itl_ms, "mean_itl_ms": metrics.mean_itl_ms,
"median_itl_ms": metrics.median_itl_ms, "median_itl_ms": metrics.median_itl_ms,
"std_itl_ms": metrics.std_itl_ms,
"p99_itl_ms": metrics.p99_itl_ms, "p99_itl_ms": metrics.p99_itl_ms,
"input_lens": [output.prompt_len for output in outputs], "input_lens": [output.prompt_len for output in outputs],
"output_lens": actual_output_lens, "output_lens": actual_output_lens,
...@@ -448,6 +490,15 @@ def main(args: argparse.Namespace): ...@@ -448,6 +490,15 @@ def main(args: argparse.Namespace):
for prompt, prompt_formatted, prompt_len, for prompt, prompt_formatted, prompt_len,
output_len in input_requests] output_len in input_requests]
elif args.dataset_name == "random":
input_requests = sample_random_requests(
input_len=args.random_input_len,
output_len=args.random_output_len,
num_prompts=args.num_prompts,
range_ratio=args.random_range_ratio,
tokenizer=tokenizer,
)
else: else:
raise ValueError(f"Unknown dataset: {args.dataset_name}") raise ValueError(f"Unknown dataset: {args.dataset_name}")
...@@ -466,7 +517,7 @@ def main(args: argparse.Namespace): ...@@ -466,7 +517,7 @@ def main(args: argparse.Namespace):
# Save config and results to json # Save config and results to json
if args.save_result: if args.save_result:
result_json = {} result_json: Dict[str, Any] = {}
# Setup # Setup
current_dt = datetime.now().strftime("%Y%m%d-%H%M%S") current_dt = datetime.now().strftime("%Y%m%d-%H%M%S")
...@@ -499,6 +550,8 @@ def main(args: argparse.Namespace): ...@@ -499,6 +550,8 @@ def main(args: argparse.Namespace):
# Save to file # Save to file
base_model_id = model_id.split("/")[-1] base_model_id = model_id.split("/")[-1]
file_name = f"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json" #noqa file_name = f"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json" #noqa
if args.result_filename:
file_name = args.result_filename
if args.result_dir: if args.result_dir:
file_name = os.path.join(args.result_dir, file_name) file_name = os.path.join(args.result_dir, file_name)
with open(file_name, "w") as outfile: with open(file_name, "w") as outfile:
...@@ -506,7 +559,7 @@ def main(args: argparse.Namespace): ...@@ -506,7 +559,7 @@ def main(args: argparse.Namespace):
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser( parser = FlexibleArgumentParser(
description="Benchmark the online serving throughput.") description="Benchmark the online serving throughput.")
parser.add_argument( parser.add_argument(
"--backend", "--backend",
...@@ -539,7 +592,7 @@ if __name__ == "__main__": ...@@ -539,7 +592,7 @@ if __name__ == "__main__":
"--dataset-name", "--dataset-name",
type=str, type=str,
default="sharegpt", default="sharegpt",
choices=["sharegpt", "sonnet"], choices=["sharegpt", "sonnet", "random"],
help="Name of the dataset to benchmark on.", help="Name of the dataset to benchmark on.",
) )
parser.add_argument("--dataset-path", parser.add_argument("--dataset-path",
...@@ -556,7 +609,7 @@ if __name__ == "__main__": ...@@ -556,7 +609,7 @@ if __name__ == "__main__":
"--tokenizer", "--tokenizer",
type=str, type=str,
help= help=
"Name or path of the tokenizer, if not using the default tokenizer.", "Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
) )
parser.add_argument( parser.add_argument(
"--best-of", "--best-of",
...@@ -599,6 +652,27 @@ if __name__ == "__main__": ...@@ -599,6 +652,27 @@ if __name__ == "__main__":
help= help=
"Number of prefix tokens per request, used only for sonnet dataset.", "Number of prefix tokens per request, used only for sonnet dataset.",
) )
parser.add_argument(
"--random-input-len",
type=int,
default=1024,
help=
"Number of input tokens per request, used only for random sampling.",
)
parser.add_argument(
"--random-output-len",
type=int,
default=128,
help=
"Number of output tokens per request, used only for random sampling.",
)
parser.add_argument(
"--random-range-ratio",
type=float,
default=1.0,
help="Range of sampled ratio of input/output length, "
"used only for random sampling.",
)
parser.add_argument( parser.add_argument(
"--request-rate", "--request-rate",
type=float, type=float,
...@@ -639,6 +713,15 @@ if __name__ == "__main__": ...@@ -639,6 +713,15 @@ if __name__ == "__main__":
help="Specify directory to save benchmark json results." help="Specify directory to save benchmark json results."
"If not specified, results are saved in the current directory.", "If not specified, results are saved in the current directory.",
) )
parser.add_argument(
"--result-filename",
type=str,
default=None,
help="Specify the filename to save benchmark json results."
"If not specified, results will be saved in "
"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"
" format.",
)
args = parser.parse_args() args = parser.parse_args()
main(args) main(args)
...@@ -11,8 +11,10 @@ from tqdm import tqdm ...@@ -11,8 +11,10 @@ from tqdm import tqdm
from transformers import (AutoModelForCausalLM, AutoTokenizer, from transformers import (AutoModelForCausalLM, AutoTokenizer,
PreTrainedTokenizerBase) PreTrainedTokenizerBase)
from vllm.inputs import PromptStrictInputs from vllm.engine.arg_utils import EngineArgs
from vllm.inputs import PromptInputs
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
from vllm.utils import FlexibleArgumentParser
def sample_requests( def sample_requests(
...@@ -84,6 +86,7 @@ def run_vllm( ...@@ -84,6 +86,7 @@ def run_vllm(
distributed_executor_backend: Optional[str], distributed_executor_backend: Optional[str],
gpu_memory_utilization: float = 0.9, gpu_memory_utilization: float = 0.9,
download_dir: Optional[str] = None, download_dir: Optional[str] = None,
load_format: str = EngineArgs.load_format,
) -> float: ) -> float:
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
llm = LLM( llm = LLM(
...@@ -105,11 +108,12 @@ def run_vllm( ...@@ -105,11 +108,12 @@ def run_vllm(
enable_chunked_prefill=enable_chunked_prefill, enable_chunked_prefill=enable_chunked_prefill,
max_num_batched_tokens=max_num_batched_tokens, max_num_batched_tokens=max_num_batched_tokens,
distributed_executor_backend=distributed_executor_backend, distributed_executor_backend=distributed_executor_backend,
load_format=load_format,
) )
# Add the requests to the engine. # Add the requests to the engine.
prompts = [] prompts: List[str] = []
sampling_params = [] sampling_params: List[SamplingParams] = []
for prompt, _, output_len in requests: for prompt, _, output_len in requests:
prompts.append(prompt) prompts.append(prompt)
sampling_params.append( sampling_params.append(
...@@ -144,7 +148,7 @@ def run_vllm( ...@@ -144,7 +148,7 @@ def run_vllm(
# dummy_prompt_token_ids = np.random.randint(10000, # dummy_prompt_token_ids = np.random.randint(10000,
# size=(args.num_prompts, # size=(args.num_prompts,
# args.input_len)) # args.input_len))
# dummy_inputs: List[PromptStrictInputs] = [{ # dummy_inputs: List[PromptInputs] = [{
# "prompt_token_ids": batch # "prompt_token_ids": batch
# } for batch in dummy_prompt_token_ids.tolist()] # } for batch in dummy_prompt_token_ids.tolist()]
...@@ -270,7 +274,7 @@ def main(args: argparse.Namespace): ...@@ -270,7 +274,7 @@ def main(args: argparse.Namespace):
args.quantization_param_path, args.device, args.quantization_param_path, args.device,
args.enable_prefix_caching, args.enable_chunked_prefill, args.enable_prefix_caching, args.enable_chunked_prefill,
args.max_num_batched_tokens, args.distributed_executor_backend, args.max_num_batched_tokens, args.distributed_executor_backend,
args.gpu_memory_utilization, args.download_dir) args.gpu_memory_utilization, args.download_dir, args.load_format)
elif args.backend == "hf": elif args.backend == "hf":
assert args.tensor_parallel_size == 1 assert args.tensor_parallel_size == 1
elapsed_time = run_hf(requests, args.model, tokenizer, args.n, elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
...@@ -283,6 +287,7 @@ def main(args: argparse.Namespace): ...@@ -283,6 +287,7 @@ def main(args: argparse.Namespace):
raise ValueError(f"Unknown backend: {args.backend}") raise ValueError(f"Unknown backend: {args.backend}")
total_num_tokens = sum(prompt_len + output_len total_num_tokens = sum(prompt_len + output_len
for _, prompt_len, output_len in requests) for _, prompt_len, output_len in requests)
if args.dataset is None: if args.dataset is None:
total_out_tokens = args.output_len * args.num_prompts total_out_tokens = args.output_len * args.num_prompts
else: else:
...@@ -307,7 +312,7 @@ def main(args: argparse.Namespace): ...@@ -307,7 +312,7 @@ def main(args: argparse.Namespace):
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Benchmark the throughput.") parser = FlexibleArgumentParser(description="Benchmark the throughput.")
parser.add_argument("--backend", parser.add_argument("--backend",
type=str, type=str,
choices=["vllm", "hf", "mii"], choices=["vllm", "hf", "mii"],
...@@ -398,9 +403,10 @@ if __name__ == "__main__": ...@@ -398,9 +403,10 @@ if __name__ == "__main__":
parser.add_argument( parser.add_argument(
"--device", "--device",
type=str, type=str,
default="cuda", default="auto",
choices=["cuda", "cpu", "tpu"], choices=["auto", "cuda", "cpu", "openvino", "tpu", "xpu"],
help='device type for vLLM execution, supporting CUDA and CPU.') help='device type for vLLM execution, supporting CUDA, OpenVINO and '
'CPU.')
parser.add_argument( parser.add_argument(
"--enable-prefix-caching", "--enable-prefix-caching",
action='store_true', action='store_true',
...@@ -430,6 +436,29 @@ if __name__ == "__main__": ...@@ -430,6 +436,29 @@ if __name__ == "__main__":
help='Backend to use for distributed serving. When more than 1 GPU ' help='Backend to use for distributed serving. When more than 1 GPU '
'is used, will be automatically set to "ray" if installed ' 'is used, will be automatically set to "ray" if installed '
'or "mp" (multiprocessing) otherwise.') 'or "mp" (multiprocessing) otherwise.')
parser.add_argument(
'--load-format',
type=str,
default=EngineArgs.load_format,
choices=[
'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer',
'bitsandbytes'
],
help='The format of the model weights to load.\n\n'
'* "auto" will try to load the weights in the safetensors format '
'and fall back to the pytorch bin format if safetensors format '
'is not available.\n'
'* "pt" will load the weights in the pytorch bin format.\n'
'* "safetensors" will load the weights in the safetensors format.\n'
'* "npcache" will load the weights in pytorch format and store '
'a numpy cache to speed up the loading.\n'
'* "dummy" will initialize the weights with random values, '
'which is mainly for profiling.\n'
'* "tensorizer" will load the weights using tensorizer from '
'CoreWeave. See the Tensorize vLLM Model script in the Examples'
'section for more information.\n'
'* "bitsandbytes" will load the weights using bitsandbytes '
'quantization.\n')
args = parser.parse_args() args = parser.parse_args()
if args.tokenizer is None: if args.tokenizer is None:
args.tokenizer = args.model args.tokenizer = args.model
...@@ -461,4 +490,4 @@ if __name__ == "__main__": ...@@ -461,4 +490,4 @@ if __name__ == "__main__":
if args.tokenizer != args.model: if args.tokenizer != args.model:
raise ValueError("Tokenizer must be the same as the model for MII " raise ValueError("Tokenizer must be the same as the model for MII "
"backend.") "backend.")
main(args) main(args)
\ No newline at end of file
...@@ -11,26 +11,27 @@ from torch.utils.benchmark import Measurement as TMeasurement ...@@ -11,26 +11,27 @@ from torch.utils.benchmark import Measurement as TMeasurement
from weight_shapes import WEIGHT_SHAPES from weight_shapes import WEIGHT_SHAPES
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm.utils import FlexibleArgumentParser
DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())[1:] DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512] DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
DEFAULT_TP_SIZES = [1] DEFAULT_TP_SIZES = [1]
# helpers # helpers
def to_fp8(tensor: torch.tensor) -> torch.tensor: def to_fp8(tensor: torch.Tensor) -> torch.Tensor:
finfo = torch.finfo(torch.float8_e4m3fn) finfo = torch.finfo(torch.float8_e4m3fn)
return torch.round(tensor.clamp( return torch.round(tensor.clamp(
min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn) min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
def to_int8(tensor: torch.tensor) -> torch.tensor: def to_int8(tensor: torch.Tensor) -> torch.Tensor:
return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8) return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
def make_rand_tensors(dtype: torch.dtype, m: int, n: int, def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
k: int) -> Tuple[torch.tensor, torch.tensor]: k: int) -> Tuple[torch.Tensor, torch.Tensor]:
a = torch.randn((m, k), device='cuda') * 5 a = torch.randn((m, k), device='cuda') * 5
b = torch.randn((n, k), device='cuda').t() * 5 b = torch.randn((n, k), device='cuda').t() * 5
...@@ -46,15 +47,15 @@ def make_rand_tensors(dtype: torch.dtype, m: int, n: int, ...@@ -46,15 +47,15 @@ def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
# impl # impl
def pytorch_i8_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor, def pytorch_mm_impl(a: torch.Tensor, b: torch.Tensor, scale_a: torch.Tensor,
scale_b: torch.tensor, scale_b: torch.Tensor,
out_dtype: torch.dtype) -> torch.tensor: out_dtype: torch.dtype) -> torch.Tensor:
return torch.mm(a, b) return torch.mm(a, b)
def pytorch_fp8_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor, def pytorch_fp8_impl(a: torch.Tensor, b: torch.Tensor, scale_a: torch.Tensor,
scale_b: torch.tensor, scale_b: torch.Tensor,
out_dtype: torch.dtype) -> torch.tensor: out_dtype: torch.dtype) -> torch.Tensor:
return torch._scaled_mm(a, return torch._scaled_mm(a,
b, b,
scale_a=scale_a, scale_a=scale_a,
...@@ -62,9 +63,9 @@ def pytorch_fp8_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor, ...@@ -62,9 +63,9 @@ def pytorch_fp8_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
out_dtype=out_dtype) out_dtype=out_dtype)
def pytorch_fp8_impl_fast_accum(a: torch.tensor, b: torch.tensor, def pytorch_fp8_impl_fast_accum(a: torch.Tensor, b: torch.Tensor,
scale_a: torch.tensor, scale_b: torch.tensor, scale_a: torch.Tensor, scale_b: torch.Tensor,
out_dtype: torch.dtype) -> torch.tensor: out_dtype: torch.dtype) -> torch.Tensor:
return torch._scaled_mm(a, return torch._scaled_mm(a,
b, b,
scale_a=scale_a, scale_a=scale_a,
...@@ -73,15 +74,15 @@ def pytorch_fp8_impl_fast_accum(a: torch.tensor, b: torch.tensor, ...@@ -73,15 +74,15 @@ def pytorch_fp8_impl_fast_accum(a: torch.tensor, b: torch.tensor,
use_fast_accum=True) use_fast_accum=True)
def cutlass_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor, def cutlass_impl(a: torch.Tensor, b: torch.Tensor, scale_a: torch.Tensor,
scale_b: torch.tensor, scale_b: torch.Tensor,
out_dtype: torch.dtype) -> torch.tensor: out_dtype: torch.dtype) -> torch.Tensor:
return ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype=out_dtype) return ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype=out_dtype)
# bench # bench
def bench_fn(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor, def bench_fn(a: torch.Tensor, b: torch.Tensor, scale_a: torch.Tensor,
scale_b: torch.tensor, out_dtype: torch.dtype, label: str, scale_b: torch.Tensor, out_dtype: torch.dtype, label: str,
sub_label: str, fn: Callable, description: str) -> TMeasurement: sub_label: str, fn: Callable, description: str) -> TMeasurement:
min_run_time = 1 min_run_time = 1
...@@ -111,18 +112,24 @@ def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str, ...@@ -111,18 +112,24 @@ def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32) scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
timers = [] timers = []
# pytorch impl # pytorch impl - bfloat16
timers.append( timers.append(
bench_fn(a.to(dtype=torch.bfloat16, device="cuda"), bench_fn(a.to(dtype=torch.bfloat16, device="cuda"),
b.to(dtype=torch.bfloat16, device="cuda"), scale_a, scale_b, b.to(dtype=torch.bfloat16, device="cuda"), scale_a, scale_b,
torch.bfloat16, label, sub_label, pytorch_i8_impl, torch.bfloat16, label, sub_label, pytorch_mm_impl,
"pytorch_bf16_bf16_bf16_matmul-no-scales")) "pytorch_bf16_bf16_bf16_matmul-no-scales"))
# pytorch impl - float16
timers.append(
bench_fn(a.to(dtype=torch.float16, device="cuda"),
b.to(dtype=torch.float16, device="cuda"), scale_a, scale_b,
torch.float16, label, sub_label, pytorch_mm_impl,
"pytorch_fp16_fp16_fp16_matmul-no-scales"))
# cutlass impl # cutlass impl
timers.append( timers.append(
bench_fn(a, b, scale_a.to(device="cpu"), scale_b.to(device="cpu"), bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label,
torch.bfloat16, label, sub_label, cutlass_impl, cutlass_impl, "cutlass_i8_i8_bf16_scaled_mm"))
"cutlass_i8_i8_bf16_scaled_mm"))
return timers return timers
...@@ -136,6 +143,13 @@ def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str, ...@@ -136,6 +143,13 @@ def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
timers = [] timers = []
# pytorch impl w. bf16
timers.append(
bench_fn(a.to(dtype=torch.bfloat16, device="cuda"),
b.to(dtype=torch.bfloat16, device="cuda"), scale_a, scale_b,
torch.bfloat16, label, sub_label, pytorch_mm_impl,
"pytorch_bf16_bf16_bf16_matmul-no-scales"))
# pytorch impl: bf16 output, without fp8 fast accum # pytorch impl: bf16 output, without fp8 fast accum
timers.append( timers.append(
bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label, bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label,
...@@ -160,14 +174,12 @@ def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str, ...@@ -160,14 +174,12 @@ def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
# cutlass impl: bf16 output # cutlass impl: bf16 output
timers.append( timers.append(
bench_fn(a, b, scale_a.to(device="cpu"), scale_b.to(device="cpu"), bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label,
torch.bfloat16, label, sub_label, cutlass_impl, cutlass_impl, "cutlass_fp8_fp8_bf16_scaled_mm"))
"cutlass_fp8_fp8_bf16_scaled_mm"))
# cutlass impl: fp16 output # cutlass impl: fp16 output
timers.append( timers.append(
bench_fn(a, b, scale_a.to(device="cpu"), scale_b.to(device="cpu"), bench_fn(a, b, scale_a, scale_b, torch.float16, label, sub_label,
torch.float16, label, sub_label, cutlass_impl, cutlass_impl, "cutlass_fp8_fp8_fp16_scaled_mm"))
"cutlass_fp8_fp8_fp16_scaled_mm"))
return timers return timers
...@@ -289,7 +301,7 @@ if __name__ == '__main__': ...@@ -289,7 +301,7 @@ if __name__ == '__main__':
return torch.float8_e4m3fn return torch.float8_e4m3fn
raise ValueError("unsupported dtype") raise ValueError("unsupported dtype")
parser = argparse.ArgumentParser( parser = FlexibleArgumentParser(
description=""" description="""
Benchmark Cutlass GEMM. Benchmark Cutlass GEMM.
......
...@@ -22,6 +22,12 @@ WEIGHT_SHAPES = { ...@@ -22,6 +22,12 @@ WEIGHT_SHAPES = {
([4096, 22016], 1), ([4096, 22016], 1),
([11008, 4096], 0), ([11008, 4096], 0),
], ],
"meta-llama/Llama-3-8b": [
([4096, 6144], 1),
([4096, 4096], 0),
([4096, 28672], 1),
([14336, 4096], 0),
],
"meta-llama/Llama-2-13b-hf": [ "meta-llama/Llama-2-13b-hf": [
([5120, 15360], 1), ([5120, 15360], 1),
([5120, 5120], 0), ([5120, 5120], 0),
......
import argparse
import os import os
import sys import sys
from typing import Optional from typing import Optional
...@@ -10,6 +9,7 @@ from vllm import _custom_ops as ops ...@@ -10,6 +9,7 @@ from vllm import _custom_ops as ops
from vllm.model_executor.layers.quantization.aqlm import ( from vllm.model_executor.layers.quantization.aqlm import (
dequantize_weight, generic_dequantize_gemm, get_int_dtype, dequantize_weight, generic_dequantize_gemm, get_int_dtype,
optimized_dequantize_gemm) optimized_dequantize_gemm)
from vllm.utils import FlexibleArgumentParser
os.environ['CUDA_VISIBLE_DEVICES'] = '0' os.environ['CUDA_VISIBLE_DEVICES'] = '0'
...@@ -86,9 +86,9 @@ def dequant_no_scale( ...@@ -86,9 +86,9 @@ def dequant_no_scale(
# Compare the optimized 1x16 and 2x8 cuda decompression/dequant kernels against # Compare the optimized 1x16 and 2x8 cuda decompression/dequant kernels against
# the generic pytorch version. # the generic pytorch version.
# Just visual comparison. # Just visual comparison.
def dequant_test(k: int, parts: torch.tensor, nbooks: int, bits: int) -> None: def dequant_test(k: int, parts: torch.Tensor, nbooks: int, bits: int) -> None:
n = parts.sum().item() n = int(parts.sum().item())
device = torch.device('cuda:0') device = torch.device('cuda:0')
...@@ -137,7 +137,7 @@ def dequant_test(k: int, parts: torch.tensor, nbooks: int, bits: int) -> None: ...@@ -137,7 +137,7 @@ def dequant_test(k: int, parts: torch.tensor, nbooks: int, bits: int) -> None:
def main(): def main():
parser = argparse.ArgumentParser(description="Benchmark aqlm performance.") parser = FlexibleArgumentParser(description="Benchmark aqlm performance.")
# Add arguments # Add arguments
parser.add_argument("--nbooks", parser.add_argument("--nbooks",
...@@ -204,7 +204,7 @@ def main(): ...@@ -204,7 +204,7 @@ def main():
sys.stdout = sys.__stdout__ sys.stdout = sys.__stdout__
def run_grid(m: int, k: int, parts: torch.tensor, nbooks: int, bits: int, def run_grid(m: int, k: int, parts: torch.Tensor, nbooks: int, bits: int,
methods): methods):
# I didn't see visible improvements from increasing these, but feel free :) # I didn't see visible improvements from increasing these, but feel free :)
...@@ -252,10 +252,10 @@ def run_grid(m: int, k: int, parts: torch.tensor, nbooks: int, bits: int, ...@@ -252,10 +252,10 @@ def run_grid(m: int, k: int, parts: torch.tensor, nbooks: int, bits: int,
print('') print('')
def run_timing(num_calls: int, m: int, k: int, parts: torch.tensor, def run_timing(num_calls: int, m: int, k: int, parts: torch.Tensor,
nbooks: int, bits: int, method) -> float: nbooks: int, bits: int, method) -> float:
n = parts.sum().item() n = int(parts.sum().item())
device = torch.device('cuda:0') device = torch.device('cuda:0')
......
import argparse from typing import List
import torch import torch
import torch.utils.benchmark as benchmark import torch.utils.benchmark as benchmark
from benchmark_shapes import WEIGHT_SHAPES from benchmark_shapes import WEIGHT_SHAPES
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm.model_executor.layers.quantization.gptq_marlin import (
GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
GPTQ_MARLIN_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_SUPPORTED_NUM_BITS)
from vllm.model_executor.layers.quantization.gptq_marlin_24 import ( from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
GPTQ_MARLIN_24_MAX_PARALLEL, GPTQ_MARLIN_24_MIN_THREAD_N, GPTQ_MARLIN_24_MAX_PARALLEL, GPTQ_MARLIN_24_MIN_THREAD_N,
GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_24_SUPPORTED_NUM_BITS) GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES)
from vllm.model_executor.layers.quantization.utils.marlin_utils import ( from vllm.model_executor.layers.quantization.utils.marlin_utils import (
MarlinWorkspace, marlin_24_quantize, marlin_quantize) GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
MARLIN_SUPPORTED_GROUP_SIZES, query_marlin_supported_quant_types)
from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
MarlinWorkspace, marlin_quantize)
from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import (
marlin_24_quantize)
from vllm.model_executor.layers.quantization.utils.quant_utils import ( from vllm.model_executor.layers.quantization.utils.quant_utils import (
gptq_pack, quantize_weights, sort_weights) gptq_pack, gptq_quantize_weights, sort_weights)
from vllm.scalar_type import ScalarType
from vllm.utils import FlexibleArgumentParser
DEFAULT_MODELS = ["meta-llama/Llama-2-7b-hf/TP1"] DEFAULT_MODELS = ["meta-llama/Llama-2-7b-hf/TP1"]
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512] DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
...@@ -23,13 +27,15 @@ ACT_ORDER_OPTS = [False, True] ...@@ -23,13 +27,15 @@ ACT_ORDER_OPTS = [False, True]
K_FULL_OPTS = [False, True] K_FULL_OPTS = [False, True]
def bench_run(results, model, act_order, is_k_full, num_bits, group_size, def bench_run(results: List[benchmark.Measurement], model: str,
size_m, size_k, size_n): act_order: bool, is_k_full: bool, quant_type: ScalarType,
group_size: int, size_m: int, size_k: int, size_n: int):
label = "Quant Matmul" label = "Quant Matmul"
sub_label = ("{}, act={} k_full={}, b={}, g={}, " sub_label = ("{}, act={} k_full={}, q={}, g={}, "
"MKN=({}x{}x{})".format(model, act_order, is_k_full, num_bits, "MKN=({}x{}x{})".format(model, act_order, is_k_full,
group_size, size_m, size_k, size_n)) str(quant_type), group_size, size_m,
size_k, size_n))
print(f"Testing: {sub_label}") print(f"Testing: {sub_label}")
...@@ -46,16 +52,18 @@ def bench_run(results, model, act_order, is_k_full, num_bits, group_size, ...@@ -46,16 +52,18 @@ def bench_run(results, model, act_order, is_k_full, num_bits, group_size,
marlin_g_idx, marlin_g_idx,
marlin_sort_indices, marlin_sort_indices,
marlin_rand_perm, marlin_rand_perm,
) = marlin_quantize(b, num_bits, group_size, act_order) ) = marlin_quantize(b, quant_type, group_size, act_order)
# Marlin_24 quant # Marlin_24 quant
(marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, (marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta,
marlin_24_s) = marlin_24_quantize(b, num_bits, group_size) marlin_24_s) = marlin_24_quantize(b, quant_type, group_size)
marlin_zp = torch.empty(0, dtype=torch.int, device=b.device)
# GPTQ quant # GPTQ quant
(w_ref, q_w, s, g_idx, (w_ref, q_w, s, g_idx,
rand_perm) = quantize_weights(b, num_bits, group_size, act_order) rand_perm) = gptq_quantize_weights(b, quant_type, group_size, act_order)
q_w_gptq = gptq_pack(q_w, num_bits, size_k, size_n) q_w_gptq = gptq_pack(q_w, quant_type.size_bits, size_k, size_n)
# For act_order, sort the "weights" and "g_idx" # For act_order, sort the "weights" and "g_idx"
# so that group ids are increasing # so that group ids are increasing
...@@ -69,10 +77,11 @@ def bench_run(results, model, act_order, is_k_full, num_bits, group_size, ...@@ -69,10 +77,11 @@ def bench_run(results, model, act_order, is_k_full, num_bits, group_size,
marlin_24_workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_24_MIN_THREAD_N, marlin_24_workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_24_MIN_THREAD_N,
GPTQ_MARLIN_24_MAX_PARALLEL) GPTQ_MARLIN_24_MAX_PARALLEL)
marlin_zp = torch.zeros_like(marlin_s, dtype=torch.int)
globals = { globals = {
# Gen params # Gen params
"num_bits": num_bits, "quant_type": quant_type,
"group_size": group_size, "group_size": group_size,
"size_m": size_m, "size_m": size_m,
"size_n": size_n, "size_n": size_n,
...@@ -83,6 +92,7 @@ def bench_run(results, model, act_order, is_k_full, num_bits, group_size, ...@@ -83,6 +92,7 @@ def bench_run(results, model, act_order, is_k_full, num_bits, group_size,
"marlin_w_ref": marlin_w_ref, "marlin_w_ref": marlin_w_ref,
"marlin_q_w": marlin_q_w, "marlin_q_w": marlin_q_w,
"marlin_s": marlin_s, "marlin_s": marlin_s,
"marlin_zp": marlin_zp,
"marlin_g_idx": marlin_g_idx, "marlin_g_idx": marlin_g_idx,
"marlin_sort_indices": marlin_sort_indices, "marlin_sort_indices": marlin_sort_indices,
"marlin_rand_perm": marlin_rand_perm, "marlin_rand_perm": marlin_rand_perm,
...@@ -121,19 +131,29 @@ def bench_run(results, model, act_order, is_k_full, num_bits, group_size, ...@@ -121,19 +131,29 @@ def bench_run(results, model, act_order, is_k_full, num_bits, group_size,
results.append( results.append(
benchmark.Timer( benchmark.Timer(
stmt= stmt=
"output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, num_bits, size_m, size_n, size_k, is_k_full)", # noqa: E501 "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False)", # noqa: E501
globals=globals,
label=label,
sub_label=sub_label,
description="gptq_marlin_gemm_fp16",
).blocked_autorange(min_run_time=min_run_time))
results.append(
benchmark.Timer(
stmt=
"output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True)", # noqa: E501
globals=globals, globals=globals,
label=label, label=label,
sub_label=sub_label, sub_label=sub_label,
description="gptq_marlin_gemm", description="gptq_marlin_gemm_fp32",
).blocked_autorange(min_run_time=min_run_time)) ).blocked_autorange(min_run_time=min_run_time))
if (num_bits in GPTQ_MARLIN_24_SUPPORTED_NUM_BITS if (quant_type in GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES
and group_size in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES): and group_size in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES):
results.append( results.append(
benchmark.Timer( benchmark.Timer(
stmt= stmt=
"output = gptq_marlin_24_gemm(a, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s, marlin_24_workspace.scratch, num_bits, size_m, size_n, size_k)", # noqa: E501 "output = gptq_marlin_24_gemm(a, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s, marlin_24_workspace.scratch, quant_type, size_m, size_n, size_k)", # noqa: E501
globals=globals, globals=globals,
label=label, label=label,
sub_label=sub_label, sub_label=sub_label,
...@@ -143,7 +163,7 @@ def bench_run(results, model, act_order, is_k_full, num_bits, group_size, ...@@ -143,7 +163,7 @@ def bench_run(results, model, act_order, is_k_full, num_bits, group_size,
results.append( results.append(
benchmark.Timer( benchmark.Timer(
stmt= stmt=
"q_res = gptq_marlin_repack(q_w_gptq, repack_sort_indices, size_k, size_n, num_bits)", # noqa: E501 "q_res = gptq_marlin_repack(q_w_gptq, repack_sort_indices, size_k, size_n, quant_type.size_bits)", # noqa: E501
globals=globals, globals=globals,
label=label, label=label,
sub_label=sub_label, sub_label=sub_label,
...@@ -156,7 +176,7 @@ def main(args): ...@@ -156,7 +176,7 @@ def main(args):
for i, model in enumerate(args.models): for i, model in enumerate(args.models):
print(f"[{i}] {model}") print(f"[{i}] {model}")
results = [] results: List[benchmark.Measurement] = []
for model in args.models: for model in args.models:
for layer in WEIGHT_SHAPES[model]: for layer in WEIGHT_SHAPES[model]:
...@@ -179,12 +199,13 @@ def main(args): ...@@ -179,12 +199,13 @@ def main(args):
) > 0 and is_k_full not in args.limit_k_full: ) > 0 and is_k_full not in args.limit_k_full:
continue continue
for num_bits in GPTQ_MARLIN_SUPPORTED_NUM_BITS: for quant_type in query_marlin_supported_quant_types(
if len(args.limit_num_bits False):
) > 0 and num_bits not in args.limit_num_bits: if len(args.limit_num_bits) > 0 and \
quant_type.size_bits not in args.limit_num_bits:
continue continue
for group_size in GPTQ_MARLIN_SUPPORTED_GROUP_SIZES: for group_size in MARLIN_SUPPORTED_GROUP_SIZES:
if len( if len(
args.limit_group_size args.limit_group_size
) > 0 and group_size not in args.limit_group_size: ) > 0 and group_size not in args.limit_group_size:
...@@ -198,8 +219,8 @@ def main(args): ...@@ -198,8 +219,8 @@ def main(args):
for size_m in args.batch_sizes: for size_m in args.batch_sizes:
bench_run(results, model, act_order, is_k_full, bench_run(results, model, act_order, is_k_full,
num_bits, group_size, size_m, size_k, quant_type, group_size, size_m,
size_n) size_k, size_n)
compare = benchmark.Compare(results) compare = benchmark.Compare(results)
compare.print() compare.print()
...@@ -209,7 +230,7 @@ def main(args): ...@@ -209,7 +230,7 @@ def main(args):
# python benchmark_marlin.py --batch-sizes 1 16 32 --limit-k 4096 --limit-n 4096 --limit-group-size 128 --limit-num-bits 4 --limit-act-order 0 --limit-k-full 1 # noqa E501 # python benchmark_marlin.py --batch-sizes 1 16 32 --limit-k 4096 --limit-n 4096 --limit-group-size 128 --limit-num-bits 4 --limit-act-order 0 --limit-k-full 1 # noqa E501
# #
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser( parser = FlexibleArgumentParser(
description="Benchmark Marlin across specified models/shapes/batches") description="Benchmark Marlin across specified models/shapes/batches")
parser.add_argument( parser.add_argument(
"--models", "--models",
......
import argparse import argparse
import time import time
from datetime import datetime from datetime import datetime
from typing import Any, Dict, List, Tuple from typing import Any, Dict, List, Tuple, TypedDict
import ray import ray
import torch import torch
...@@ -10,10 +10,20 @@ from ray.experimental.tqdm_ray import tqdm ...@@ -10,10 +10,20 @@ from ray.experimental.tqdm_ray import tqdm
from transformers import AutoConfig from transformers import AutoConfig
from vllm.model_executor.layers.fused_moe.fused_moe import * from vllm.model_executor.layers.fused_moe.fused_moe import *
from vllm.utils import FlexibleArgumentParser
class BenchmarkConfig(TypedDict):
BLOCK_SIZE_M: int
BLOCK_SIZE_N: int
BLOCK_SIZE_K: int
GROUP_SIZE_M: int
num_warps: int
num_stages: int
def benchmark_config( def benchmark_config(
config: Dict[str, int], config: BenchmarkConfig,
num_tokens: int, num_tokens: int,
num_experts: int, num_experts: int,
shard_intermediate_size: int, shard_intermediate_size: int,
...@@ -92,7 +102,7 @@ def benchmark_config( ...@@ -92,7 +102,7 @@ def benchmark_config(
start_event = torch.cuda.Event(enable_timing=True) start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True) end_event = torch.cuda.Event(enable_timing=True)
latencies = [] latencies: List[float] = []
for i in range(num_iters): for i in range(num_iters):
prepare(i) prepare(i)
torch.cuda.synchronize() torch.cuda.synchronize()
...@@ -111,7 +121,7 @@ def get_configs_compute_bound() -> List[Dict[str, int]]: ...@@ -111,7 +121,7 @@ def get_configs_compute_bound() -> List[Dict[str, int]]:
# Reduced search space for faster tuning. # Reduced search space for faster tuning.
# TODO(woosuk): Increase the search space and use a performance model to # TODO(woosuk): Increase the search space and use a performance model to
# prune the search space. # prune the search space.
configs = [] configs: List[BenchmarkConfig] = []
for num_stages in [2, 3, 4, 5]: for num_stages in [2, 3, 4, 5]:
for block_m in [16, 32, 64, 128, 256]: for block_m in [16, 32, 64, 128, 256]:
for block_k in [64, 128, 256]: for block_k in [64, 128, 256]:
...@@ -175,8 +185,8 @@ class BenchmarkWorker: ...@@ -175,8 +185,8 @@ class BenchmarkWorker:
topk: int, topk: int,
dtype: torch.dtype, dtype: torch.dtype,
use_fp8: bool, use_fp8: bool,
search_space: List[Dict[str, int]], search_space: List[BenchmarkConfig],
) -> Dict[str, int]: ) -> BenchmarkConfig:
best_config = None best_config = None
best_time = float("inf") best_time = float("inf")
for config in tqdm(search_space): for config in tqdm(search_space):
...@@ -199,10 +209,11 @@ class BenchmarkWorker: ...@@ -199,10 +209,11 @@ class BenchmarkWorker:
best_config = config best_config = config
now = datetime.now() now = datetime.now()
print(f"{now.ctime()}] Completed tuning for batch_size={num_tokens}") print(f"{now.ctime()}] Completed tuning for batch_size={num_tokens}")
assert best_config is not None
return best_config return best_config
def sort_config(config: Dict[str, int]) -> Dict[str, int]: def sort_config(config: BenchmarkConfig) -> BenchmarkConfig:
return { return {
"BLOCK_SIZE_M": config["BLOCK_SIZE_M"], "BLOCK_SIZE_M": config["BLOCK_SIZE_M"],
"BLOCK_SIZE_N": config["BLOCK_SIZE_N"], "BLOCK_SIZE_N": config["BLOCK_SIZE_N"],
...@@ -214,7 +225,7 @@ def sort_config(config: Dict[str, int]) -> Dict[str, int]: ...@@ -214,7 +225,7 @@ def sort_config(config: Dict[str, int]) -> Dict[str, int]:
def save_configs( def save_configs(
configs: Dict[int, Dict[str, int]], configs: Dict[int, BenchmarkConfig],
num_experts: int, num_experts: int,
shard_intermediate_size: int, shard_intermediate_size: int,
hidden_size: int, hidden_size: int,
...@@ -305,7 +316,7 @@ def main(args: argparse.Namespace): ...@@ -305,7 +316,7 @@ def main(args: argparse.Namespace):
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser() parser = FlexibleArgumentParser()
parser.add_argument("--model", parser.add_argument("--model",
type=str, type=str,
default="mistralai/Mixtral-8x7B-Instruct-v0.1") default="mistralai/Mixtral-8x7B-Instruct-v0.1")
......
import argparse
import random import random
import time import time
from typing import Optional from typing import List, Optional
import torch import torch
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, create_kv_caches_with_random from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser,
create_kv_caches_with_random)
NUM_BLOCKS = 1024 NUM_BLOCKS = 1024
PARTITION_SIZE = 512 PARTITION_SIZE = 512
...@@ -54,14 +54,17 @@ def main( ...@@ -54,14 +54,17 @@ def main(
# Create the block tables. # Create the block tables.
max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
block_tables = [] block_tables_lst: List[List[int]] = []
for _ in range(num_seqs): for _ in range(num_seqs):
block_table = [ block_table = [
random.randint(0, NUM_BLOCKS - 1) random.randint(0, NUM_BLOCKS - 1)
for _ in range(max_num_blocks_per_seq) for _ in range(max_num_blocks_per_seq)
] ]
block_tables.append(block_table) block_tables_lst.append(block_table)
block_tables = torch.tensor(block_tables, dtype=torch.int, device=device)
block_tables = torch.tensor(block_tables_lst,
dtype=torch.int,
device=device)
# Create the KV cache. # Create the KV cache.
key_caches, value_caches = create_kv_caches_with_random(NUM_BLOCKS, key_caches, value_caches = create_kv_caches_with_random(NUM_BLOCKS,
...@@ -97,7 +100,7 @@ def main( ...@@ -97,7 +100,7 @@ def main(
start_time = time.perf_counter() start_time = time.perf_counter()
# Using default kv_scale # Using default kv_scale
kv_scale = 1.0 k_scale = v_scale = 1.0
for _ in range(num_iters): for _ in range(num_iters):
if version == "v1": if version == "v1":
...@@ -114,7 +117,8 @@ def main( ...@@ -114,7 +117,8 @@ def main(
max_seq_len, max_seq_len,
alibi_slopes, alibi_slopes,
kv_cache_dtype, kv_cache_dtype,
kv_scale, k_scale,
v_scale,
) )
elif version == "v2": elif version == "v2":
ops.paged_attention_v2( ops.paged_attention_v2(
...@@ -133,7 +137,8 @@ def main( ...@@ -133,7 +137,8 @@ def main(
max_seq_len, max_seq_len,
alibi_slopes, alibi_slopes,
kv_cache_dtype, kv_cache_dtype,
kv_scale, k_scale,
v_scale,
) )
else: else:
raise ValueError(f"Invalid version: {version}") raise ValueError(f"Invalid version: {version}")
...@@ -158,19 +163,19 @@ def main( ...@@ -158,19 +163,19 @@ def main(
if __name__ == '__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser( parser = FlexibleArgumentParser(
description="Benchmark the paged attention kernel.") description="Benchmark the paged attention kernel.")
parser.add_argument("--version", parser.add_argument("--version",
type=str, type=str,
choices=["v1", "v2"], choices=["v1", "v2"],
default="v2") default="v2")
parser.add_argument("--batch-size", type=int, default=8) parser.add_argument("--batch-size", type=int, default=8)
parser.add_argument("--seq_len", type=int, default=4096) parser.add_argument("--seq-len", type=int, default=4096)
parser.add_argument("--num-query-heads", type=int, default=64) parser.add_argument("--num-query-heads", type=int, default=64)
parser.add_argument("--num-kv-heads", type=int, default=8) parser.add_argument("--num-kv-heads", type=int, default=8)
parser.add_argument("--head-size", parser.add_argument("--head-size",
type=int, type=int,
choices=[64, 80, 96, 112, 128, 192, 256], choices=[64, 80, 96, 112, 120, 128, 192, 256],
default=128) default=128)
parser.add_argument("--block-size", type=int, choices=[16, 32], default=16) parser.add_argument("--block-size", type=int, choices=[16, 32], default=16)
parser.add_argument("--use-alibi", action="store_true") parser.add_argument("--use-alibi", action="store_true")
......
import argparse
from itertools import accumulate from itertools import accumulate
from typing import Optional from typing import List, Optional
import nvtx import nvtx
import torch import torch
from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.rotary_embedding import (RotaryEmbedding,
get_rope)
from vllm.utils import FlexibleArgumentParser
def benchmark_rope_kernels_multi_lora( def benchmark_rope_kernels_multi_lora(
...@@ -37,7 +38,7 @@ def benchmark_rope_kernels_multi_lora( ...@@ -37,7 +38,7 @@ def benchmark_rope_kernels_multi_lora(
}) })
# non-batched RoPE takes only one scaling factor, we create multiple # non-batched RoPE takes only one scaling factor, we create multiple
# instances to simulate the same behavior # instances to simulate the same behavior
non_batched_ropes = [] non_batched_ropes: List[RotaryEmbedding] = []
for scaling_factor in scaling_factors: for scaling_factor in scaling_factors:
non_batched_ropes.append( non_batched_ropes.append(
get_rope(head_size, rotary_dim, max_position, base, is_neox_style, get_rope(head_size, rotary_dim, max_position, base, is_neox_style,
...@@ -85,7 +86,7 @@ def benchmark_rope_kernels_multi_lora( ...@@ -85,7 +86,7 @@ def benchmark_rope_kernels_multi_lora(
if __name__ == '__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser( parser = FlexibleArgumentParser(
description="Benchmark the rotary embedding kernels.") description="Benchmark the rotary embedding kernels.")
parser.add_argument("--is-neox-style", type=bool, default=True) parser.add_argument("--is-neox-style", type=bool, default=True)
parser.add_argument("--batch-size", type=int, default=16) parser.add_argument("--batch-size", type=int, default=16)
...@@ -93,7 +94,7 @@ if __name__ == '__main__': ...@@ -93,7 +94,7 @@ if __name__ == '__main__':
parser.add_argument("--num-heads", type=int, default=8) parser.add_argument("--num-heads", type=int, default=8)
parser.add_argument("--head-size", parser.add_argument("--head-size",
type=int, type=int,
choices=[64, 80, 96, 112, 128, 192, 256], choices=[64, 80, 96, 112, 120, 128, 192, 256],
default=128) default=128)
parser.add_argument("--rotary-dim", type=int, choices=[16, 32], default=32) parser.add_argument("--rotary-dim", type=int, choices=[16, 32], default=32)
parser.add_argument("--dtype", parser.add_argument("--dtype",
......
import argparse
import cProfile import cProfile
import pstats import pstats
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.utils import FlexibleArgumentParser
# A very long prompt, total number of tokens is about 15k. # A very long prompt, total number of tokens is about 15k.
LONG_PROMPT = ["You are an expert in large language models, aren't you?" LONG_PROMPT = ["You are an expert in large language models, aren't you?"
...@@ -47,7 +47,7 @@ def main(args): ...@@ -47,7 +47,7 @@ def main(args):
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser( parser = FlexibleArgumentParser(
description='Benchmark the performance of hashing function in' description='Benchmark the performance of hashing function in'
'automatic prefix caching.') 'automatic prefix caching.')
parser.add_argument('--model', type=str, default='lmsys/longchat-7b-16k') parser.add_argument('--model', type=str, default='lmsys/longchat-7b-16k')
......
...@@ -33,10 +33,23 @@ function (find_isa CPUINFO TARGET OUT) ...@@ -33,10 +33,23 @@ function (find_isa CPUINFO TARGET OUT)
endif() endif()
endfunction() endfunction()
function (is_avx512_disabled OUT)
set(DISABLE_AVX512 $ENV{VLLM_CPU_DISABLE_AVX512})
if(DISABLE_AVX512 AND DISABLE_AVX512 STREQUAL "true")
set(${OUT} ON PARENT_SCOPE)
else()
set(${OUT} OFF PARENT_SCOPE)
endif()
endfunction()
is_avx512_disabled(AVX512_DISABLED)
find_isa(${CPUINFO} "avx2" AVX2_FOUND) find_isa(${CPUINFO} "avx2" AVX2_FOUND)
find_isa(${CPUINFO} "avx512f" AVX512_FOUND) find_isa(${CPUINFO} "avx512f" AVX512_FOUND)
find_isa(${CPUINFO} "POWER10" POWER10_FOUND)
find_isa(${CPUINFO} "POWER9" POWER9_FOUND)
if (AVX512_FOUND) if (AVX512_FOUND AND NOT AVX512_DISABLED)
list(APPEND CXX_COMPILE_FLAGS list(APPEND CXX_COMPILE_FLAGS
"-mavx512f" "-mavx512f"
"-mavx512vl" "-mavx512vl"
...@@ -57,12 +70,21 @@ if (AVX512_FOUND) ...@@ -57,12 +70,21 @@ if (AVX512_FOUND)
elseif (AVX2_FOUND) elseif (AVX2_FOUND)
list(APPEND CXX_COMPILE_FLAGS "-mavx2") list(APPEND CXX_COMPILE_FLAGS "-mavx2")
message(WARNING "vLLM CPU backend using AVX2 ISA") message(WARNING "vLLM CPU backend using AVX2 ISA")
elseif (POWER9_FOUND OR POWER10_FOUND)
message(STATUS "PowerPC detected")
# Check for PowerPC VSX support
list(APPEND CXX_COMPILE_FLAGS
"-mvsx"
"-mcpu=native"
"-mtune=native")
else() else()
message(FATAL_ERROR "vLLM CPU backend requires AVX512 or AVX2 ISA support.") message(FATAL_ERROR "vLLM CPU backend requires AVX512 or AVX2 or Power9+ ISA support.")
endif() endif()
message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}") message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
list(APPEND LIBS "numa")
# #
# Define extension targets # Define extension targets
...@@ -75,6 +97,7 @@ set(VLLM_EXT_SRC ...@@ -75,6 +97,7 @@ set(VLLM_EXT_SRC
"csrc/cpu/activation.cpp" "csrc/cpu/activation.cpp"
"csrc/cpu/attention.cpp" "csrc/cpu/attention.cpp"
"csrc/cpu/cache.cpp" "csrc/cpu/cache.cpp"
"csrc/cpu/utils.cpp"
"csrc/cpu/layernorm.cpp" "csrc/cpu/layernorm.cpp"
"csrc/cpu/pos_encoding.cpp" "csrc/cpu/pos_encoding.cpp"
"csrc/cpu/torch_bindings.cpp") "csrc/cpu/torch_bindings.cpp")
...@@ -84,11 +107,11 @@ define_gpu_extension_target( ...@@ -84,11 +107,11 @@ define_gpu_extension_target(
DESTINATION vllm DESTINATION vllm
LANGUAGE CXX LANGUAGE CXX
SOURCES ${VLLM_EXT_SRC} SOURCES ${VLLM_EXT_SRC}
LIBRARIES ${LIBS}
COMPILE_FLAGS ${CXX_COMPILE_FLAGS} COMPILE_FLAGS ${CXX_COMPILE_FLAGS}
USE_SABI 3 USE_SABI 3
WITH_SOABI WITH_SOABI
) )
add_custom_target(default)
message(STATUS "Enabling C extension.") message(STATUS "Enabling C extension.")
add_dependencies(default _C) add_dependencies(default _C)
...@@ -128,7 +128,7 @@ function (get_torch_gpu_compiler_flags OUT_GPU_FLAGS GPU_LANG) ...@@ -128,7 +128,7 @@ function (get_torch_gpu_compiler_flags OUT_GPU_FLAGS GPU_LANG)
"-U__HIP_NO_HALF_OPERATORS__" "-U__HIP_NO_HALF_OPERATORS__"
"-fno-gpu-rdc" "-fno-gpu-rdc"
"--gpu-max-threads-per-block=1024") "--gpu-max-threads-per-block=1024")
endif() endif()
set(${OUT_GPU_FLAGS} ${GPU_FLAGS} PARENT_SCOPE) set(${OUT_GPU_FLAGS} ${GPU_FLAGS} PARENT_SCOPE)
endfunction() endfunction()
...@@ -152,16 +152,23 @@ macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES) ...@@ -152,16 +152,23 @@ macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES)
if (${GPU_LANG} STREQUAL "HIP") if (${GPU_LANG} STREQUAL "HIP")
# #
# `GPU_ARCHES` controls the `--offload-arch` flags. # `GPU_ARCHES` controls the `--offload-arch` flags.
# `CMAKE_HIP_ARCHITECTURES` is set up by torch and can be controlled
# via the `PYTORCH_ROCM_ARCH` env variable.
# #
# If PYTORCH_ROCM_ARCH env variable exists, then we take it as a list,
# if not, then we use CMAKE_HIP_ARCHITECTURES which was generated by calling
# "rocm_agent_enumerator" in "enable_language(HIP)"
# (in file Modules/CMakeDetermineHIPCompiler.cmake)
#
if(DEFINED ENV{PYTORCH_ROCM_ARCH})
set(HIP_ARCHITECTURES $ENV{PYTORCH_ROCM_ARCH})
else()
set(HIP_ARCHITECTURES ${CMAKE_HIP_ARCHITECTURES})
endif()
# #
# Find the intersection of the supported + detected architectures to # Find the intersection of the supported + detected architectures to
# set the module architecture flags. # set the module architecture flags.
# #
set(${GPU_ARCHES}) set(${GPU_ARCHES})
foreach (_ARCH ${CMAKE_HIP_ARCHITECTURES}) foreach (_ARCH ${HIP_ARCHITECTURES})
if (_ARCH IN_LIST _GPU_SUPPORTED_ARCHES_LIST) if (_ARCH IN_LIST _GPU_SUPPORTED_ARCHES_LIST)
list(APPEND ${GPU_ARCHES} ${_ARCH}) list(APPEND ${GPU_ARCHES} ${_ARCH})
endif() endif()
...@@ -169,7 +176,7 @@ macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES) ...@@ -169,7 +176,7 @@ macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES)
if(NOT ${GPU_ARCHES}) if(NOT ${GPU_ARCHES})
message(FATAL_ERROR message(FATAL_ERROR
"None of the detected ROCm architectures: ${CMAKE_HIP_ARCHITECTURES} is" "None of the detected ROCm architectures: ${HIP_ARCHITECTURES} is"
" supported. Supported ROCm architectures are: ${_GPU_SUPPORTED_ARCHES_LIST}.") " supported. Supported ROCm architectures are: ${_GPU_SUPPORTED_ARCHES_LIST}.")
endif() endif()
...@@ -179,7 +186,7 @@ macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES) ...@@ -179,7 +186,7 @@ macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES)
# #
# The torch cmake setup hardcodes the detected architecture flags in # The torch cmake setup hardcodes the detected architecture flags in
# `CMAKE_CUDA_FLAGS`. Since `CMAKE_CUDA_FLAGS` is a "global" variable, it # `CMAKE_CUDA_FLAGS`. Since `CMAKE_CUDA_FLAGS` is a "global" variable, it
# can't modified on a per-target basis, e.g. for the `punica` extension. # can't modified on a per-target basis.
# So, all the `-gencode` flags need to be extracted and removed from # So, all the `-gencode` flags need to be extracted and removed from
# `CMAKE_CUDA_FLAGS` for processing so they can be passed by another method. # `CMAKE_CUDA_FLAGS` for processing so they can be passed by another method.
# Since it's not possible to use `target_compiler_options` for adding target # Since it's not possible to use `target_compiler_options` for adding target
...@@ -361,4 +368,4 @@ function (define_gpu_extension_target GPU_MOD_NAME) ...@@ -361,4 +368,4 @@ function (define_gpu_extension_target GPU_MOD_NAME)
endif() endif()
install(TARGETS ${GPU_MOD_NAME} LIBRARY DESTINATION ${GPU_DESTINATION}) install(TARGETS ${GPU_MOD_NAME} LIBRARY DESTINATION ${GPU_DESTINATION})
endfunction() endfunction()
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment