Commit 7c4f76e3 authored by zhuwenwen's avatar zhuwenwen
Browse files

merge v0.4.0

parents 2da0dd3e 51c31bc1
...@@ -70,16 +70,16 @@ RUN if [ "$BUILD_FA" = "1" ]; then \ ...@@ -70,16 +70,16 @@ RUN if [ "$BUILD_FA" = "1" ]; then \
&& cd ..; \ && cd ..; \
fi fi
COPY ./ /app/vllm
RUN python3 -m pip install --upgrade pip
RUN python3 -m pip install xformers==0.0.23 --no-deps
# Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt. # Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt.
# Manually removed it so that later steps of numpy upgrade can continue # Manually removed it so that later steps of numpy upgrade can continue
RUN if [ "$BASE_IMAGE" = "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" ]; then \ RUN if [ "$BASE_IMAGE" = "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" ]; then \
rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/; fi rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/; fi
COPY ./ /app/vllm
RUN python3 -m pip install --upgrade pip
RUN python3 -m pip install xformers==0.0.23 --no-deps
RUN cd /app \ RUN cd /app \
&& cd vllm \ && cd vllm \
&& pip install -U -r requirements-rocm.txt \ && pip install -U -r requirements-rocm.txt \
...@@ -90,6 +90,6 @@ RUN cd /app \ ...@@ -90,6 +90,6 @@ RUN cd /app \
&& cd .. && cd ..
RUN python3 -m pip install --upgrade pip RUN python3 -m pip install --upgrade pip
RUN python3 -m pip install --no-cache-dir ray[all] RUN python3 -m pip install --no-cache-dir ray[all]==2.9.3
CMD ["/bin/bash"] CMD ["/bin/bash"]
include LICENSE include LICENSE
include requirements.txt include requirements.txt
include CMakeLists.txt
recursive-include cmake *
recursive-include csrc * recursive-include csrc *
...@@ -42,7 +42,7 @@ python3 setup.py install ...@@ -42,7 +42,7 @@ python3 setup.py install
+ 若使用 pip install 下载安装过慢,可添加源:-i https://pypi.tuna.tsinghua.edu.cn/simple/ + 若使用 pip install 下载安装过慢,可添加源:-i https://pypi.tuna.tsinghua.edu.cn/simple/
## 验证 ## 验证
- python -c "import vllm; print(vllm.\_\_version__)",版本号与官方版本同步,查询该软件的版本号,例如0.3.3 - python -c "import vllm; print(vllm.\_\_version__)",版本号与官方版本同步,查询该软件的版本号,例如0.4.0
## Known Issue ## Known Issue
- -
......
...@@ -16,12 +16,12 @@ Easy, fast, and cheap LLM serving for everyone ...@@ -16,12 +16,12 @@ Easy, fast, and cheap LLM serving for everyone
--- ---
**The Second vLLM Bay Area Meetup (Jan 31st 5pm-7:30pm PT)** **The Third vLLM Bay Area Meetup (April 2nd 6pm-8:30pm PT)**
We are thrilled to announce our second vLLM Meetup! We are thrilled to announce our third vLLM Meetup!
The vLLM team will share recent updates and roadmap. The vLLM team will share recent updates and roadmap.
We will also have vLLM collaborators from IBM coming up to the stage to discuss their insights on LLM optimizations. We will also have vLLM collaborators from Roblox coming up to the stage to discuss their experience in deploying LLMs with vLLM.
Please register [here](https://lu.ma/ygxbpzhl) and join us! Please register [here](https://robloxandvllmmeetup2024.splashthat.com/) and join us!
--- ---
...@@ -67,6 +67,8 @@ vLLM seamlessly supports many Hugging Face models, including the following archi ...@@ -67,6 +67,8 @@ vLLM seamlessly supports many Hugging Face models, including the following archi
- Baichuan & Baichuan2 (`baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc.) - Baichuan & Baichuan2 (`baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc.)
- BLOOM (`bigscience/bloom`, `bigscience/bloomz`, etc.) - BLOOM (`bigscience/bloom`, `bigscience/bloomz`, etc.)
- ChatGLM (`THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, etc.) - ChatGLM (`THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, etc.)
- Command-R (`CohereForAI/c4ai-command-r-v01`, etc.)
- DBRX (`databricks/dbrx-base`, `databricks/dbrx-instruct` etc.)
- DeciLM (`Deci/DeciLM-7B`, `Deci/DeciLM-7B-instruct`, etc.) - DeciLM (`Deci/DeciLM-7B`, `Deci/DeciLM-7B-instruct`, etc.)
- Falcon (`tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc.) - Falcon (`tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc.)
- Gemma (`google/gemma-2b`, `google/gemma-7b`, etc.) - Gemma (`google/gemma-2b`, `google/gemma-7b`, etc.)
...@@ -76,6 +78,7 @@ vLLM seamlessly supports many Hugging Face models, including the following archi ...@@ -76,6 +78,7 @@ vLLM seamlessly supports many Hugging Face models, including the following archi
- GPT-NeoX (`EleutherAI/gpt-neox-20b`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc.) - GPT-NeoX (`EleutherAI/gpt-neox-20b`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc.)
- InternLM (`internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc.) - InternLM (`internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc.)
- InternLM2 (`internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc.) - InternLM2 (`internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc.)
- Jais (`core42/jais-13b`, `core42/jais-13b-chat`, `core42/jais-30b-v3`, `core42/jais-30b-chat-v3`, etc.)
- LLaMA & LLaMA-2 (`meta-llama/Llama-2-70b-hf`, `lmsys/vicuna-13b-v1.3`, `young-geng/koala`, `openlm-research/open_llama_13b`, etc.) - LLaMA & LLaMA-2 (`meta-llama/Llama-2-70b-hf`, `lmsys/vicuna-13b-v1.3`, `young-geng/koala`, `openlm-research/open_llama_13b`, etc.)
- Mistral (`mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc.) - Mistral (`mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc.)
- Mixtral (`mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, etc.) - Mixtral (`mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, etc.)
...@@ -86,8 +89,10 @@ vLLM seamlessly supports many Hugging Face models, including the following archi ...@@ -86,8 +89,10 @@ vLLM seamlessly supports many Hugging Face models, including the following archi
- Phi (`microsoft/phi-1_5`, `microsoft/phi-2`, etc.) - Phi (`microsoft/phi-1_5`, `microsoft/phi-2`, etc.)
- Qwen (`Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc.) - Qwen (`Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc.)
- Qwen2 (`Qwen/Qwen2-7B-beta`, `Qwen/Qwen-7B-Chat-beta`, etc.) - Qwen2 (`Qwen/Qwen2-7B-beta`, `Qwen/Qwen-7B-Chat-beta`, etc.)
- Qwen2MoE (`Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc.)
- StableLM(`stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc.) - StableLM(`stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc.)
- Starcoder2(`bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc.) - Starcoder2(`bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc.)
- Xverse (`xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc.)
- Yi (`01-ai/Yi-6B`, `01-ai/Yi-34B`, etc.) - Yi (`01-ai/Yi-6B`, `01-ai/Yi-34B`, etc.)
Install vLLM with pip or [from source](https://vllm.readthedocs.io/en/latest/getting_started/installation.html#build-from-source): Install vLLM with pip or [from source](https://vllm.readthedocs.io/en/latest/getting_started/installation.html#build-from-source):
......
import json import json
import os import os
import sys
import time import time
from dataclasses import dataclass import traceback
from typing import Optional from dataclasses import dataclass, field
from typing import List, Optional
import aiohttp import aiohttp
from tqdm.asyncio import tqdm from tqdm.asyncio import tqdm
...@@ -26,8 +28,11 @@ class RequestFuncOutput: ...@@ -26,8 +28,11 @@ class RequestFuncOutput:
generated_text: str = "" generated_text: str = ""
success: bool = False success: bool = False
latency: float = 0 latency: float = 0
ttft: float = 0 ttft: float = 0 # Time to first token
itl: List[float] = field(
default_factory=list) # List of inter-token latencies
prompt_len: int = 0 prompt_len: int = 0
error: str = ""
async def async_request_tgi( async def async_request_tgi(
...@@ -55,71 +60,38 @@ async def async_request_tgi( ...@@ -55,71 +60,38 @@ async def async_request_tgi(
ttft = 0 ttft = 0
st = time.perf_counter() st = time.perf_counter()
most_recent_timestamp = st
try: try:
async with session.post(url=api_url, json=payload) as response: async with session.post(url=api_url, json=payload) as response:
if response.status == 200: if response.status == 200:
async for data in response.content.iter_any(): async for chunk in response.content:
if ttft == 0: chunk = chunk.strip()
ttft = time.perf_counter() - st if not chunk:
output.ttft = ttft continue
output.latency = time.perf_counter() - st
body = data.decode("utf-8").lstrip("data:")
output.generated_text = json.loads(body)["generated_text"]
output.success = True
else:
output.success = False
except (aiohttp.ClientOSError, aiohttp.ServerDisconnectedError):
output.success = False
if pbar:
pbar.update(1)
return output
async def async_request_vllm( chunk = remove_prefix(chunk.decode("utf-8"), "data:")
request_func_input: RequestFuncInput,
pbar: Optional[tqdm] = None,
) -> RequestFuncOutput:
api_url = request_func_input.api_url
assert api_url.endswith("generate")
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: data = json.loads(chunk)
payload = { timestamp = time.perf_counter()
"prompt": request_func_input.prompt, # First token
"n": 1,
"best_of": request_func_input.best_of,
"use_beam_search": request_func_input.use_beam_search,
"temperature": 0.0 if request_func_input.use_beam_search else 1.0,
"top_p": 1.0,
"max_tokens": request_func_input.output_len,
"ignore_eos": True,
"stream": True,
}
output = RequestFuncOutput()
output.prompt_len = request_func_input.prompt_len
ttft = 0
st = time.perf_counter()
try:
async with session.post(url=api_url, json=payload) as response:
if response.status == 200:
async for data in response.content.iter_any():
if ttft == 0: if ttft == 0:
ttft = time.perf_counter() - st ttft = time.perf_counter() - st
output.ttft = ttft output.ttft = ttft
output.latency = time.perf_counter() - st
# When streaming, '\0' is appended to the end of the response. # Decoding phase
body = data.decode("utf-8").strip("\0") else:
output.generated_text = json.loads( output.itl.append(timestamp -
body)["text"][0][len(request_func_input.prompt):] most_recent_timestamp)
output.success = True
else: most_recent_timestamp = timestamp
output.success = False
except (aiohttp.ClientOSError, aiohttp.ServerDisconnectedError): output.latency = most_recent_timestamp - st
output.success = True
output.generated_text = data["generated_text"]
except Exception:
output.success = False output.success = False
exc_info = sys.exc_info()
output.error = "".join(traceback.format_exception(*exc_info))
if pbar: if pbar:
pbar.update(1) pbar.update(1)
...@@ -146,26 +118,45 @@ async def async_request_trt_llm( ...@@ -146,26 +118,45 @@ async def async_request_trt_llm(
} }
output = RequestFuncOutput() output = RequestFuncOutput()
output.prompt_len = request_func_input.prompt_len output.prompt_len = request_func_input.prompt_len
ttft = 0
ttft = 0
st = time.perf_counter() st = time.perf_counter()
most_recent_timestamp = st
try: try:
async with session.post(url=api_url, json=payload) as resp: async with session.post(url=api_url, json=payload) as response:
if resp.status == 200: if response.status == 200:
async for data in resp.content.iter_any(): async for chunk in response.content:
chunk = chunk.strip()
if not chunk:
continue
chunk = remove_prefix(chunk.decode("utf-8"), "data:")
data = json.loads(chunk)
timestamp = time.perf_counter()
# First token
if ttft == 0: if ttft == 0:
ttft = time.perf_counter() - st ttft = time.perf_counter() - st
output.ttft = ttft output.ttft = ttft
output.latency = time.perf_counter() - st
body = data.decode("utf-8").lstrip("data:") # Decoding phase
output.generated_text = json.loads(body)["text_output"] else:
output.itl.append(timestamp -
most_recent_timestamp)
most_recent_timestamp = timestamp
output.latency = most_recent_timestamp - st
output.generated_text = json.loads(data)["text_output"]
output.success = True output.success = True
else: else:
output.error = response.reason
output.success = False output.success = False
except (aiohttp.ClientOSError, aiohttp.ServerDisconnectedError): except Exception:
output.success = False output.success = False
exc_info = sys.exc_info()
output.error = "".join(traceback.format_exception(*exc_info))
if pbar: if pbar:
pbar.update(1) pbar.update(1)
...@@ -181,34 +172,35 @@ async def async_request_deepspeed_mii( ...@@ -181,34 +172,35 @@ async def async_request_deepspeed_mii(
assert not request_func_input.use_beam_search assert not request_func_input.use_beam_search
payload = { payload = {
"prompts": request_func_input.prompt, "prompt": request_func_input.prompt,
"max_new_tokens": request_func_input.output_len, "max_tokens": request_func_input.output_len,
"ignore_eos": True, "temperature": 0.01, # deepspeed-mii does not accept 0.0 temp.
"do_sample": True,
"temperature":
0.01, # deepspeed-mii does not accept 0.0 temperature.
"top_p": 1.0, "top_p": 1.0,
} }
output = RequestFuncOutput() output = RequestFuncOutput()
output.prompt_len = request_func_input.prompt_len output.prompt_len = request_func_input.prompt_len
# DeepSpeed-MII doesn't support streaming as of Jan 28 2024, will use 0 as placeholder. # NOTE: DeepSpeed-MII doesn't support streaming as of Jan 28 2024,
# https://github.com/microsoft/DeepSpeed-MII/pull/311 # will use 0 as placeholder.
# See https://github.com/microsoft/DeepSpeed-MII/pull/311
output.ttft = 0 output.ttft = 0
st = time.perf_counter() st = time.perf_counter()
try: try:
async with session.post(url=request_func_input.api_url, async with session.post(url=request_func_input.api_url,
json=payload) as resp: json=payload) as response:
if resp.status == 200: if response.status == 200:
parsed_resp = await resp.json() parsed_resp = await response.json()
output.latency = time.perf_counter() - st output.latency = time.perf_counter() - st
output.generated_text = parsed_resp[0]["generated_text"] output.generated_text = parsed_resp["text"][0]
output.success = True output.success = True
else: else:
output.error = response.reason
output.success = False output.success = False
except (aiohttp.ClientOSError, aiohttp.ServerDisconnectedError): except Exception:
output.success = False output.success = False
exc_info = sys.exc_info()
output.error = "".join(traceback.format_exception(*exc_info))
if pbar: if pbar:
pbar.update(1) pbar.update(1)
...@@ -220,7 +212,9 @@ async def async_request_openai_completions( ...@@ -220,7 +212,9 @@ async def async_request_openai_completions(
pbar: Optional[tqdm] = None, pbar: Optional[tqdm] = None,
) -> RequestFuncOutput: ) -> RequestFuncOutput:
api_url = request_func_input.api_url api_url = request_func_input.api_url
assert api_url.endswith("v1/completions") assert api_url.endswith(
"v1/completions"
), "OpenAI Completions API URL must end with 'v1/completions'."
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
assert not request_func_input.use_beam_search assert not request_func_input.use_beam_search
...@@ -242,43 +236,150 @@ async def async_request_openai_completions( ...@@ -242,43 +236,150 @@ async def async_request_openai_completions(
generated_text = "" generated_text = ""
ttft = 0 ttft = 0
st = time.perf_counter() st = time.perf_counter()
most_recent_timestamp = st
try: try:
async with session.post(url=api_url, json=payload, async with session.post(url=api_url, json=payload,
headers=headers) as response: headers=headers) as response:
if response.status == 200: if response.status == 200:
async for chunk in response.content: async for chunk in response.content:
if ttft == 0: chunk = chunk.strip()
ttft = time.perf_counter() - st if not chunk:
output.ttft = ttft continue
chunk = remove_prefix(chunk.decode("utf-8"), "data: ")
if chunk == "[DONE]":
latency = time.perf_counter() - st
else:
data = json.loads(chunk)
if data["choices"][0]["text"]:
timestamp = time.perf_counter()
# First token
if ttft == 0:
ttft = time.perf_counter() - st
output.ttft = ttft
# Decoding phase
# NOTE: Some completion API might have a last
# usage summary response without a token so we
# do not want to include as inter-token-latency
elif data.get("usage", None) is None:
output.itl.append(timestamp -
most_recent_timestamp)
most_recent_timestamp = timestamp
generated_text += data["choices"][0]["text"]
output.generated_text = generated_text
output.success = True
output.latency = latency
except Exception:
output.success = False
exc_info = sys.exc_info()
output.error = "".join(traceback.format_exception(*exc_info))
if pbar:
pbar.update(1)
return output
async def async_request_openai_chat_completions(
request_func_input: RequestFuncInput,
pbar: Optional[tqdm] = None,
) -> RequestFuncOutput:
api_url = request_func_input.api_url
assert api_url.endswith(
"v1/chat/completions"
), "OpenAI Chat Completions API URL must end with 'v1/chat/completions'."
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
assert not request_func_input.use_beam_search
payload = {
"model": request_func_input.model,
"messages": [
{
"role": "user",
"content": request_func_input.prompt,
},
],
"temperature": 0.0,
"max_tokens": request_func_input.output_len,
"stream": True,
}
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
}
output = RequestFuncOutput()
output.prompt_len = request_func_input.prompt_len
generated_text = ""
ttft = 0
st = time.perf_counter()
most_recent_timestamp = st
try:
async with session.post(url=api_url, json=payload,
headers=headers) as response:
if response.status == 200:
async for chunk in response.content:
chunk = chunk.strip() chunk = chunk.strip()
if not chunk: if not chunk:
continue continue
chunk = chunk.decode("utf-8").lstrip("data: ") chunk = remove_prefix(chunk.decode("utf-8"), "data: ")
if chunk == "[DONE]": if chunk == "[DONE]":
latency = time.perf_counter() - st latency = time.perf_counter() - st
else: else:
body = json.loads(chunk) timestamp = time.perf_counter()
generated_text += body["choices"][0]["text"] data = json.loads(chunk)
if "content" in data["choices"][0]["delta"]:
# First token
if ttft == 0:
ttft = time.perf_counter() - st
output.ttft = ttft
# Decoding phase
else:
output.itl.append(timestamp -
most_recent_timestamp)
generated_text += data["choices"][0]["delta"][
"content"]
most_recent_timestamp = timestamp
output.generated_text = generated_text output.generated_text = generated_text
output.success = True output.success = True
output.latency = latency output.latency = latency
else: else:
output.error = response.reason
output.success = False output.success = False
except (aiohttp.ClientOSError, aiohttp.ServerDisconnectedError): except Exception:
output.success = False output.success = False
exc_info = sys.exc_info()
output.error = "".join(traceback.format_exception(*exc_info))
if pbar: if pbar:
pbar.update(1) pbar.update(1)
return output return output
# Since vllm must support Python 3.8, we can't use str.removeprefix(prefix)
# introduced in Python 3.9
def remove_prefix(text: str, prefix: str) -> str:
if text.startswith(prefix):
return text[len(prefix):]
return text
ASYNC_REQUEST_FUNCS = { ASYNC_REQUEST_FUNCS = {
"tgi": async_request_tgi, "tgi": async_request_tgi,
"vllm": async_request_vllm, "vllm": async_request_openai_completions,
"lmdeploy": async_request_openai_completions,
"deepspeed-mii": async_request_deepspeed_mii, "deepspeed-mii": async_request_deepspeed_mii,
"openai": async_request_openai_completions, "openai": async_request_openai_completions,
"openai-chat": async_request_openai_chat_completions,
"tensorrt-llm": async_request_trt_llm, "tensorrt-llm": async_request_trt_llm,
} }
...@@ -16,17 +16,19 @@ def main(args: argparse.Namespace): ...@@ -16,17 +16,19 @@ def main(args: argparse.Namespace):
# NOTE(woosuk): If the request cannot be processed in a single batch, # NOTE(woosuk): If the request cannot be processed in a single batch,
# the engine will automatically process the request in multiple batches. # the engine will automatically process the request in multiple batches.
llm = LLM( llm = LLM(model=args.model,
model=args.model, tokenizer=args.tokenizer,
tokenizer=args.tokenizer, quantization=args.quantization,
quantization=args.quantization, tensor_parallel_size=args.tensor_parallel_size,
tensor_parallel_size=args.tensor_parallel_size, trust_remote_code=args.trust_remote_code,
trust_remote_code=args.trust_remote_code, dtype=args.dtype,
dtype=args.dtype, enforce_eager=args.enforce_eager,
enforce_eager=args.enforce_eager, kv_cache_dtype=args.kv_cache_dtype,
kv_cache_dtype=args.kv_cache_dtype, device=args.device,
device=args.device, ray_workers_use_nsight=args.ray_workers_use_nsight,
) enable_chunked_prefill=args.enable_chunked_prefill,
download_dir=args.download_dir,
block_size=args.block_size)
sampling_params = SamplingParams( sampling_params = SamplingParams(
n=args.n, n=args.n,
...@@ -145,5 +147,25 @@ if __name__ == '__main__': ...@@ -145,5 +147,25 @@ if __name__ == '__main__':
default="cuda", default="cuda",
choices=["cuda"], choices=["cuda"],
help='device type for vLLM execution, supporting CUDA only currently.') help='device type for vLLM execution, supporting CUDA only currently.')
parser.add_argument('--block-size',
type=int,
default=16,
help='block size of key/value cache')
parser.add_argument(
'--enable-chunked-prefill',
type=bool,
default=False,
help='If True, the prefill requests can be chunked based on the '
'max_num_batched_tokens')
parser.add_argument(
"--ray-workers-use-nsight",
action='store_true',
help="If specified, use nsight to profile ray workers",
)
parser.add_argument('--download-dir',
type=str,
default=None,
help='directory to download and load the weights, '
'default to the default cache dir of huggingface')
args = parser.parse_args() args = parser.parse_args()
main(args) main(args)
import argparse
import time
from vllm import LLM, SamplingParams
PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as fellows. You need to answer my question about the table.\n# Table\n|Opening|Opening|Sl. No.|Film|Cast|Director|Music Director|Notes|\n|----|----|----|----|----|----|----|----|\n|J A N|9|1|Agni Pushpam|Jayabharathi, Kamalahasan|Jeassy|M. K. Arjunan||\n|J A N|16|2|Priyamvada|Mohan Sharma, Lakshmi, KPAC Lalitha|K. S. Sethumadhavan|V. Dakshinamoorthy||\n|J A N|23|3|Yakshagaanam|Madhu, Sheela|Sheela|M. S. Viswanathan||\n|J A N|30|4|Paalkkadal|Sheela, Sharada|T. K. Prasad|A. T. Ummer||\n|F E B|5|5|Amma|Madhu, Srividya|M. Krishnan Nair|M. K. Arjunan||\n|F E B|13|6|Appooppan|Thikkurissi Sukumaran Nair, Kamal Haasan|P. Bhaskaran|M. S. Baburaj||\n|F E B|20|7|Srishti|Chowalloor Krishnankutty, Ravi Alummoodu|K. T. Muhammad|M. S. Baburaj||\n|F E B|20|8|Vanadevatha|Prem Nazir, Madhubala|Yusufali Kechery|G. Devarajan||\n|F E B|27|9|Samasya|Madhu, Kamalahaasan|K. Thankappan|Shyam||\n|F E B|27|10|Yudhabhoomi|K. P. Ummer, Vidhubala|Crossbelt Mani|R. K. Shekhar||\n|M A R|5|11|Seemantha Puthran|Prem Nazir, Jayabharathi|A. B. Raj|M. K. Arjunan||\n|M A R|12|12|Swapnadanam|Rani Chandra, Dr. Mohandas|K. G. George|Bhaskar Chandavarkar||\n|M A R|19|13|Thulavarsham|Prem Nazir, sreedevi, Sudheer|N. Sankaran Nair|V. Dakshinamoorthy||\n|M A R|20|14|Aruthu|Kaviyoor Ponnamma, Kamalahasan|Ravi|G. Devarajan||\n|M A R|26|15|Swimming Pool|Kamal Haasan, M. G. Soman|J. Sasikumar|M. K. Arjunan||\n\n# Question\nWhat' s the content in the (1,1) cells\n" # noqa: E501
def test_prefix(llm=None, sampling_params=None, prompts=None):
start_time = time.time()
llm.generate(prompts, sampling_params=sampling_params)
end_time = time.time()
print(f"cost time {end_time - start_time}")
def main(args):
llm = LLM(model="baichuan-inc/Baichuan2-13B-Chat",
tokenizer_mode='auto',
trust_remote_code=True,
enforce_eager=True,
enable_prefix_caching=args.enable_prefix_caching)
num_prompts = 100
prompts = [PROMPT] * num_prompts
sampling_params = SamplingParams(temperature=0, max_tokens=100)
print("------warm up------")
test_prefix(
llm=llm,
prompts=prompts[:1],
sampling_params=sampling_params,
)
print("------start generating------")
test_prefix(
llm=llm,
prompts=prompts,
sampling_params=sampling_params,
)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description='Benchmark the performance with or without automatic '
'prefix caching.')
parser.add_argument('--enable-prefix-caching',
action='store_true',
help='enable prefix caching')
args = parser.parse_args()
main(args)
"""Benchmark online serving throughput. """Benchmark online serving throughput.
On the server side, run one of the following commands: On the server side, run one of the following commands:
(vLLM backend) vLLM OpenAI API server
python -m vllm.entrypoints.api_server \ python -m vllm.entrypoints.openai.api_server \
--model <your_model> --swap-space 16 \ --model <your_model> --swap-space 16 \
--disable-log-requests --disable-log-requests
...@@ -12,28 +12,30 @@ On the server side, run one of the following commands: ...@@ -12,28 +12,30 @@ On the server side, run one of the following commands:
On the client side, run: On the client side, run:
python benchmarks/benchmark_serving.py \ python benchmarks/benchmark_serving.py \
--backend <backend> \ --backend <backend> \
--tokenizer <your_model> --dataset <target_dataset> \ --model <your_model> \
--request-rate <request_rate> --dataset-name sharegpt \
--dataset-path <path to dataset> \
--request-rate <request_rate> \ # By default <request_rate> is inf
--num-prompts <num_prompts> # By default <num_prompts> is 1000
""" """
import argparse import argparse
import asyncio import asyncio
import json import json
import os
import random import random
import time import time
import warnings
from dataclasses import dataclass from dataclasses import dataclass
from datetime import datetime from datetime import datetime
from typing import AsyncGenerator, List, Tuple from typing import AsyncGenerator, List, Tuple
import numpy as np import numpy as np
from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
RequestFuncOutput)
from tqdm.asyncio import tqdm from tqdm.asyncio import tqdm
from transformers import PreTrainedTokenizerBase from transformers import PreTrainedTokenizerBase
from vllm.transformers_utils.tokenizer import get_tokenizer
from backend_request_func import ( from vllm.transformers_utils.tokenizer import get_tokenizer
ASYNC_REQUEST_FUNCS,
RequestFuncInput,
RequestFuncOutput,
)
@dataclass @dataclass
...@@ -52,7 +54,7 @@ class BenchmarkMetrics: ...@@ -52,7 +54,7 @@ class BenchmarkMetrics:
p99_tpot_ms: float p99_tpot_ms: float
def sample_requests( def sample_sharegpt_requests(
dataset_path: str, dataset_path: str,
num_requests: int, num_requests: int,
tokenizer: PreTrainedTokenizerBase, tokenizer: PreTrainedTokenizerBase,
...@@ -100,6 +102,73 @@ def sample_requests( ...@@ -100,6 +102,73 @@ def sample_requests(
return sampled_requests return sampled_requests
def sample_sonnet_requests(
dataset_path: str,
num_requests: int,
input_len: int,
output_len: int,
prefix_len: int,
tokenizer: PreTrainedTokenizerBase,
) -> List[Tuple[str, str, int, int]]:
assert input_len > prefix_len, "input_len must be greater than prefix_len."
# Load the dataset.
with open(dataset_path) as f:
poem_lines = f.readlines()
# Tokenize the poem lines.
poem_token_ids = tokenizer(poem_lines).input_ids
average_poem_len = sum(
len(token_ids) for token_ids in poem_token_ids) / len(poem_token_ids)
# Base prefix for all requests.
base_prompt = "Pick as many lines as you can from these poem lines:\n"
base_message = [{
"role": "user",
"content": base_prompt,
}]
base_prompt_formatted = tokenizer.apply_chat_template(
base_message, add_generation_prompt=True, tokenize=False)
base_prompt_offset = len(tokenizer(base_prompt_formatted).input_ids)
assert (input_len > base_prompt_offset
), f"Please set 'args.input-len' higher than {base_prompt_offset}."
num_input_lines = round(
(input_len - base_prompt_offset) / average_poem_len)
# First approximately `prefix_len` number of tokens in the
# prompt are fixed poem lines.
assert (
prefix_len > base_prompt_offset
), f"Please set 'args.prefix-len' higher than {base_prompt_offset}."
num_prefix_lines = round(
(prefix_len - base_prompt_offset) / average_poem_len)
prefix_lines = poem_lines[:num_prefix_lines]
# Sample the rest of lines per request.
sampled_requests: List[Tuple[str, int, int]] = []
for _ in range(num_requests):
sampled_lines = "".join(
prefix_lines +
random.sample(poem_lines, num_input_lines - num_prefix_lines))
prompt = f"{base_prompt}{sampled_lines}"
message = [
{
"role": "user",
"content": prompt,
},
]
prompt_formatted = tokenizer.apply_chat_template(
message, add_generation_prompt=True, tokenize=False)
prompt_len = len(tokenizer(prompt_formatted).input_ids)
sampled_requests.append(
(prompt, prompt_formatted, prompt_len, output_len))
return sampled_requests
async def get_request( async def get_request(
input_requests: List[Tuple[str, int, int]], input_requests: List[Tuple[str, int, int]],
request_rate: float, request_rate: float,
...@@ -122,37 +191,42 @@ def calculate_metrics( ...@@ -122,37 +191,42 @@ def calculate_metrics(
outputs: List[RequestFuncOutput], outputs: List[RequestFuncOutput],
dur_s: float, dur_s: float,
tokenizer: PreTrainedTokenizerBase, tokenizer: PreTrainedTokenizerBase,
) -> BenchmarkMetrics: ) -> Tuple[BenchmarkMetrics, List[int]]:
total_output = 0 actual_output_lens = []
total_input = 0 total_input = 0
completed = 0 completed = 0
per_token_latencies = [] tpots = []
ttfts = [] ttfts = []
for i in range(len(outputs)): for i in range(len(outputs)):
if outputs[i].success: if outputs[i].success:
output_len = len(tokenizer.encode(outputs[i].generated_text)) output_len = len(tokenizer(outputs[i].generated_text).input_ids)
total_output += output_len actual_output_lens.append(output_len)
total_input += input_requests[i][1] total_input += input_requests[i][1]
per_token_latencies.append(outputs[i].latency / output_len) if output_len > 1:
tpots.append(
(outputs[i].latency - outputs[i].ttft) / (output_len - 1))
ttfts.append(outputs[i].ttft) ttfts.append(outputs[i].ttft)
completed += 1 completed += 1
else:
actual_output_lens.append(0)
metrics = BenchmarkMetrics( metrics = BenchmarkMetrics(
completed=completed, completed=completed,
total_input=total_input, total_input=total_input,
total_output=total_output, total_output=sum(actual_output_lens),
request_throughput=completed / dur_s, request_throughput=completed / dur_s,
input_throughput=total_input / dur_s, input_throughput=total_input / dur_s,
output_throughput=total_output / dur_s, output_throughput=sum(actual_output_lens) / dur_s,
mean_ttft_ms=np.mean(ttfts) * 1000, mean_ttft_ms=np.mean(ttfts or 0) *
median_ttft_ms=np.median(ttfts) * 1000, 1000, # ttfts is empty if streaming is not supported by backend
p99_ttft_ms=np.percentile(ttfts, 99) * 1000, median_ttft_ms=np.median(ttfts or 0) * 1000,
mean_tpot_ms=np.mean(per_token_latencies) * 1000, p99_ttft_ms=np.percentile(ttfts or 0, 99) * 1000,
median_tpot_ms=np.median(per_token_latencies) * 1000, mean_tpot_ms=np.mean(tpots) * 1000,
p99_tpot_ms=np.percentile(per_token_latencies, 99) * 1000, median_tpot_ms=np.median(tpots) * 1000,
p99_tpot_ms=np.percentile(tpots, 99) * 1000,
) )
return metrics return metrics, actual_output_lens
async def benchmark( async def benchmark(
...@@ -171,10 +245,10 @@ async def benchmark( ...@@ -171,10 +245,10 @@ async def benchmark(
else: else:
raise ValueError(f"Unknown backend: {backend}") raise ValueError(f"Unknown backend: {backend}")
pbar = None if disable_tqdm else tqdm(total=len(input_requests))
print(f"Traffic request rate: {request_rate}") print(f"Traffic request rate: {request_rate}")
pbar = None if disable_tqdm else tqdm(total=len(input_requests))
benchmark_start_time = time.perf_counter() benchmark_start_time = time.perf_counter()
tasks = [] tasks = []
async for request in get_request(input_requests, request_rate): async for request in get_request(input_requests, request_rate):
...@@ -192,40 +266,53 @@ async def benchmark( ...@@ -192,40 +266,53 @@ async def benchmark(
asyncio.create_task( asyncio.create_task(
request_func(request_func_input=request_func_input, request_func(request_func_input=request_func_input,
pbar=pbar))) pbar=pbar)))
outputs = await asyncio.gather(*tasks) outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
if not disable_tqdm: if not disable_tqdm:
pbar.close() pbar.close()
benchmark_duration = time.perf_counter() - benchmark_start_time benchmark_duration = time.perf_counter() - benchmark_start_time
metrics = calculate_metrics( metrics, actual_output_lens = calculate_metrics(
input_requests=input_requests, input_requests=input_requests,
outputs=outputs, outputs=outputs,
dur_s=benchmark_duration, dur_s=benchmark_duration,
tokenizer=tokenizer, tokenizer=tokenizer,
) )
print(f"Successful requests: {metrics.completed}") print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='='))
print(f"Benchmark duration: {benchmark_duration:2f} s") print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
print(f"Total input tokens: {metrics.total_input}") print("{:<40} {:<10.2f}".format("Benchmark duration (s):",
print(f"Total generated tokens: {metrics.total_output}") benchmark_duration))
print(f"Request throughput: {metrics.request_throughput:.2f} requests/s") print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
print(f"Input token throughput: {metrics.input_throughput:.2f} tokens/s") print("{:<40} {:<10}".format("Total generated tokens:",
print(f"Output token throughput: {metrics.output_throughput:.2f} tokens/s") metrics.total_output))
print(f"Mean TTFT: {metrics.mean_ttft_ms:.2f} ms") print("{:<40} {:<10.2f}".format("Request throughput (req/s):",
print(f"Median TTFT: {metrics.median_ttft_ms:.2f} ms") metrics.request_throughput))
print(f"P99 TTFT: {metrics.p99_ttft_ms:.2f} ms") print("{:<40} {:<10.2f}".format("Input token throughput (tok/s):",
print(f"Mean TPOT: {metrics.mean_tpot_ms:.2f} ms") metrics.input_throughput))
print(f"Median TPOT: {metrics.median_tpot_ms:.2f} ms") print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):",
print(f"P99 TPOT: {metrics.p99_tpot_ms:.2f} ms") metrics.output_throughput))
print("{s:{c}^{n}}".format(s='Time to First Token', n=50, c='-'))
print("{:<40} {:<10.2f}".format("Mean TTFT (ms):", metrics.mean_ttft_ms))
print("{:<40} {:<10.2f}".format("Median TTFT (ms):",
metrics.median_ttft_ms))
print("{:<40} {:<10.2f}".format("P99 TTFT (ms):", metrics.p99_ttft_ms))
print("{s:{c}^{n}}".format(s='Time per Output Token (excl. 1st token)',
n=50,
c='-'))
print("{:<40} {:<10.2f}".format("Mean TPOT (ms):", metrics.mean_tpot_ms))
print("{:<40} {:<10.2f}".format("Median TPOT (ms):",
metrics.median_tpot_ms))
print("{:<40} {:<10.2f}".format("P99 TPOT (ms):", metrics.p99_tpot_ms))
print("=" * 50)
result = { result = {
"duration": benchmark_duration, "duration": benchmark_duration,
"completed": metrics.completed, "completed": metrics.completed,
"total_input_tokens": metrics.total_input, "total_input_tokens": metrics.total_input,
"total_output_tokens": metrics.total_output, "total_output_tokens": metrics.total_output,
"request_inthroughput": metrics.request_throughput, "request_throughput": metrics.request_throughput,
"input_throughput": metrics.input_throughput, "input_throughput": metrics.input_throughput,
"output_throughput": metrics.output_throughput, "output_throughput": metrics.output_throughput,
"mean_ttft_ms": metrics.mean_ttft_ms, "mean_ttft_ms": metrics.mean_ttft_ms,
...@@ -233,7 +320,13 @@ async def benchmark( ...@@ -233,7 +320,13 @@ async def benchmark(
"p99_ttft_ms": metrics.p99_ttft_ms, "p99_ttft_ms": metrics.p99_ttft_ms,
"mean_tpot_ms": metrics.mean_tpot_ms, "mean_tpot_ms": metrics.mean_tpot_ms,
"median_tpot_ms": metrics.median_tpot_ms, "median_tpot_ms": metrics.median_tpot_ms,
"p99_tpot_ms": metrics.p99_tpot_ms "p99_tpot_ms": metrics.p99_tpot_ms,
"input_lens": [output.prompt_len for output in outputs],
"output_lens": actual_output_lens,
"ttfts": [output.ttft for output in outputs],
"itls": [output.itl for output in outputs],
"generated_texts": [output.generated_text for output in outputs],
"errors": [output.error for output in outputs],
} }
return result return result
...@@ -254,7 +347,58 @@ def main(args: argparse.Namespace): ...@@ -254,7 +347,58 @@ def main(args: argparse.Namespace):
tokenizer = get_tokenizer(tokenizer_id, tokenizer = get_tokenizer(tokenizer_id,
trust_remote_code=args.trust_remote_code) trust_remote_code=args.trust_remote_code)
input_requests = sample_requests(args.dataset, args.num_prompts, tokenizer)
if args.dataset is not None:
warnings.warn(
"The '--dataset' argument will be deprecated in the next "
"release. Please use '--dataset-name' and "
"'--dataset-path' in the future runs.",
stacklevel=2)
input_requests = sample_sharegpt_requests(
dataset_path=args.dataset,
num_requests=args.num_prompts,
tokenizer=tokenizer,
)
elif args.dataset_name == "sharegpt":
input_requests = sample_sharegpt_requests(
dataset_path=args.dataset_path,
num_requests=args.num_prompts,
tokenizer=tokenizer,
)
elif args.dataset_name == "sonnet":
# Do not format the prompt, pass to message directly
if args.backend == "openai-chat":
input_requests = sample_sonnet_requests(
dataset_path=args.dataset_path,
num_requests=args.num_prompts,
input_len=args.input_len,
output_len=args.output_len,
prefix_len=args.prefix_len,
tokenizer=tokenizer,
)
input_requests = [(prompt, prompt_len, output_len)
for prompt, prompt_formatted, prompt_len,
output_len in input_requests]
else:
assert (
tokenizer.chat_template or tokenizer.default_chat_template
), "Tokenizer/model must have chat template for sonnet dataset."
input_requests = sample_sonnet_requests(
dataset_path=args.dataset_path,
num_requests=args.num_prompts,
input_len=args.input_len,
output_len=args.output_len,
prefix_len=args.prefix_len,
tokenizer=tokenizer,
)
input_requests = [(prompt_formatted, prompt_len, output_len)
for prompt, prompt_formatted, prompt_len,
output_len in input_requests]
else:
raise ValueError(f"Unknown dataset: {args.dataset_name}")
benchmark_result = asyncio.run( benchmark_result = asyncio.run(
benchmark( benchmark(
...@@ -277,13 +421,23 @@ def main(args: argparse.Namespace): ...@@ -277,13 +421,23 @@ def main(args: argparse.Namespace):
current_dt = datetime.now().strftime("%Y%m%d-%H%M%S") current_dt = datetime.now().strftime("%Y%m%d-%H%M%S")
result_json["date"] = current_dt result_json["date"] = current_dt
result_json["backend"] = backend result_json["backend"] = backend
result_json["version"] = args.version
result_json["model_id"] = model_id result_json["model_id"] = model_id
result_json["tokenizer_id"] = tokenizer_id result_json["tokenizer_id"] = tokenizer_id
result_json["best_of"] = args.best_of result_json["best_of"] = args.best_of
result_json["use_beam_search"] = args.use_beam_search result_json["use_beam_search"] = args.use_beam_search
result_json["num_prompts"] = args.num_prompts result_json["num_prompts"] = args.num_prompts
# Metadata
if args.metadata:
for item in args.metadata:
if "=" in item:
kvstring = item.split("=")
result_json[kvstring[0].strip()] = kvstring[1].strip()
else:
raise ValueError(
"Invalid metadata format. Please use KEY=VALUE format."
)
# Traffic # Traffic
result_json["request_rate"] = ( result_json["request_rate"] = (
args.request_rate if args.request_rate < float("inf") else "inf") args.request_rate if args.request_rate < float("inf") else "inf")
...@@ -293,7 +447,9 @@ def main(args: argparse.Namespace): ...@@ -293,7 +447,9 @@ def main(args: argparse.Namespace):
# Save to file # Save to file
base_model_id = model_id.split("/")[-1] base_model_id = model_id.split("/")[-1]
file_name = f"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json" file_name = f"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json" #noqa
if args.result_dir:
file_name = os.path.join(args.result_dir, file_name)
with open(file_name, "w") as outfile: with open(file_name, "w") as outfile:
json.dump(result_json, outfile) json.dump(result_json, outfile)
...@@ -307,12 +463,6 @@ if __name__ == "__main__": ...@@ -307,12 +463,6 @@ if __name__ == "__main__":
default="vllm", default="vllm",
choices=list(ASYNC_REQUEST_FUNCS.keys()), choices=list(ASYNC_REQUEST_FUNCS.keys()),
) )
parser.add_argument(
"--version",
type=str,
default="N/A",
help="Version of the serving backend/engine.",
)
parser.add_argument( parser.add_argument(
"--base-url", "--base-url",
type=str, type=str,
...@@ -324,12 +474,26 @@ if __name__ == "__main__": ...@@ -324,12 +474,26 @@ if __name__ == "__main__":
parser.add_argument( parser.add_argument(
"--endpoint", "--endpoint",
type=str, type=str,
default="/generate", default="/v1/completions",
help="API endpoint.", help="API endpoint.",
) )
parser.add_argument("--dataset", parser.add_argument(
"--dataset",
type=str,
default=None,
help="Path to the ShareGPT dataset, will be deprecated in the "
"next release.",
)
parser.add_argument(
"--dataset-name",
type=str,
default="sharegpt",
choices=["sharegpt", "sonnet"],
help="Name of the dataset to benchmark on.",
)
parser.add_argument("--dataset-path",
type=str, type=str,
required=True, default=None,
help="Path to the dataset.") help="Path to the dataset.")
parser.add_argument( parser.add_argument(
"--model", "--model",
...@@ -341,7 +505,7 @@ if __name__ == "__main__": ...@@ -341,7 +505,7 @@ if __name__ == "__main__":
"--tokenizer", "--tokenizer",
type=str, type=str,
help= help=
"Name or path of the tokenizer, if not using the default model tokenizer.", "Name or path of the tokenizer, if not using the default tokenizer.",
) )
parser.add_argument( parser.add_argument(
"--best-of", "--best-of",
...@@ -357,6 +521,27 @@ if __name__ == "__main__": ...@@ -357,6 +521,27 @@ if __name__ == "__main__":
default=1000, default=1000,
help="Number of prompts to process.", help="Number of prompts to process.",
) )
parser.add_argument(
"--sonnet-input-len",
type=int,
default=550,
help=
"Number of input tokens per request, used only for sonnet dataset.",
)
parser.add_argument(
"--sonnet-output-len",
type=int,
default=150,
help=
"Number of output tokens per request, used only for sonnet dataset.",
)
parser.add_argument(
"--sonnet-prefix-len",
type=int,
default=200,
help=
"Number of prefix tokens per request, used only for sonnet dataset.",
)
parser.add_argument( parser.add_argument(
"--request-rate", "--request-rate",
type=float, type=float,
...@@ -382,6 +567,21 @@ if __name__ == "__main__": ...@@ -382,6 +567,21 @@ if __name__ == "__main__":
action="store_true", action="store_true",
help="Specify to save benchmark results to a json file", help="Specify to save benchmark results to a json file",
) )
parser.add_argument(
"--metadata",
metavar="KEY=VALUE",
nargs="*",
help="Key-value pairs (e.g, --metadata version=0.3.3 tp=1) "
"for metadata of this run to be saved in the result JSON file "
"for record keeping purposes.",
)
parser.add_argument(
"--result-dir",
type=str,
default=None,
help="Specify directory to save benchmark json results."
"If not specified, results are saved in the current directory.",
)
args = parser.parse_args() args = parser.parse_args()
main(args) main(args)
...@@ -6,9 +6,9 @@ import time ...@@ -6,9 +6,9 @@ import time
from typing import List, Optional, Tuple from typing import List, Optional, Tuple
import torch import torch
from tqdm import tqdm
from transformers import (AutoModelForCausalLM, AutoTokenizer, from transformers import (AutoModelForCausalLM, AutoTokenizer,
PreTrainedTokenizerBase) PreTrainedTokenizerBase)
from tqdm import tqdm
def sample_requests( def sample_requests(
...@@ -73,21 +73,25 @@ def run_vllm( ...@@ -73,21 +73,25 @@ def run_vllm(
enforce_eager: bool, enforce_eager: bool,
kv_cache_dtype: str, kv_cache_dtype: str,
device: str, device: str,
enable_prefix_caching: bool,
gpu_memory_utilization: float = 0.9,
download_dir: Optional[str] = None,
) -> float: ) -> float:
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
llm = LLM( llm = LLM(model=model,
model=model, tokenizer=tokenizer,
tokenizer=tokenizer, quantization=quantization,
quantization=quantization, tensor_parallel_size=tensor_parallel_size,
tensor_parallel_size=tensor_parallel_size, seed=seed,
seed=seed, trust_remote_code=trust_remote_code,
trust_remote_code=trust_remote_code, dtype=dtype,
dtype=dtype, max_model_len=max_model_len,
max_model_len=max_model_len, gpu_memory_utilization=gpu_memory_utilization,
enforce_eager=enforce_eager, enforce_eager=enforce_eager,
kv_cache_dtype=kv_cache_dtype, kv_cache_dtype=kv_cache_dtype,
device=device, device=device,
) enable_prefix_caching=enable_prefix_caching,
download_dir=download_dir)
# Add the requests to the engine. # Add the requests to the engine.
for prompt, _, output_len in requests: for prompt, _, output_len in requests:
...@@ -179,13 +183,15 @@ def run_mii( ...@@ -179,13 +183,15 @@ def run_mii(
tensor_parallel_size: int, tensor_parallel_size: int,
output_len: int, output_len: int,
) -> float: ) -> float:
from mii import pipeline from mii import client, serve
llm = pipeline(model, tensor_parallel=tensor_parallel_size) llm = serve(model, tensor_parallel=tensor_parallel_size)
prompts = [prompt for prompt, _, _ in requests] prompts = [prompt for prompt, _, _ in requests]
start = time.perf_counter() start = time.perf_counter()
llm(prompts, max_new_tokens=output_len) llm.generate(prompts, max_new_tokens=output_len)
end = time.perf_counter() end = time.perf_counter()
client = client(model)
client.terminate_server()
return end - start return end - start
...@@ -211,7 +217,9 @@ def main(args: argparse.Namespace): ...@@ -211,7 +217,9 @@ def main(args: argparse.Namespace):
args.seed, args.n, args.use_beam_search, args.seed, args.n, args.use_beam_search,
args.trust_remote_code, args.dtype, args.trust_remote_code, args.dtype,
args.max_model_len, args.enforce_eager, args.max_model_len, args.enforce_eager,
args.kv_cache_dtype, args.device) args.kv_cache_dtype, args.device,
args.enable_prefix_caching,
args.gpu_memory_utilization, args.download_dir)
elif args.backend == "hf": elif args.backend == "hf":
assert args.tensor_parallel_size == 1 assert args.tensor_parallel_size == 1
elapsed_time = run_hf(requests, args.model, tokenizer, args.n, elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
...@@ -286,6 +294,12 @@ if __name__ == "__main__": ...@@ -286,6 +294,12 @@ if __name__ == "__main__":
'The "auto" option will use FP16 precision ' 'The "auto" option will use FP16 precision '
'for FP32 and FP16 models, and BF16 precision ' 'for FP32 and FP16 models, and BF16 precision '
'for BF16 models.') 'for BF16 models.')
parser.add_argument('--gpu-memory-utilization',
type=float,
default=0.9,
help='the fraction of GPU memory to be used for '
'the model executor, which can range from 0 to 1.'
'If unspecified, will use the default value of 0.9.')
parser.add_argument("--enforce-eager", parser.add_argument("--enforce-eager",
action="store_true", action="store_true",
help="enforce eager execution") help="enforce eager execution")
...@@ -302,6 +316,15 @@ if __name__ == "__main__": ...@@ -302,6 +316,15 @@ if __name__ == "__main__":
default="cuda", default="cuda",
choices=["cuda"], choices=["cuda"],
help='device type for vLLM execution, supporting CUDA only currently.') help='device type for vLLM execution, supporting CUDA only currently.')
parser.add_argument(
"--enable-prefix-caching",
action='store_true',
help="enable automatic prefix caching for vLLM backend.")
parser.add_argument('--download-dir',
type=str,
default=None,
help='directory to download and load the weights, '
'default to the default cache dir of huggingface')
args = parser.parse_args() args = parser.parse_args()
if args.tokenizer is None: if args.tokenizer is None:
args.tokenizer = args.model args.tokenizer = args.model
......
...@@ -2,13 +2,15 @@ import json ...@@ -2,13 +2,15 @@ import json
import os import os
import sys import sys
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
from vllm.model_executor.layers.fused_moe import fused_moe
import torch import torch
import torch.nn.functional as F import torch.nn.functional as F
import triton import triton
from vllm.model_executor.layers.fused_moe import (fused_moe,
get_config_file_name)
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
def main(): def main():
method = fused_moe method = fused_moe
...@@ -64,7 +66,7 @@ def run_grid(bs, method): ...@@ -64,7 +66,7 @@ def run_grid(bs, method):
print(f'{tp_size=} {bs=}') print(f'{tp_size=} {bs=}')
print(f'{config}') print(f'{config}')
# warmup # warmup
print(f'warming up') print('warming up')
try: try:
for _ in range(num_warmup_trials): for _ in range(num_warmup_trials):
run_timing( run_timing(
...@@ -82,7 +84,7 @@ def run_grid(bs, method): ...@@ -82,7 +84,7 @@ def run_grid(bs, method):
continue continue
# trial # trial
print(f'benchmarking') print('benchmarking')
for _ in range(num_trials): for _ in range(num_trials):
kernel_dur_ms = run_timing( kernel_dur_ms = run_timing(
num_calls=num_calls, num_calls=num_calls,
...@@ -103,17 +105,25 @@ def run_grid(bs, method): ...@@ -103,17 +105,25 @@ def run_grid(bs, method):
best_config = config best_config = config
best_time_us = kernel_dur_us best_time_us = kernel_dur_us
print( print(f'{kernel_dur_us=:.1f} {model_dur_ms=:.1f}'
f'{kernel_dur_us=:.1f} {model_dur_ms=:.1f} {bs=} {tp_size=} {top_k=} {num_total_experts=} {d_model=} {model_intermediate_size=} {num_layers=}' f' {bs=} {tp_size=} {top_k=} {num_total_experts=} '
) f'{d_model=} {model_intermediate_size=} {num_layers=}')
print("best_time_us", best_time_us) print("best_time_us", best_time_us)
print("best_config", best_config) print("best_config", best_config)
filename = "/tmp/config.jsonl" # holds Dict[str, Dict[str, int]]
filename = get_config_file_name(num_total_experts,
model_intermediate_size // tp_size)
print(f"writing config to file {filename}") print(f"writing config to file {filename}")
with open(filename, "a") as f: existing_content = {}
f.write(json.dumps({str(bs): best_config}) + "\n") if os.path.exists(filename):
with open(filename, "r") as f:
existing_content = json.load(f)
existing_content[str(bs)] = best_config
with open(filename, "w") as f:
json.dump(existing_content, f, indent=4)
f.write("\n")
def run_timing(num_calls: int, bs: int, d_model: int, num_total_experts: int, def run_timing(num_calls: int, bs: int, d_model: int, num_total_experts: int,
......
from typing import Optional
import argparse import argparse
import random import random
import time import time
from typing import Optional
import torch import torch
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, create_kv_caches_with_random
from vllm._C import ops from vllm._C import ops
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, create_kv_caches_with_random
NUM_BLOCKS = 1024 NUM_BLOCKS = 1024
PARTITION_SIZE = 512 PARTITION_SIZE = 512
......
import argparse
from itertools import accumulate
from typing import Optional
import nvtx
import torch
from vllm.model_executor.layers.rotary_embedding import get_rope
def benchmark_rope_kernels_multi_lora(
is_neox_style: bool,
batch_size: int,
seq_len: int,
num_heads: int,
head_size: int,
rotary_dim: Optional[int],
dtype: torch.dtype,
seed: int,
device: str,
max_position: int = 8192,
base: int = 10000,
) -> None:
torch.random.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
torch.set_default_device(device)
if rotary_dim is None:
rotary_dim = head_size
# silulating serving 4 LoRAs
scaling_factors = [1, 2, 4, 8]
# batched RoPE can take multiple scaling factors
batched_rope = get_rope(head_size, rotary_dim, max_position, base,
is_neox_style, {
"type": "linear",
"factor": tuple(scaling_factors)
})
# non-batched RoPE takes only one scaling factor, we create multiple
# instances to simulate the same behavior
non_batched_ropes = []
for scaling_factor in scaling_factors:
non_batched_ropes.append(
get_rope(head_size, rotary_dim, max_position, base, is_neox_style,
{
"type": "linear",
"factor": (scaling_factor, )
}))
positions = torch.randint(0, max_position, (batch_size, seq_len))
query = torch.randn(batch_size,
seq_len,
num_heads * head_size,
dtype=dtype)
key = torch.randn_like(query)
# create query offsets for batched RoPE, we concat multiple kv cache
# together and each query needs to find the right kv cache of its type
offset_map = torch.tensor(
list(
accumulate([0] + [
max_position * scaling_factor * 2
for scaling_factor in scaling_factors[:-1]
])))
query_types = torch.randint(0,
len(scaling_factors), (batch_size, seq_len),
device=device)
# map query types to offsets
query_offsets = offset_map[query_types]
# the kernel takes flattened offsets
flatten_offsets = query_offsets.flatten()
# batched queries of the same type together for non-batched RoPE
queries = [query[query_types == i] for i in range(len(scaling_factors))]
keys = [key[query_types == i] for i in range(len(scaling_factors))]
packed_qkr = zip(queries, keys, non_batched_ropes)
# synchronize before start timing
torch.cuda.synchronize()
with nvtx.annotate("non-batched", color="yellow"):
for q, k, r in packed_qkr:
r.forward(positions, q, k)
torch.cuda.synchronize()
with nvtx.annotate("batched", color="green"):
batched_rope.forward(positions, query, key, flatten_offsets)
torch.cuda.synchronize()
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description="Benchmark the rotary embedding kernels.")
parser.add_argument("--is-neox-style", type=bool, default=True)
parser.add_argument("--batch-size", type=int, default=16)
parser.add_argument("--seq-len", type=int, default=512)
parser.add_argument("--num-heads", type=int, default=8)
parser.add_argument("--head-size",
type=int,
choices=[64, 80, 96, 112, 128, 256],
default=128)
parser.add_argument("--rotary-dim", type=int, choices=[16, 32], default=32)
parser.add_argument("--dtype",
type=str,
choices=["bfloat16", "float"],
default="float")
parser.add_argument("--seed", type=int, default=0)
parser.add_argument("--device",
type=str,
choices=["cuda:0", "cuda:1"],
default="cuda:0")
args = parser.parse_args()
print(args)
benchmark_rope_kernels_multi_lora(
is_neox_style=args.is_neox_style,
batch_size=args.batch_size,
seq_len=args.seq_len,
num_heads=args.num_heads,
head_size=args.head_size,
rotary_dim=args.rotary_dim,
dtype=getattr(torch, args.dtype),
seed=args.seed,
device=args.device,
)
This diff is collapsed.
#!/usr/bin/env python3
#
# A command line tool for running pytorch's hipify preprocessor on CUDA
# source files.
#
# See https://github.com/ROCm/hipify_torch
# and <torch install dir>/utils/hipify/hipify_python.py
#
import argparse
import os
import shutil
from torch.utils.hipify.hipify_python import hipify
if __name__ == '__main__':
parser = argparse.ArgumentParser()
# Project directory where all the source + include files live.
parser.add_argument(
"-p",
"--project_dir",
help="The project directory.",
)
# Directory where hipified files are written.
parser.add_argument(
"-o",
"--output_dir",
help="The output directory.",
)
# Source files to convert.
parser.add_argument("sources",
help="Source files to hipify.",
nargs="*",
default=[])
args = parser.parse_args()
# Limit include scope to project_dir only
includes = [os.path.join(args.project_dir, '*')]
# Get absolute path for all source files.
extra_files = [os.path.abspath(s) for s in args.sources]
# Copy sources from project directory to output directory.
# The directory might already exist to hold object files so we ignore that.
shutil.copytree(args.project_dir, args.output_dir, dirs_exist_ok=True)
hipify_result = hipify(project_directory=args.project_dir,
output_directory=args.output_dir,
header_include_dirs=[],
includes=includes,
extra_files=extra_files,
show_detailed=True,
is_pytorch_extension=True,
hipify_extra_files_only=True)
hipified_sources = []
for source in args.sources:
s_abs = os.path.abspath(source)
hipified_s_abs = (hipify_result[s_abs].hipified_path if
(s_abs in hipify_result
and hipify_result[s_abs].hipified_path is not None)
else s_abs)
hipified_sources.append(hipified_s_abs)
assert (len(hipified_sources) == len(args.sources))
# Print hipified source files.
print("\n".join(hipified_sources))
#
# Attempt to find the python package that uses the same python executable as
# `EXECUTABLE` and is one of the `SUPPORTED_VERSIONS`.
#
macro (find_python_from_executable EXECUTABLE SUPPORTED_VERSIONS)
file(REAL_PATH ${EXECUTABLE} EXECUTABLE)
set(Python_EXECUTABLE ${EXECUTABLE})
find_package(Python COMPONENTS Interpreter Development.Module)
if (NOT Python_FOUND)
message(FATAL_ERROR "Unable to find python matching: ${EXECUTABLE}.")
endif()
set(_VER "${Python_VERSION_MAJOR}.${Python_VERSION_MINOR}")
set(_SUPPORTED_VERSIONS_LIST ${SUPPORTED_VERSIONS} ${ARGN})
if (NOT _VER IN_LIST _SUPPORTED_VERSIONS_LIST)
message(FATAL_ERROR
"Python version (${_VER}) is not one of the supported versions: "
"${_SUPPORTED_VERSIONS_LIST}.")
endif()
message(STATUS "Found python matching: ${EXECUTABLE}.")
endmacro()
#
# Run `EXPR` in python. The standard output of python is stored in `OUT` and
# has trailing whitespace stripped. If an error is encountered when running
# python, a fatal message `ERR_MSG` is issued.
#
function (run_python OUT EXPR ERR_MSG)
execute_process(
COMMAND
"${Python_EXECUTABLE}" "-c" "${EXPR}"
OUTPUT_VARIABLE PYTHON_OUT
RESULT_VARIABLE PYTHON_ERROR_CODE
ERROR_VARIABLE PYTHON_STDERR
OUTPUT_STRIP_TRAILING_WHITESPACE)
if(NOT PYTHON_ERROR_CODE EQUAL 0)
message(FATAL_ERROR "${ERR_MSG}: ${PYTHON_STDERR}")
endif()
set(${OUT} ${PYTHON_OUT} PARENT_SCOPE)
endfunction()
# Run `EXPR` in python after importing `PKG`. Use the result of this to extend
# `CMAKE_PREFIX_PATH` so the torch cmake configuration can be imported.
macro (append_cmake_prefix_path PKG EXPR)
run_python(_PREFIX_PATH
"import ${PKG}; print(${EXPR})" "Failed to locate ${PKG} path")
list(APPEND CMAKE_PREFIX_PATH ${_PREFIX_PATH})
endmacro()
#
# Add a target named `hipify${NAME}` that runs the hipify preprocessor on a set
# of CUDA source files. The names of the corresponding "hipified" sources are
# stored in `OUT_SRCS`.
#
function (hipify_sources_target OUT_SRCS NAME ORIG_SRCS)
#
# Split into C++ and non-C++ (i.e. CUDA) sources.
#
set(SRCS ${ORIG_SRCS})
set(CXX_SRCS ${ORIG_SRCS})
list(FILTER SRCS EXCLUDE REGEX "\.(cc)|(cpp)$")
list(FILTER CXX_SRCS INCLUDE REGEX "\.(cc)|(cpp)$")
#
# Generate ROCm/HIP source file names from CUDA file names.
# Since HIP files are generated code, they will appear in the build area
# `CMAKE_CURRENT_BINARY_DIR` directory rather than the original csrc dir.
#
set(HIP_SRCS)
foreach (SRC ${SRCS})
string(REGEX REPLACE "\.cu$" "\.hip" SRC ${SRC})
string(REGEX REPLACE "cuda" "hip" SRC ${SRC})
list(APPEND HIP_SRCS "${CMAKE_CURRENT_BINARY_DIR}/${SRC}")
endforeach()
set(CSRC_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/csrc)
add_custom_target(
hipify${NAME}
COMMAND ${CMAKE_SOURCE_DIR}/cmake/hipify.py -p ${CMAKE_SOURCE_DIR}/csrc -o ${CSRC_BUILD_DIR} ${SRCS}
DEPENDS ${CMAKE_SOURCE_DIR}/cmake/hipify.py ${SRCS}
BYPRODUCTS ${HIP_SRCS}
COMMENT "Running hipify on ${NAME} extension source files.")
# Swap out original extension sources with hipified sources.
list(APPEND HIP_SRCS ${CXX_SRCS})
set(${OUT_SRCS} ${HIP_SRCS} PARENT_SCOPE)
endfunction()
#
# Get additional GPU compiler flags from torch.
#
function (get_torch_gpu_compiler_flags OUT_GPU_FLAGS GPU_LANG)
if (${GPU_LANG} STREQUAL "CUDA")
#
# Get common NVCC flags from torch.
#
run_python(GPU_FLAGS
"from torch.utils.cpp_extension import COMMON_NVCC_FLAGS; print(';'.join(COMMON_NVCC_FLAGS))"
"Failed to determine torch nvcc compiler flags")
if (CUDA_VERSION VERSION_GREATER_EQUAL 11.8)
list(APPEND GPU_FLAGS "-DENABLE_FP8_E5M2")
endif()
elseif(${GPU_LANG} STREQUAL "HIP")
#
# Get common HIP/HIPCC flags from torch.
#
run_python(GPU_FLAGS
"import torch.utils.cpp_extension as t; print(';'.join(t.COMMON_HIP_FLAGS + t.COMMON_HIPCC_FLAGS))"
"Failed to determine torch nvcc compiler flags")
list(APPEND GPU_FLAGS
"-DUSE_ROCM"
"-U__HIP_NO_HALF_CONVERSIONS__"
"-U__HIP_NO_HALF_OPERATORS__"
"-fno-gpu-rdc")
endif()
set(${OUT_GPU_FLAGS} ${GPU_FLAGS} PARENT_SCOPE)
endfunction()
# Macro for converting a `gencode` version number to a cmake version number.
macro(string_to_ver OUT_VER IN_STR)
string(REGEX REPLACE "\([0-9]+\)\([0-9]\)" "\\1.\\2" ${OUT_VER} ${IN_STR})
endmacro()
#
# Override the GPU architectures detected by cmake/torch and filter them by
# `GPU_SUPPORTED_ARCHES`. Sets the final set of architectures in
# `GPU_ARCHES`.
#
# Note: this is defined as a macro since it updates `CMAKE_CUDA_FLAGS`.
#
macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES)
set(_GPU_SUPPORTED_ARCHES_LIST ${GPU_SUPPORTED_ARCHES} ${ARGN})
message(STATUS "${GPU_LANG} supported arches: ${_GPU_SUPPORTED_ARCHES_LIST}")
if (${GPU_LANG} STREQUAL "HIP")
#
# `GPU_ARCHES` controls the `--offload-arch` flags.
# `CMAKE_HIP_ARCHITECTURES` is set up by torch and can be controlled
# via the `PYTORCH_ROCM_ARCH` env variable.
#
#
# Find the intersection of the supported + detected architectures to
# set the module architecture flags.
#
set(${GPU_ARCHES})
foreach (_ARCH ${CMAKE_HIP_ARCHITECTURES})
if (_ARCH IN_LIST _GPU_SUPPORTED_ARCHES_LIST)
list(APPEND ${GPU_ARCHES} ${_ARCH})
endif()
endforeach()
if(NOT ${GPU_ARCHES})
message(FATAL_ERROR
"None of the detected ROCm architectures: ${CMAKE_HIP_ARCHITECTURES} is"
" supported. Supported ROCm architectures are: ${_GPU_SUPPORTED_ARCHES_LIST}.")
endif()
elseif(${GPU_LANG} STREQUAL "CUDA")
#
# Setup/process CUDA arch flags.
#
# The torch cmake setup hardcodes the detected architecture flags in
# `CMAKE_CUDA_FLAGS`. Since `CMAKE_CUDA_FLAGS` is a "global" variable, it
# can't modified on a per-target basis, e.g. for the `punica` extension.
# So, all the `-gencode` flags need to be extracted and removed from
# `CMAKE_CUDA_FLAGS` for processing so they can be passed by another method.
# Since it's not possible to use `target_compiler_options` for adding target
# specific `-gencode` arguments, the target's `CUDA_ARCHITECTURES` property
# must be used instead. This requires repackaging the architecture flags
# into a format that cmake expects for `CUDA_ARCHITECTURES`.
#
# This is a bit fragile in that it depends on torch using `-gencode` as opposed
# to one of the other nvcc options to specify architectures.
#
# Note: torch uses the `TORCH_CUDA_ARCH_LIST` environment variable to override
# detected architectures.
#
message(DEBUG "initial CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}")
# Extract all `-gencode` flags from `CMAKE_CUDA_FLAGS`
string(REGEX MATCHALL "-gencode arch=[^ ]+" _CUDA_ARCH_FLAGS
${CMAKE_CUDA_FLAGS})
# Remove all `-gencode` flags from `CMAKE_CUDA_FLAGS` since they will be modified
# and passed back via the `CUDA_ARCHITECTURES` property.
string(REGEX REPLACE "-gencode arch=[^ ]+ *" "" CMAKE_CUDA_FLAGS
${CMAKE_CUDA_FLAGS})
# If this error is triggered, it might mean that torch has changed how it sets
# up nvcc architecture code generation flags.
if (NOT _CUDA_ARCH_FLAGS)
message(FATAL_ERROR
"Could not find any architecture related code generation flags in "
"CMAKE_CUDA_FLAGS. (${CMAKE_CUDA_FLAGS})")
endif()
message(DEBUG "final CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}")
message(DEBUG "arch flags: ${_CUDA_ARCH_FLAGS}")
# Initialize the architecture lists to empty.
set(${GPU_ARCHES})
# Process each `gencode` flag.
foreach(_ARCH ${_CUDA_ARCH_FLAGS})
# For each flag, extract the version number and whether it refers to PTX
# or native code.
# Note: if a regex matches then `CMAKE_MATCH_1` holds the binding
# for that match.
string(REGEX MATCH "arch=compute_\([0-9]+a?\)" _COMPUTE ${_ARCH})
if (_COMPUTE)
set(_COMPUTE ${CMAKE_MATCH_1})
endif()
string(REGEX MATCH "code=sm_\([0-9]+a?\)" _SM ${_ARCH})
if (_SM)
set(_SM ${CMAKE_MATCH_1})
endif()
string(REGEX MATCH "code=compute_\([0-9]+a?\)" _CODE ${_ARCH})
if (_CODE)
set(_CODE ${CMAKE_MATCH_1})
endif()
# Make sure the virtual architecture can be matched.
if (NOT _COMPUTE)
message(FATAL_ERROR
"Could not determine virtual architecture from: ${_ARCH}.")
endif()
# One of sm_ or compute_ must exist.
if ((NOT _SM) AND (NOT _CODE))
message(FATAL_ERROR
"Could not determine a codegen architecture from: ${_ARCH}.")
endif()
if (_SM)
# -real suffix let CMake to only generate elf code for the kernels.
# we want this, otherwise the added ptx (default) will increase binary size.
set(_VIRT "-real")
set(_CODE_ARCH ${_SM})
else()
# -virtual suffix let CMake to generate ptx code for the kernels.
set(_VIRT "-virtual")
set(_CODE_ARCH ${_CODE})
endif()
# Check if the current version is in the supported arch list.
string_to_ver(_CODE_VER ${_CODE_ARCH})
if (NOT _CODE_VER IN_LIST _GPU_SUPPORTED_ARCHES_LIST)
message(STATUS "discarding unsupported CUDA arch ${_VER}.")
continue()
endif()
# Add it to the arch list.
list(APPEND ${GPU_ARCHES} "${_CODE_ARCH}${_VIRT}")
endforeach()
endif()
message(STATUS "${GPU_LANG} target arches: ${${GPU_ARCHES}}")
endmacro()
#
# Define a target named `GPU_MOD_NAME` for a single extension. The
# arguments are:
#
# DESTINATION <dest> - Module destination directory.
# LANGUAGE <lang> - The GPU language for this module, e.g CUDA, HIP,
# etc.
# SOURCES <sources> - List of source files relative to CMakeLists.txt
# directory.
#
# Optional arguments:
#
# ARCHITECTURES <arches> - A list of target GPU architectures in cmake
# format.
# Refer `CMAKE_CUDA_ARCHITECTURES` documentation
# and `CMAKE_HIP_ARCHITECTURES` for more info.
# ARCHITECTURES will use cmake's defaults if
# not provided.
# COMPILE_FLAGS <flags> - Extra compiler flags passed to NVCC/hip.
# INCLUDE_DIRECTORIES <dirs> - Extra include directories.
# LIBRARIES <libraries> - Extra link libraries.
# WITH_SOABI - Generate library with python SOABI suffix name.
#
# Note: optimization level/debug info is set via cmake build type.
#
function (define_gpu_extension_target GPU_MOD_NAME)
cmake_parse_arguments(PARSE_ARGV 1
GPU
"WITH_SOABI"
"DESTINATION;LANGUAGE"
"SOURCES;ARCHITECTURES;COMPILE_FLAGS;INCLUDE_DIRECTORIES;LIBRARIES")
# Add hipify preprocessing step when building with HIP/ROCm.
if (GPU_LANGUAGE STREQUAL "HIP")
hipify_sources_target(GPU_SOURCES ${GPU_MOD_NAME} "${GPU_SOURCES}")
endif()
if (GPU_WITH_SOABI)
set(GPU_WITH_SOABI WITH_SOABI)
else()
set(GPU_WITH_SOABI)
endif()
Python_add_library(${GPU_MOD_NAME} MODULE "${GPU_SOURCES}" ${GPU_WITH_SOABI})
if (GPU_LANGUAGE STREQUAL "HIP")
# Make this target dependent on the hipify preprocessor step.
add_dependencies(${GPU_MOD_NAME} hipify${GPU_MOD_NAME})
endif()
if (GPU_ARCHITECTURES)
set_target_properties(${GPU_MOD_NAME} PROPERTIES
${GPU_LANGUAGE}_ARCHITECTURES "${GPU_ARCHITECTURES}")
endif()
set_property(TARGET ${GPU_MOD_NAME} PROPERTY CXX_STANDARD 17)
target_compile_options(${GPU_MOD_NAME} PRIVATE
$<$<COMPILE_LANGUAGE:${GPU_LANGUAGE}>:${GPU_COMPILE_FLAGS}>)
target_compile_definitions(${GPU_MOD_NAME} PRIVATE
"-DTORCH_EXTENSION_NAME=${GPU_MOD_NAME}")
target_include_directories(${GPU_MOD_NAME} PRIVATE csrc
${GPU_INCLUDE_DIRECTORIES})
target_link_libraries(${GPU_MOD_NAME} PRIVATE torch ${torch_python_LIBRARY}
${GPU_LIBRARIES})
# Don't use `TORCH_LIBRARIES` for CUDA since it pulls in a bunch of
# dependencies that are not necessary and may not be installed.
if (GPU_LANGUAGE STREQUAL "CUDA")
target_link_libraries(${GPU_MOD_NAME} PRIVATE ${CUDA_CUDA_LIB}
${CUDA_LIBRARIES})
else()
target_link_libraries(${GPU_MOD_NAME} PRIVATE ${TORCH_LIBRARIES})
endif()
install(TARGETS ${GPU_MOD_NAME} LIBRARY DESTINATION ${GPU_DESTINATION})
endfunction()
This diff is collapsed.
...@@ -33,12 +33,25 @@ template<typename T> ...@@ -33,12 +33,25 @@ template<typename T>
__device__ __forceinline__ T gelu_kernel(const T& x) { __device__ __forceinline__ T gelu_kernel(const T& x) {
// Equivalent to PyTorch GELU with 'none' approximation. // Equivalent to PyTorch GELU with 'none' approximation.
// Refer to: // Refer to:
// https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L38 // https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L36-L38
const float f = (float) x; const float f = (float) x;
constexpr float ALPHA = M_SQRT1_2; constexpr float ALPHA = M_SQRT1_2;
return (T) (f * 0.5f * (1.0f + ::erf(f * ALPHA))); return (T) (f * 0.5f * (1.0f + ::erf(f * ALPHA)));
} }
template<typename T>
__device__ __forceinline__ T gelu_tanh_kernel(const T& x) {
// Equivalent to PyTorch GELU with 'tanh' approximation.
// Refer to:
// https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L25-L30
const float f = (float) x;
constexpr float BETA = M_SQRT2 * M_2_SQRTPI * 0.5f;
constexpr float KAPPA = 0.044715;
float x_cube = f * f * f;
float inner = BETA * (f + KAPPA * x_cube);
return (T) (0.5f * f * (1.0f + ::tanhf(inner)));
}
} // namespace vllm } // namespace vllm
// Launch activation and gating kernel. // Launch activation and gating kernel.
...@@ -73,6 +86,13 @@ void gelu_and_mul( ...@@ -73,6 +86,13 @@ void gelu_and_mul(
LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_kernel); LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_kernel);
} }
void gelu_tanh_and_mul(
torch::Tensor& out, // [..., d]
torch::Tensor& input) // [..., 2 * d]
{
LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_tanh_kernel);
}
namespace vllm { namespace vllm {
// Element-wise activation kernel template. // Element-wise activation kernel template.
......
...@@ -15,9 +15,6 @@ ...@@ -15,9 +15,6 @@
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
#ifdef USE_ROCM
#include <hip/hip_runtime.h>
#endif
#include <torch/extension.h> #include <torch/extension.h>
#include <ATen/cuda/CUDAContext.h> #include <ATen/cuda/CUDAContext.h>
...@@ -31,11 +28,6 @@ ...@@ -31,11 +28,6 @@
#include <algorithm> #include <algorithm>
#ifndef USE_ROCM
#define WARP_SIZE 32
#else
#define WARP_SIZE warpSize
#endif
#define MAX(a, b) ((a) > (b) ? (a) : (b)) #define MAX(a, b) ((a) > (b) ? (a) : (b))
#define MIN(a, b) ((a) < (b) ? (a) : (b)) #define MIN(a, b) ((a) < (b) ? (a) : (b))
#define DIVIDE_ROUND_UP(a, b) (((a) + (b) - 1) / (b)) #define DIVIDE_ROUND_UP(a, b) (((a) + (b) - 1) / (b))
......
#pragma once #pragma once
#ifdef USE_ROCM
#include <hip/hip_runtime.h>
#endif
#ifndef USE_ROCM
#define WARP_SIZE 32
#else
#define WARP_SIZE warpSize
#endif
#ifndef USE_ROCM #ifndef USE_ROCM
#define VLLM_LDG(arg) __ldg(arg) #define VLLM_LDG(arg) __ldg(arg)
#else #else
......
...@@ -29,7 +29,7 @@ fptr_t init_custom_ar(torch::Tensor &meta, torch::Tensor &rank_data, ...@@ -29,7 +29,7 @@ fptr_t init_custom_ar(torch::Tensor &meta, torch::Tensor &rank_data,
std::memcpy(&ipc_handles[i], handles[i].data(), sizeof(cudaIpcMemHandle_t)); std::memcpy(&ipc_handles[i], handles[i].data(), sizeof(cudaIpcMemHandle_t));
} }
return (fptr_t) new vllm::CustomAllreduce( return (fptr_t) new vllm::CustomAllreduce(
reinterpret_cast<vllm::Metadata *>(meta.data_ptr()), rank_data.data_ptr(), reinterpret_cast<vllm::Signal *>(meta.data_ptr()), rank_data.data_ptr(),
rank_data.numel(), ipc_handles, offsets, rank, full_nvlink); rank_data.numel(), ipc_handles, offsets, rank, full_nvlink);
} }
...@@ -62,9 +62,9 @@ bool should_custom_ar(torch::Tensor &inp, int max_size, int world_size, ...@@ -62,9 +62,9 @@ bool should_custom_ar(torch::Tensor &inp, int max_size, int world_size,
if (inp_size % 16 != 0) return false; if (inp_size % 16 != 0) return false;
if (!_is_weak_contiguous(inp)) return false; if (!_is_weak_contiguous(inp)) return false;
if (world_size == 2 || full_nvlink) return inp_size <= max_size; if (world_size == 2 || full_nvlink) return inp_size <= max_size;
// 4 PCIE GPUs use 2 stage allreduce, and is only faster than NCCL when size // for 4 or more non NVLink-capable GPUs, custom allreduce provides little
// <= 512k // performance improvement over NCCL.
return world_size <= 4 && inp_size <= 512 * 1024; return false;
} }
void _all_reduce(fptr_t _fa, torch::Tensor &inp, torch::Tensor &out, void _all_reduce(fptr_t _fa, torch::Tensor &inp, torch::Tensor &out,
...@@ -126,7 +126,7 @@ void dispose(fptr_t _fa) { ...@@ -126,7 +126,7 @@ void dispose(fptr_t _fa) {
delete fa; delete fa;
} }
int meta_size() { return sizeof(vllm::Metadata); } int meta_size() { return sizeof(vllm::Signal); }
void register_buffer(fptr_t _fa, torch::Tensor &t, void register_buffer(fptr_t _fa, torch::Tensor &t,
const std::vector<std::string> &handles, const std::vector<std::string> &handles,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment