"vllm/vscode:/vscode.git/clone" did not exist on "baee28c46c242b72f90d6b1211ab9d7872ab05d3"
Commit 7c4f76e3 authored by zhuwenwen's avatar zhuwenwen
Browse files

merge v0.4.0

parents 2da0dd3e 51c31bc1
from abc import ABC, abstractmethod
from typing import List, Optional
from transformers import PreTrainedTokenizer
from vllm.lora.request import LoRARequest
class BaseTokenizerGroup(ABC):
"""A group of tokenizers that can be used for LoRA adapters."""
@abstractmethod
def ping(self) -> bool:
"""Check if the tokenizer group is alive."""
pass
@abstractmethod
def get_max_input_len(self,
lora_request: Optional[LoRARequest] = None
) -> Optional[int]:
"""Get the maximum input length for the LoRA request."""
pass
@abstractmethod
def encode(self,
prompt: str,
request_id: Optional[str] = None,
lora_request: Optional[LoRARequest] = None) -> List[int]:
"""Encode a prompt using the tokenizer group."""
pass
@abstractmethod
async def encode_async(
self,
prompt: str,
request_id: Optional[str] = None,
lora_request: Optional[LoRARequest] = None) -> List[int]:
"""Encode a prompt using the tokenizer group."""
pass
@abstractmethod
def get_lora_tokenizer(
self,
lora_request: Optional[LoRARequest] = None
) -> "PreTrainedTokenizer":
"""Get a tokenizer for a LoRA request."""
pass
@abstractmethod
async def get_lora_tokenizer_async(
self,
lora_request: Optional[LoRARequest] = None
) -> "PreTrainedTokenizer":
"""Get a tokenizer for a LoRA request."""
pass
import asyncio
import os
from typing import List, Optional
from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy
from transformers import PreTrainedTokenizer
from vllm.config import TokenizerPoolConfig
from vllm.engine.ray_utils import ray
from vllm.lora.request import LoRARequest
from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import (
BaseTokenizerGroup)
from vllm.transformers_utils.tokenizer_group.tokenizer_group import (
TokenizerGroup)
class RayTokenizerGroupPool(BaseTokenizerGroup):
"""A Ray-based pool of TokenizerGroups for async tokenization."""
# Class to use for workers making up the pool.
_worker_cls = TokenizerGroup
@classmethod
def from_config(cls, tokenizer_pool_config: TokenizerPoolConfig,
**init_kwargs) -> "RayTokenizerGroupPool":
ray_actor_options = (tokenizer_pool_config.extra_config or {
"num_cpus": 0
})
ray_actor_options.setdefault(
"scheduling_strategy",
NodeAffinitySchedulingStrategy(
node_id=ray.get_runtime_context().get_node_id(), soft=True))
# Carry over the env vars to the actors.
# This is necessary for API keys and such.
ray_actor_options.setdefault("runtime_env", {})
_carry_over_env_vars_to_runtime_env(ray_actor_options["runtime_env"])
init_kwargs["num_actors"] = tokenizer_pool_config.pool_size
init_kwargs["ray_actor_options"] = ray_actor_options
return cls(**init_kwargs)
def __init__(self, tokenizer_id: str, enable_lora: bool, max_num_seqs: int,
max_input_length: Optional[int], num_actors: int,
ray_actor_options: dict, **tokenizer_config):
# Store a local copy of the TokenizerGroup for quick access
# to underlying HF tokenizers.
self._local_tokenizer_group = self._worker_cls(
tokenizer_id=tokenizer_id,
enable_lora=enable_lora,
max_num_seqs=max_num_seqs,
max_input_length=max_input_length,
)
ray_tokenizer_group_cls = ray.remote(
self._worker_cls).options(**ray_actor_options)
self.tokenizer_actors = [
ray_tokenizer_group_cls.remote(tokenizer_id, enable_lora,
max_num_seqs, max_input_length,
**tokenizer_config)
for _ in range(num_actors)
]
self._idle_actors: Optional[asyncio.Queue] = None
@property
def pool_size(self) -> int:
return len(self.tokenizer_actors)
def ping(self):
return ray.get(
[actor.ping.remote() for actor in self.tokenizer_actors])
def _ensure_queue_initialized(self):
if self._idle_actors is None:
self._idle_actors = asyncio.Queue()
for actor in self.tokenizer_actors:
self._idle_actors.put_nowait(actor)
def encode(self,
prompt: str,
request_id: Optional[str] = None,
lora_request: Optional[LoRARequest] = None) -> List[int]:
"""Encode a prompt using the tokenizer group.
We pick an idle actor and use it to encode the prompt.
The actor is then put back in the queue for future use.
This is blocking.
"""
self._ensure_queue_initialized()
if self._idle_actors.empty():
raise RuntimeError("No idle actors available.")
actor = self._idle_actors.get_nowait()
try:
ret = ray.get(
actor.encode.remote(request_id=request_id,
prompt=prompt,
lora_request=lora_request))
finally:
# Put the actor back in the queue.
# This is done in a finally block to ensure that the actor is
# always put back in the queue, even if an exception/cancellation
# is raised.
self._idle_actors.put_nowait(actor)
return ret
async def encode_async(
self,
prompt: str,
request_id: Optional[str] = None,
lora_request: Optional[LoRARequest] = None) -> List[int]:
"""Encode a prompt using the tokenizer group.
We pick an idle actor and use it to encode the prompt.
If there are no idle actors, we wait until one becomes
available.
The actor is then put back in the queue for future use.
This is non-blocking.
"""
self._ensure_queue_initialized()
actor = await self._idle_actors.get()
try:
ret = await actor.encode.remote(request_id=request_id,
prompt=prompt,
lora_request=lora_request)
finally:
# Put the actor back in the queue.
# This is done in a finally block to ensure that the actor is
# always put back in the queue, even if an exception/cancellation
# is raised.
self._idle_actors.put_nowait(actor)
return ret
def get_max_input_len(self,
lora_request: Optional[LoRARequest] = None
) -> Optional[int]:
"""Get the maximum input length for the LoRA request."""
return self._local_tokenizer_group.get_max_input_len(lora_request)
def get_lora_tokenizer(
self,
lora_request: Optional[LoRARequest] = None
) -> "PreTrainedTokenizer":
return self._local_tokenizer_group.get_lora_tokenizer(lora_request)
async def get_lora_tokenizer_async(
self,
lora_request: Optional[LoRARequest] = None
) -> "PreTrainedTokenizer":
return await self._local_tokenizer_group.get_lora_tokenizer_async(
lora_request)
def _carry_over_env_vars_to_runtime_env(runtime_env: dict) -> None:
"""Copy over all current process environment variables to the runtime_env.
The variables in runtime_env will take precedence over the current process
environment variables.
runtime_env will be modified in place."""
env_vars = os.environ.copy()
runtime_env.setdefault("env_vars", {})
env_vars.update(runtime_env["env_vars"])
runtime_env["env_vars"] = env_vars
from typing import List, Optional
from transformers import PreTrainedTokenizer
from vllm.lora.request import LoRARequest
from vllm.transformers_utils.tokenizer import (get_lora_tokenizer,
get_lora_tokenizer_async,
get_tokenizer)
from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import (
BaseTokenizerGroup)
from vllm.utils import LRUCache
class TokenizerGroup(BaseTokenizerGroup):
"""A group of tokenizers that can be used for LoRA adapters."""
def __init__(self, tokenizer_id: str, enable_lora: bool, max_num_seqs: int,
max_input_length: Optional[int], **tokenizer_config):
self.tokenizer_id = tokenizer_id
self.tokenizer_config = tokenizer_config
self.enable_lora = enable_lora
self.max_input_length = max_input_length
self.tokenizer = get_tokenizer(self.tokenizer_id, **tokenizer_config)
self.lora_tokenizers = LRUCache[PreTrainedTokenizer](
capacity=max_num_seqs) if enable_lora else None
def ping(self) -> bool:
"""Check if the tokenizer group is alive."""
return True
def get_max_input_len(self,
lora_request: Optional[LoRARequest] = None
) -> Optional[int]:
"""Get the maximum input length for the LoRA request."""
return self.max_input_length
def encode(self,
prompt: str,
request_id: Optional[str] = None,
lora_request: Optional[LoRARequest] = None) -> List[int]:
tokenizer = self.get_lora_tokenizer(lora_request)
return tokenizer.encode(prompt)
async def encode_async(
self,
prompt: str,
request_id: Optional[str] = None,
lora_request: Optional[LoRARequest] = None) -> List[int]:
tokenizer = await self.get_lora_tokenizer_async(lora_request)
return tokenizer.encode(prompt)
def get_lora_tokenizer(
self,
lora_request: Optional[LoRARequest] = None
) -> "PreTrainedTokenizer":
if not lora_request or not self.enable_lora:
return self.tokenizer
if lora_request.lora_int_id not in self.lora_tokenizers:
tokenizer = (get_lora_tokenizer(
lora_request, **self.tokenizer_config) or self.tokenizer)
self.lora_tokenizers.put(lora_request.lora_int_id, tokenizer)
return tokenizer
else:
return self.lora_tokenizers.get(lora_request.lora_int_id)
async def get_lora_tokenizer_async(
self,
lora_request: Optional[LoRARequest] = None
) -> "PreTrainedTokenizer":
if not lora_request or not self.enable_lora:
return self.tokenizer
if lora_request.lora_int_id not in self.lora_tokenizers:
tokenizer = (await get_lora_tokenizer_async(
lora_request, **self.tokenizer_config) or self.tokenizer)
self.lora_tokenizers.put(lora_request.lora_int_id, tokenizer)
return tokenizer
else:
return self.lora_tokenizers.get(lora_request.lora_int_id)
# yapf: disable
# Adapted from
# https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat/blob/8f6e343d545c503b91429582231d1d354dac2740/tokenization_baichuan.py
# This includes a fix suggested in
......@@ -13,7 +12,6 @@ import sentencepiece as spm
from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
from transformers.utils import logging
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
......@@ -52,27 +50,16 @@ class BaichuanTokenizer(PreTrainedTokenizer):
clean_up_tokenization_spaces=False,
**kwargs,
):
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
bos_token = (
AddedToken(bos_token, lstrip=False, rstrip=False)
if isinstance(bos_token, str)
else bos_token
)
eos_token = (
AddedToken(eos_token, lstrip=False, rstrip=False)
if isinstance(eos_token, str)
else eos_token
)
unk_token = (
AddedToken(unk_token, lstrip=False, rstrip=False)
if isinstance(unk_token, str)
else unk_token
)
pad_token = (
AddedToken(pad_token, lstrip=False, rstrip=False)
if isinstance(pad_token, str)
else pad_token
)
self.sp_model_kwargs = ({} if sp_model_kwargs is None else
sp_model_kwargs)
bos_token = (AddedToken(bos_token, lstrip=False, rstrip=False)
if isinstance(bos_token, str) else bos_token)
eos_token = (AddedToken(eos_token, lstrip=False, rstrip=False)
if isinstance(eos_token, str) else eos_token)
unk_token = (AddedToken(unk_token, lstrip=False, rstrip=False)
if isinstance(unk_token, str) else unk_token)
pad_token = (AddedToken(pad_token, lstrip=False, rstrip=False)
if isinstance(pad_token, str) else pad_token)
self.vocab_file = vocab_file
self.add_bos_token = add_bos_token
self.add_eos_token = add_eos_token
......@@ -107,7 +94,10 @@ class BaichuanTokenizer(PreTrainedTokenizer):
def get_vocab(self):
"""Returns vocab as a dict"""
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
vocab = {
self.convert_ids_to_tokens(i): i
for i in range(self.vocab_size)
}
vocab.update(self.added_tokens_encoder)
return vocab
......@@ -130,7 +120,8 @@ class BaichuanTokenizer(PreTrainedTokenizer):
out_string = ""
prev_is_special = False
for i, token in enumerate(tokens):
# make sure that special tokens are not decoded using sentencepiece model
# make sure that special tokens are not decoded using
# sentencepiece model
if token in self.all_special_tokens:
if not prev_is_special and i != 0:
out_string += " "
......@@ -143,9 +134,9 @@ class BaichuanTokenizer(PreTrainedTokenizer):
out_string += self.sp_model.decode(current_sub_tokens)
return out_string
def save_vocabulary(
self, save_directory, filename_prefix: Optional[str] = None
) -> Tuple[str]:
def save_vocabulary(self,
save_directory,
filename_prefix: Optional[str] = None) -> Tuple[str]:
"""
Save the vocabulary and special tokens file to a directory.
......@@ -157,24 +148,24 @@ class BaichuanTokenizer(PreTrainedTokenizer):
`Tuple(str)`: Paths to the files saved.
"""
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
logger.error(f"Vocabulary path ({save_directory}) "
"should be a directory")
return
out_vocab_file = os.path.join(
save_directory,
(filename_prefix + "-" if filename_prefix else "")
+ VOCAB_FILES_NAMES["vocab_file"],
(filename_prefix + "-" if filename_prefix else "") +
VOCAB_FILES_NAMES["vocab_file"],
)
if os.path.abspath(self.vocab_file) != os.path.abspath(
out_vocab_file
) and os.path.isfile(self.vocab_file):
out_vocab_file) and os.path.isfile(self.vocab_file):
copyfile(self.vocab_file, out_vocab_file)
elif not os.path.isfile(self.vocab_file):
with open(out_vocab_file, "wb") as fi:
content_spiece_model = self.sp_model.serialized_model_proto()
fi.write(content_spiece_model)
return (out_vocab_file,)
return (out_vocab_file, )
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
bos_token_id = [self.bos_token_id] if self.add_bos_token else []
......@@ -194,7 +185,8 @@ class BaichuanTokenizer(PreTrainedTokenizer):
already_has_special_tokens: bool = False,
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
Retrieve sequence ids from a token list that has no special tokens
added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.
Args:
......@@ -202,11 +194,14 @@ class BaichuanTokenizer(PreTrainedTokenizer):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
already_has_special_tokens (`bool`, *optional*, defaults to
`False`):
Whether or not the token list is already formatted with
special tokens for the model.
Returns:
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
`List[int]`: A list of integers in the range [0, 1]:
1 for a special token, 0 for a sequence token.
"""
if already_has_special_tokens:
return super().get_special_tokens_mask(
......@@ -220,20 +215,16 @@ class BaichuanTokenizer(PreTrainedTokenizer):
if token_ids_1 is None:
return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
return (
bos_token_id
+ ([0] * len(token_ids_0))
+ eos_token_id
+ bos_token_id
+ ([0] * len(token_ids_1))
+ eos_token_id
)
return (bos_token_id + ([0] * len(token_ids_0)) + eos_token_id +
bos_token_id + ([0] * len(token_ids_1)) + eos_token_id)
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
self,
token_ids_0: List[int],
token_ids_1: Optional[List[int]] = None) -> List[int]:
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
Creates a mask from the two sequences passed to be used in a
sequence-pair classification task. An ALBERT
sequence pair mask has the following format:
```
......@@ -250,7 +241,8 @@ class BaichuanTokenizer(PreTrainedTokenizer):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
`List[int]`: List of [token type IDs](../glossary#token-type-ids)
according to the given sequence(s).
"""
bos_token_id = [self.bos_token_id] if self.add_bos_token else []
eos_token_id = [self.eos_token_id] if self.add_eos_token else []
......
import datetime
import json
import logging
import os
import platform
import time
from enum import Enum
from pathlib import Path
from threading import Thread
from typing import Dict, Optional
from uuid import uuid4
import cpuinfo
import psutil
import requests
import torch
_config_home = os.getenv("XDG_CONFIG_HOME", os.path.expanduser("~/.config"))
_USAGE_STATS_JSON_PATH = os.path.join(_config_home, "vllm/usage_stats.json")
_USAGE_STATS_DO_NOT_TRACK_PATH = os.path.join(_config_home,
"vllm/do_not_track")
_USAGE_STATS_ENABLED = None
_USAGE_STATS_SERVER = os.environ.get("VLLM_USAGE_STATS_SERVER",
"https://stats.vllm.ai")
def is_usage_stats_enabled():
"""Determine whether or not we can send usage stats to the server.
The logic is as follows:
- By default, it should be enabled.
- Two environment variables can disable it:
- DO_NOT_TRACK=1
- VLLM_NO_USAGE_STATS=1
- A file in the home directory can disable it if it exists:
- $HOME/.config/vllm/do_not_track
"""
global _USAGE_STATS_ENABLED
if _USAGE_STATS_ENABLED is None:
do_not_track = os.environ.get("DO_NOT_TRACK", "0") == "1"
no_usage_stats = os.environ.get("VLLM_NO_USAGE_STATS", "0") == "1"
do_not_track_file = os.path.exists(_USAGE_STATS_DO_NOT_TRACK_PATH)
_USAGE_STATS_ENABLED = not (do_not_track or no_usage_stats
or do_not_track_file)
return _USAGE_STATS_ENABLED
def _get_current_timestamp_ns() -> int:
return int(datetime.datetime.now(datetime.timezone.utc).timestamp() * 1e9)
def _detect_cloud_provider() -> str:
# Try detecting through vendor file
vendor_files = [
"/sys/class/dmi/id/product_version", "/sys/class/dmi/id/bios_vendor",
"/sys/class/dmi/id/product_name",
"/sys/class/dmi/id/chassis_asset_tag", "/sys/class/dmi/id/sys_vendor"
]
# Mapping of identifiable strings to cloud providers
cloud_identifiers = {
"amazon": "AWS",
"microsoft corporation": "AZURE",
"google": "GCP",
"oraclecloud": "OCI",
}
for vendor_file in vendor_files:
path = Path(vendor_file)
if path.is_file():
file_content = path.read_text().lower()
for identifier, provider in cloud_identifiers.items():
if identifier in file_content:
return provider
# Try detecting through environment variables
env_to_cloud_provider = {
"RUNPOD_DC_ID": "RUNPOD",
}
for env_var, provider in env_to_cloud_provider.items():
if os.environ.get(env_var):
return provider
return "UNKNOWN"
class UsageContext(str, Enum):
UNKNOWN_CONTEXT = "UNKNOWN_CONTEXT"
LLM_CLASS = "LLM_CLASS"
API_SERVER = "API_SERVER"
OPENAI_API_SERVER = "OPENAI_API_SERVER"
ENGINE_CONTEXT = "ENGINE_CONTEXT"
class UsageMessage:
"""Collect platform information and send it to the usage stats server."""
def __init__(self) -> None:
# NOTE: vLLM's server _only_ support flat KV pair.
# Do not use nested fields.
self.uuid = str(uuid4())
# Environment Information
self.provider: Optional[str] = None
self.num_cpu: Optional[int] = None
self.cpu_type: Optional[str] = None
self.cpu_family_model_stepping: Optional[str] = None
self.total_memory: Optional[int] = None
self.architecture: Optional[str] = None
self.platform: Optional[str] = None
self.gpu_count: Optional[int] = None
self.gpu_type: Optional[str] = None
self.gpu_memory_per_device: Optional[int] = None
# vLLM Information
self.model_architecture: Optional[str] = None
self.vllm_version: Optional[str] = None
self.context: Optional[str] = None
# Metadata
self.log_time: Optional[int] = None
self.source: Optional[str] = None
def report_usage(self,
model_architecture: str,
usage_context: UsageContext,
extra_kvs: Dict[str, any] = None) -> None:
t = Thread(target=self._report_usage_worker,
args=(model_architecture, usage_context, extra_kvs or {}),
daemon=True)
t.start()
def _report_usage_worker(self, model_architecture: str,
usage_context: UsageContext,
extra_kvs: Dict[str, any]) -> None:
self._report_usage_once(model_architecture, usage_context, extra_kvs)
self._report_continous_usage()
def _report_usage_once(self, model_architecture: str,
usage_context: UsageContext,
extra_kvs: Dict[str, any]) -> None:
# Platform information
if torch.cuda.is_available():
device_property = torch.cuda.get_device_properties(0)
self.gpu_count = torch.cuda.device_count()
self.gpu_type = device_property.name
self.gpu_memory_per_device = device_property.total_memory
self.provider = _detect_cloud_provider()
self.architecture = platform.machine()
self.platform = platform.platform()
self.total_memory = psutil.virtual_memory().total
info = cpuinfo.get_cpu_info()
self.num_cpu = info.get("count", None)
self.cpu_type = info.get("brand_raw", "")
self.cpu_family_model_stepping = ",".join([
str(info.get("family", "")),
str(info.get("model", "")),
str(info.get("stepping", ""))
])
# vLLM information
import vllm # delayed import to prevent circular import
self.context = usage_context.value
self.vllm_version = vllm.__version__
self.model_architecture = model_architecture
# Metadata
self.log_time = _get_current_timestamp_ns()
self.source = os.environ.get("VLLM_USAGE_SOURCE", "production")
data = vars(self)
if extra_kvs:
data.update(extra_kvs)
self._write_to_file(data)
self._send_to_server(data)
def _report_continous_usage(self):
"""Report usage every 10 minutes.
This helps us to collect more data points for uptime of vLLM usages.
This function can also help send over performance metrics over time.
"""
while True:
time.sleep(600)
data = {"uuid": self.uuid, "log_time": _get_current_timestamp_ns()}
self._write_to_file(data)
self._send_to_server(data)
def _send_to_server(self, data):
try:
requests.post(_USAGE_STATS_SERVER, json=data)
except requests.exceptions.RequestException:
# silently ignore unless we are using debug log
logging.debug("Failed to send usage data to server")
def _write_to_file(self, data):
os.makedirs(os.path.dirname(_USAGE_STATS_JSON_PATH), exist_ok=True)
Path(_USAGE_STATS_JSON_PATH).touch(exist_ok=True)
with open(_USAGE_STATS_JSON_PATH, "a") as f:
json.dump(data, f)
f.write("\n")
usage_message = UsageMessage()
import asyncio
import enum
import gc
import os
import socket
import subprocess
import uuid
import warnings
from collections import OrderedDict
from functools import lru_cache, partial
from platform import uname
from typing import List, Tuple, Union
from packaging.version import parse, Version
from typing import (Any, Awaitable, Callable, Generic, Hashable, List,
Optional, Tuple, TypeVar, Union)
import psutil
import torch
import asyncio
from functools import partial
from typing import (
Awaitable,
Callable,
TypeVar,
)
from collections import OrderedDict
from typing import Any, Hashable, Optional
from packaging.version import Version, parse
from vllm.logger import init_logger
......@@ -51,10 +48,10 @@ class Counter:
self.counter = 0
class LRUCache:
class LRUCache(Generic[T]):
def __init__(self, capacity: int):
self.cache = OrderedDict()
self.cache = OrderedDict[Hashable, T]()
self.capacity = capacity
def __contains__(self, key: Hashable) -> bool:
......@@ -63,10 +60,10 @@ class LRUCache:
def __len__(self) -> int:
return len(self.cache)
def __getitem__(self, key: Hashable) -> Any:
def __getitem__(self, key: Hashable) -> T:
return self.get(key)
def __setitem__(self, key: Hashable, value: Any) -> None:
def __setitem__(self, key: Hashable, value: T) -> None:
self.put(key, value)
def __delitem__(self, key: Hashable) -> None:
......@@ -75,7 +72,9 @@ class LRUCache:
def touch(self, key: Hashable) -> None:
self.cache.move_to_end(key)
def get(self, key: Hashable, default_value: Optional[Any] = None) -> int:
def get(self,
key: Hashable,
default_value: Optional[T] = None) -> Optional[T]:
if key in self.cache:
value = self.cache[key]
self.cache.move_to_end(key)
......@@ -83,12 +82,12 @@ class LRUCache:
value = default_value
return value
def put(self, key: Hashable, value: Any) -> None:
def put(self, key: Hashable, value: T) -> None:
self.cache[key] = value
self.cache.move_to_end(key)
self._remove_old_if_needed()
def _on_remove(self, key: Hashable, value: Any):
def _on_remove(self, key: Hashable, value: T):
pass
def remove_oldest(self):
......@@ -101,7 +100,7 @@ class LRUCache:
while len(self.cache) > self.capacity:
self.remove_oldest()
def pop(self, key: int, default_value: Optional[Any] = None) -> Any:
def pop(self, key: Hashable, default_value: Optional[Any] = None) -> T:
run_on_remove = key in self.cache
value = self.cache.pop(key, default_value)
if run_on_remove:
......@@ -118,6 +117,7 @@ def is_hip() -> bool:
return torch.version.hip is not None
@lru_cache(maxsize=None)
def is_neuron() -> bool:
try:
import transformers_neuronx
......@@ -126,15 +126,17 @@ def is_neuron() -> bool:
return transformers_neuronx is not None
@lru_cache(maxsize=None)
def get_max_shared_memory_bytes(gpu: int = 0) -> int:
"""Returns the maximum shared memory per thread block in bytes."""
# NOTE: This import statement should be executed lazily since
# the Neuron-X backend does not have the `cuda_utils` module.
from vllm._C import cuda_utils
max_shared_mem = cuda_utils.get_max_shared_memory_per_block_device_attribute(
gpu)
# value 0 will cause MAX_SEQ_LEN become negative and test_attention.py will fail
max_shared_mem = (
cuda_utils.get_max_shared_memory_per_block_device_attribute(gpu))
# value 0 will cause MAX_SEQ_LEN become negative and test_attention.py
# will fail
assert max_shared_mem > 0, "max_shared_mem can not be zero"
return int(max_shared_mem)
......@@ -148,6 +150,7 @@ def random_uuid() -> str:
return str(uuid.uuid4().hex)
@lru_cache(maxsize=None)
def in_wsl() -> bool:
# Reference: https://github.com/microsoft/WSL/issues/4071
return "microsoft" in " ".join(uname()).lower()
......@@ -170,20 +173,41 @@ def make_async(func: Callable[..., T]) -> Callable[..., Awaitable[T]]:
def get_ip() -> str:
host_ip = os.environ.get("HOST_IP")
if host_ip:
return host_ip
# IP is not set, try to get it from the network interface
# try ipv4
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
try:
s.connect(("dns.google", 80)) # Doesn't need to be reachable
s.connect(("8.8.8.8", 80)) # Doesn't need to be reachable
return s.getsockname()[0]
except OSError:
except Exception:
pass
# try ipv6
try:
s = socket.socket(socket.AF_INET6, socket.SOCK_DGRAM)
s.connect(("dns.google", 80))
# Google's public DNS server, see
# https://developers.google.com/speed/public-dns/docs/using#addresses
s.connect(("2001:4860:4860::8888", 80)) # Doesn't need to be reachable
return s.getsockname()[0]
except Exception:
pass
warnings.warn(
"Failed to get the IP address, using 0.0.0.0 by default."
"The value can be set by the environment variable HOST_IP.",
stacklevel=2)
return "0.0.0.0"
def get_distributed_init_method(ip: str, port: int) -> str:
return f"tcp://{ip}:{port}"
# Brackets are not permitted in ipv4 addresses,
# see https://github.com/python/cpython/issues/103848
return f"tcp://[{ip}]:{port}" if ":" in ip else f"tcp://{ip}:{port}"
def get_open_port() -> int:
......@@ -203,14 +227,24 @@ def set_cuda_visible_devices(device_ids: List[int]) -> None:
os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, device_ids))
def chunk_list(lst, chunk_size):
"""Yield successive chunk_size chunks from lst."""
return [lst[i:i + chunk_size] for i in range(0, len(lst), chunk_size)]
def cdiv(a: int, b: int) -> int:
"""Ceiling division."""
return -(a // -b)
@lru_cache(maxsize=None)
def get_nvcc_cuda_version() -> Optional[Version]:
cuda_home = os.environ.get('CUDA_HOME')
if not cuda_home:
cuda_home = '/usr/local/cuda'
if os.path.isfile(cuda_home + '/bin/nvcc'):
logger.info(
f'CUDA_HOME is not found in the environment. Using {cuda_home} as CUDA_HOME.'
)
logger.info(f'CUDA_HOME is not found in the environment. '
f'Using {cuda_home} as CUDA_HOME.')
else:
logger.warning(
f'Not found nvcc in {cuda_home}. Skip cuda version check!')
......@@ -309,3 +343,98 @@ def create_kv_caches_with_random(
f"Does not support value cache of type {cache_dtype}")
value_caches.append(value_cache)
return key_caches, value_caches
@lru_cache
def print_warning_once(msg: str) -> None:
logger.warning(msg)
@lru_cache(maxsize=None)
def is_pin_memory_available() -> bool:
if in_wsl():
# Pinning memory in WSL is not supported.
# https://docs.nvidia.com/cuda/wsl-user-guide/index.html#known-limitations-for-linux-cuda-applications
print_warning_once("Using 'pin_memory=False' as WSL is detected. "
"This may slow down the performance.")
return False
elif is_neuron():
print_warning_once("Pin memory is not supported on Neuron.")
return False
return True
class CudaMemoryProfiler:
def __init__(self, device=None):
self.device = device
def current_memory_usage(self) -> float:
# Return the memory usage in bytes.
torch.cuda.reset_peak_memory_stats(self.device)
mem = torch.cuda.max_memory_allocated(self.device)
return mem
def __enter__(self):
self.initial_memory = self.current_memory_usage()
# This allows us to call methods of the context manager if needed
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.final_memory = self.current_memory_usage()
self.consumed_memory = self.final_memory - self.initial_memory
# Force garbage collection
gc.collect()
def str_to_int_tuple(s: str) -> Tuple[int]:
"""Convert a string to a tuple of integers."""
try:
return tuple(map(int, s.split(",")))
except ValueError as e:
raise ValueError(
"String must be a series of integers separated by commas "
f"(e.g., 1, 2, 3). Given input: {s}") from e
def pad_to_max_length(x: List[int], max_len: int, pad: int) -> List[int]:
assert len(x) <= max_len
return x + [pad] * (max_len - len(x))
def make_tensor_with_pad(
x: List[List[int]],
max_len: int,
pad: int,
dtype: torch.dtype,
device: Optional[Union[str, torch.device]],
) -> torch.Tensor:
"""Make a padded tensor of a 2D inputs.
The padding is applied to the end of each inner list until it reaches
`max_len`.
"""
padded_x = [pad_to_max_length(x_i, max_len, pad) for x_i in x]
return torch.tensor(padded_x, dtype=dtype, device=device)
def async_tensor_h2d(
data: list,
dtype: torch.dtype,
target_device: Union[str, torch.device],
pin_memory: bool,
) -> torch.Tensor:
"""Asynchronously create a tensor and copy it from host to device."""
t = torch.tensor(data, dtype=dtype, pin_memory=pin_memory, device="cpu")
return t.to(device=target_device, non_blocking=True)
def maybe_expand_dim(tensor: torch.Tensor,
target_dims: int,
size: int = 1) -> torch.Tensor:
"""Expand the tensor to the target_dims."""
if tensor.ndim < target_dims:
tensor = tensor.view(-1, *([size] * (target_dims - tensor.ndim)))
return tensor
"""CacheEngine class for managing the KV cache."""
from typing import Dict, List, Tuple
from typing import Dict, List
import torch
from vllm.attention import get_attn_backend
from vllm.config import CacheConfig, ModelConfig, ParallelConfig
from vllm.logger import init_logger
from vllm.utils import in_wsl, is_neuron, STR_DTYPE_TO_TORCH_DTYPE
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, is_pin_memory_available
logger = init_logger(__name__)
KVCache = Tuple[torch.Tensor, torch.Tensor]
class CacheEngine:
"""Manages the KV cache.
......@@ -38,119 +37,48 @@ class CacheEngine:
self.num_gpu_blocks = cache_config.num_gpu_blocks
self.num_cpu_blocks = cache_config.num_cpu_blocks
# Skip initializing CUDA stream and buffer for Neuron backend.
if is_neuron():
return
if cache_config.cache_dtype == "auto":
self.dtype = model_config.dtype
else:
self.dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype]
# Get attention backend.
self.attn_backend = get_attn_backend(model_config.dtype)
# Initialize the cache.
self.gpu_cache = self.allocate_gpu_cache()
self.cpu_cache = self.allocate_cpu_cache()
# Initialize the stream for caching operations.
self.cache_stream = torch.cuda.Stream()
assert self.cache_stream != torch.cuda.current_stream()
# Initialize the events for stream synchronization.
self.events = [torch.cuda.Event() for _ in range(self.num_layers)]
def get_key_block_shape(self) -> Tuple[int, int, int, int]:
element_size = torch.tensor([], dtype=self.dtype).element_size()
x = 16 // element_size
return (
self.num_heads,
self.head_size // x,
self.block_size,
x,
)
def get_value_block_shape(self) -> Tuple[int, int, int]:
return (
self.num_heads,
self.head_size,
self.block_size,
)
def allocate_gpu_cache(self) -> List[KVCache]:
gpu_cache: List[KVCache] = []
key_block_shape = self.get_key_block_shape()
value_block_shape = self.get_value_block_shape()
for _ in range(self.num_layers):
key_blocks = torch.empty(
size=(self.num_gpu_blocks, *key_block_shape),
dtype=self.dtype,
device="cuda",
)
value_blocks = torch.empty(
size=(self.num_gpu_blocks, *value_block_shape),
dtype=self.dtype,
device="cuda",
)
gpu_cache.append((key_blocks, value_blocks))
return gpu_cache
def allocate_cpu_cache(self) -> List[KVCache]:
cpu_cache: List[KVCache] = []
key_block_shape = self.get_key_block_shape()
value_block_shape = self.get_value_block_shape()
pin_memory = not in_wsl()
if not pin_memory:
# Pinning memory in WSL is not supported.
# https://docs.nvidia.com/cuda/wsl-user-guide/index.html#known-limitations-for-linux-cuda-applications
logger.warning("Using 'pin_memory=False' as WSL is detected. "
"This may slow down the performance.")
self.gpu_cache = self._allocate_kv_cache(self.num_gpu_blocks, "cuda")
self.cpu_cache = self._allocate_kv_cache(self.num_cpu_blocks, "cpu")
def _allocate_kv_cache(
self,
num_blocks: int,
device: str,
) -> List[torch.Tensor]:
"""Allocates KV cache on the specified device."""
kv_cache_shape = self.attn_backend.get_kv_cache_shape(
num_blocks, self.block_size, self.num_heads, self.head_size)
pin_memory = is_pin_memory_available() if device == "cpu" else False
kv_cache: List[torch.Tensor] = []
for _ in range(self.num_layers):
key_blocks = torch.empty(
size=(self.num_cpu_blocks, *key_block_shape),
dtype=self.dtype,
pin_memory=pin_memory,
device="cpu",
)
value_blocks = torch.empty(
size=(self.num_cpu_blocks, *value_block_shape),
kv_cache.append(
torch.empty(kv_cache_shape,
dtype=self.dtype,
pin_memory=pin_memory,
device="cpu",
)
cpu_cache.append((key_blocks, value_blocks))
return cpu_cache
device=device))
return kv_cache
def _swap(
self,
src: List[KVCache],
dst: List[KVCache],
src_to_dst: Dict[int, int],
) -> None:
from vllm._C import cache_ops
with torch.cuda.stream(self.cache_stream):
def swap_in(self, src_to_dst: Dict[int, int]) -> None:
for i in range(self.num_layers):
src_key_cache, src_value_cache = src[i]
dst_key_cache, dst_value_cache = dst[i]
# Copy the key blocks.
cache_ops.swap_blocks(src_key_cache, dst_key_cache, src_to_dst)
# Copy the value blocks.
cache_ops.swap_blocks(src_value_cache, dst_value_cache,
self.attn_backend.swap_blocks(self.cpu_cache[i], self.gpu_cache[i],
src_to_dst)
event = self.events[i]
event.record(stream=self.cache_stream)
def swap_in(self, src_to_dst: Dict[int, int]) -> None:
self._swap(self.cpu_cache, self.gpu_cache, src_to_dst)
def swap_out(self, src_to_dst: Dict[int, int]) -> None:
self._swap(self.gpu_cache, self.cpu_cache, src_to_dst)
for i in range(self.num_layers):
self.attn_backend.swap_blocks(self.gpu_cache[i], self.cpu_cache[i],
src_to_dst)
def copy(self, src_to_dsts: Dict[int, List[int]]) -> None:
from vllm._C import cache_ops
key_caches = [key_cache for key_cache, _ in self.gpu_cache]
value_caches = [value_cache for _, value_cache in self.gpu_cache]
# NOTE(woosuk): This operation implicitly synchronizes the CPU and GPU.
cache_ops.copy_blocks(key_caches, value_caches, src_to_dsts)
self.attn_backend.copy_blocks(self.gpu_cache, src_to_dsts)
@staticmethod
def get_cache_block_size(
......
import contextlib
import time
from typing import Dict, List, Optional, Tuple, Set, Union
from typing import Dict, List, Optional, Set, Tuple
import numpy as np
import torch
import torch.nn as nn
from vllm.config import (DeviceConfig, ModelConfig, LoRAConfig, ParallelConfig,
SchedulerConfig)
from vllm.attention import AttentionMetadata, get_attn_backend
from vllm.config import (DeviceConfig, LoRAConfig, ModelConfig, ParallelConfig,
SchedulerConfig, VisionLanguageConfig)
from vllm.logger import init_logger
from vllm.model_executor import get_model, InputMetadata, SamplingMetadata
from vllm.model_executor.parallel_utils import cupy_utils
from vllm.lora.layers import LoRAMapping
from vllm.lora.request import LoRARequest
from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager
from vllm.model_executor import SamplingMetadata
from vllm.model_executor.model_loader import get_model
from vllm.model_executor.parallel_utils import custom_all_reduce, pynccl_utils
from vllm.model_executor.parallel_utils.communication_op import (
broadcast_tensor_dict)
from vllm.model_executor.parallel_utils.parallel_state import (
with_cupy_nccl_for_all_reduce)
from vllm.model_executor.parallel_utils import custom_all_reduce
with_pynccl_for_all_reduce)
from vllm.sampling_params import SamplingParams, SamplingType
from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata
from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager
from vllm.lora.layers import LoRAMapping
from vllm.lora.request import LoRARequest
from vllm.utils import in_wsl
from vllm.sequence import (MultiModalData, SamplerOutput, SequenceData,
SequenceGroupMetadata)
from vllm.utils import (CudaMemoryProfiler, async_tensor_h2d,
is_pin_memory_available, make_tensor_with_pad,
maybe_expand_dim)
logger = init_logger(__name__)
KVCache = Tuple[torch.Tensor, torch.Tensor]
_PAD_SLOT_ID = -1
LORA_WARMUP_RANK = 8
# Capture graphs for batch size 1, 2, 4, 8, 16, 24, 32, 40, ..., 256.
_BATCH_SIZE_ALIGNMENT = 8
# Capture graphs for token size 1, 2, 4, 8, 16, 24, 32, 40, ..., 256.
# NOTE: _get_graph_batch_size needs to be updated if this list is changed.
_BATCH_SIZES_TO_CAPTURE = [1, 2, 4] + [8 * i for i in range(1, 33)]
_BATCH_SIZES_TO_CAPTURE = [1, 2, 4] + [
_BATCH_SIZE_ALIGNMENT * i for i in range(1, 33)
]
class ModelRunner:
......@@ -44,6 +50,7 @@ class ModelRunner:
lora_config: Optional[LoRAConfig],
kv_cache_dtype: Optional[str] = "auto",
is_driver_worker: bool = False,
vision_language_config: Optional[VisionLanguageConfig] = None,
):
self.model_config = model_config
self.parallel_config = parallel_config
......@@ -76,27 +83,31 @@ class ModelRunner:
# The shape of the cached block table will be
# (max batch size to capture, max context len to capture / block size).
self.graph_block_tables = None # Set after initial profiling.
# cache in_wsl result
self.in_wsl = in_wsl()
self.pin_memory = is_pin_memory_available()
self.kv_cache_dtype = kv_cache_dtype
self.vision_language_config = vision_language_config
# Set enforce_eager to True for Neuron backend, to avoid capturing graph
if self.device_config.is_neuron:
self.model_config.enforce_eager = True
self.attn_backend = get_attn_backend(
self.model_config.dtype if model_config is not None else None)
def load_model(self) -> None:
self.model = get_model(self.model_config,
with CudaMemoryProfiler() as m:
self.model = get_model(
self.model_config,
self.device_config,
lora_config=self.lora_config,
vision_language_config=self.vision_language_config,
parallel_config=self.parallel_config,
scheduler_config=self.scheduler_config)
vocab_size = self.model.config.vocab_size
self.model_memory_usage = m.consumed_memory
logger.info(f"Loading model weights took "
f"{self.model_memory_usage / float(2**30):.4f} GB")
if self.lora_config:
assert hasattr(
self.model, "supported_lora_modules"
) and self.model.supported_lora_modules, "Model does not support LoRA"
assert hasattr(self.model, "supported_lora_modules"
) and self.model.supported_lora_modules, (
"Model does not support LoRA")
assert hasattr(
self.model,
"embedding_modules"), "Model does not have embedding_modules"
......@@ -104,8 +115,7 @@ class ModelRunner:
), "Model does not have embedding_padding_modules"
self.lora_manager = LRUCacheWorkerLoRAManager(
self.scheduler_config.max_num_seqs,
self.scheduler_config.max_num_batched_tokens +
self.scheduler_config.max_paddings, vocab_size,
self.scheduler_config.max_num_batched_tokens, self.vocab_size,
self.lora_config, self.device, self.model.embedding_modules,
self.model.embedding_padding_modules)
self.model = self.lora_manager.create_lora_manager(self.model)
......@@ -113,20 +123,24 @@ class ModelRunner:
def set_block_size(self, block_size: int) -> None:
self.block_size = block_size
max_num_blocks = (self.max_context_len_to_capture + block_size -
1) // block_size
self.graph_block_tables = np.zeros(
(max(_BATCH_SIZES_TO_CAPTURE), max_num_blocks), dtype=np.int32)
(max(_BATCH_SIZES_TO_CAPTURE), self.get_max_block_per_batch()),
dtype=np.int32)
def get_max_block_per_batch(self) -> int:
block_size = self.block_size
return (self.max_context_len_to_capture + block_size - 1) // block_size
def _prepare_prompt(
self,
seq_group_metadata_list: List[SequenceGroupMetadata],
) -> Tuple[torch.Tensor, torch.Tensor, InputMetadata, List[int], List[int],
List[int], List[int], Set[LoRARequest]]:
) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, List[int],
List[int], List[int], List[int], Set[LoRARequest],
torch.Tensor]:
assert len(seq_group_metadata_list) > 0
input_tokens: List[List[int]] = []
input_positions: List[List[int]] = []
slot_mapping: List[List[int]] = []
input_tokens: List[int] = []
input_positions: List[int] = []
slot_mapping: List[int] = []
lora_index_mapping: List[int] = []
lora_prompt_mapping: List[int] = []
lora_requests: Set[LoRARequest] = set()
......@@ -135,53 +149,82 @@ class ModelRunner:
context_lens: List[int] = []
subquery_lens: List[int] = []
prefix_block_tables: List[List[int]] = []
multi_modal_input_list: List[torch.Tensor] = []
for seq_group_metadata in seq_group_metadata_list:
assert seq_group_metadata.is_prompt
seq_ids = list(seq_group_metadata.seq_data.keys())
assert len(seq_ids) == 1
seq_id = seq_ids[0]
computed_block_nums = seq_group_metadata.computed_block_nums
if (self.scheduler_config is not None
and self.scheduler_config.chunked_prefill_enabled
and computed_block_nums is not None):
raise RuntimeError(
"chunked prefill cannot be used with prefix caching "
"now.")
token_chunk_size = seq_group_metadata.token_chunk_size
seq_data = seq_group_metadata.seq_data[seq_id]
prompt_tokens = seq_data.get_token_ids()
computed_len = seq_data.get_num_computed_tokens()
# We should use get_len here because in case of preemption
# it contains output tokens.
prefill_end = min(seq_data.get_len(),
computed_len + token_chunk_size)
# TODO(sang): Rename it after chunked prefill is introduced.
prompt_tokens = seq_data.get_token_ids()[computed_len:prefill_end]
prompt_len = len(prompt_tokens)
# Right now, the prefill_end is always same as the length of
# sequence. However, once chunked prefill is introduced, this
# assumption can be changed.
assert prefill_end == seq_data.get_len()
prompt_lens.append(prompt_len)
prefix_len = 0
prefix = seq_group_metadata.prefix
if prefix is not None and prefix.computed:
prefix_len = prefix.get_length()
prompt_tokens = prompt_tokens[prefix_len:]
prefix_block_tables.append(prefix.get_block_numbers())
# NOTE: This only works for oooooooxxx style attention.
if computed_block_nums is not None and len(
computed_block_nums) > 0 and self.sliding_window is None:
# Prefix is not supported with sliding_window
computed_len = len(computed_block_nums) * self.block_size
prompt_tokens = prompt_tokens[computed_len:]
prefix_block_tables.append(computed_block_nums)
else:
prefix_block_tables.append([])
# Right now, prefill start is always 0. However, this
# assumption can be changed once chunked prefill is introduced.
assert computed_len == 0
# actual prompt lens
context_lens.append(prefix_len)
subquery_lens.append(prompt_len - prefix_len)
context_lens.append(computed_len)
subquery_lens.append(prompt_len - computed_len)
input_tokens.append(prompt_tokens)
input_tokens.extend(prompt_tokens)
# NOTE(woosuk): Here we assume that the first token in the prompt
# is always the first token in the sequence.
input_positions.append(
list(range(prefix_len, prefix_len + len(prompt_tokens))))
input_positions.extend(list(range(computed_len, prefill_end)))
lora_id = seq_group_metadata.lora_int_id
if lora_id > 0:
lora_requests.add(seq_group_metadata.lora_request)
lora_index_mapping.append([lora_id] * (prompt_len - prefix_len))
lora_index_mapping += [lora_id] * (prompt_len - computed_len)
lora_prompt_mapping.extend(
[lora_id] *
(prompt_len - prefix_len
(prompt_len - computed_len
if seq_group_metadata.sampling_params.prompt_logprobs else 1))
if seq_group_metadata.multi_modal_data:
multi_modal_input_list.append(
seq_group_metadata.multi_modal_data.data)
if seq_group_metadata.block_tables is None:
# During memory profiling, the block tables are not initialized
# yet. In this case, we just use a dummy slot mapping.
slot_mapping.append([_PAD_SLOT_ID] * prompt_len)
slot_mapping.extend([_PAD_SLOT_ID] * prompt_len)
continue
# Compute the slot mapping.
slot_mapping.append([])
block_table = seq_group_metadata.block_tables[seq_id]
# Mask the [0, start_idx) tokens of the prompt with _PAD_SLOT_ID,
# where start_idx is max(0, prompt_len - sliding_window).
......@@ -190,86 +233,116 @@ class ModelRunner:
# mapping will be [-1, -1, 2, 3, 4, 5, 6, 7, 0, 1].
start_idx = 0
if self.sliding_window is not None:
assert prefix_len == 0, (
assert computed_len == 0, (
"Prefix caching is currently not supported with "
"sliding window attention")
start_idx = max(0, prompt_len - self.sliding_window)
for i in range(prefix_len, prompt_len):
for i in range(computed_len, prefill_end):
if i < start_idx:
slot_mapping[-1].append(_PAD_SLOT_ID)
slot_mapping.append(_PAD_SLOT_ID)
continue
block_number = block_table[i // self.block_size]
block_offset = i % self.block_size
slot = block_number * self.block_size + block_offset
slot_mapping[-1].append(slot)
slot_mapping.append(slot)
max_prompt_len = max(subquery_lens)
input_tokens = _make_tensor_with_pad(input_tokens,
max_prompt_len,
pad=0,
max_subquery_len = max(subquery_lens)
max_prompt_len = max(prompt_lens)
num_prompt_tokens = len(input_tokens)
assert max_subquery_len > 0
input_tokens = torch.tensor(input_tokens,
dtype=torch.long,
device=self.device)
input_positions = _make_tensor_with_pad(input_positions,
max_prompt_len,
pad=0,
input_positions = torch.tensor(input_positions,
dtype=torch.long,
device=self.device)
slot_mapping = _make_tensor_with_pad(slot_mapping,
max_prompt_len,
pad=_PAD_SLOT_ID,
slot_mapping = torch.tensor(slot_mapping,
dtype=torch.long,
device=self.device)
lora_index_mapping = [
_pad_to_max(mapping, max_prompt_len, pad=0)
for mapping in lora_index_mapping
]
lora_index_mapping = lora_index_mapping
context_lens_tensor = torch.tensor(context_lens,
dtype=torch.int,
device=self.device)
if multi_modal_input_list:
assert self.vision_language_config, (
"Multi-modal inputs are only supported by "
"vision language models.")
multi_modal_input = torch.cat(multi_modal_input_list,
dim=0).to(self.device)
else:
multi_modal_input = None
# Prepare prefix block tables
max_prompt_block_table_len = max(len(t) for t in prefix_block_tables)
block_tables = _make_tensor_with_pad(
block_tables = make_tensor_with_pad(
prefix_block_tables,
max_len=max_prompt_block_table_len,
pad=0,
dtype=torch.int,
device=self.device,
)
start_loc_tensor = torch.arange(0,
len(prompt_lens) * max_prompt_len,
max_prompt_len,
# Query length can be shorter than key (i.e., prompt) when prefill
# is chunked or prefix cached.
subquery_lens_tensor = torch.tensor(subquery_lens,
dtype=torch.long,
device=self.device)
subquery_start_loc = torch.zeros(subquery_lens_tensor.shape[0] + 1,
dtype=torch.int32,
device=self.device)
prompt_lens_tensor = torch.tensor(prompt_lens,
dtype=torch.long,
device=self.device)
seq_start_loc = torch.zeros(prompt_lens_tensor.shape[0] + 1,
dtype=torch.int32,
device=self.device)
torch.cumsum(subquery_lens_tensor,
dim=0,
dtype=subquery_start_loc.dtype,
out=subquery_start_loc[1:])
input_metadata = InputMetadata(
torch.cumsum(prompt_lens_tensor,
dim=0,
dtype=seq_start_loc.dtype,
out=seq_start_loc[1:])
attn_metadata = self.attn_backend.make_metadata(
is_prompt=True,
slot_mapping=slot_mapping,
prompt_lens=prompt_lens_tensor,
max_seq_len=max_prompt_len,
start_loc=start_loc_tensor,
prompt_lens=prompt_lens,
prompt_lens_tensor=prompt_lens_tensor,
num_prompt_tokens=num_prompt_tokens,
num_generation_tokens=0,
max_subquery_len=max_subquery_len,
max_context_len=None,
max_prompt_len=max_prompt_len,
subquery_start_loc=subquery_start_loc,
seq_start_loc=seq_start_loc,
context_lens=context_lens_tensor,
block_tables=block_tables,
use_cuda_graph=False,
kv_cache_dtype=self.kv_cache_dtype,
)
return (input_tokens, input_positions, input_metadata, prompt_lens,
return (input_tokens, input_positions, attn_metadata, prompt_lens,
subquery_lens, lora_index_mapping, lora_prompt_mapping,
lora_requests)
lora_requests, multi_modal_input)
def _prepare_decode(
self,
seq_group_metadata_list: List[SequenceGroupMetadata],
) -> Tuple[torch.Tensor, torch.Tensor, InputMetadata, List[int], List[int],
Set[LoRARequest]]:
) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, List[int],
List[int], Set[LoRARequest]]:
assert len(seq_group_metadata_list) > 0
input_tokens: List[List[int]] = []
input_positions: List[List[int]] = []
slot_mapping: List[List[int]] = []
input_tokens: List[int] = []
input_positions: List[int] = []
slot_mapping: List[int] = []
context_lens: List[int] = []
block_tables: List[List[int]] = []
lora_index_mapping: List[int] = []
......@@ -278,6 +351,7 @@ class ModelRunner:
for seq_group_metadata in seq_group_metadata_list:
assert not seq_group_metadata.is_prompt
assert seq_group_metadata.token_chunk_size == 1
seq_ids = list(seq_group_metadata.seq_data.keys())
lora_id = seq_group_metadata.lora_int_id
......@@ -288,11 +362,11 @@ class ModelRunner:
for seq_id in seq_ids:
seq_data = seq_group_metadata.seq_data[seq_id]
generation_token = seq_data.get_last_token_id()
input_tokens.append([generation_token])
input_tokens.append(generation_token)
seq_len = seq_data.get_len()
position = seq_len - 1
input_positions.append([position])
input_positions.append(position)
context_len = seq_len if self.sliding_window is None else min(
seq_len, self.sliding_window)
......@@ -302,8 +376,8 @@ class ModelRunner:
block_number = block_table[position // self.block_size]
block_offset = position % self.block_size
slot = block_number * self.block_size + block_offset
slot_mapping.append([slot])
lora_index_mapping.append([lora_id])
slot_mapping.append(slot)
lora_index_mapping.append(lora_id)
lora_prompt_mapping.append(lora_id)
if self.sliding_window is not None:
......@@ -312,6 +386,9 @@ class ModelRunner:
block_table = block_table[-sliding_window_blocks:]
block_tables.append(block_table)
# vLLM uses cuda graph only for decoding requests.
# See `capture_model` API for more details.
# For decoding requests, batch_size == input_tokens.
batch_size = len(input_tokens)
max_context_len = max(context_lens)
use_captured_graph = (
......@@ -319,31 +396,24 @@ class ModelRunner:
and batch_size <= _BATCH_SIZES_TO_CAPTURE[-1]
and max_context_len <= self.max_context_len_to_capture)
if use_captured_graph:
# Pad the input tokens, positions, and slot mapping to match the
# batch size of the captured graph.
graph_batch_size = _get_graph_batch_size(batch_size)
assert graph_batch_size >= batch_size
for _ in range(graph_batch_size - batch_size):
input_tokens.append([])
input_positions.append([])
slot_mapping.append([])
input_tokens.append(0)
input_positions.append(0)
slot_mapping.append(_PAD_SLOT_ID)
context_lens.append(1)
block_tables.append([])
lora_index_mapping.append(0)
batch_size = graph_batch_size
input_tokens = _make_tensor_with_pad(input_tokens,
max_len=1,
pad=0,
input_tokens = torch.tensor(input_tokens,
dtype=torch.long,
device=self.device)
input_positions = _make_tensor_with_pad(input_positions,
max_len=1,
pad=0,
input_positions = torch.tensor(input_positions,
dtype=torch.long,
device=self.device)
slot_mapping = _make_tensor_with_pad(slot_mapping,
max_len=1,
pad=_PAD_SLOT_ID,
slot_mapping = torch.tensor(slot_mapping,
dtype=torch.long,
device=self.device)
context_lens = torch.tensor(context_lens,
......@@ -351,6 +421,12 @@ class ModelRunner:
device=self.device)
if use_captured_graph:
# When using cuda-graph all these tensors should be
# padded.
assert context_lens.shape[0] == input_tokens.shape[0]
assert context_lens.shape[0] == input_positions.shape[0]
assert context_lens.shape[0] == slot_mapping.shape[0]
# The shape of graph_block_tables is
# [max batch size, max context len // block size].
input_block_tables = self.graph_block_tables[:batch_size]
......@@ -361,7 +437,7 @@ class ModelRunner:
else:
max_block_table_len = max(
len(block_table) for block_table in block_tables)
block_tables = _make_tensor_with_pad(
block_tables = make_tensor_with_pad(
block_tables,
max_len=max_block_table_len,
pad=0,
......@@ -369,23 +445,24 @@ class ModelRunner:
device=self.device,
)
lora_index_mapping = [
_pad_to_max(mapping, 1, pad=0) for mapping in lora_index_mapping
]
input_metadata = InputMetadata(
attn_metadata = self.attn_backend.make_metadata(
is_prompt=False,
slot_mapping=slot_mapping,
prompt_lens=None,
max_seq_len=None,
start_loc=None,
prompt_lens_tensor=None,
num_prompt_tokens=0,
num_generation_tokens=len(input_tokens),
max_subquery_len=None,
max_context_len=max_context_len,
max_prompt_len=None,
subquery_start_loc=None,
seq_start_loc=None,
context_lens=context_lens,
block_tables=block_tables,
use_cuda_graph=use_captured_graph,
kv_cache_dtype=self.kv_cache_dtype,
)
return (input_tokens, input_positions, input_metadata,
return (input_tokens, input_positions, attn_metadata,
lora_index_mapping, lora_prompt_mapping, lora_requests)
def _prepare_sample(
......@@ -400,9 +477,8 @@ class ModelRunner:
selected_token_start_idx = 0
categorized_sample_indices = {t: [] for t in SamplingType}
categorized_sample_indices_start_idx = 0
pin_memory = not self.in_wsl and not self.device_config.is_neuron
categorized_sampled_token_indices_start_idx = 0
max_subquery_len = max(subquery_lens) if subquery_lens else 1
for i, seq_group_metadata in enumerate(seq_group_metadata_list):
seq_ids = list(seq_group_metadata.seq_data.keys())
sampling_params = seq_group_metadata.sampling_params
......@@ -417,9 +493,12 @@ class ModelRunner:
categorized_sample_indices_start_idx += subquery_len - 1
categorized_sample_indices[
sampling_params.sampling_type].append(
categorized_sample_indices_start_idx)
sampling_params.sampling_type].append([
categorized_sample_indices_start_idx,
categorized_sampled_token_indices_start_idx
])
categorized_sample_indices_start_idx += 1
categorized_sampled_token_indices_start_idx += 1
if sampling_params.prompt_logprobs is not None:
selected_token_indices.extend(
......@@ -427,11 +506,11 @@ class ModelRunner:
selected_token_start_idx + subquery_len - 1))
selected_token_indices.append(selected_token_start_idx +
subquery_len - 1)
selected_token_start_idx += max_subquery_len
selected_token_start_idx += subquery_len
if sampling_params.seed is not None:
seq_group_metadata.state.generator = torch.Generator(
device="cuda").manual_seed(sampling_params.seed)
device=self.device).manual_seed(sampling_params.seed)
else:
num_seqs = len(seq_ids)
selected_token_indices.extend(
......@@ -441,22 +520,32 @@ class ModelRunner:
categorized_sample_indices[
sampling_params.sampling_type].extend(
range(categorized_sample_indices_start_idx,
categorized_sample_indices_start_idx + num_seqs))
zip(
range(
categorized_sample_indices_start_idx,
categorized_sample_indices_start_idx +
num_seqs),
range(
categorized_sampled_token_indices_start_idx,
categorized_sampled_token_indices_start_idx +
num_seqs)))
categorized_sample_indices_start_idx += num_seqs
categorized_sampled_token_indices_start_idx += num_seqs
if sampling_params.seed is not None:
generators.append(seq_group_metadata.state.generator)
selected_token_indices = _async_h2d(selected_token_indices,
selected_token_indices = async_tensor_h2d(selected_token_indices,
dtype=torch.long,
target_device=self.device,
pin_memory=pin_memory)
pin_memory=self.pin_memory)
categorized_sample_indices = {
t: _async_h2d(seq_ids,
t: maybe_expand_dim(
async_tensor_h2d(seq_ids,
dtype=torch.int,
target_device=self.device,
pin_memory=pin_memory)
pin_memory=self.pin_memory), 2, 2)
for t, seq_ids in categorized_sample_indices.items()
}
......@@ -477,33 +566,32 @@ class ModelRunner:
def prepare_input_tensors(
self,
seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
) -> Tuple[torch.Tensor, torch.Tensor, InputMetadata, SamplingMetadata,
Set[int], LoRAMapping]:
) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, SamplingMetadata,
Set[int], LoRAMapping, torch.Tensor]:
if self.is_driver_worker:
# NOTE: We assume that all sequences in the group are all prompts or
# all decodes.
is_prompt = seq_group_metadata_list[0].is_prompt
# Prepare input tensors.
if is_prompt:
(input_tokens, input_positions, input_metadata, prompt_lens,
(input_tokens, input_positions, attn_metadata, prompt_lens,
subquery_lens, lora_index_mapping, lora_prompt_mapping,
lora_requests) = self._prepare_prompt(seq_group_metadata_list)
lora_requests, multi_modal_input
) = self._prepare_prompt(seq_group_metadata_list)
else:
(input_tokens, input_positions, input_metadata,
(input_tokens, input_positions, attn_metadata,
lora_index_mapping, lora_prompt_mapping,
lora_requests) = self._prepare_decode(seq_group_metadata_list)
prompt_lens = []
subquery_lens = None
multi_modal_input = None
sampling_metadata = self._prepare_sample(seq_group_metadata_list,
prompt_lens,
subquery_lens)
if self.lora_config:
flat_lora_index_mapping = [
item for sublist in lora_index_mapping for item in sublist
]
lora_mapping = LoRAMapping(
flat_lora_index_mapping,
lora_index_mapping,
lora_prompt_mapping,
)
else:
......@@ -513,82 +601,77 @@ class ModelRunner:
metadata_dict = {
"input_tokens": input_tokens,
"input_positions": input_positions,
"is_prompt": input_metadata.is_prompt,
"slot_mapping": input_metadata.slot_mapping,
"prompt_lens": input_metadata.prompt_lens,
"max_seq_len": input_metadata.max_seq_len,
"start_loc": input_metadata.start_loc,
"max_context_len": input_metadata.max_context_len,
"context_lens": input_metadata.context_lens,
"block_tables": input_metadata.block_tables,
"use_cuda_graph": input_metadata.use_cuda_graph,
"kv_cache_dtype": input_metadata.kv_cache_dtype,
"selected_token_indices":
sampling_metadata.selected_token_indices,
"lora_requests": lora_requests,
"lora_mapping": lora_mapping,
"multi_modal_input": multi_modal_input,
}
metadata_dict.update(attn_metadata.asdict_zerocopy())
broadcast_tensor_dict(metadata_dict, src=0)
else:
metadata_dict = broadcast_tensor_dict(src=0)
input_tokens = metadata_dict["input_tokens"]
input_positions = metadata_dict["input_positions"]
lora_mapping = metadata_dict["lora_mapping"]
lora_requests = metadata_dict["lora_requests"]
input_metadata = InputMetadata(
is_prompt=metadata_dict["is_prompt"],
slot_mapping=metadata_dict["slot_mapping"],
prompt_lens=metadata_dict["prompt_lens"],
max_seq_len=metadata_dict["max_seq_len"],
start_loc=metadata_dict["start_loc"],
max_context_len=metadata_dict["max_context_len"],
context_lens=metadata_dict["context_lens"],
block_tables=metadata_dict["block_tables"],
use_cuda_graph=metadata_dict["use_cuda_graph"],
kv_cache_dtype=metadata_dict["kv_cache_dtype"],
)
input_tokens = metadata_dict.pop("input_tokens")
input_positions = metadata_dict.pop("input_positions")
selected_token_indices = metadata_dict.pop(
"selected_token_indices")
lora_mapping = metadata_dict.pop("lora_mapping")
lora_requests = metadata_dict.pop("lora_requests")
multi_modal_input = metadata_dict.pop("multi_modal_input")
attn_metadata = self.attn_backend.make_metadata(**metadata_dict)
sampling_metadata = SamplingMetadata(
seq_groups=None,
seq_data=None,
prompt_lens=None,
selected_token_indices=metadata_dict["selected_token_indices"],
selected_token_indices=selected_token_indices,
categorized_sample_indices=None,
generators=None,
perform_sampling=False,
)
return (input_tokens, input_positions, input_metadata,
sampling_metadata, lora_requests, lora_mapping)
return (input_tokens, input_positions, attn_metadata,
sampling_metadata, lora_requests, lora_mapping,
multi_modal_input)
@torch.inference_mode()
def execute_model(
self,
seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
kv_caches: List[Tuple[torch.Tensor, torch.Tensor]],
kv_caches: List[torch.Tensor],
) -> Optional[SamplerOutput]:
(input_tokens, input_positions, input_metadata, sampling_metadata,
lora_requests,
lora_mapping) = self.prepare_input_tensors(seq_group_metadata_list)
(input_tokens, input_positions, attn_metadata, sampling_metadata,
lora_requests, lora_mapping, multi_modal_input
) = self.prepare_input_tensors(seq_group_metadata_list)
if self.lora_config:
self.set_active_loras(lora_requests, lora_mapping)
# Execute the model.
if input_metadata.use_cuda_graph:
if attn_metadata.use_cuda_graph:
graph_batch_size = input_tokens.shape[0]
model_executable = self.graph_runners[graph_batch_size]
else:
model_executable = self.model
hidden_states = model_executable(
input_ids=input_tokens,
positions=input_positions,
kv_caches=kv_caches,
input_metadata=input_metadata,
)
execute_model_kwargs = {
"input_ids": input_tokens,
"positions": input_positions,
"kv_caches": kv_caches,
"attn_metadata": attn_metadata,
}
if self.vision_language_config:
execute_model_kwargs.update({"image_input": multi_modal_input})
hidden_states = model_executable(**execute_model_kwargs)
# Compute the logits.
logits = self.model.compute_logits(hidden_states, sampling_metadata)
# Only perform sampling in the driver worker.
if not sampling_metadata.perform_sampling:
return None
# Sample the next token.
output = self.model.sample(
hidden_states=hidden_states,
logits=logits,
sampling_metadata=sampling_metadata,
)
return output
......@@ -596,8 +679,7 @@ class ModelRunner:
@torch.inference_mode()
def profile_run(self) -> None:
# Enable top-k sampling to reflect the accurate memory usage.
vocab_size = self.model_config.get_vocab_size()
sampling_params = SamplingParams(top_p=0.99, top_k=vocab_size - 1)
sampling_params = SamplingParams(top_p=0.99, top_k=self.vocab_size - 1)
max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens
max_num_seqs = self.scheduler_config.max_num_seqs
......@@ -626,10 +708,22 @@ class ModelRunner:
# Profile memory usage with max_num_sequences sequences and the total
# number of tokens equal to max_num_batched_tokens.
seqs: List[SequenceGroupMetadata] = []
# Additional GPU memory may be needed for vision encoding, which needs
# to be accounted for when calculating the GPU blocks for
# vLLM blocker manager.
# To exercise the worst scenario for GPU memory consumption,
# the number of seqs (batch_size) is chosen to maximize the number
# of images processed.
if self.vision_language_config:
max_num_seqs = min(
max_num_seqs,
int(max_num_batched_tokens /
self.vision_language_config.image_feature_size))
for group_id in range(max_num_seqs):
seq_len = (max_num_batched_tokens // max_num_seqs +
(group_id < max_num_batched_tokens % max_num_seqs))
seq_data = SequenceData([0] * seq_len)
seq_data, fake_multi_modal_input = _prepare_fake_inputs(
seq_len, self.vision_language_config)
seq = SequenceGroupMetadata(
request_id=str(group_id),
is_prompt=True,
......@@ -638,12 +732,13 @@ class ModelRunner:
block_tables=None,
lora_request=dummy_lora_requests_per_seq[group_id]
if dummy_lora_requests_per_seq else None,
multi_modal_data=fake_multi_modal_input,
)
seqs.append(seq)
# Run the model with the dummy inputs.
num_layers = self.model_config.get_num_layers(self.parallel_config)
kv_caches = [(None, None)] * num_layers
kv_caches = [None] * num_layers
self.execute_model(seqs, kv_caches)
torch.cuda.synchronize()
return
......@@ -675,10 +770,22 @@ class ModelRunner:
return self.lora_manager.list_loras()
@torch.inference_mode()
def capture_model(self, kv_caches: List[KVCache]) -> None:
def capture_model(self, kv_caches: List[torch.Tensor]) -> None:
"""Cuda graph capture a model.
Note that CUDA graph's performance gain is negligible if number
of batched tokens are larger than 200. And since CUDA graph
requires fixed sized tensors, supporting large/variable batch
size requires high GPU memory overhead. Thus, vLLM only captures
decoding requests. Mixed batch (chunked prefill + decoding) or
prefill requests are not captured.
Since it is used for decoding-only, it assumes there's only 1 token
per sequence in the batch.
"""
# NOTE(woosuk): This is a hack to ensure that the NCCL backend is never
# deleted before the CUDA graphs.
self.cupy_nccl_backend = cupy_utils.get_nccl_backend()
self.pynccl_backend = pynccl_utils.get_nccl_backend()
assert not self.model_config.enforce_eager
logger.info("Capturing the model for CUDA graphs. This may lead to "
......@@ -694,10 +801,9 @@ class ModelRunner:
# Prepare dummy inputs. These will be reused for all batch sizes.
max_batch_size = max(_BATCH_SIZES_TO_CAPTURE)
input_tokens = torch.zeros(max_batch_size, 1, dtype=torch.long).cuda()
input_positions = torch.zeros(max_batch_size, 1,
dtype=torch.long).cuda()
slot_mapping = torch.empty(max_batch_size, 1, dtype=torch.long).cuda()
input_tokens = torch.zeros(max_batch_size, dtype=torch.long).cuda()
input_positions = torch.zeros(max_batch_size, dtype=torch.long).cuda()
slot_mapping = torch.empty(max_batch_size, dtype=torch.long).cuda()
slot_mapping.fill_(_PAD_SLOT_ID)
context_lens = torch.ones(max_batch_size, dtype=torch.int32).cuda()
block_tables = torch.from_numpy(self.graph_block_tables).cuda()
......@@ -709,23 +815,28 @@ class ModelRunner:
]
# NOTE(woosuk): There are 3 backends for all-reduce: custom all-reduce
# kernel, CuPy NCCL, and PyTorch NCCL. When using CUDA graph, we use
# either custom all-reduce kernel or CuPy NCCL. When not using CUDA
# kernel, pynccl, and PyTorch NCCL. When using CUDA graph, we use
# either custom all-reduce kernel or pynccl. When not using CUDA
# graph, we use either custom all-reduce kernel or PyTorch NCCL.
# We always prioritize using custom all-reduce kernel but fall back
# to PyTorch or CuPy NCCL if it is disabled or not supported.
# to PyTorch or pynccl if it is disabled or not supported.
with custom_all_reduce.capture():
# NOTE: Capturing the largest batch size first may help reduce the
# memory usage of CUDA graph.
for batch_size in reversed(batch_size_capture_list):
# Create dummy input_metadata.
input_metadata = InputMetadata(
# Create dummy attn_metadata.
attn_metadata = self.attn_backend.make_metadata(
is_prompt=False,
slot_mapping=slot_mapping[:batch_size],
prompt_lens=None,
max_seq_len=None,
start_loc=None,
prompt_lens_tensor=None,
num_prompt_tokens=0,
num_generation_tokens=batch_size,
max_subquery_len=None,
max_context_len=self.max_context_len_to_capture,
max_prompt_len=None,
subquery_start_loc=None,
seq_start_loc=None,
context_lens=context_lens[:batch_size],
block_tables=block_tables[:batch_size],
use_cuda_graph=True,
......@@ -744,7 +855,7 @@ class ModelRunner:
input_tokens[:batch_size],
input_positions[:batch_size],
kv_caches,
input_metadata,
attn_metadata,
memory_pool=self.graph_memory_pool,
)
self.graph_memory_pool = graph_runner.graph.pool()
......@@ -756,12 +867,18 @@ class ModelRunner:
logger.info(f"Graph capturing finished in {elapsed_time:.0f} secs.")
def __del__(self) -> None:
# Delete the CUDA graphs before deleting the CuPy NCCL communicator.
# Delete the CUDA graphs before deleting the pynccl communicator.
# NOTE(woosuk): This is necessary because otherwise deadlocks can
# happen.
# FIXME(woosuk): This is a bit hacky. Find a more robust solution.
# TODO(youkaichao): when we get enough user feedback that pynccl is
# more stable than cupy, we can remove this, e.g. in v0.4.1.
self.graph_runners.clear()
self.cupy_nccl_backend = None
self.pynccl_backend = None
@property
def vocab_size(self) -> int:
return self.model_config.get_vocab_size()
class CUDAGraphRunner:
......@@ -776,20 +893,22 @@ class CUDAGraphRunner:
self,
input_ids: torch.Tensor,
positions: torch.Tensor,
kv_caches: List[KVCache],
input_metadata: InputMetadata,
kv_caches: List[torch.Tensor],
attn_metadata: AttentionMetadata,
memory_pool,
**kwargs,
) -> None:
assert self.graph is None
# Run the model once without capturing the graph.
# This is to make sure that the captured graph does not include the
# kernel launches for initial benchmarking (e.g., Triton autotune).
with _maybe_cupy_nccl():
with _maybe_pynccl():
self.model(
input_ids,
positions,
kv_caches,
input_metadata,
attn_metadata,
**kwargs,
)
torch.cuda.synchronize()
......@@ -798,12 +917,13 @@ class CUDAGraphRunner:
# https://stackoverflow.com/questions/31039022/python-multi-line-with-statement
self.graph = torch.cuda.CUDAGraph()
with torch.cuda.graph(self.graph, pool=memory_pool): # noqa: SIM117
with _maybe_cupy_nccl():
with _maybe_pynccl():
hidden_states = self.model(
input_ids,
positions,
kv_caches,
input_metadata,
attn_metadata,
**kwargs,
)
torch.cuda.synchronize()
......@@ -812,9 +932,9 @@ class CUDAGraphRunner:
"input_ids": input_ids,
"positions": positions,
"kv_caches": kv_caches,
"slot_mapping": input_metadata.slot_mapping,
"context_lens": input_metadata.context_lens,
"block_tables": input_metadata.block_tables,
"slot_mapping": attn_metadata.slot_mapping,
"context_lens": attn_metadata.context_lens,
"block_tables": attn_metadata.block_tables,
}
self.output_buffers = {"hidden_states": hidden_states}
return
......@@ -823,8 +943,9 @@ class CUDAGraphRunner:
self,
input_ids: torch.Tensor,
positions: torch.Tensor,
kv_caches: List[Tuple[torch.Tensor, torch.Tensor]],
input_metadata: InputMetadata,
kv_caches: List[torch.Tensor],
attn_metadata: AttentionMetadata,
**kwargs,
) -> torch.Tensor:
# KV caches are fixed tensors, so we don't need to copy them.
del kv_caches
......@@ -832,13 +953,12 @@ class CUDAGraphRunner:
# Copy the input tensors to the input buffers.
self.input_buffers["input_ids"].copy_(input_ids, non_blocking=True)
self.input_buffers["positions"].copy_(positions, non_blocking=True)
self.input_buffers["slot_mapping"].copy_(input_metadata.slot_mapping,
self.input_buffers["slot_mapping"].copy_(attn_metadata.slot_mapping,
non_blocking=True)
self.input_buffers["context_lens"].copy_(input_metadata.context_lens,
self.input_buffers["context_lens"].copy_(attn_metadata.context_lens,
non_blocking=True)
self.input_buffers["block_tables"].copy_(input_metadata.block_tables,
self.input_buffers["block_tables"].copy_(attn_metadata.block_tables,
non_blocking=True)
# Run the graph.
self.graph.replay()
......@@ -850,44 +970,43 @@ class CUDAGraphRunner:
@contextlib.contextmanager
def _maybe_cupy_nccl():
if cupy_utils.is_initialized() and not custom_all_reduce.is_initialized():
with with_cupy_nccl_for_all_reduce():
def _maybe_pynccl():
if pynccl_utils.is_initialized(
) and not custom_all_reduce.is_initialized():
with with_pynccl_for_all_reduce():
yield
else:
yield
def _pad_to_max(x: List[int], max_len: int, pad: int) -> List[int]:
assert len(x) <= max_len
return x + [pad] * (max_len - len(x))
def _make_tensor_with_pad(
x: List[List[int]],
max_len: int,
pad: int,
dtype: torch.dtype,
device: Optional[Union[str, torch.device]],
) -> torch.Tensor:
padded_x = [_pad_to_max(x_i, max_len, pad) for x_i in x]
return torch.tensor(padded_x, dtype=dtype, device=device)
def _get_graph_batch_size(batch_size: int) -> int:
"""Returns the padded batch size given actual batch size.
Batch sizes are 1, 2, 4, _BATCH_SIZE_ALIGNMENT,
2*_BATCH_SIZE_ALIGNMENT, 3*_BATCH_SIZE_ALIGNMENT...
"""
if batch_size <= 2:
return batch_size
elif batch_size <= 4:
return 4
else:
return (batch_size + 7) // 8 * 8
def _async_h2d(
data: list,
dtype: torch.dtype,
target_device: Union[str, torch.device],
pin_memory: bool,
) -> torch.Tensor:
t = torch.tensor(data, dtype=dtype, pin_memory=pin_memory, device="cpu")
return t.to(device=target_device, non_blocking=True)
return ((batch_size + _BATCH_SIZE_ALIGNMENT - 1) //
_BATCH_SIZE_ALIGNMENT * _BATCH_SIZE_ALIGNMENT)
def _prepare_fake_inputs(
seq_len: int, vision_language_config: Optional[VisionLanguageConfig]):
"""Prepare fake inputs for profile run."""
if vision_language_config:
prompt_tokens = [
vision_language_config.image_token_id
] * vision_language_config.image_feature_size + [0] * (
seq_len - vision_language_config.image_feature_size)
fake_image_input = MultiModalData(
type=MultiModalData.Type.IMAGE,
data=torch.zeros(vision_language_config.image_input_shape,
dtype=torch.float16))
else:
prompt_tokens = [0] * seq_len
fake_image_input = None
return SequenceData(prompt_tokens), fake_image_input
from typing import Dict, List, Optional, Tuple
import torch
from vllm.config import (DeviceConfig, ModelConfig, ParallelConfig,
SchedulerConfig)
from vllm.logger import init_logger
from vllm.model_executor import SamplingMetadata
from vllm.model_executor.neuron_model_loader import get_neuron_model
from vllm.sampling_params import SamplingParams, SamplingType
from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata
from vllm.utils import (async_tensor_h2d, is_pin_memory_available,
make_tensor_with_pad, maybe_expand_dim)
logger = init_logger(__name__)
class NeuronModelRunner:
def __init__(
self,
model_config: ModelConfig,
parallel_config: ParallelConfig,
scheduler_config: SchedulerConfig,
device_config: DeviceConfig,
):
self.model_config = model_config
self.parallel_config = parallel_config
self.scheduler_config = scheduler_config
if model_config is not None and model_config.get_sliding_window():
logger.warning("Sliding window is not supported on Neuron. "
"The model will run without sliding window.")
self.device_config = (device_config
if device_config is not None else DeviceConfig())
self.device = self.device_config.device
self.model = None
self.pin_memory = is_pin_memory_available()
def load_model(self) -> None:
self.model = get_neuron_model(self.model_config,
parallel_config=self.parallel_config,
scheduler_config=self.scheduler_config)
def _prepare_prompt(
self,
seq_group_metadata_list: List[SequenceGroupMetadata],
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, List[int]]:
assert len(seq_group_metadata_list) > 0
input_tokens: List[List[int]] = []
input_positions: List[List[int]] = []
input_block_ids: List[int] = []
prompt_lens: List[int] = []
for seq_group_metadata in seq_group_metadata_list:
assert seq_group_metadata.is_prompt
seq_ids = list(seq_group_metadata.seq_data.keys())
assert len(seq_ids) == 1
seq_id = seq_ids[0]
seq_data = seq_group_metadata.seq_data[seq_id]
prompt_tokens = seq_data.get_token_ids()
prompt_len = len(prompt_tokens)
prompt_lens.append(prompt_len)
input_tokens.append(prompt_tokens)
input_positions.append(list(range(prompt_len)))
assert seq_group_metadata.block_tables is not None
block_table = seq_group_metadata.block_tables[seq_id]
assert len(block_table) == 1
input_block_ids.append(block_table[0])
max_prompt_len = max(prompt_lens)
assert max_prompt_len > 0
input_tokens = make_tensor_with_pad(input_tokens,
max_prompt_len,
pad=0,
dtype=torch.long,
device=self.device)
input_positions = make_tensor_with_pad(input_positions,
max_prompt_len,
pad=0,
dtype=torch.long,
device=self.device)
input_block_ids = torch.tensor(input_block_ids,
dtype=torch.long,
device=self.device)
return input_tokens, input_positions, input_block_ids, prompt_lens
def _prepare_decode(
self,
seq_group_metadata_list: List[SequenceGroupMetadata],
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
assert len(seq_group_metadata_list) > 0
input_tokens: List[List[int]] = []
input_positions: List[List[int]] = []
input_block_ids: List[int] = []
context_lens: List[int] = []
for seq_group_metadata in seq_group_metadata_list:
assert not seq_group_metadata.is_prompt
seq_ids = list(seq_group_metadata.seq_data.keys())
for seq_id in seq_ids:
seq_data = seq_group_metadata.seq_data[seq_id]
generation_token = seq_data.get_last_token_id()
input_tokens.append([generation_token])
seq_len = seq_data.get_len()
position = seq_len - 1
input_positions.append([position])
context_lens.append(seq_len)
assert seq_group_metadata.block_tables is not None
block_table = seq_group_metadata.block_tables[seq_id]
assert len(block_table) == 1
input_block_ids.append(block_table[0])
input_tokens = make_tensor_with_pad(input_tokens,
max_len=1,
pad=0,
dtype=torch.long,
device=self.device)
input_positions = make_tensor_with_pad(input_positions,
max_len=1,
pad=0,
dtype=torch.long,
device=self.device)
context_lens = torch.tensor(context_lens,
dtype=torch.int,
device=self.device)
input_block_ids = torch.tensor(input_block_ids,
dtype=torch.long,
device=self.device)
return input_tokens, input_positions, input_block_ids
def _prepare_sample(
self,
seq_group_metadata_list: List[SequenceGroupMetadata],
prompt_lens: List[int],
) -> SamplingMetadata:
seq_groups: List[Tuple[List[int], SamplingParams]] = []
selected_token_indices: List[int] = []
generators: List[torch.Generator] = []
selected_token_start_idx = 0
categorized_sample_indices = {t: [] for t in SamplingType}
categorized_sample_indices_start_idx = 0
categorized_sampled_token_indices_start_idx = 0
for i, seq_group_metadata in enumerate(seq_group_metadata_list):
seq_ids = list(seq_group_metadata.seq_data.keys())
sampling_params = seq_group_metadata.sampling_params
seq_groups.append((seq_ids, sampling_params))
if seq_group_metadata.is_prompt:
assert len(seq_ids) == 1
assert prompt_lens is not None
prompt_len = prompt_lens[i]
if sampling_params.prompt_logprobs is not None:
# NOTE: prompt token positions do not need sample, skip
categorized_sample_indices_start_idx += prompt_len - 1
categorized_sample_indices[
sampling_params.sampling_type].append([
categorized_sample_indices_start_idx,
categorized_sampled_token_indices_start_idx
])
categorized_sample_indices_start_idx += 1
categorized_sampled_token_indices_start_idx += 1
if sampling_params.prompt_logprobs is not None:
selected_token_indices.extend(
range(selected_token_start_idx,
selected_token_start_idx + prompt_len - 1))
selected_token_indices.append(selected_token_start_idx +
prompt_len - 1)
selected_token_start_idx += prompt_len
if sampling_params.seed is not None:
seq_group_metadata.state.generator = torch.Generator(
device=self.device).manual_seed(sampling_params.seed)
else:
num_seqs = len(seq_ids)
selected_token_indices.extend(
range(selected_token_start_idx,
selected_token_start_idx + num_seqs))
selected_token_start_idx += num_seqs
categorized_sample_indices[
sampling_params.sampling_type].extend(
zip(
range(
categorized_sample_indices_start_idx,
categorized_sample_indices_start_idx +
num_seqs),
range(
categorized_sampled_token_indices_start_idx,
categorized_sampled_token_indices_start_idx +
num_seqs)))
categorized_sample_indices_start_idx += num_seqs
categorized_sampled_token_indices_start_idx += num_seqs
if sampling_params.seed is not None:
generators.append(seq_group_metadata.state.generator)
selected_token_indices = async_tensor_h2d(selected_token_indices,
dtype=torch.long,
target_device=self.device,
pin_memory=self.pin_memory)
categorized_sample_indices = {
t: maybe_expand_dim(
async_tensor_h2d(seq_ids,
dtype=torch.int,
target_device=self.device,
pin_memory=self.pin_memory), 2, 2)
for t, seq_ids in categorized_sample_indices.items()
}
seq_data: Dict[int, SequenceData] = {}
for seq_group_metadata in seq_group_metadata_list:
seq_data.update(seq_group_metadata.seq_data)
sampling_metadata = SamplingMetadata(
seq_groups=seq_groups,
seq_data=seq_data,
prompt_lens=prompt_lens,
selected_token_indices=selected_token_indices,
categorized_sample_indices=categorized_sample_indices,
generators=generators,
)
return sampling_metadata
def prepare_input_tensors(
self,
seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, SamplingMetadata]:
# NOTE: We assume that all sequences in the group are all prompts or
# all decodes.
is_prompt = seq_group_metadata_list[0].is_prompt
# Prepare input tensors.
if is_prompt:
(input_tokens, input_positions, input_block_ids,
prompt_lens) = self._prepare_prompt(seq_group_metadata_list)
else:
(input_tokens, input_positions,
input_block_ids) = self._prepare_decode(seq_group_metadata_list)
prompt_lens = []
sampling_metadata = self._prepare_sample(seq_group_metadata_list,
prompt_lens)
return (input_tokens, input_positions, input_block_ids,
sampling_metadata)
@torch.inference_mode()
def execute_model(
self,
seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
) -> Optional[SamplerOutput]:
(input_tokens, input_positions, input_block_ids, sampling_metadata
) = self.prepare_input_tensors(seq_group_metadata_list)
hidden_states = self.model(
input_ids=input_tokens,
positions=input_positions,
input_block_ids=input_block_ids,
)
# Compute the logits.
logits = self.model.compute_logits(hidden_states, sampling_metadata)
# Sample the next token.
output = self.model.sample(
logits=logits,
sampling_metadata=sampling_metadata,
)
return output
@property
def vocab_size(self) -> int:
return self.model_config.get_vocab_size()
"""A Neuron worker class."""
from typing import Dict, List, Optional, Tuple
from typing import List, Optional
import torch
import torch.distributed
from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
ParallelConfig, SchedulerConfig, LoRAConfig)
from vllm.config import (DeviceConfig, ModelConfig, ParallelConfig,
SchedulerConfig)
from vllm.model_executor import set_random_seed
from vllm.model_executor.parallel_utils.communication_op import (
broadcast_tensor_dict)
from vllm.model_executor.parallel_utils.parallel_state import (
ensure_model_parallel_initialized)
from vllm.sequence import SamplerOutput, SequenceGroupMetadata
from vllm.worker.cache_engine import CacheEngine
from vllm.worker.model_runner import ModelRunner
from vllm.worker.neuron_model_runner import NeuronModelRunner
class Worker:
class NeuronWorker:
"""A worker class that executes the model on a group of neuron cores.
"""
......@@ -26,166 +21,32 @@ class Worker:
parallel_config: ParallelConfig,
scheduler_config: SchedulerConfig,
device_config: DeviceConfig,
local_rank: int,
rank: int,
distributed_init_method: str,
lora_config: Optional[LoRAConfig] = None,
kv_cache_dtype: Optional[str] = "auto",
is_driver_worker: bool = False,
) -> None:
self.model_config = model_config
self.parallel_config = parallel_config
self.scheduler_config = scheduler_config
self.device_config = device_config
self.local_rank = local_rank
self.rank = rank
self.distributed_init_method = distributed_init_method
self.lora_config = lora_config
self.is_driver_worker = is_driver_worker
if self.is_driver_worker:
assert self.rank == 0, "The driver worker must have rank 0."
self.model_runner = ModelRunner(model_config,
parallel_config,
scheduler_config,
device_config,
lora_config=self.lora_config,
is_driver_worker=is_driver_worker)
# Uninitialized cache engine. Will be initialized by
# self.init_cache_engine().
self.cache_config = None
self.cache_engine = None
self.cache_events = None
self.gpu_cache = None
self.model_runner = NeuronModelRunner(model_config, parallel_config,
scheduler_config, device_config)
def init_model(self) -> None:
# Initialize the distributed environment.
_init_distributed_environment(self.parallel_config,
self.rank,
self.distributed_init_method,
distributed_backend="gloo")
# Initialize the model.
def init_device(self) -> None:
# Set random seed.
set_random_seed(self.model_config.seed)
def load_model(self):
self.model_runner.load_model()
@torch.inference_mode()
def profile_num_available_blocks(
self,
block_size: int = 128,
gpu_memory_utilization: float = 0.9,
cpu_swap_space: int = 0,
cache_dtype: str = "float16",
) -> Tuple[int, int]:
"""Simply returns max_num_seqs as num_gpu_blocks, 0 as num_cpu_blocks."""
num_gpu_blocks = self.scheduler_config.max_num_seqs
num_cpu_blocks = 0
return num_gpu_blocks, num_cpu_blocks
def init_cache_engine(self, cache_config: CacheConfig) -> None:
self.cache_config = cache_config
self.cache_engine = CacheEngine(self.cache_config, self.model_config,
self.parallel_config)
self.model_runner.set_block_size(self.cache_engine.block_size)
def warm_up_model(self) -> None:
# Warm up is maintained in transformers-neuronx
pass
def cache_swap(
self,
blocks_to_swap_in: Dict[int, int],
blocks_to_swap_out: Dict[int, int],
blocks_to_copy: Dict[int, List[int]],
) -> None:
# Issue cache operations.
issued_cache_op = False
if blocks_to_swap_in:
self.cache_engine.swap_in(blocks_to_swap_in)
issued_cache_op = True
if blocks_to_swap_out:
self.cache_engine.swap_out(blocks_to_swap_out)
issued_cache_op = True
if blocks_to_copy:
self.cache_engine.copy(blocks_to_copy)
issued_cache_op = True
cache_events = self.cache_events if issued_cache_op else None
# Wait for cache operations to finish.
if cache_events is not None:
raise NotImplementedError(
"cache operations are not implemented for neuron backend.")
@torch.inference_mode()
def execute_model(
self,
seq_group_metadata_list: Optional[List[SequenceGroupMetadata]] = None,
blocks_to_swap_in: Optional[Dict[int, int]] = None,
blocks_to_swap_out: Optional[Dict[int, int]] = None,
blocks_to_copy: Optional[Dict[int, List[int]]] = None,
seq_group_metadata_list: List[SequenceGroupMetadata],
) -> Optional[SamplerOutput]:
if self.is_driver_worker:
assert seq_group_metadata_list is not None
num_seq_groups = len(seq_group_metadata_list)
assert blocks_to_swap_in is not None
assert blocks_to_swap_out is not None
assert blocks_to_copy is not None
data = {
"num_seq_groups": num_seq_groups,
"blocks_to_swap_in": blocks_to_swap_in,
"blocks_to_swap_out": blocks_to_swap_out,
"blocks_to_copy": blocks_to_copy,
}
broadcast_tensor_dict(data, src=0)
else:
data = broadcast_tensor_dict(src=0)
num_seq_groups = data["num_seq_groups"]
blocks_to_swap_in = data["blocks_to_swap_in"]
blocks_to_swap_out = data["blocks_to_swap_out"]
blocks_to_copy = data["blocks_to_copy"]
self.cache_swap(blocks_to_swap_in, blocks_to_swap_out, blocks_to_copy)
# If there is no input, we don't need to execute the model.
if num_seq_groups == 0:
return {}
output = self.model_runner.execute_model(seq_group_metadata_list,
self.gpu_cache)
output = self.model_runner.execute_model(seq_group_metadata_list)
return output
def _init_distributed_environment(
parallel_config: ParallelConfig,
rank: int,
distributed_init_method: Optional[str] = None,
distributed_backend: Optional[str] = None,
) -> None:
"""Initialize the distributed environment."""
if torch.distributed.is_initialized():
torch_world_size = torch.distributed.get_world_size()
if torch_world_size != parallel_config.world_size:
raise RuntimeError(
"torch.distributed is already initialized but the torch world "
"size does not match parallel_config.world_size "
f"({torch_world_size} vs. {parallel_config.world_size}).")
elif not distributed_init_method:
raise ValueError(
"distributed_init_method must be set if torch.distributed "
"is not already initialized")
else:
distributed_backend = distributed_backend if distributed_backend else "nccl"
torch.distributed.init_process_group(
backend=distributed_backend,
world_size=parallel_config.world_size,
rank=rank,
init_method=distributed_init_method,
)
# A small all_reduce for warmup.
torch.distributed.all_reduce(torch.zeros(1))
ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
parallel_config.pipeline_parallel_size)
"""A GPU worker class."""
import gc
import os
from typing import Dict, List, Tuple, Set, Optional
from typing import Dict, List, Optional, Set, Tuple
import torch
import torch.distributed
from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
ParallelConfig, SchedulerConfig, LoRAConfig)
from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig,
ParallelConfig, SchedulerConfig, VisionLanguageConfig)
from vllm.lora.request import LoRARequest
from vllm.model_executor import set_random_seed
from vllm.model_executor.parallel_utils import cupy_utils
from vllm.model_executor.parallel_utils import pynccl_utils
from vllm.model_executor.parallel_utils.communication_op import (
broadcast_tensor_dict)
from vllm.model_executor.parallel_utils.custom_all_reduce import init_custom_ar
......@@ -18,8 +19,6 @@ from vllm.model_executor.parallel_utils.parallel_state import (
from vllm.sequence import SamplerOutput, SequenceGroupMetadata
from vllm.worker.cache_engine import CacheEngine
from vllm.worker.model_runner import ModelRunner
from vllm.lora.request import LoRARequest
from vllm.utils import is_hip
class Worker:
......@@ -40,6 +39,7 @@ class Worker:
rank: int,
distributed_init_method: str,
lora_config: Optional[LoRAConfig] = None,
vision_language_config: Optional[VisionLanguageConfig] = None,
kv_cache_dtype: Optional[str] = "auto",
is_driver_worker: bool = False,
) -> None:
......@@ -55,21 +55,27 @@ class Worker:
if self.is_driver_worker:
assert self.rank == 0, "The driver worker must have rank 0."
self.model_runner = ModelRunner(model_config,
self.vision_language_config = vision_language_config
if self.vision_language_config:
assert not self.lora_config, (
"To be tested: vision language model with LoRA settings.")
self.model_runner = ModelRunner(
model_config,
parallel_config,
scheduler_config,
device_config,
lora_config=self.lora_config,
kv_cache_dtype=kv_cache_dtype,
is_driver_worker=is_driver_worker)
is_driver_worker=is_driver_worker,
vision_language_config=vision_language_config)
# Uninitialized cache engine. Will be initialized by
# self.init_cache_engine().
self.cache_config = None
self.cache_engine = None
self.cache_events = None
self.gpu_cache = None
def init_model(self, cupy_port: Optional[int] = None) -> None:
def init_device(self) -> None:
if self.device_config.device.type == "cuda":
# torch.distributed.all_reduce does not free the input tensor until
# the synchronization point. This causes the memory usage to grow
......@@ -92,8 +98,9 @@ class Worker:
f"Not support device type: {self.device_config.device}")
# Initialize the distributed environment.
init_distributed_environment(self.parallel_config, self.rank,
cupy_port, self.distributed_init_method)
# Initialize the model.
self.distributed_init_method,
self.local_rank)
# Set random seed.
set_random_seed(self.model_config.seed)
def load_model(self):
......@@ -130,9 +137,12 @@ class Worker:
# NOTE(woosuk): Here we assume that the other processes using the same
# GPU did not change their memory usage during the profiling.
peak_memory = self.init_gpu_memory - free_gpu_memory
assert peak_memory > 0, (
"Error in memory profiling. This happens when the GPU memory was "
"not properly cleaned up before initializing the vLLM instance.")
cache_block_size = CacheEngine.get_cache_block_size(
block_size, cache_dtype, self.model_config, self.parallel_config)
cache_block_size = self.get_cache_block_size_bytes(
block_size, cache_dtype)
num_gpu_blocks = int(
(total_gpu_memory * gpu_memory_utilization - peak_memory) //
cache_block_size)
......@@ -149,7 +159,6 @@ class Worker:
self.cache_config = cache_config
self.cache_engine = CacheEngine(self.cache_config, self.model_config,
self.parallel_config)
self.cache_events = self.cache_engine.events
self.gpu_cache = self.cache_engine.gpu_cache
self.model_runner.set_block_size(self.cache_engine.block_size)
......@@ -167,24 +176,13 @@ class Worker:
blocks_to_copy: Dict[int, List[int]],
) -> None:
# Issue cache operations.
issued_cache_op = False
# TODO(woosuk): Profile swapping overhead and optimize if needed.
if blocks_to_swap_in:
self.cache_engine.swap_in(blocks_to_swap_in)
issued_cache_op = True
if blocks_to_swap_out:
self.cache_engine.swap_out(blocks_to_swap_out)
issued_cache_op = True
if blocks_to_copy:
self.cache_engine.copy(blocks_to_copy)
issued_cache_op = True
cache_events = self.cache_events if issued_cache_op else None
# Wait for cache operations to finish.
# TODO(woosuk): Profile swapping overhead and optimize if needed.
if cache_events is not None:
for event in cache_events:
event.wait()
@torch.inference_mode()
def execute_model(
......@@ -233,12 +231,28 @@ class Worker:
def list_loras(self) -> Set[int]:
return self.model_runner.list_loras()
@property
def max_model_len(self) -> int:
return self.model_config.max_model_len
@property
def vocab_size(self) -> int:
return self.model_runner.vocab_size
def get_cache_block_size_bytes(self, block_size: int,
cache_dtype: str) -> int:
"""Get the size of the KV cache block size in bytes.
"""
return CacheEngine.get_cache_block_size(block_size, cache_dtype,
self.model_config,
self.parallel_config)
def init_distributed_environment(
parallel_config: ParallelConfig,
rank: int,
cupy_port: Optional[int],
distributed_init_method: Optional[str] = None,
local_rank: int = -1,
) -> None:
"""Initialize the distributed environment."""
if torch.distributed.is_initialized():
......@@ -260,29 +274,27 @@ def init_distributed_environment(
init_method=distributed_init_method,
)
if cupy_utils.is_initialized():
cupy_world_size = cupy_utils.get_world_size()
if cupy_world_size != parallel_config.world_size:
if pynccl_utils.is_initialized():
pynccl_world_size = pynccl_utils.get_world_size()
if pynccl_world_size != parallel_config.world_size:
raise RuntimeError(
"cupy.distributed is already initialized but the cupy world "
"pynccl is already initialized but the pynccl world "
"size does not match parallel_config.world_size "
f"({cupy_world_size} vs. {parallel_config.world_size}).")
elif (parallel_config.world_size > 1 and cupy_port is not None
and not is_hip()):
# NOTE(woosuk): We don't initialize CuPy process group when world size
f"({pynccl_world_size} vs. {parallel_config.world_size}).")
elif parallel_config.world_size > 1:
# NOTE(woosuk): We don't initialize pynccl process group when world size
# is 1.
# TODO(woosuk): Support multi-node connection.
cupy_utils.init_process_group(
pynccl_utils.init_process_group(
world_size=parallel_config.world_size,
local_rank=local_rank,
rank=rank,
host="localhost",
port=cupy_port,
init_method=distributed_init_method,
)
# A small all_reduce for warmup.
torch.distributed.all_reduce(torch.zeros(1).cuda())
if cupy_utils.is_initialized():
cupy_utils.all_reduce(torch.zeros(1).cuda())
if pynccl_utils.is_initialized():
pynccl_utils.all_reduce(torch.zeros(1).cuda())
ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
parallel_config.pipeline_parallel_size)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment