Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
text-generation-inference
Commits
e019635f
Commit
e019635f
authored
Nov 01, 2024
by
xuxzh1
🎱
Browse files
update
parent
64def8e2
Changes
171
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
0 additions
and
3904 deletions
+0
-3904
server/vllm/vllm/__init__.py
server/vllm/vllm/__init__.py
+0
-23
server/vllm/vllm/block.py
server/vllm/vllm/block.py
+0
-68
server/vllm/vllm/config.py
server/vllm/vllm/config.py
+0
-404
server/vllm/vllm/core/__init__.py
server/vllm/vllm/core/__init__.py
+0
-0
server/vllm/vllm/core/block_manager.py
server/vllm/vllm/core/block_manager.py
+0
-273
server/vllm/vllm/core/policy.py
server/vllm/vllm/core/policy.py
+0
-45
server/vllm/vllm/core/scheduler.py
server/vllm/vllm/core/scheduler.py
+0
-400
server/vllm/vllm/engine/__init__.py
server/vllm/vllm/engine/__init__.py
+0
-0
server/vllm/vllm/engine/arg_utils.py
server/vllm/vllm/engine/arg_utils.py
+0
-230
server/vllm/vllm/engine/async_llm_engine.py
server/vllm/vllm/engine/async_llm_engine.py
+0
-496
server/vllm/vllm/engine/llm_engine.py
server/vllm/vllm/engine/llm_engine.py
+0
-713
server/vllm/vllm/engine/ray_utils.py
server/vllm/vllm/engine/ray_utils.py
+0
-119
server/vllm/vllm/entrypoints/__init__.py
server/vllm/vllm/entrypoints/__init__.py
+0
-0
server/vllm/vllm/entrypoints/api_server.py
server/vllm/vllm/entrypoints/api_server.py
+0
-80
server/vllm/vllm/entrypoints/llm.py
server/vllm/vllm/entrypoints/llm.py
+0
-189
server/vllm/vllm/entrypoints/openai/__init__.py
server/vllm/vllm/entrypoints/openai/__init__.py
+0
-0
server/vllm/vllm/entrypoints/openai/api_server.py
server/vllm/vllm/entrypoints/openai/api_server.py
+0
-626
server/vllm/vllm/entrypoints/openai/protocol.py
server/vllm/vllm/entrypoints/openai/protocol.py
+0
-178
server/vllm/vllm/logger.py
server/vllm/vllm/logger.py
+0
-51
server/vllm/vllm/model_executor/__init__.py
server/vllm/vllm/model_executor/__init__.py
+0
-9
No files found.
server/vllm/vllm/__init__.py
deleted
100644 → 0
View file @
64def8e2
"""vLLM: a high-throughput and memory-efficient inference engine for LLMs"""
from
vllm.engine.arg_utils
import
AsyncEngineArgs
,
EngineArgs
from
vllm.engine.async_llm_engine
import
AsyncLLMEngine
from
vllm.engine.llm_engine
import
LLMEngine
from
vllm.engine.ray_utils
import
initialize_cluster
from
vllm.entrypoints.llm
import
LLM
from
vllm.outputs
import
CompletionOutput
,
RequestOutput
from
vllm.sampling_params
import
SamplingParams
__version__
=
"0.2.1"
__all__
=
[
"LLM"
,
"SamplingParams"
,
"RequestOutput"
,
"CompletionOutput"
,
"LLMEngine"
,
"EngineArgs"
,
"AsyncLLMEngine"
,
"AsyncEngineArgs"
,
"initialize_cluster"
,
]
server/vllm/vllm/block.py
deleted
100644 → 0
View file @
64def8e2
"""Token blocks."""
from
typing
import
List
from
vllm.utils
import
Device
_BLANK_TOKEN_ID
=
-
1
class
LogicalTokenBlock
:
"""A block that stores a contiguous chunk of tokens from left to right.
Logical blocks are used to represent the states of the corresponding
physical blocks in the KV cache.
"""
def
__init__
(
self
,
block_number
:
int
,
block_size
:
int
,
)
->
None
:
self
.
block_number
=
block_number
self
.
block_size
=
block_size
self
.
token_ids
=
[
_BLANK_TOKEN_ID
]
*
block_size
self
.
num_tokens
=
0
def
is_empty
(
self
)
->
bool
:
return
self
.
num_tokens
==
0
def
get_num_empty_slots
(
self
)
->
int
:
return
self
.
block_size
-
self
.
num_tokens
def
is_full
(
self
)
->
bool
:
return
self
.
num_tokens
==
self
.
block_size
def
append_tokens
(
self
,
token_ids
:
List
[
int
])
->
None
:
assert
len
(
token_ids
)
<=
self
.
get_num_empty_slots
()
curr_idx
=
self
.
num_tokens
self
.
token_ids
[
curr_idx
:
curr_idx
+
len
(
token_ids
)]
=
token_ids
self
.
num_tokens
+=
len
(
token_ids
)
def
get_token_ids
(
self
)
->
List
[
int
]:
return
self
.
token_ids
[:
self
.
num_tokens
]
def
get_last_token_id
(
self
)
->
int
:
assert
self
.
num_tokens
>
0
return
self
.
token_ids
[
self
.
num_tokens
-
1
]
class
PhysicalTokenBlock
:
"""Represents the state of a block in the KV cache."""
def
__init__
(
self
,
device
:
Device
,
block_number
:
int
,
block_size
:
int
,
)
->
None
:
self
.
device
=
device
self
.
block_number
=
block_number
self
.
block_size
=
block_size
self
.
ref_count
=
0
def
__repr__
(
self
)
->
str
:
return
(
f
'PhysicalTokenBlock(device=
{
self
.
device
}
, '
f
'block_number=
{
self
.
block_number
}
, '
f
'ref_count=
{
self
.
ref_count
}
)'
)
server/vllm/vllm/config.py
deleted
100644 → 0
View file @
64def8e2
from
typing
import
Optional
import
torch
from
transformers
import
PretrainedConfig
from
vllm.logger
import
init_logger
from
vllm.transformers_utils.config
import
get_config
from
vllm.utils
import
get_cpu_memory
logger
=
init_logger
(
__name__
)
_GB
=
1
<<
30
class
ModelConfig
:
"""Configuration for the model.
Args:
model: Name or path of the huggingface model to use.
tokenizer: Name or path of the huggingface tokenizer to use.
tokenizer_mode: Tokenizer mode. "auto" will use the fast tokenizer if
available, and "slow" will always use the slow tokenizer.
trust_remote_code: Trust remote code (e.g., from HuggingFace) when
downloading the model and tokenizer.
download_dir: Directory to download and load the weights, default to the
default cache directory of huggingface.
load_format: The format of the model weights to load:
"auto" will try to load the weights in the safetensors format and
fall back to the pytorch bin format if safetensors format is
not available.
"pt" will load the weights in the pytorch bin format.
"safetensors" will load the weights in the safetensors format.
"npcache" will load the weights in pytorch format and store
a numpy cache to speed up the loading.
"dummy" will initialize the weights with random values, which is
mainly for profiling.
dtype: Data type for model weights and activations. The "auto" option
will use FP16 precision for FP32 and FP16 models, and BF16 precision
for BF16 models.
seed: Random seed for reproducibility.
revision: The specific model version to use. It can be a branch name,
a tag name, or a commit id. If unspecified, will use the default
version.
tokenizer_revision: The specific tokenizer version to use. It can be a
branch name, a tag name, or a commit id. If unspecified, will use
the default version.
max_model_len: Maximum length of a sequence (including prompt and
output). If None, will be derived from the model.
quantization: Quantization method that was used to quantize the model
weights. If None, we assume the model weights are not quantized.
"""
def
__init__
(
self
,
model
:
str
,
tokenizer
:
str
,
tokenizer_mode
:
str
,
trust_remote_code
:
bool
,
download_dir
:
Optional
[
str
],
load_format
:
str
,
dtype
:
str
,
seed
:
int
,
revision
:
Optional
[
str
]
=
None
,
tokenizer_revision
:
Optional
[
str
]
=
None
,
max_model_len
:
Optional
[
int
]
=
None
,
quantization
:
Optional
[
str
]
=
None
,
)
->
None
:
self
.
model
=
model
self
.
tokenizer
=
tokenizer
self
.
tokenizer_mode
=
tokenizer_mode
self
.
trust_remote_code
=
trust_remote_code
self
.
download_dir
=
download_dir
self
.
load_format
=
load_format
self
.
seed
=
seed
self
.
revision
=
revision
self
.
tokenizer_revision
=
tokenizer_revision
self
.
quantization
=
quantization
self
.
hf_config
=
get_config
(
model
,
trust_remote_code
,
revision
)
self
.
dtype
=
_get_and_verify_dtype
(
self
.
hf_config
,
dtype
)
self
.
max_model_len
=
_get_and_verify_max_len
(
self
.
hf_config
,
max_model_len
)
self
.
_verify_load_format
()
self
.
_verify_tokenizer_mode
()
self
.
_verify_quantization
()
def
_verify_load_format
(
self
)
->
None
:
load_format
=
self
.
load_format
.
lower
()
if
load_format
not
in
[
"auto"
,
"pt"
,
"safetensors"
,
"npcache"
,
"dummy"
]:
raise
ValueError
(
f
"Unknown load format:
{
self
.
load_format
}
. Must be one of "
"'auto', 'pt', 'safetensors', 'npcache', or 'dummy'."
)
self
.
load_format
=
load_format
def
_verify_tokenizer_mode
(
self
)
->
None
:
tokenizer_mode
=
self
.
tokenizer_mode
.
lower
()
if
tokenizer_mode
not
in
[
"auto"
,
"slow"
]:
raise
ValueError
(
f
"Unknown tokenizer mode:
{
self
.
tokenizer_mode
}
. Must be "
"either 'auto' or 'slow'."
)
self
.
tokenizer_mode
=
tokenizer_mode
def
_verify_quantization
(
self
)
->
None
:
supported_quantization
=
[
"awq"
]
if
self
.
quantization
is
None
:
return
quantization
=
self
.
quantization
.
lower
()
if
quantization
not
in
supported_quantization
:
raise
ValueError
(
f
"Unknown quantization:
{
self
.
quantization
}
. Must be one of "
f
"
{
supported_quantization
}
."
)
self
.
quantization
=
quantization
def
verify_with_parallel_config
(
self
,
parallel_config
:
"ParallelConfig"
,
)
->
None
:
total_num_attention_heads
=
self
.
hf_config
.
num_attention_heads
tensor_parallel_size
=
parallel_config
.
tensor_parallel_size
if
total_num_attention_heads
%
tensor_parallel_size
!=
0
:
raise
ValueError
(
f
"Total number of attention heads (
{
total_num_attention_heads
}
)"
" must be divisible by tensor parallel size "
f
"(
{
tensor_parallel_size
}
)."
)
total_num_hidden_layers
=
self
.
hf_config
.
num_hidden_layers
pipeline_parallel_size
=
parallel_config
.
pipeline_parallel_size
if
total_num_hidden_layers
%
pipeline_parallel_size
!=
0
:
raise
ValueError
(
f
"Total number of hidden layers (
{
total_num_hidden_layers
}
) "
"must be divisible by pipeline parallel size "
f
"(
{
pipeline_parallel_size
}
)."
)
def
get_hidden_size
(
self
)
->
int
:
return
self
.
hf_config
.
hidden_size
def
get_head_size
(
self
)
->
int
:
# FIXME(woosuk): This may not be true for all models.
return
self
.
hf_config
.
hidden_size
//
self
.
hf_config
.
num_attention_heads
def
get_num_kv_heads
(
self
,
parallel_config
:
"ParallelConfig"
)
->
int
:
"""Returns the number of KV heads per GPU worker."""
# For GPTBigCode & Falcon:
# NOTE: for falcon, when new_decoder_architecture is True, the
# multi_query flag is ignored and we use n_head_kv for the number of
# KV heads.
falcon_model_types
=
[
"falcon"
,
"RefinedWeb"
,
"RefinedWebModel"
]
new_decoder_arch_falcon
=
(
self
.
hf_config
.
model_type
in
falcon_model_types
and
getattr
(
self
.
hf_config
,
"new_decoder_architecture"
,
False
))
if
not
new_decoder_arch_falcon
and
getattr
(
self
.
hf_config
,
"multi_query"
,
False
):
# Multi-query attention, only one KV head.
# Currently, tensor parallelism is not supported in this case.
return
1
# For Falcon:
if
getattr
(
self
.
hf_config
,
"n_head_kv"
,
None
)
is
not
None
:
return
(
self
.
hf_config
.
n_head_kv
//
parallel_config
.
tensor_parallel_size
)
if
getattr
(
self
.
hf_config
,
"num_kv_heads"
,
None
)
is
not
None
:
return
(
self
.
hf_config
.
num_kv_heads
//
parallel_config
.
tensor_parallel_size
)
# For LLaMA-2:
if
getattr
(
self
.
hf_config
,
"num_key_value_heads"
,
None
)
is
not
None
:
return
(
self
.
hf_config
.
num_key_value_heads
//
parallel_config
.
tensor_parallel_size
)
total_num_attention_heads
=
self
.
hf_config
.
num_attention_heads
return
total_num_attention_heads
//
parallel_config
.
tensor_parallel_size
def
get_num_layers
(
self
,
parallel_config
:
"ParallelConfig"
)
->
int
:
total_num_hidden_layers
=
self
.
hf_config
.
num_hidden_layers
return
total_num_hidden_layers
//
parallel_config
.
pipeline_parallel_size
class
CacheConfig
:
"""Configuration for the KV cache.
Args:
block_size: Size of a cache block in number of tokens.
gpu_memory_utilization: Fraction of GPU memory to use for the
vLLM execution.
swap_space: Size of the CPU swap space per GPU (in GiB).
"""
def
__init__
(
self
,
block_size
:
int
,
gpu_memory_utilization
:
float
,
swap_space
:
int
,
sliding_window
:
Optional
[
int
]
=
None
,
)
->
None
:
self
.
block_size
=
block_size
self
.
gpu_memory_utilization
=
gpu_memory_utilization
self
.
swap_space_bytes
=
swap_space
*
_GB
self
.
sliding_window
=
sliding_window
self
.
_verify_args
()
# Will be set after profiling.
self
.
num_gpu_blocks
=
None
self
.
num_cpu_blocks
=
None
def
_verify_args
(
self
)
->
None
:
if
self
.
gpu_memory_utilization
>
1.0
:
raise
ValueError
(
"GPU memory utilization must be less than 1.0. Got "
f
"
{
self
.
gpu_memory_utilization
}
."
)
def
verify_with_parallel_config
(
self
,
parallel_config
:
"ParallelConfig"
,
)
->
None
:
total_cpu_memory
=
get_cpu_memory
()
# FIXME(woosuk): Here, it is assumed that the GPUs in a tensor parallel
# group are in the same node. However, the GPUs may span multiple nodes.
num_gpus_per_node
=
parallel_config
.
tensor_parallel_size
cpu_memory_usage
=
self
.
swap_space_bytes
*
num_gpus_per_node
msg
=
(
f
"
{
cpu_memory_usage
/
_GB
:.
2
f
}
GiB out of "
f
"the
{
total_cpu_memory
/
_GB
:.
2
f
}
GiB total CPU memory is "
"allocated for the swap space."
)
if
cpu_memory_usage
>
0.7
*
total_cpu_memory
:
raise
ValueError
(
"Too large swap space. "
+
msg
)
elif
cpu_memory_usage
>
0.4
*
total_cpu_memory
:
logger
.
warning
(
"Possibly too large swap space. "
+
msg
)
class
ParallelConfig
:
"""Configuration for the distributed execution.
Args:
pipeline_parallel_size: Number of pipeline parallel groups.
tensor_parallel_size: Number of tensor parallel groups.
worker_use_ray: Whether to use Ray for model workers. Will be set to
True if either pipeline_parallel_size or tensor_parallel_size is
greater than 1.
"""
def
__init__
(
self
,
pipeline_parallel_size
:
int
,
tensor_parallel_size
:
int
,
worker_use_ray
:
bool
,
)
->
None
:
self
.
pipeline_parallel_size
=
pipeline_parallel_size
self
.
tensor_parallel_size
=
tensor_parallel_size
self
.
worker_use_ray
=
worker_use_ray
self
.
world_size
=
pipeline_parallel_size
*
tensor_parallel_size
if
self
.
world_size
>
1
:
self
.
worker_use_ray
=
True
self
.
_verify_args
()
def
_verify_args
(
self
)
->
None
:
if
self
.
pipeline_parallel_size
>
1
:
raise
NotImplementedError
(
"Pipeline parallelism is not supported yet."
)
class
SchedulerConfig
:
"""Scheduler configuration.
Args:
max_num_batched_tokens: Maximum number of tokens to be processed in
a single iteration.
max_num_seqs: Maximum number of sequences to be processed in a single
iteration.
max_model_len: Maximum length of a sequence (including prompt
and generated text).
max_paddings: Maximum number of paddings to be added to a batch.
"""
def
__init__
(
self
,
max_num_batched_tokens
:
Optional
[
int
],
max_num_seqs
:
int
,
max_model_len
:
int
,
max_paddings
:
int
,
)
->
None
:
if
max_num_batched_tokens
is
not
None
:
self
.
max_num_batched_tokens
=
max_num_batched_tokens
else
:
# If max_model_len is too short, use 2048 as the default value for
# higher throughput.
self
.
max_num_batched_tokens
=
max
(
max_model_len
,
2048
)
self
.
max_num_seqs
=
max_num_seqs
self
.
max_model_len
=
max_model_len
self
.
max_paddings
=
max_paddings
self
.
_verify_args
()
def
_verify_args
(
self
)
->
None
:
if
self
.
max_num_batched_tokens
<
self
.
max_model_len
:
raise
ValueError
(
f
"max_num_batched_tokens (
{
self
.
max_num_batched_tokens
}
) is "
f
"smaller than max_model_len (
{
self
.
max_model_len
}
). "
"This effectively limits the maximum sequence length to "
"max_num_batched_tokens and makes vLLM reject longer "
"sequences. Please increase max_num_batched_tokens or "
"decrease max_model_len."
)
if
self
.
max_num_batched_tokens
<
self
.
max_num_seqs
:
raise
ValueError
(
f
"max_num_batched_tokens (
{
self
.
max_num_batched_tokens
}
) must "
"be greater than or equal to max_num_seqs "
f
"(
{
self
.
max_num_seqs
}
)."
)
_STR_DTYPE_TO_TORCH_DTYPE
=
{
"half"
:
torch
.
float16
,
"float16"
:
torch
.
float16
,
"float"
:
torch
.
float32
,
"float32"
:
torch
.
float32
,
"bfloat16"
:
torch
.
bfloat16
,
}
def
_get_and_verify_dtype
(
config
:
PretrainedConfig
,
dtype
:
str
,
)
->
torch
.
dtype
:
# NOTE: getattr(config, "torch_dtype", torch.float32) is not correct
# because config.torch_dtype can be None.
config_dtype
=
getattr
(
config
,
"torch_dtype"
,
None
)
if
config_dtype
is
None
:
config_dtype
=
torch
.
float32
dtype
=
dtype
.
lower
()
if
dtype
==
"auto"
:
if
config_dtype
==
torch
.
float32
:
# Following the common practice, we use float16 for float32 models.
torch_dtype
=
torch
.
float16
else
:
torch_dtype
=
config_dtype
else
:
if
dtype
not
in
_STR_DTYPE_TO_TORCH_DTYPE
:
raise
ValueError
(
f
"Unknown dtype:
{
dtype
}
"
)
torch_dtype
=
_STR_DTYPE_TO_TORCH_DTYPE
[
dtype
]
# Verify the dtype.
if
torch_dtype
!=
config_dtype
:
if
torch_dtype
==
torch
.
float32
:
# Upcasting to float32 is allowed.
pass
elif
config_dtype
==
torch
.
float32
:
# Downcasting from float32 to float16 or bfloat16 is allowed.
pass
else
:
# Casting between float16 and bfloat16 is allowed with a warning.
logger
.
warning
(
f
"Casting
{
config_dtype
}
to
{
torch_dtype
}
."
)
return
torch_dtype
def
_get_and_verify_max_len
(
hf_config
:
PretrainedConfig
,
max_model_len
:
Optional
[
int
],
)
->
int
:
"""Get and verify the model's maximum length."""
derived_max_model_len
=
float
(
"inf"
)
possible_keys
=
[
# OPT
"max_position_embeddings"
,
# GPT-2
"n_positions"
,
# MPT
"max_seq_len"
,
# Others
"max_sequence_length"
,
"max_seq_length"
,
"seq_len"
,
]
for
key
in
possible_keys
:
max_len_key
=
getattr
(
hf_config
,
key
,
None
)
if
max_len_key
is
not
None
:
derived_max_model_len
=
min
(
derived_max_model_len
,
max_len_key
)
if
derived_max_model_len
==
float
(
"inf"
):
if
max_model_len
is
not
None
:
# If max_model_len is specified, we use it.
return
max_model_len
default_max_len
=
2048
logger
.
warning
(
"The model's config.json does not contain any of the following "
"keys to determine the original maximum length of the model: "
f
"
{
possible_keys
}
. Assuming the model's maximum length is "
f
"
{
default_max_len
}
."
)
derived_max_model_len
=
default_max_len
rope_scaling
=
getattr
(
hf_config
,
"rope_scaling"
,
None
)
if
rope_scaling
is
not
None
:
assert
"factor"
in
rope_scaling
scaling_factor
=
rope_scaling
[
"factor"
]
derived_max_model_len
*=
scaling_factor
if
max_model_len
is
None
:
max_model_len
=
derived_max_model_len
elif
max_model_len
>
derived_max_model_len
:
raise
ValueError
(
f
"User-specified max_model_len (
{
max_model_len
}
) is greater than "
f
"the derived max_model_len (
{
max_len_key
}
=
{
derived_max_model_len
}
"
" in model's config.json). This may lead to incorrect model "
"outputs or CUDA errors. Make sure the value is correct and "
"within the model context size."
)
return
int
(
max_model_len
)
server/vllm/vllm/core/__init__.py
deleted
100644 → 0
View file @
64def8e2
server/vllm/vllm/core/block_manager.py
deleted
100644 → 0
View file @
64def8e2
"""A block manager that manages token blocks."""
from
typing
import
Dict
,
List
,
Optional
,
Set
,
Tuple
from
vllm.block
import
PhysicalTokenBlock
from
vllm.sequence
import
Sequence
,
SequenceGroup
,
SequenceStatus
from
vllm.utils
import
Device
class
BlockAllocator
:
"""Manages free physical token blocks for a device.
The allocator maintains a list of free blocks and allocates a block when
requested. When a block is freed, its reference count is decremented. If
the reference count becomes zero, the block is added back to the free list.
"""
def
__init__
(
self
,
device
:
Device
,
block_size
:
int
,
num_blocks
:
int
,
)
->
None
:
self
.
device
=
device
self
.
block_size
=
block_size
self
.
num_blocks
=
num_blocks
# Initialize the free blocks.
self
.
free_blocks
:
List
[
PhysicalTokenBlock
]
=
[]
for
i
in
range
(
num_blocks
):
block
=
PhysicalTokenBlock
(
device
=
device
,
block_number
=
i
,
block_size
=
block_size
)
self
.
free_blocks
.
append
(
block
)
def
allocate
(
self
)
->
PhysicalTokenBlock
:
if
not
self
.
free_blocks
:
raise
ValueError
(
"Out of memory! No free blocks are available."
)
block
=
self
.
free_blocks
.
pop
()
block
.
ref_count
=
1
return
block
def
free
(
self
,
block
:
PhysicalTokenBlock
)
->
None
:
if
block
.
ref_count
==
0
:
raise
ValueError
(
f
"Double free!
{
block
}
is already freed."
)
block
.
ref_count
-=
1
if
block
.
ref_count
==
0
:
self
.
free_blocks
.
append
(
block
)
def
get_num_free_blocks
(
self
)
->
int
:
return
len
(
self
.
free_blocks
)
# Mapping: logical block number -> physical block.
BlockTable
=
List
[
PhysicalTokenBlock
]
class
BlockSpaceManager
:
"""Manages the mapping between logical and physical token blocks."""
def
__init__
(
self
,
block_size
:
int
,
num_gpu_blocks
:
int
,
num_cpu_blocks
:
int
,
watermark
:
float
=
0.01
,
sliding_window
:
Optional
[
int
]
=
None
,
)
->
None
:
self
.
block_size
=
block_size
self
.
num_total_gpu_blocks
=
num_gpu_blocks
self
.
num_total_cpu_blocks
=
num_cpu_blocks
self
.
block_sliding_window
=
None
if
sliding_window
is
not
None
:
assert
sliding_window
%
block_size
==
0
,
(
sliding_window
,
block_size
)
self
.
block_sliding_window
=
sliding_window
//
block_size
self
.
watermark
=
watermark
assert
watermark
>=
0.0
self
.
watermark_blocks
=
int
(
watermark
*
num_gpu_blocks
)
self
.
gpu_allocator
=
BlockAllocator
(
Device
.
GPU
,
block_size
,
num_gpu_blocks
)
self
.
cpu_allocator
=
BlockAllocator
(
Device
.
CPU
,
block_size
,
num_cpu_blocks
)
# Mapping: seq_id -> BlockTable.
self
.
block_tables
:
Dict
[
int
,
BlockTable
]
=
{}
def
can_allocate
(
self
,
seq_group
:
SequenceGroup
)
->
bool
:
# FIXME(woosuk): Here we assume that all sequences in the group share
# the same prompt. This may not be true for preempted sequences.
seq
=
seq_group
.
get_seqs
()[
0
]
num_required_blocks
=
len
(
seq
.
logical_token_blocks
)
if
self
.
block_sliding_window
is
not
None
:
num_required_blocks
=
min
(
num_required_blocks
,
self
.
block_sliding_window
)
num_free_gpu_blocks
=
self
.
gpu_allocator
.
get_num_free_blocks
()
# Use watermark to avoid frequent cache eviction.
return
(
num_free_gpu_blocks
-
num_required_blocks
>=
self
.
watermark_blocks
)
def
allocate
(
self
,
seq_group
:
SequenceGroup
)
->
None
:
# NOTE: Here we assume that all sequences in the group have the same
# prompt.
seq
=
seq_group
.
get_seqs
()[
0
]
# Allocate new physical token blocks that will store the prompt tokens.
block_table
:
BlockTable
=
[]
for
logical_idx
in
range
(
len
(
seq
.
logical_token_blocks
)):
if
(
self
.
block_sliding_window
is
not
None
and
logical_idx
>=
self
.
block_sliding_window
):
block
=
block_table
[
logical_idx
%
self
.
block_sliding_window
]
else
:
block
=
self
.
gpu_allocator
.
allocate
()
# Set the reference counts of the token blocks.
block
.
ref_count
=
seq_group
.
num_seqs
()
block_table
.
append
(
block
)
# Assign the block table for each sequence.
for
seq
in
seq_group
.
get_seqs
():
self
.
block_tables
[
seq
.
seq_id
]
=
block_table
.
copy
()
def
can_append_slot
(
self
,
seq_group
:
SequenceGroup
)
->
bool
:
# Simple heuristic: If there is at least one free block
# for each sequence, we can append.
num_free_gpu_blocks
=
self
.
gpu_allocator
.
get_num_free_blocks
()
num_seqs
=
seq_group
.
num_seqs
(
status
=
SequenceStatus
.
RUNNING
)
return
num_seqs
<=
num_free_gpu_blocks
def
append_slot
(
self
,
seq
:
Sequence
)
->
Optional
[
Tuple
[
int
,
int
]]:
"""Allocate a physical slot for a new token."""
logical_blocks
=
seq
.
logical_token_blocks
block_table
=
self
.
block_tables
[
seq
.
seq_id
]
if
len
(
block_table
)
<
len
(
logical_blocks
):
if
(
self
.
block_sliding_window
and
len
(
block_table
)
>=
self
.
block_sliding_window
):
# re-use a block
block_table
.
append
(
block_table
[
len
(
block_table
)
%
self
.
block_sliding_window
])
else
:
# The sequence has a new logical block.
# Allocate a new physical block.
block
=
self
.
gpu_allocator
.
allocate
()
block_table
.
append
(
block
)
return
None
# We want to append the token to the last physical block.
last_block
=
block_table
[
-
1
]
assert
last_block
.
device
==
Device
.
GPU
if
last_block
.
ref_count
==
1
:
# Not shared with other sequences. Appendable.
return
None
else
:
# The last block is shared with other sequences.
# Copy on Write: Allocate a new block and copy the tokens.
new_block
=
self
.
gpu_allocator
.
allocate
()
block_table
[
-
1
]
=
new_block
self
.
gpu_allocator
.
free
(
last_block
)
return
last_block
.
block_number
,
new_block
.
block_number
def
fork
(
self
,
parent_seq
:
Sequence
,
child_seq
:
Sequence
)
->
None
:
# NOTE: fork does not allocate a new physical block.
# Thus, it is always safe from OOM.
src_block_table
=
self
.
block_tables
[
parent_seq
.
seq_id
]
self
.
block_tables
[
child_seq
.
seq_id
]
=
src_block_table
.
copy
()
for
block
in
src_block_table
:
block
.
ref_count
+=
1
def
_get_physical_blocks
(
self
,
seq_group
:
SequenceGroup
)
->
List
[
PhysicalTokenBlock
]:
# NOTE: Here, we assume that the physical blocks are only shared by
# the sequences in the same group.
blocks
:
Set
[
PhysicalTokenBlock
]
=
set
()
for
seq
in
seq_group
.
get_seqs
():
if
seq
.
is_finished
():
continue
blocks
.
update
(
self
.
block_tables
[
seq
.
seq_id
])
return
list
(
blocks
)
def
can_swap_in
(
self
,
seq_group
:
SequenceGroup
)
->
bool
:
blocks
=
self
.
_get_physical_blocks
(
seq_group
)
num_swapped_seqs
=
seq_group
.
num_seqs
(
status
=
SequenceStatus
.
SWAPPED
)
num_free_blocks
=
self
.
gpu_allocator
.
get_num_free_blocks
()
# NOTE: Conservatively, we assume that every sequence will allocate
# at least one free block right after the swap-in.
# NOTE: This should match the logic in can_append_slot().
num_required_blocks
=
len
(
blocks
)
+
num_swapped_seqs
return
num_free_blocks
-
num_required_blocks
>=
self
.
watermark_blocks
def
swap_in
(
self
,
seq_group
:
SequenceGroup
)
->
Dict
[
int
,
int
]:
# CPU block -> GPU block.
mapping
:
Dict
[
PhysicalTokenBlock
,
PhysicalTokenBlock
]
=
{}
for
seq
in
seq_group
.
get_seqs
(
status
=
SequenceStatus
.
SWAPPED
):
new_block_table
:
BlockTable
=
[]
block_table
=
self
.
block_tables
[
seq
.
seq_id
]
for
cpu_block
in
block_table
:
if
cpu_block
in
mapping
:
gpu_block
=
mapping
[
cpu_block
]
gpu_block
.
ref_count
+=
1
else
:
gpu_block
=
self
.
gpu_allocator
.
allocate
()
mapping
[
cpu_block
]
=
gpu_block
new_block_table
.
append
(
gpu_block
)
# Free the CPU block swapped in to GPU.
self
.
cpu_allocator
.
free
(
cpu_block
)
self
.
block_tables
[
seq
.
seq_id
]
=
new_block_table
block_number_mapping
=
{
cpu_block
.
block_number
:
gpu_block
.
block_number
for
cpu_block
,
gpu_block
in
mapping
.
items
()
}
return
block_number_mapping
def
can_swap_out
(
self
,
seq_group
:
SequenceGroup
)
->
bool
:
blocks
=
self
.
_get_physical_blocks
(
seq_group
)
return
len
(
blocks
)
<=
self
.
cpu_allocator
.
get_num_free_blocks
()
def
swap_out
(
self
,
seq_group
:
SequenceGroup
)
->
Dict
[
int
,
int
]:
# GPU block -> CPU block.
mapping
:
Dict
[
PhysicalTokenBlock
,
PhysicalTokenBlock
]
=
{}
for
seq
in
seq_group
.
get_seqs
(
status
=
SequenceStatus
.
RUNNING
):
new_block_table
:
BlockTable
=
[]
block_table
=
self
.
block_tables
[
seq
.
seq_id
]
for
gpu_block
in
block_table
:
if
gpu_block
in
mapping
:
cpu_block
=
mapping
[
gpu_block
]
cpu_block
.
ref_count
+=
1
else
:
cpu_block
=
self
.
cpu_allocator
.
allocate
()
mapping
[
gpu_block
]
=
cpu_block
new_block_table
.
append
(
cpu_block
)
# Free the GPU block swapped out to CPU.
self
.
gpu_allocator
.
free
(
gpu_block
)
self
.
block_tables
[
seq
.
seq_id
]
=
new_block_table
block_number_mapping
=
{
gpu_block
.
block_number
:
cpu_block
.
block_number
for
gpu_block
,
cpu_block
in
mapping
.
items
()
}
return
block_number_mapping
def
_free_block_table
(
self
,
block_table
:
BlockTable
)
->
None
:
for
block
in
set
(
block_table
):
if
block
.
device
==
Device
.
GPU
:
self
.
gpu_allocator
.
free
(
block
)
else
:
self
.
cpu_allocator
.
free
(
block
)
def
free
(
self
,
seq
:
Sequence
)
->
None
:
if
seq
.
seq_id
not
in
self
.
block_tables
:
# Already freed or haven't been scheduled yet.
return
block_table
=
self
.
block_tables
[
seq
.
seq_id
]
self
.
_free_block_table
(
block_table
)
del
self
.
block_tables
[
seq
.
seq_id
]
def
reset
(
self
)
->
None
:
for
block_table
in
self
.
block_tables
.
values
():
self
.
_free_block_table
(
block_table
)
self
.
block_tables
.
clear
()
def
get_block_table
(
self
,
seq
:
Sequence
)
->
List
[
int
]:
block_table
=
self
.
block_tables
[
seq
.
seq_id
]
return
[
block
.
block_number
for
block
in
block_table
]
def
get_num_free_gpu_blocks
(
self
)
->
int
:
return
self
.
gpu_allocator
.
get_num_free_blocks
()
def
get_num_free_cpu_blocks
(
self
)
->
int
:
return
self
.
cpu_allocator
.
get_num_free_blocks
()
server/vllm/vllm/core/policy.py
deleted
100644 → 0
View file @
64def8e2
from
typing
import
List
from
vllm.sequence
import
SequenceGroup
class
Policy
:
def
get_priority
(
self
,
now
:
float
,
seq_group
:
SequenceGroup
,
)
->
float
:
raise
NotImplementedError
def
sort_by_priority
(
self
,
now
:
float
,
seq_groups
:
List
[
SequenceGroup
],
)
->
List
[
SequenceGroup
]:
return
sorted
(
seq_groups
,
key
=
lambda
seq_group
:
self
.
get_priority
(
now
,
seq_group
),
reverse
=
True
,
)
class
FCFS
(
Policy
):
def
get_priority
(
self
,
now
:
float
,
seq_group
:
SequenceGroup
,
)
->
float
:
return
now
-
seq_group
.
arrival_time
class
PolicyFactory
:
_POLICY_REGISTRY
=
{
'fcfs'
:
FCFS
,
}
@
classmethod
def
get_policy
(
cls
,
policy_name
:
str
,
**
kwargs
)
->
Policy
:
return
cls
.
_POLICY_REGISTRY
[
policy_name
](
**
kwargs
)
server/vllm/vllm/core/scheduler.py
deleted
100644 → 0
View file @
64def8e2
import
enum
import
time
from
typing
import
Dict
,
Iterable
,
List
,
Optional
,
Tuple
,
Union
from
vllm.config
import
CacheConfig
,
SchedulerConfig
from
vllm.core.block_manager
import
BlockSpaceManager
from
vllm.core.policy
import
PolicyFactory
from
vllm.logger
import
init_logger
from
vllm.sequence
import
(
Sequence
,
SequenceData
,
SequenceGroup
,
SequenceGroupMetadata
,
SequenceStatus
)
logger
=
init_logger
(
__name__
)
class
PreemptionMode
(
enum
.
Enum
):
"""Preemption modes.
1. Swapping: Swap out the blocks of the preempted sequences to CPU memory
and swap them back in when the sequences are resumed.
2. Recomputation: Discard the blocks of the preempted sequences and
recompute them when the sequences are resumed, treating the sequences as
new prompts.
"""
SWAP
=
enum
.
auto
()
RECOMPUTE
=
enum
.
auto
()
class
SchedulerOutputs
:
def
__init__
(
self
,
scheduled_seq_groups
:
List
[
SequenceGroup
],
prompt_run
:
bool
,
num_batched_tokens
:
int
,
blocks_to_swap_in
:
Dict
[
int
,
int
],
blocks_to_swap_out
:
Dict
[
int
,
int
],
blocks_to_copy
:
Dict
[
int
,
List
[
int
]],
ignored_seq_groups
:
List
[
SequenceGroup
],
)
->
None
:
self
.
scheduled_seq_groups
=
scheduled_seq_groups
self
.
prompt_run
=
prompt_run
self
.
num_batched_tokens
=
num_batched_tokens
self
.
blocks_to_swap_in
=
blocks_to_swap_in
self
.
blocks_to_swap_out
=
blocks_to_swap_out
self
.
blocks_to_copy
=
blocks_to_copy
# Swap in and swap out should never happen at the same time.
assert
not
(
blocks_to_swap_in
and
blocks_to_swap_out
)
self
.
ignored_seq_groups
=
ignored_seq_groups
def
is_empty
(
self
)
->
bool
:
# NOTE: We do not consider the ignored sequence groups.
return
(
not
self
.
scheduled_seq_groups
and
not
self
.
blocks_to_swap_in
and
not
self
.
blocks_to_swap_out
and
not
self
.
blocks_to_copy
)
class
Scheduler
:
def
__init__
(
self
,
scheduler_config
:
SchedulerConfig
,
cache_config
:
CacheConfig
,
)
->
None
:
self
.
scheduler_config
=
scheduler_config
self
.
cache_config
=
cache_config
self
.
prompt_limit
=
min
(
self
.
scheduler_config
.
max_model_len
,
self
.
scheduler_config
.
max_num_batched_tokens
)
# Instantiate the scheduling policy.
self
.
policy
=
PolicyFactory
.
get_policy
(
policy_name
=
"fcfs"
)
# Create the block space manager.
self
.
block_manager
=
BlockSpaceManager
(
block_size
=
self
.
cache_config
.
block_size
,
num_gpu_blocks
=
self
.
cache_config
.
num_gpu_blocks
,
num_cpu_blocks
=
self
.
cache_config
.
num_cpu_blocks
,
sliding_window
=
self
.
cache_config
.
sliding_window
)
# TODO(zhuohan): Use deque instead of list for better performance.
# Sequence groups in the WAITING state.
self
.
waiting
:
List
[
SequenceGroup
]
=
[]
# Sequence groups in the RUNNING state.
self
.
running
:
List
[
SequenceGroup
]
=
[]
# Sequence groups in the SWAPPED state.
self
.
swapped
:
List
[
SequenceGroup
]
=
[]
def
add_seq_group
(
self
,
seq_group
:
SequenceGroup
)
->
None
:
# Add sequence groups to the waiting queue.
self
.
waiting
.
append
(
seq_group
)
def
abort_seq_group
(
self
,
request_id
:
Union
[
str
,
Iterable
[
str
]])
->
None
:
if
isinstance
(
request_id
,
str
):
request_id
=
(
request_id
,
)
request_ids
=
set
(
request_id
)
for
state_queue
in
[
self
.
waiting
,
self
.
running
,
self
.
swapped
]:
# We need to reverse the list as we are removing elements
# from it as we iterate over it. If we don't do it,
# indices will get messed up and we will skip over elements.
for
seq_group
in
reversed
(
state_queue
):
if
seq_group
.
request_id
in
request_ids
:
# Remove the sequence group from the state queue.
state_queue
.
remove
(
seq_group
)
for
seq
in
seq_group
.
get_seqs
():
if
seq
.
is_finished
():
continue
seq
.
status
=
SequenceStatus
.
FINISHED_ABORTED
self
.
free_seq
(
seq
)
request_ids
.
remove
(
seq_group
.
request_id
)
if
not
request_ids
:
return
def
has_unfinished_seqs
(
self
)
->
bool
:
return
self
.
waiting
or
self
.
running
or
self
.
swapped
def
get_num_unfinished_seq_groups
(
self
)
->
int
:
return
len
(
self
.
waiting
)
+
len
(
self
.
running
)
+
len
(
self
.
swapped
)
def
_schedule
(
self
)
->
SchedulerOutputs
:
# Blocks that need to be swaped or copied before model execution.
blocks_to_swap_in
:
Dict
[
int
,
int
]
=
{}
blocks_to_swap_out
:
Dict
[
int
,
int
]
=
{}
blocks_to_copy
:
Dict
[
int
,
List
[
int
]]
=
{}
# Fix the current time.
now
=
time
.
monotonic
()
# Join waiting sequences if possible.
if
not
self
.
swapped
:
ignored_seq_groups
:
List
[
SequenceGroup
]
=
[]
scheduled
:
List
[
SequenceGroup
]
=
[]
# The total number of sequences on the fly, including the
# requests in the generation phase.
num_curr_seqs
=
sum
(
seq_group
.
get_max_num_running_seqs
()
for
seq_group
in
self
.
running
)
seq_lens
:
List
[
int
]
=
[]
# Optimization: We do not sort the waiting queue since the preempted
# sequence groups are added to the front and the new sequence groups
# are added to the back.
while
self
.
waiting
:
seq_group
=
self
.
waiting
[
0
]
assert
seq_group
.
num_seqs
()
==
1
,
(
"Waiting sequence group should have only one prompt "
"sequence."
)
num_prompt_tokens
=
seq_group
.
get_seqs
()[
0
].
get_len
()
if
num_prompt_tokens
>
self
.
prompt_limit
:
logger
.
warning
(
f
"Input prompt (
{
num_prompt_tokens
}
tokens) is too long"
f
" and exceeds limit of
{
self
.
prompt_limit
}
"
)
for
seq
in
seq_group
.
get_seqs
():
seq
.
status
=
SequenceStatus
.
FINISHED_IGNORED
ignored_seq_groups
.
append
(
seq_group
)
self
.
waiting
.
pop
(
0
)
continue
# If the sequence group cannot be allocated, stop.
if
not
self
.
block_manager
.
can_allocate
(
seq_group
):
break
# If the number of batched tokens exceeds the limit, stop.
new_seq_lens
=
seq_lens
+
[
num_prompt_tokens
]
num_batched_tokens
=
len
(
new_seq_lens
)
*
max
(
new_seq_lens
)
if
(
num_batched_tokens
>
self
.
scheduler_config
.
max_num_batched_tokens
):
break
# The total number of sequences in the RUNNING state should not
# exceed the maximum number of sequences.
num_new_seqs
=
seq_group
.
get_max_num_running_seqs
()
if
(
num_curr_seqs
+
num_new_seqs
>
self
.
scheduler_config
.
max_num_seqs
):
break
num_paddings
=
num_batched_tokens
-
sum
(
new_seq_lens
)
if
num_paddings
>
self
.
scheduler_config
.
max_paddings
:
break
seq_lens
=
new_seq_lens
seq_group
=
self
.
waiting
.
pop
(
0
)
self
.
_allocate
(
seq_group
)
self
.
running
.
append
(
seq_group
)
num_curr_seqs
+=
num_new_seqs
scheduled
.
append
(
seq_group
)
if
scheduled
or
ignored_seq_groups
:
scheduler_outputs
=
SchedulerOutputs
(
scheduled_seq_groups
=
scheduled
,
prompt_run
=
True
,
num_batched_tokens
=
len
(
seq_lens
)
*
max
(
seq_lens
),
blocks_to_swap_in
=
blocks_to_swap_in
,
blocks_to_swap_out
=
blocks_to_swap_out
,
blocks_to_copy
=
blocks_to_copy
,
ignored_seq_groups
=
ignored_seq_groups
,
)
return
scheduler_outputs
# NOTE(woosuk): Preemption happens only when there is no available slot
# to keep all the sequence groups in the RUNNING state.
# In this case, the policy is responsible for deciding which sequence
# groups to preempt.
self
.
running
=
self
.
policy
.
sort_by_priority
(
now
,
self
.
running
)
# Reserve new token slots for the running sequence groups.
running
:
List
[
SequenceGroup
]
=
[]
preempted
:
List
[
SequenceGroup
]
=
[]
while
self
.
running
:
seq_group
=
self
.
running
.
pop
(
0
)
while
not
self
.
block_manager
.
can_append_slot
(
seq_group
):
if
self
.
running
:
# Preempt the lowest-priority sequence groups.
victim_seq_group
=
self
.
running
.
pop
(
-
1
)
self
.
_preempt
(
victim_seq_group
,
blocks_to_swap_out
)
preempted
.
append
(
victim_seq_group
)
else
:
# No other sequence groups can be preempted.
# Preempt the current sequence group.
self
.
_preempt
(
seq_group
,
blocks_to_swap_out
)
preempted
.
append
(
seq_group
)
break
else
:
# Append new slots to the sequence group.
self
.
_append_slot
(
seq_group
,
blocks_to_copy
)
running
.
append
(
seq_group
)
self
.
running
=
running
# Swap in the sequence groups in the SWAPPED state if possible.
self
.
swapped
=
self
.
policy
.
sort_by_priority
(
now
,
self
.
swapped
)
if
not
preempted
:
num_curr_seqs
=
sum
(
seq_group
.
get_max_num_running_seqs
()
for
seq_group
in
self
.
running
)
while
self
.
swapped
:
seq_group
=
self
.
swapped
[
0
]
# If the sequence group cannot be swapped in, stop.
if
not
self
.
block_manager
.
can_swap_in
(
seq_group
):
break
# The total number of sequences in the RUNNING state should not
# exceed the maximum number of sequences.
num_new_seqs
=
seq_group
.
get_max_num_running_seqs
()
if
(
num_curr_seqs
+
num_new_seqs
>
self
.
scheduler_config
.
max_num_seqs
):
break
seq_group
=
self
.
swapped
.
pop
(
0
)
self
.
_swap_in
(
seq_group
,
blocks_to_swap_in
)
self
.
_append_slot
(
seq_group
,
blocks_to_copy
)
num_curr_seqs
+=
num_new_seqs
self
.
running
.
append
(
seq_group
)
# Each sequence in the generation phase only takes one token slot.
# Therefore, the number of batched tokens is equal to the number of
# sequences in the RUNNING state.
num_batched_tokens
=
sum
(
seq_group
.
num_seqs
(
status
=
SequenceStatus
.
RUNNING
)
for
seq_group
in
self
.
running
)
scheduler_outputs
=
SchedulerOutputs
(
scheduled_seq_groups
=
self
.
running
,
prompt_run
=
False
,
num_batched_tokens
=
num_batched_tokens
,
blocks_to_swap_in
=
blocks_to_swap_in
,
blocks_to_swap_out
=
blocks_to_swap_out
,
blocks_to_copy
=
blocks_to_copy
,
ignored_seq_groups
=
[],
)
return
scheduler_outputs
def
schedule
(
self
)
->
Tuple
[
List
[
SequenceGroupMetadata
],
SchedulerOutputs
]:
# Schedule sequence groups.
# This function call changes the internal states of the scheduler
# such as self.running, self.swapped, and self.waiting.
scheduler_outputs
=
self
.
_schedule
()
# Create input data structures.
seq_group_metadata_list
:
List
[
SequenceGroupMetadata
]
=
[]
for
seq_group
in
scheduler_outputs
.
scheduled_seq_groups
:
seq_data
:
Dict
[
int
,
List
[
SequenceData
]]
=
{}
block_tables
:
Dict
[
int
,
List
[
int
]]
=
{}
for
seq
in
seq_group
.
get_seqs
(
status
=
SequenceStatus
.
RUNNING
):
seq_id
=
seq
.
seq_id
seq_data
[
seq_id
]
=
seq
.
data
block_tables
[
seq_id
]
=
self
.
block_manager
.
get_block_table
(
seq
)
seq_group_metadata
=
SequenceGroupMetadata
(
request_id
=
seq_group
.
request_id
,
is_prompt
=
scheduler_outputs
.
prompt_run
,
seq_data
=
seq_data
,
sampling_params
=
seq_group
.
sampling_params
,
block_tables
=
block_tables
,
)
seq_group_metadata_list
.
append
(
seq_group_metadata
)
return
seq_group_metadata_list
,
scheduler_outputs
def
fork_seq
(
self
,
parent_seq
:
Sequence
,
child_seq
:
Sequence
)
->
None
:
self
.
block_manager
.
fork
(
parent_seq
,
child_seq
)
def
free_seq
(
self
,
seq
:
Sequence
)
->
None
:
self
.
block_manager
.
free
(
seq
)
def
free_finished_seq_groups
(
self
)
->
None
:
self
.
running
=
[
seq_group
for
seq_group
in
self
.
running
if
not
seq_group
.
is_finished
()
]
def
_allocate
(
self
,
seq_group
:
SequenceGroup
)
->
None
:
self
.
block_manager
.
allocate
(
seq_group
)
for
seq
in
seq_group
.
get_seqs
():
seq
.
status
=
SequenceStatus
.
RUNNING
def
_append_slot
(
self
,
seq_group
:
SequenceGroup
,
blocks_to_copy
:
Dict
[
int
,
List
[
int
]],
)
->
None
:
for
seq
in
seq_group
.
get_seqs
(
status
=
SequenceStatus
.
RUNNING
):
ret
=
self
.
block_manager
.
append_slot
(
seq
)
if
ret
is
not
None
:
src_block
,
dst_block
=
ret
if
src_block
in
blocks_to_copy
:
blocks_to_copy
[
src_block
].
append
(
dst_block
)
else
:
blocks_to_copy
[
src_block
]
=
[
dst_block
]
def
_preempt
(
self
,
seq_group
:
SequenceGroup
,
blocks_to_swap_out
:
Dict
[
int
,
int
],
preemption_mode
:
Optional
[
PreemptionMode
]
=
None
,
)
->
None
:
# If preemption mode is not specified, we determine the mode as follows:
# We use recomputation by default since it incurs lower overhead than
# swapping. However, when the sequence group has multiple sequences
# (e.g., beam search), recomputation is not currently supported. In
# such a case, we use swapping instead.
# FIXME(woosuk): This makes our scheduling policy a bit bizarre.
# As swapped sequences are prioritized over waiting sequences,
# sequence groups with multiple sequences are implicitly prioritized
# over sequence groups with a single sequence.
# TODO(woosuk): Support recomputation for sequence groups with multiple
# sequences. This may require a more sophisticated CUDA kernel.
if
preemption_mode
is
None
:
if
seq_group
.
get_max_num_running_seqs
()
==
1
:
preemption_mode
=
PreemptionMode
.
RECOMPUTE
else
:
preemption_mode
=
PreemptionMode
.
SWAP
if
preemption_mode
==
PreemptionMode
.
RECOMPUTE
:
self
.
_preempt_by_recompute
(
seq_group
)
elif
preemption_mode
==
PreemptionMode
.
SWAP
:
self
.
_preempt_by_swap
(
seq_group
,
blocks_to_swap_out
)
else
:
assert
False
,
"Invalid preemption mode."
def
_preempt_by_recompute
(
self
,
seq_group
:
SequenceGroup
,
)
->
None
:
seqs
=
seq_group
.
get_seqs
(
status
=
SequenceStatus
.
RUNNING
)
assert
len
(
seqs
)
==
1
for
seq
in
seqs
:
seq
.
status
=
SequenceStatus
.
WAITING
self
.
block_manager
.
free
(
seq
)
# NOTE: For FCFS, we insert the preempted sequence group to the front
# of the waiting queue.
self
.
waiting
.
insert
(
0
,
seq_group
)
def
_preempt_by_swap
(
self
,
seq_group
:
SequenceGroup
,
blocks_to_swap_out
:
Dict
[
int
,
int
],
)
->
None
:
self
.
_swap_out
(
seq_group
,
blocks_to_swap_out
)
self
.
swapped
.
append
(
seq_group
)
def
_swap_in
(
self
,
seq_group
:
SequenceGroup
,
blocks_to_swap_in
:
Dict
[
int
,
int
],
)
->
None
:
mapping
=
self
.
block_manager
.
swap_in
(
seq_group
)
blocks_to_swap_in
.
update
(
mapping
)
for
seq
in
seq_group
.
get_seqs
(
status
=
SequenceStatus
.
SWAPPED
):
seq
.
status
=
SequenceStatus
.
RUNNING
def
_swap_out
(
self
,
seq_group
:
SequenceGroup
,
blocks_to_swap_out
:
Dict
[
int
,
int
],
)
->
None
:
if
not
self
.
block_manager
.
can_swap_out
(
seq_group
):
# FIXME(woosuk): Abort the sequence group instead of aborting the
# entire engine.
raise
RuntimeError
(
"Aborted due to the lack of CPU swap space. Please increase "
"the swap space to avoid this error."
)
mapping
=
self
.
block_manager
.
swap_out
(
seq_group
)
blocks_to_swap_out
.
update
(
mapping
)
for
seq
in
seq_group
.
get_seqs
(
status
=
SequenceStatus
.
RUNNING
):
seq
.
status
=
SequenceStatus
.
SWAPPED
server/vllm/vllm/engine/__init__.py
deleted
100644 → 0
View file @
64def8e2
server/vllm/vllm/engine/arg_utils.py
deleted
100644 → 0
View file @
64def8e2
import
argparse
import
dataclasses
from
dataclasses
import
dataclass
from
typing
import
Optional
,
Tuple
from
vllm.config
import
(
CacheConfig
,
ModelConfig
,
ParallelConfig
,
SchedulerConfig
)
@
dataclass
class
EngineArgs
:
"""Arguments for vLLM engine."""
model
:
str
tokenizer
:
Optional
[
str
]
=
None
tokenizer_mode
:
str
=
'auto'
trust_remote_code
:
bool
=
False
download_dir
:
Optional
[
str
]
=
None
load_format
:
str
=
'auto'
dtype
:
str
=
'auto'
seed
:
int
=
0
max_model_len
:
Optional
[
int
]
=
None
worker_use_ray
:
bool
=
False
pipeline_parallel_size
:
int
=
1
tensor_parallel_size
:
int
=
1
block_size
:
int
=
16
swap_space
:
int
=
4
# GiB
gpu_memory_utilization
:
float
=
0.90
max_num_batched_tokens
:
Optional
[
int
]
=
None
max_num_seqs
:
int
=
256
max_paddings
:
int
=
256
disable_log_stats
:
bool
=
False
revision
:
Optional
[
str
]
=
None
tokenizer_revision
:
Optional
[
str
]
=
None
quantization
:
Optional
[
str
]
=
None
def
__post_init__
(
self
):
if
self
.
tokenizer
is
None
:
self
.
tokenizer
=
self
.
model
@
staticmethod
def
add_cli_args
(
parser
:
argparse
.
ArgumentParser
)
->
argparse
.
ArgumentParser
:
"""Shared CLI arguments for vLLM engine."""
# Model arguments
parser
.
add_argument
(
'--model'
,
type
=
str
,
default
=
'facebook/opt-125m'
,
help
=
'name or path of the huggingface model to use'
)
parser
.
add_argument
(
'--tokenizer'
,
type
=
str
,
default
=
EngineArgs
.
tokenizer
,
help
=
'name or path of the huggingface tokenizer to use'
)
parser
.
add_argument
(
'--revision'
,
type
=
str
,
default
=
None
,
help
=
'the specific model version to use. It can be a branch '
'name, a tag name, or a commit id. If unspecified, will use '
'the default version.'
)
parser
.
add_argument
(
'--tokenizer-revision'
,
type
=
str
,
default
=
None
,
help
=
'the specific tokenizer version to use. It can be a branch '
'name, a tag name, or a commit id. If unspecified, will use '
'the default version.'
)
parser
.
add_argument
(
'--tokenizer-mode'
,
type
=
str
,
default
=
EngineArgs
.
tokenizer_mode
,
choices
=
[
'auto'
,
'slow'
],
help
=
'tokenizer mode. "auto" will use the fast '
'tokenizer if available, and "slow" will '
'always use the slow tokenizer.'
)
parser
.
add_argument
(
'--trust-remote-code'
,
action
=
'store_true'
,
help
=
'trust remote code from huggingface'
)
parser
.
add_argument
(
'--download-dir'
,
type
=
str
,
default
=
EngineArgs
.
download_dir
,
help
=
'directory to download and load the weights, '
'default to the default cache dir of '
'huggingface'
)
parser
.
add_argument
(
'--load-format'
,
type
=
str
,
default
=
EngineArgs
.
load_format
,
choices
=
[
'auto'
,
'pt'
,
'safetensors'
,
'npcache'
,
'dummy'
],
help
=
'The format of the model weights to load. '
'"auto" will try to load the weights in the safetensors format '
'and fall back to the pytorch bin format if safetensors format '
'is not available. '
'"pt" will load the weights in the pytorch bin format. '
'"safetensors" will load the weights in the safetensors format. '
'"npcache" will load the weights in pytorch format and store '
'a numpy cache to speed up the loading. '
'"dummy" will initialize the weights with random values, '
'which is mainly for profiling.'
)
parser
.
add_argument
(
'--dtype'
,
type
=
str
,
default
=
EngineArgs
.
dtype
,
choices
=
[
'auto'
,
'half'
,
'float16'
,
'bfloat16'
,
'float'
,
'float32'
],
help
=
'data type for model weights and activations. '
'The "auto" option will use FP16 precision '
'for FP32 and FP16 models, and BF16 precision '
'for BF16 models.'
)
parser
.
add_argument
(
'--max-model-len'
,
type
=
int
,
default
=
None
,
help
=
'model context length. If unspecified, '
'will be automatically derived from the model.'
)
# Parallel arguments
parser
.
add_argument
(
'--worker-use-ray'
,
action
=
'store_true'
,
help
=
'use Ray for distributed serving, will be '
'automatically set when using more than 1 GPU'
)
parser
.
add_argument
(
'--pipeline-parallel-size'
,
'-pp'
,
type
=
int
,
default
=
EngineArgs
.
pipeline_parallel_size
,
help
=
'number of pipeline stages'
)
parser
.
add_argument
(
'--tensor-parallel-size'
,
'-tp'
,
type
=
int
,
default
=
EngineArgs
.
tensor_parallel_size
,
help
=
'number of tensor parallel replicas'
)
# KV cache arguments
parser
.
add_argument
(
'--block-size'
,
type
=
int
,
default
=
EngineArgs
.
block_size
,
choices
=
[
8
,
16
,
32
],
help
=
'token block size'
)
# TODO(woosuk): Support fine-grained seeds (e.g., seed per request).
parser
.
add_argument
(
'--seed'
,
type
=
int
,
default
=
EngineArgs
.
seed
,
help
=
'random seed'
)
parser
.
add_argument
(
'--swap-space'
,
type
=
int
,
default
=
EngineArgs
.
swap_space
,
help
=
'CPU swap space size (GiB) per GPU'
)
parser
.
add_argument
(
'--gpu-memory-utilization'
,
type
=
float
,
default
=
EngineArgs
.
gpu_memory_utilization
,
help
=
'the percentage of GPU memory to be used for'
'the model executor'
)
parser
.
add_argument
(
'--max-num-batched-tokens'
,
type
=
int
,
default
=
EngineArgs
.
max_num_batched_tokens
,
help
=
'maximum number of batched tokens per '
'iteration'
)
parser
.
add_argument
(
'--max-num-seqs'
,
type
=
int
,
default
=
EngineArgs
.
max_num_seqs
,
help
=
'maximum number of sequences per iteration'
)
parser
.
add_argument
(
'--max-paddings'
,
type
=
int
,
default
=
EngineArgs
.
max_paddings
,
help
=
'maximum number of paddings in a batch'
)
parser
.
add_argument
(
'--disable-log-stats'
,
action
=
'store_true'
,
help
=
'disable logging statistics'
)
# Quantization settings.
parser
.
add_argument
(
'--quantization'
,
'-q'
,
type
=
str
,
choices
=
[
'awq'
,
None
],
default
=
None
,
help
=
'Method used to quantize the weights'
)
return
parser
@
classmethod
def
from_cli_args
(
cls
,
args
:
argparse
.
Namespace
)
->
'EngineArgs'
:
# Get the list of attributes of this dataclass.
attrs
=
[
attr
.
name
for
attr
in
dataclasses
.
fields
(
cls
)]
# Set the attributes from the parsed arguments.
engine_args
=
cls
(
**
{
attr
:
getattr
(
args
,
attr
)
for
attr
in
attrs
})
return
engine_args
def
create_engine_configs
(
self
,
)
->
Tuple
[
ModelConfig
,
CacheConfig
,
ParallelConfig
,
SchedulerConfig
]:
model_config
=
ModelConfig
(
self
.
model
,
self
.
tokenizer
,
self
.
tokenizer_mode
,
self
.
trust_remote_code
,
self
.
download_dir
,
self
.
load_format
,
self
.
dtype
,
self
.
seed
,
self
.
revision
,
self
.
tokenizer_revision
,
self
.
max_model_len
,
self
.
quantization
)
cache_config
=
CacheConfig
(
self
.
block_size
,
self
.
gpu_memory_utilization
,
self
.
swap_space
,
getattr
(
model_config
.
hf_config
,
'sliding_window'
,
None
))
parallel_config
=
ParallelConfig
(
self
.
pipeline_parallel_size
,
self
.
tensor_parallel_size
,
self
.
worker_use_ray
)
scheduler_config
=
SchedulerConfig
(
self
.
max_num_batched_tokens
,
self
.
max_num_seqs
,
model_config
.
max_model_len
,
self
.
max_paddings
)
return
model_config
,
cache_config
,
parallel_config
,
scheduler_config
@
dataclass
class
AsyncEngineArgs
(
EngineArgs
):
"""Arguments for asynchronous vLLM engine."""
engine_use_ray
:
bool
=
False
disable_log_requests
:
bool
=
False
max_log_len
:
Optional
[
int
]
=
None
@
staticmethod
def
add_cli_args
(
parser
:
argparse
.
ArgumentParser
)
->
argparse
.
ArgumentParser
:
parser
=
EngineArgs
.
add_cli_args
(
parser
)
parser
.
add_argument
(
'--engine-use-ray'
,
action
=
'store_true'
,
help
=
'use Ray to start the LLM engine in a '
'separate process as the server process.'
)
parser
.
add_argument
(
'--disable-log-requests'
,
action
=
'store_true'
,
help
=
'disable logging requests'
)
parser
.
add_argument
(
'--max-log-len'
,
type
=
int
,
default
=
None
,
help
=
'max number of prompt characters or prompt '
'ID numbers being printed in log. '
'Default: unlimited.'
)
return
parser
server/vllm/vllm/engine/async_llm_engine.py
deleted
100644 → 0
View file @
64def8e2
import
asyncio
import
time
from
functools
import
partial
from
typing
import
(
Any
,
Dict
,
Iterable
,
List
,
Optional
,
Set
,
Tuple
,
Type
,
Union
)
from
vllm.config
import
ModelConfig
from
vllm.engine.arg_utils
import
AsyncEngineArgs
from
vllm.engine.llm_engine
import
LLMEngine
from
vllm.engine.ray_utils
import
initialize_cluster
,
ray
from
vllm.logger
import
init_logger
from
vllm.outputs
import
RequestOutput
from
vllm.sampling_params
import
SamplingParams
logger
=
init_logger
(
__name__
)
class
AsyncEngineDeadError
(
RuntimeError
):
pass
def
_raise_exception_on_finish
(
task
:
asyncio
.
Task
,
request_tracker
:
"RequestTracker"
)
->
None
:
msg
=
(
"Task finished unexpectedly. This should never happen! "
"Please open an issue on Github."
)
try
:
try
:
task
.
result
()
except
asyncio
.
CancelledError
:
return
except
Exception
as
exc
:
raise
AsyncEngineDeadError
(
msg
+
" See stack trace above for the actual cause."
)
from
exc
raise
AsyncEngineDeadError
(
msg
)
except
Exception
as
exc
:
request_tracker
.
propagate_exception
(
exc
)
raise
exc
class
AsyncStream
:
"""A stream of RequestOutputs for a request that can be
iterated over asynchronously."""
def
__init__
(
self
,
request_id
:
str
)
->
None
:
self
.
request_id
=
request_id
self
.
_queue
=
asyncio
.
Queue
()
self
.
_finished
=
False
def
put
(
self
,
item
:
RequestOutput
)
->
None
:
if
self
.
_finished
:
return
self
.
_queue
.
put_nowait
(
item
)
def
finish
(
self
)
->
None
:
self
.
_queue
.
put_nowait
(
StopIteration
)
self
.
_finished
=
True
@
property
def
finished
(
self
)
->
bool
:
return
self
.
_finished
def
__aiter__
(
self
):
return
self
async
def
__anext__
(
self
)
->
RequestOutput
:
result
=
await
self
.
_queue
.
get
()
if
result
is
StopIteration
:
raise
StopAsyncIteration
elif
isinstance
(
result
,
Exception
):
raise
result
return
result
class
RequestTracker
:
"""Synchronous abstraction for tracking requests."""
def
__init__
(
self
)
->
None
:
self
.
_request_streams
:
Dict
[
str
,
AsyncStream
]
=
{}
self
.
_finished_requests
:
asyncio
.
Queue
[
str
]
=
asyncio
.
Queue
()
self
.
_new_requests
:
asyncio
.
Queue
[
Tuple
[
AsyncStream
,
dict
]]
=
asyncio
.
Queue
()
self
.
new_requests_event
=
None
def
__contains__
(
self
,
item
):
return
item
in
self
.
_request_streams
def
init_event
(
self
):
self
.
new_requests_event
=
asyncio
.
Event
()
def
propagate_exception
(
self
,
exc
:
Exception
,
request_id
:
Optional
[
str
]
=
None
)
->
None
:
"""Propagate an exception to request streams
(all if request_id is None)."""
if
request_id
is
not
None
:
self
.
_request_streams
[
request_id
].
put
(
exc
)
else
:
for
stream
in
self
.
_request_streams
.
values
():
stream
.
put
(
exc
)
def
process_request_output
(
self
,
request_output
:
RequestOutput
,
*
,
verbose
:
bool
=
False
)
->
None
:
"""Process a request output from the engine."""
request_id
=
request_output
.
request_id
self
.
_request_streams
[
request_id
].
put
(
request_output
)
if
request_output
.
finished
:
if
verbose
:
logger
.
info
(
f
"Finished request
{
request_id
}
."
)
self
.
abort_request
(
request_id
)
def
add_request
(
self
,
request_id
:
str
,
**
engine_add_request_kwargs
)
->
AsyncStream
:
"""Add a request to be sent to the engine on the next background
loop iteration."""
if
request_id
in
self
.
_request_streams
:
raise
KeyError
(
f
"Request
{
request_id
}
already exists."
)
stream
=
AsyncStream
(
request_id
)
self
.
_new_requests
.
put_nowait
((
stream
,
{
"request_id"
:
request_id
,
**
engine_add_request_kwargs
}))
self
.
new_requests_event
.
set
()
return
stream
def
abort_request
(
self
,
request_id
:
str
,
*
,
verbose
:
bool
=
False
)
->
None
:
"""Abort a request during next background loop iteration."""
if
verbose
:
logger
.
info
(
f
"Aborted request
{
request_id
}
."
)
self
.
_finished_requests
.
put_nowait
(
request_id
)
if
request_id
not
in
self
.
_request_streams
or
self
.
_request_streams
[
request_id
].
finished
:
# The request has already finished or been aborted.
return
self
.
_request_streams
[
request_id
].
finish
()
def
get_new_and_finished_requests
(
self
)
->
Tuple
[
List
[
dict
],
Set
[
str
]]:
"""Get the new requests and finished requests to be
sent to the engine."""
new_requests
:
List
[
dict
]
=
[]
finished_requests
:
Set
[
str
]
=
set
()
while
not
self
.
_finished_requests
.
empty
():
request_id
=
self
.
_finished_requests
.
get_nowait
()
finished_requests
.
add
(
request_id
)
self
.
_request_streams
.
pop
(
request_id
,
None
)
while
not
self
.
_new_requests
.
empty
():
stream
,
new_request
=
self
.
_new_requests
.
get_nowait
()
if
stream
.
request_id
in
finished_requests
:
# The request has already been aborted.
stream
.
finish
()
continue
self
.
_request_streams
[
stream
.
request_id
]
=
stream
new_requests
.
append
(
new_request
)
self
.
new_requests_event
.
clear
()
return
new_requests
,
finished_requests
async
def
wait_for_new_requests
(
self
):
await
self
.
new_requests_event
.
wait
()
class
_AsyncLLMEngine
(
LLMEngine
):
"""Extension of LLMEngine to add async methods."""
async
def
step_async
(
self
)
->
List
[
RequestOutput
]:
"""Performs one decoding iteration and returns newly generated results.
The workers are ran asynchronously if possible.
This function performs one decoding iteration of the engine. It first
schedules the sequences to be executed in the next iteration and the
token blocks to be swapped in/out/copy. Then, it executes the model
and updates the scheduler with the model outputs. Finally, it decodes
the sequences and returns the newly generated results.
"""
seq_group_metadata_list
,
scheduler_outputs
,
ignored
=
self
.
_schedule
()
if
scheduler_outputs
.
is_empty
():
return
ignored
# Execute the model.
output
=
await
self
.
_run_workers_async
(
"execute_model"
,
seq_group_metadata_list
=
seq_group_metadata_list
,
blocks_to_swap_in
=
scheduler_outputs
.
blocks_to_swap_in
,
blocks_to_swap_out
=
scheduler_outputs
.
blocks_to_swap_out
,
blocks_to_copy
=
scheduler_outputs
.
blocks_to_copy
,
)
return
self
.
_process_model_outputs
(
output
,
scheduler_outputs
)
+
ignored
async
def
_run_workers_async
(
self
,
method
:
str
,
*
args
,
get_all_outputs
:
bool
=
False
,
**
kwargs
,
)
->
Any
:
"""Runs the given method on all workers."""
all_outputs
=
[]
for
worker
in
self
.
workers
:
if
self
.
parallel_config
.
worker_use_ray
:
executor
=
partial
(
worker
.
execute_method
.
remote
,
method
)
else
:
executor
=
getattr
(
worker
,
method
)
output
=
executor
(
*
args
,
**
kwargs
)
all_outputs
.
append
(
output
)
if
self
.
parallel_config
.
worker_use_ray
:
all_outputs
=
await
asyncio
.
gather
(
*
all_outputs
)
if
get_all_outputs
:
return
all_outputs
# Make sure all workers have the same results.
output
=
all_outputs
[
0
]
for
other_output
in
all_outputs
[
1
:]:
assert
output
==
other_output
return
output
class
AsyncLLMEngine
:
"""An asynchronous wrapper for LLMEngine.
This class is used to wrap the LLMEngine class to make it asynchronous. It
uses asyncio to create a background loop that keeps processing incoming
requests. The LLMEngine is kicked by the generate method when there
are requests in the waiting queue. The generate method yields the outputs
from the LLMEngine to the caller.
NOTE: For the comprehensive list of arguments, see `LLMEngine`.
Args:
worker_use_ray: Whether to use Ray for model workers. Required for
distributed execution. Should be the same as
`parallel_config.worker_use_ray`.
engine_use_ray: Whether to make LLMEngine a Ray actor. If so, the
async frontend will be executed in a separate process as the
model workers.
log_requests: Whether to log the requests.
start_engine_loop: If True, the background task to run the engine
will be automatically started in the generate call.
*args, *kwargs: Arguments for LLMEngine.
"""
_engine_class
:
Type
[
_AsyncLLMEngine
]
=
_AsyncLLMEngine
def
__init__
(
self
,
worker_use_ray
:
bool
,
engine_use_ray
:
bool
,
*
args
,
log_requests
:
bool
=
True
,
max_log_len
:
Optional
[
int
]
=
None
,
start_engine_loop
:
bool
=
True
,
**
kwargs
)
->
None
:
self
.
worker_use_ray
=
worker_use_ray
self
.
engine_use_ray
=
engine_use_ray
self
.
log_requests
=
log_requests
self
.
max_log_len
=
max_log_len
self
.
engine
=
self
.
_init_engine
(
*
args
,
**
kwargs
)
self
.
background_loop
=
None
# We need to keep a reference to unshielded
# task as well to prevent it from being garbage
# collected
self
.
_background_loop_unshielded
=
None
self
.
start_engine_loop
=
start_engine_loop
self
.
_request_tracker
=
RequestTracker
()
@
property
def
is_running
(
self
)
->
bool
:
return
(
self
.
background_loop
is
not
None
and
not
self
.
background_loop
.
done
())
def
start_background_loop
(
self
)
->
None
:
"""Start the background loop."""
if
self
.
is_running
:
raise
RuntimeError
(
"Background loop is already running."
)
self
.
_request_tracker
.
init_event
()
self
.
_background_loop_unshielded
=
asyncio
.
get_event_loop
(
).
create_task
(
self
.
run_engine_loop
())
self
.
_background_loop_unshielded
.
add_done_callback
(
partial
(
_raise_exception_on_finish
,
request_tracker
=
self
.
_request_tracker
))
self
.
background_loop
=
asyncio
.
shield
(
self
.
_background_loop_unshielded
)
def
_init_engine
(
self
,
*
args
,
**
kwargs
)
->
Union
[
_AsyncLLMEngine
,
"ray.ObjectRef"
]:
if
not
self
.
engine_use_ray
:
engine_class
=
self
.
_engine_class
elif
self
.
worker_use_ray
:
engine_class
=
ray
.
remote
(
num_cpus
=
0
)(
self
.
_engine_class
).
remote
else
:
engine_class
=
ray
.
remote
(
num_gpus
=
1
)(
self
.
_engine_class
).
remote
return
engine_class
(
*
args
,
**
kwargs
)
async
def
engine_step
(
self
)
->
bool
:
"""Kick the engine to process the waiting requests.
Returns True if there are in-progress requests."""
new_requests
,
finished_requests
=
(
self
.
_request_tracker
.
get_new_and_finished_requests
())
for
new_request
in
new_requests
:
# Add the request into the vLLM engine's waiting queue.
# TODO: Maybe add add_request_batch to reduce Ray overhead
if
self
.
engine_use_ray
:
await
self
.
engine
.
add_request
.
remote
(
**
new_request
)
else
:
self
.
engine
.
add_request
(
**
new_request
)
if
finished_requests
:
await
self
.
_engine_abort
(
finished_requests
)
if
self
.
engine_use_ray
:
request_outputs
=
await
self
.
engine
.
step
.
remote
()
else
:
request_outputs
=
await
self
.
engine
.
step_async
()
# Put the outputs into the corresponding streams.
for
request_output
in
request_outputs
:
self
.
_request_tracker
.
process_request_output
(
request_output
,
verbose
=
self
.
log_requests
)
return
len
(
request_outputs
)
>
0
async
def
_engine_abort
(
self
,
request_ids
:
Iterable
[
str
]):
if
self
.
engine_use_ray
:
await
self
.
engine
.
abort_request
.
remote
(
request_ids
)
else
:
self
.
engine
.
abort_request
(
request_ids
)
async
def
run_engine_loop
(
self
):
# Initialize the RequestTracker here so it uses the right event loop.
has_requests_in_progress
=
False
while
True
:
if
not
has_requests_in_progress
:
await
self
.
_request_tracker
.
wait_for_new_requests
()
has_requests_in_progress
=
await
self
.
engine_step
()
await
asyncio
.
sleep
(
0
)
async
def
add_request
(
self
,
request_id
:
str
,
prompt
:
Optional
[
str
],
sampling_params
:
SamplingParams
,
prompt_token_ids
:
Optional
[
List
[
int
]]
=
None
,
arrival_time
:
Optional
[
float
]
=
None
,
)
->
AsyncStream
:
if
self
.
log_requests
:
shortened_prompt
=
prompt
shortened_token_ids
=
prompt_token_ids
if
self
.
max_log_len
is
not
None
:
if
shortened_prompt
is
not
None
:
shortened_prompt
=
shortened_prompt
[:
self
.
max_log_len
]
if
shortened_token_ids
is
not
None
:
shortened_token_ids
=
shortened_token_ids
[:
self
.
max_log_len
]
logger
.
info
(
f
"Received request
{
request_id
}
: "
f
"prompt:
{
shortened_prompt
!
r
}
, "
f
"sampling params:
{
sampling_params
}
, "
f
"prompt token ids:
{
shortened_token_ids
}
."
)
if
not
self
.
is_running
:
if
self
.
start_engine_loop
:
self
.
start_background_loop
()
else
:
raise
AsyncEngineDeadError
(
"Background loop is not running. If it was running, "
"inspect the output to find the stacktrace of the "
"error that caused the background loop to stop "
"(AsyncEngineDeadError)."
)
stream
=
self
.
_request_tracker
.
add_request
(
request_id
,
prompt
=
prompt
,
sampling_params
=
sampling_params
,
prompt_token_ids
=
prompt_token_ids
,
arrival_time
=
arrival_time
)
return
stream
async
def
generate
(
self
,
prompt
:
Optional
[
str
],
sampling_params
:
SamplingParams
,
request_id
:
str
,
prompt_token_ids
:
Optional
[
List
[
int
]]
=
None
)
->
RequestOutput
:
"""Generate outputs for a request.
Generate outputs for a request. This method is a coroutine. It adds the
request into the waiting queue of the LLMEngine and streams the outputs
from the LLMEngine to the caller.
Args:
prompt: The prompt string. Can be None if prompt_token_ids is
provided.
sampling_params: The sampling parameters of the request.
request_id: The unique id of the request.
prompt_token_ids: The token IDs of the prompt. If None, we
use the tokenizer to convert the prompts to token IDs.
Yields:
The output `RequestOutput` objects from the LLMEngine for the
request.
"""
# Preprocess the request.
# This should not be used for logging, as it is monotonic time.
arrival_time
=
time
.
monotonic
()
try
:
stream
=
await
self
.
add_request
(
request_id
,
prompt
,
sampling_params
,
prompt_token_ids
=
prompt_token_ids
,
arrival_time
=
arrival_time
)
async
for
request_output
in
stream
:
yield
request_output
except
(
Exception
,
asyncio
.
CancelledError
)
as
e
:
# If there is an exception or coroutine is cancelled, abort the
# request.
self
.
_abort
(
request_id
)
raise
e
async
def
abort
(
self
,
request_id
:
str
)
->
None
:
"""Abort a request.
Abort a submitted request. If the request is finished or not found,
this method will be a no-op.
Args:
request_id: The unique id of the request.
"""
if
not
self
.
is_running
:
raise
AsyncEngineDeadError
(
"Background loop is not running. If it was running, "
"inspect the output to find the stacktrace of the "
"error that caused the background loop to stop "
"(AsyncEngineDeadError)."
)
return
self
.
_abort
(
request_id
)
def
_abort
(
self
,
request_id
:
str
)
->
None
:
"""Abort a request.
Abort a submitted request. If the request is finished or not found,
this method will be a no-op.
Args:
request_id: The unique id of the request.
"""
self
.
_request_tracker
.
abort_request
(
request_id
,
verbose
=
self
.
log_requests
)
async
def
get_model_config
(
self
)
->
ModelConfig
:
"""Get the model configuration of the vLLM engine."""
if
self
.
engine_use_ray
:
return
await
self
.
engine
.
get_model_config
.
remote
()
else
:
return
self
.
engine
.
get_model_config
()
@
classmethod
def
from_engine_args
(
cls
,
engine_args
:
AsyncEngineArgs
,
start_engine_loop
:
bool
=
True
)
->
"AsyncLLMEngine"
:
"""Creates an async LLM engine from the engine arguments."""
# Create the engine configs.
engine_configs
=
engine_args
.
create_engine_configs
()
parallel_config
=
engine_configs
[
2
]
# Initialize the cluster.
distributed_init_method
,
placement_group
=
initialize_cluster
(
parallel_config
,
engine_args
.
engine_use_ray
)
# Create the async LLM engine.
engine
=
cls
(
engine_args
.
worker_use_ray
,
engine_args
.
engine_use_ray
,
*
engine_configs
,
distributed_init_method
,
placement_group
,
log_requests
=
not
engine_args
.
disable_log_requests
,
log_stats
=
not
engine_args
.
disable_log_stats
,
max_log_len
=
engine_args
.
max_log_len
,
start_engine_loop
=
start_engine_loop
)
return
engine
server/vllm/vllm/engine/llm_engine.py
deleted
100644 → 0
View file @
64def8e2
import
copy
import
time
from
functools
import
partial
from
typing
import
TYPE_CHECKING
,
Any
,
Iterable
,
List
,
Optional
,
Tuple
,
Union
from
vllm.config
import
(
CacheConfig
,
ModelConfig
,
ParallelConfig
,
SchedulerConfig
)
from
vllm.core.scheduler
import
Scheduler
,
SchedulerOutputs
from
vllm.engine.arg_utils
import
EngineArgs
from
vllm.engine.ray_utils
import
RayWorker
,
initialize_cluster
,
ray
from
vllm.logger
import
init_logger
from
vllm.outputs
import
RequestOutput
from
vllm.sampling_params
import
SamplingParams
from
vllm.sequence
import
(
SamplerOutput
,
Sequence
,
SequenceGroup
,
SequenceGroupMetadata
,
SequenceGroupOutputs
,
SequenceOutputs
,
SequenceStatus
)
from
vllm.transformers_utils.tokenizer
import
(
detokenize_incrementally
,
get_tokenizer
)
from
vllm.utils
import
Counter
if
ray
:
from
ray.air.util.torch_dist
import
init_torch_dist_process_group
from
ray.util.scheduling_strategies
import
PlacementGroupSchedulingStrategy
if
TYPE_CHECKING
:
from
ray.util.placement_group
import
PlacementGroup
logger
=
init_logger
(
__name__
)
_LOGGING_INTERVAL_SEC
=
5
class
LLMEngine
:
"""An LLM engine that receives requests and generates texts.
This is the main class for the vLLM engine. It receives requests
from clients and generates texts from the LLM. It includes a tokenizer, a
language model (possibly distributed across multiple GPUs), and GPU memory
space allocated for intermediate states (aka KV cache). This class utilizes
iteration-level scheduling and efficient memory management to maximize the
serving throughput.
The `LLM` class wraps this class for offline batched inference and the
`AsyncLLMEngine` class wraps this class for online serving.
NOTE: The config arguments are derived from the `EngineArgs` class. For the
comprehensive list of arguments, see `EngineArgs`.
Args:
model_config: The configuration related to the LLM model.
cache_config: The configuration related to the KV cache memory
management.
parallel_config: The configuration related to distributed execution.
scheduler_config: The configuration related to the request scheduler.
distributed_init_method: The initialization method for distributed
execution. See `torch.distributed.init_process_group` for details.
placement_group: Ray placement group for distributed execution.
Required for distributed execution.
log_stats: Whether to log statistics.
"""
def
__init__
(
self
,
model_config
:
ModelConfig
,
cache_config
:
CacheConfig
,
parallel_config
:
ParallelConfig
,
scheduler_config
:
SchedulerConfig
,
distributed_init_method
:
str
,
placement_group
:
Optional
[
"PlacementGroup"
],
log_stats
:
bool
,
)
->
None
:
logger
.
info
(
"Initializing an LLM engine with config: "
f
"model=
{
model_config
.
model
!
r
}
, "
f
"tokenizer=
{
model_config
.
tokenizer
!
r
}
, "
f
"tokenizer_mode=
{
model_config
.
tokenizer_mode
}
, "
f
"revision=
{
model_config
.
revision
}
, "
f
"tokenizer_revision=
{
model_config
.
tokenizer_revision
}
, "
f
"trust_remote_code=
{
model_config
.
trust_remote_code
}
, "
f
"dtype=
{
model_config
.
dtype
}
, "
f
"max_seq_len=
{
model_config
.
max_model_len
}
, "
f
"download_dir=
{
model_config
.
download_dir
!
r
}
, "
f
"load_format=
{
model_config
.
load_format
}
, "
f
"tensor_parallel_size=
{
parallel_config
.
tensor_parallel_size
}
, "
f
"quantization=
{
model_config
.
quantization
}
, "
f
"seed=
{
model_config
.
seed
}
)"
)
# TODO(woosuk): Print more configs in debug mode.
self
.
model_config
=
model_config
self
.
cache_config
=
cache_config
assert
self
.
cache_config
.
sliding_window
==
getattr
(
self
.
model_config
.
hf_config
,
"sliding_window"
,
None
)
self
.
parallel_config
=
parallel_config
self
.
scheduler_config
=
scheduler_config
self
.
log_stats
=
log_stats
self
.
_verify_args
()
self
.
tokenizer
=
get_tokenizer
(
model_config
.
tokenizer
,
tokenizer_mode
=
model_config
.
tokenizer_mode
,
trust_remote_code
=
model_config
.
trust_remote_code
,
tokenizer_revision
=
model_config
.
tokenizer_revision
,
revision
=
model_config
.
revision
)
self
.
seq_counter
=
Counter
()
# Create the parallel GPU workers.
if
self
.
parallel_config
.
worker_use_ray
:
self
.
_init_workers_ray
(
placement_group
)
else
:
self
.
_init_workers
(
distributed_init_method
)
# Profile the memory usage and initialize the cache.
self
.
_init_cache
()
# Create the scheduler.
self
.
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
)
# Logging.
self
.
last_logging_time
=
0.0
# List of (timestamp, num_tokens)
self
.
num_prompt_tokens
:
List
[
Tuple
[
float
,
int
]]
=
[]
# List of (timestamp, num_tokens)
self
.
num_generation_tokens
:
List
[
Tuple
[
float
,
int
]]
=
[]
def
_init_workers
(
self
,
distributed_init_method
:
str
):
# Lazy import the Worker to avoid importing torch.cuda/xformers
# before CUDA_VISIBLE_DEVICES is set in the Worker
from
vllm.worker.worker
import
Worker
# pylint: disable=import-outside-toplevel
assert
self
.
parallel_config
.
world_size
==
1
,
(
"Ray is required if parallel_config.world_size > 1."
)
self
.
workers
:
List
[
Worker
]
=
[]
worker
=
Worker
(
self
.
model_config
,
self
.
parallel_config
,
self
.
scheduler_config
,
0
,
distributed_init_method
,
)
self
.
workers
.
append
(
worker
)
self
.
_run_workers
(
"init_model"
,
get_all_outputs
=
True
,
)
def
_init_workers_ray
(
self
,
placement_group
:
"PlacementGroup"
,
**
ray_remote_kwargs
):
# Lazy import the Worker to avoid importing torch.cuda/xformers
# before CUDA_VISIBLE_DEVICES is set in the Worker
from
vllm.worker.worker
import
Worker
# pylint: disable=import-outside-toplevel
self
.
workers
:
List
[
Worker
]
=
[]
for
bundle
in
placement_group
.
bundle_specs
:
if
not
bundle
.
get
(
"GPU"
,
0
):
continue
worker
=
ray
.
remote
(
num_cpus
=
0
,
num_gpus
=
1
,
scheduling_strategy
=
PlacementGroupSchedulingStrategy
(
placement_group
=
placement_group
,
placement_group_capture_child_tasks
=
True
),
**
ray_remote_kwargs
,
)(
RayWorker
).
remote
(
self
.
model_config
.
trust_remote_code
)
self
.
workers
.
append
(
worker
)
# Initialize torch distributed process group for the workers.
init_torch_dist_process_group
(
self
.
workers
,
backend
=
"nccl"
)
model_config
=
copy
.
deepcopy
(
self
.
model_config
)
parallel_config
=
copy
.
deepcopy
(
self
.
parallel_config
)
scheduler_config
=
copy
.
deepcopy
(
self
.
scheduler_config
)
self
.
_run_workers
(
"init_worker"
,
get_all_outputs
=
True
,
worker_init_fn
=
lambda
:
Worker
(
model_config
,
parallel_config
,
scheduler_config
,
None
,
None
,
))
self
.
_run_workers
(
"init_model"
,
get_all_outputs
=
True
,
)
def
_verify_args
(
self
)
->
None
:
self
.
model_config
.
verify_with_parallel_config
(
self
.
parallel_config
)
self
.
cache_config
.
verify_with_parallel_config
(
self
.
parallel_config
)
def
_init_cache
(
self
)
->
None
:
"""Profiles the memory usage and initializes the KV cache."""
# Get the maximum number of blocks that can be allocated on GPU and CPU.
num_blocks
=
self
.
_run_workers
(
"profile_num_available_blocks"
,
get_all_outputs
=
True
,
block_size
=
self
.
cache_config
.
block_size
,
gpu_memory_utilization
=
self
.
cache_config
.
gpu_memory_utilization
,
cpu_swap_space
=
self
.
cache_config
.
swap_space_bytes
,
)
# Since we use a shared centralized controller, we take the minimum
# number of blocks across all workers to make sure all the memory
# operators can be applied to all workers.
num_gpu_blocks
=
min
(
b
[
0
]
for
b
in
num_blocks
)
num_cpu_blocks
=
min
(
b
[
1
]
for
b
in
num_blocks
)
# FIXME(woosuk): Change to debug log.
logger
.
info
(
f
"# GPU blocks:
{
num_gpu_blocks
}
, "
f
"# CPU blocks:
{
num_cpu_blocks
}
"
)
if
num_gpu_blocks
<=
0
:
raise
ValueError
(
"No available memory for the cache blocks. "
"Try increasing `gpu_memory_utilization` when "
"initializing the engine."
)
self
.
cache_config
.
num_gpu_blocks
=
num_gpu_blocks
self
.
cache_config
.
num_cpu_blocks
=
num_cpu_blocks
# Initialize the cache.
self
.
_run_workers
(
"init_cache_engine"
,
cache_config
=
self
.
cache_config
)
@
classmethod
def
from_engine_args
(
cls
,
engine_args
:
EngineArgs
)
->
"LLMEngine"
:
"""Creates an LLM engine from the engine arguments."""
# Create the engine configs.
engine_configs
=
engine_args
.
create_engine_configs
()
parallel_config
=
engine_configs
[
2
]
# Initialize the cluster.
distributed_init_method
,
placement_group
=
initialize_cluster
(
parallel_config
)
# Create the LLM engine.
engine
=
cls
(
*
engine_configs
,
distributed_init_method
,
placement_group
,
log_stats
=
not
engine_args
.
disable_log_stats
)
return
engine
def
add_request
(
self
,
request_id
:
str
,
prompt
:
Optional
[
str
],
sampling_params
:
SamplingParams
,
prompt_token_ids
:
Optional
[
List
[
int
]]
=
None
,
arrival_time
:
Optional
[
float
]
=
None
,
)
->
None
:
"""Add a request to the engine's request pool.
The request is added to the request pool and will be processed by the
scheduler as `engine.step()` is called. The exact scheduling policy is
determined by the scheduler.
Args:
request_id: The unique ID of the request.
prompt: The prompt string. Can be None if prompt_token_ids is
provided.
sampling_params: The sampling parameters for text generation.
prompt_token_ids: The token IDs of the prompt. If None, we
use the tokenizer to convert the prompts to token IDs.
arrival_time: The arrival time of the request. If None, we use
the current monotonic time.
"""
if
arrival_time
is
None
:
arrival_time
=
time
.
monotonic
()
if
prompt_token_ids
is
None
:
assert
prompt
is
not
None
prompt_token_ids
=
self
.
tokenizer
.
encode
(
prompt
)
# Create the sequences.
block_size
=
self
.
cache_config
.
block_size
seq_id
=
next
(
self
.
seq_counter
)
seq
=
Sequence
(
seq_id
,
prompt
,
prompt_token_ids
,
block_size
)
# Create the sequence group.
seq_group
=
SequenceGroup
(
request_id
,
[
seq
],
sampling_params
,
arrival_time
)
# Add the sequence group to the scheduler.
self
.
scheduler
.
add_seq_group
(
seq_group
)
def
abort_request
(
self
,
request_id
:
Union
[
str
,
Iterable
[
str
]])
->
None
:
"""Aborts a request(s) with the given ID.
Args:
request_id: The ID(s) of the request to abort.
"""
self
.
scheduler
.
abort_seq_group
(
request_id
)
def
get_model_config
(
self
)
->
ModelConfig
:
"""Gets the model configuration."""
return
self
.
model_config
def
get_num_unfinished_requests
(
self
)
->
int
:
"""Gets the number of unfinished requests."""
return
self
.
scheduler
.
get_num_unfinished_seq_groups
()
def
has_unfinished_requests
(
self
)
->
bool
:
"""Returns True if there are unfinished requests."""
return
self
.
scheduler
.
has_unfinished_seqs
()
def
_schedule
(
self
)
->
Tuple
[
List
[
SequenceGroupMetadata
],
SchedulerOutputs
,
List
[
RequestOutput
]]:
seq_group_metadata_list
,
scheduler_outputs
=
self
.
scheduler
.
schedule
()
return
seq_group_metadata_list
,
scheduler_outputs
,
[
RequestOutput
.
from_seq_group
(
seq_group
)
for
seq_group
in
scheduler_outputs
.
ignored_seq_groups
]
def
_check_beam_search_early_stopping
(
self
,
early_stopping
:
Union
[
bool
,
str
],
sampling_params
:
SamplingParams
,
best_running_seq
:
Sequence
,
current_worst_seq
:
Sequence
,
)
->
bool
:
assert
sampling_params
.
use_beam_search
length_penalty
=
sampling_params
.
length_penalty
if
early_stopping
is
True
:
return
True
current_worst_score
=
(
current_worst_seq
.
get_beam_search_score
(
length_penalty
=
length_penalty
,
eos_token_id
=
self
.
tokenizer
.
eos_token_id
))
if
early_stopping
is
False
:
highest_attainable_score
=
(
best_running_seq
.
get_beam_search_score
(
length_penalty
=
length_penalty
,
eos_token_id
=
self
.
tokenizer
.
eos_token_id
))
else
:
assert
early_stopping
==
"never"
if
length_penalty
>
0.0
:
# If length_penalty > 0.0, beam search will prefer longer
# sequences. The highest attainable score calculation is
# based on the longest possible sequence length in this case.
max_possible_length
=
max
(
best_running_seq
.
get_prompt_len
()
+
sampling_params
.
max_tokens
,
self
.
scheduler_config
.
max_model_len
)
highest_attainable_score
=
(
best_running_seq
.
get_beam_search_score
(
length_penalty
=
length_penalty
,
eos_token_id
=
self
.
tokenizer
.
eos_token_id
,
seq_len
=
max_possible_length
))
else
:
# Otherwise, beam search will prefer shorter sequences. The
# highest attainable score calculation is based on the current
# sequence length.
highest_attainable_score
=
(
best_running_seq
.
get_beam_search_score
(
length_penalty
=
length_penalty
,
eos_token_id
=
self
.
tokenizer
.
eos_token_id
))
return
current_worst_score
>=
highest_attainable_score
def
_process_sequence_group_outputs
(
self
,
seq_group
:
SequenceGroup
,
outputs
:
SequenceGroupOutputs
)
->
None
:
# Process prompt logprobs
prompt_logprobs
=
outputs
.
prompt_logprobs
if
prompt_logprobs
is
not
None
:
seq_group
.
prompt_logprobs
=
prompt_logprobs
# Process samples
samples
=
outputs
.
samples
parent_seqs
=
seq_group
.
get_seqs
(
status
=
SequenceStatus
.
RUNNING
)
existing_finished_seqs
=
seq_group
.
get_finished_seqs
()
parent_child_dict
=
{
parent_seq
.
seq_id
:
[]
for
parent_seq
in
parent_seqs
}
for
sample
in
samples
:
parent_child_dict
[
sample
.
parent_seq_id
].
append
(
sample
)
# List of (child, parent)
child_seqs
:
List
[
Tuple
[
Sequence
,
Sequence
]]
=
[]
# Process the child samples for each parent sequence
for
parent
in
parent_seqs
:
child_samples
:
List
[
SequenceOutputs
]
=
parent_child_dict
[
parent
.
seq_id
]
if
len
(
child_samples
)
==
0
:
# This parent sequence has no children samples. Remove
# the parent sequence from the sequence group since it will
# not be used in the future iterations.
parent
.
status
=
SequenceStatus
.
FINISHED_ABORTED
seq_group
.
remove
(
parent
.
seq_id
)
self
.
scheduler
.
free_seq
(
parent
)
continue
# Fork the parent sequence if there are multiple child samples.
for
child_sample
in
child_samples
[:
-
1
]:
new_child_seq_id
=
next
(
self
.
seq_counter
)
child
=
parent
.
fork
(
new_child_seq_id
)
child
.
append_token_id
(
child_sample
.
output_token
,
child_sample
.
logprobs
)
child_seqs
.
append
((
child
,
parent
))
# Continue the parent sequence for the last child sample.
# We reuse the parent sequence here to reduce redundant memory
# copies, especially when using non-beam search sampling methods.
last_child_sample
=
child_samples
[
-
1
]
parent
.
append_token_id
(
last_child_sample
.
output_token
,
last_child_sample
.
logprobs
)
child_seqs
.
append
((
parent
,
parent
))
for
seq
,
_
in
child_seqs
:
self
.
_decode_sequence
(
seq
,
seq_group
.
sampling_params
)
self
.
_check_stop
(
seq
,
seq_group
.
sampling_params
)
# Non-beam search case
if
not
seq_group
.
sampling_params
.
use_beam_search
:
# For newly created child sequences, add them to the sequence group
# and fork them in block manager if they are not finished.
for
seq
,
parent
in
child_seqs
:
if
seq
is
not
parent
:
seq_group
.
add
(
seq
)
if
not
seq
.
is_finished
():
self
.
scheduler
.
fork_seq
(
parent
,
seq
)
# Free the finished and selected parent sequences' memory in block
# manager. Keep them in the sequence group as candidate output.
# NOTE: we need to fork the new sequences before freeing the
# old sequences.
for
seq
,
parent
in
child_seqs
:
if
seq
is
parent
and
seq
.
is_finished
():
self
.
scheduler
.
free_seq
(
seq
)
return
# Beam search case
# Select the child sequences to keep in the sequence group.
selected_child_seqs
=
[]
unselected_child_seqs
=
[]
beam_width
=
seq_group
.
sampling_params
.
best_of
length_penalty
=
seq_group
.
sampling_params
.
length_penalty
# Select the newly finished sequences with the highest scores
# to replace existing finished sequences.
# Tuple of (seq, parent, is_new)
existing_finished_seqs
=
[(
seq
,
None
,
False
)
for
seq
in
existing_finished_seqs
]
new_finished_seqs
=
[(
seq
,
parent
,
True
)
for
seq
,
parent
in
child_seqs
if
seq
.
is_finished
()]
all_finished_seqs
=
existing_finished_seqs
+
new_finished_seqs
# Sort the finished sequences by their scores.
all_finished_seqs
.
sort
(
key
=
lambda
x
:
x
[
0
].
get_beam_search_score
(
length_penalty
=
length_penalty
,
eos_token_id
=
self
.
tokenizer
.
eos_token_id
),
reverse
=
True
)
for
seq
,
parent
,
is_new
in
all_finished_seqs
[:
beam_width
]:
if
is_new
:
# A newly generated child sequence finishes and has a high
# score, so we will add it into the sequence group.
selected_child_seqs
.
append
((
seq
,
parent
))
for
seq
,
parent
,
is_new
in
all_finished_seqs
[
beam_width
:]:
if
is_new
:
# A newly generated child sequence finishes but has a low
# score, so we will not add it into the sequence group.
# Additionally, if this sequence is a continuation of a
# parent sequence, we will need remove the parent sequence
# from the sequence group.
unselected_child_seqs
.
append
((
seq
,
parent
))
else
:
# An existing finished sequence has a low score, so we will
# remove it from the sequence group.
seq_group
.
remove
(
seq
.
seq_id
)
# select the top beam_width sequences from the running
# sequences for the next iteration to continue the beam
# search.
running_child_seqs
=
[(
seq
,
parent
)
for
seq
,
parent
in
child_seqs
if
not
seq
.
is_finished
()]
# Sort the running sequences by their scores.
running_child_seqs
.
sort
(
key
=
lambda
x
:
x
[
0
].
get_beam_search_score
(
length_penalty
=
length_penalty
,
eos_token_id
=
self
.
tokenizer
.
eos_token_id
),
reverse
=
True
)
# Check if we can stop the beam search.
if
len
(
running_child_seqs
)
==
0
:
# No running sequences, stop the beam search.
stop_beam_search
=
True
elif
len
(
all_finished_seqs
)
<
beam_width
:
# Not enough finished sequences, continue the beam search.
stop_beam_search
=
False
else
:
# Check the early stopping criteria
best_running_seq
=
running_child_seqs
[
0
][
0
]
current_worst_seq
=
all_finished_seqs
[
beam_width
-
1
][
0
]
stop_beam_search
=
self
.
_check_beam_search_early_stopping
(
seq_group
.
sampling_params
.
early_stopping
,
seq_group
.
sampling_params
,
best_running_seq
,
current_worst_seq
)
if
stop_beam_search
:
# Stop the beam search and remove all the running sequences from
# the sequence group.
unselected_child_seqs
.
extend
(
running_child_seqs
)
else
:
# Continue the beam search and select the top beam_width sequences
# to continue the beam search.
selected_child_seqs
.
extend
(
running_child_seqs
[:
beam_width
])
# The remaining running sequences will not be used in the next
# iteration. Again, if these sequences are continuations of
# parent sequences, we will need to remove the parent sequences
# from the sequence group.
unselected_child_seqs
.
extend
(
running_child_seqs
[
beam_width
:])
# For newly created child sequences, add them to the sequence group
# and fork them in block manager if they are not finished.
for
seq
,
parent
in
selected_child_seqs
:
if
seq
is
not
parent
:
seq_group
.
add
(
seq
)
if
not
seq
.
is_finished
():
self
.
scheduler
.
fork_seq
(
parent
,
seq
)
# Free the finished and selected parent sequences' memory in block
# manager. Keep them in the sequence group as candidate output.
for
seq
,
parent
in
selected_child_seqs
:
if
seq
is
parent
and
seq
.
is_finished
():
self
.
scheduler
.
free_seq
(
seq
)
# Remove the unselected parent sequences from the sequence group and
# free their memory in block manager.
for
seq
,
parent
in
unselected_child_seqs
:
if
seq
is
parent
:
# Remove the parent sequence if it is not selected for next
# iteration
seq_group
.
remove
(
seq
.
seq_id
)
self
.
scheduler
.
free_seq
(
seq
)
def
_process_model_outputs
(
self
,
output
:
SamplerOutput
,
scheduler_outputs
:
SchedulerOutputs
)
->
List
[
RequestOutput
]:
# Update the scheduled sequence groups with the model outputs.
scheduled_seq_groups
=
scheduler_outputs
.
scheduled_seq_groups
for
seq_group
,
outputs
in
zip
(
scheduled_seq_groups
,
output
):
self
.
_process_sequence_group_outputs
(
seq_group
,
outputs
)
# Free the finished sequence groups.
self
.
scheduler
.
free_finished_seq_groups
()
# Create the outputs.
request_outputs
:
List
[
RequestOutput
]
=
[]
for
seq_group
in
(
scheduled_seq_groups
+
scheduler_outputs
.
ignored_seq_groups
):
request_output
=
RequestOutput
.
from_seq_group
(
seq_group
)
request_outputs
.
append
(
request_output
)
if
self
.
log_stats
:
# Log the system stats.
self
.
_log_system_stats
(
scheduler_outputs
.
prompt_run
,
scheduler_outputs
.
num_batched_tokens
)
return
request_outputs
def
step
(
self
)
->
List
[
RequestOutput
]:
"""Performs one decoding iteration and returns newly generated results.
This function performs one decoding iteration of the engine. It first
schedules the sequences to be executed in the next iteration and the
token blocks to be swapped in/out/copy. Then, it executes the model
and updates the scheduler with the model outputs. Finally, it decodes
the sequences and returns the newly generated results.
"""
seq_group_metadata_list
,
scheduler_outputs
,
ignored
=
self
.
_schedule
()
if
scheduler_outputs
.
is_empty
():
return
ignored
# Execute the model.
output
=
self
.
_run_workers
(
"execute_model"
,
seq_group_metadata_list
=
seq_group_metadata_list
,
blocks_to_swap_in
=
scheduler_outputs
.
blocks_to_swap_in
,
blocks_to_swap_out
=
scheduler_outputs
.
blocks_to_swap_out
,
blocks_to_copy
=
scheduler_outputs
.
blocks_to_copy
,
)
return
self
.
_process_model_outputs
(
output
,
scheduler_outputs
)
+
ignored
def
_log_system_stats
(
self
,
prompt_run
:
bool
,
num_batched_tokens
:
int
,
)
->
None
:
now
=
time
.
monotonic
()
# Log the number of batched input tokens.
if
prompt_run
:
self
.
num_prompt_tokens
.
append
((
now
,
num_batched_tokens
))
else
:
self
.
num_generation_tokens
.
append
((
now
,
num_batched_tokens
))
elapsed_time
=
now
-
self
.
last_logging_time
if
elapsed_time
<
_LOGGING_INTERVAL_SEC
:
return
# Discard the old stats.
self
.
num_prompt_tokens
=
[(
t
,
n
)
for
t
,
n
in
self
.
num_prompt_tokens
if
now
-
t
<
_LOGGING_INTERVAL_SEC
]
self
.
num_generation_tokens
=
[(
t
,
n
)
for
t
,
n
in
self
.
num_generation_tokens
if
now
-
t
<
_LOGGING_INTERVAL_SEC
]
if
len
(
self
.
num_prompt_tokens
)
>
1
:
total_num_tokens
=
sum
(
n
for
_
,
n
in
self
.
num_prompt_tokens
[:
-
1
])
window
=
now
-
self
.
num_prompt_tokens
[
0
][
0
]
avg_prompt_throughput
=
total_num_tokens
/
window
else
:
avg_prompt_throughput
=
0.0
if
len
(
self
.
num_generation_tokens
)
>
1
:
total_num_tokens
=
sum
(
n
for
_
,
n
in
self
.
num_generation_tokens
[:
-
1
])
window
=
now
-
self
.
num_generation_tokens
[
0
][
0
]
avg_generation_throughput
=
total_num_tokens
/
window
else
:
avg_generation_throughput
=
0.0
total_num_gpu_blocks
=
self
.
cache_config
.
num_gpu_blocks
num_free_gpu_blocks
=
(
self
.
scheduler
.
block_manager
.
get_num_free_gpu_blocks
())
num_used_gpu_blocks
=
total_num_gpu_blocks
-
num_free_gpu_blocks
gpu_cache_usage
=
num_used_gpu_blocks
/
total_num_gpu_blocks
total_num_cpu_blocks
=
self
.
cache_config
.
num_cpu_blocks
if
total_num_cpu_blocks
>
0
:
num_free_cpu_blocks
=
(
self
.
scheduler
.
block_manager
.
get_num_free_cpu_blocks
())
num_used_cpu_blocks
=
total_num_cpu_blocks
-
num_free_cpu_blocks
cpu_cache_usage
=
num_used_cpu_blocks
/
total_num_cpu_blocks
else
:
cpu_cache_usage
=
0.0
logger
.
info
(
"Avg prompt throughput: "
f
"
{
avg_prompt_throughput
:.
1
f
}
tokens/s, "
"Avg generation throughput: "
f
"
{
avg_generation_throughput
:.
1
f
}
tokens/s, "
f
"Running:
{
len
(
self
.
scheduler
.
running
)
}
reqs, "
f
"Swapped:
{
len
(
self
.
scheduler
.
swapped
)
}
reqs, "
f
"Pending:
{
len
(
self
.
scheduler
.
waiting
)
}
reqs, "
f
"GPU KV cache usage:
{
gpu_cache_usage
*
100
:.
1
f
}
%, "
f
"CPU KV cache usage:
{
cpu_cache_usage
*
100
:.
1
f
}
%"
)
self
.
last_logging_time
=
now
def
_decode_sequence
(
self
,
seq
:
Sequence
,
sampling_params
:
SamplingParams
)
->
None
:
"""Decodes the new token for a sequence."""
(
new_tokens
,
new_output_text
,
prefix_offset
,
read_offset
)
=
detokenize_incrementally
(
self
.
tokenizer
,
all_input_ids
=
seq
.
get_token_ids
(),
prev_tokens
=
seq
.
tokens
,
prefix_offset
=
seq
.
prefix_offset
,
read_offset
=
seq
.
read_offset
,
skip_special_tokens
=
sampling_params
.
skip_special_tokens
,
)
if
seq
.
tokens
is
None
:
seq
.
tokens
=
new_tokens
else
:
seq
.
tokens
.
extend
(
new_tokens
)
seq
.
prefix_offset
=
prefix_offset
seq
.
read_offset
=
read_offset
seq
.
output_text
+=
new_output_text
def
_check_stop
(
self
,
seq
:
Sequence
,
sampling_params
:
SamplingParams
)
->
None
:
"""Stop the finished sequences."""
for
stop_str
in
sampling_params
.
stop
:
if
seq
.
output_text
.
endswith
(
stop_str
):
# Truncate the output text so that the stop string is
# not included in the output.
seq
.
output_text
=
seq
.
output_text
[:
-
len
(
stop_str
)]
seq
.
status
=
SequenceStatus
.
FINISHED_STOPPED
return
if
seq
.
get_last_token_id
()
in
sampling_params
.
stop_token_ids
:
seq
.
status
=
SequenceStatus
.
FINISHED_STOPPED
return
# Check if the sequence has reached max_model_len.
if
seq
.
get_len
()
>
self
.
scheduler_config
.
max_model_len
:
seq
.
status
=
SequenceStatus
.
FINISHED_LENGTH_CAPPED
return
# Check if the sequence has reached max_tokens.
if
seq
.
get_output_len
()
==
sampling_params
.
max_tokens
:
seq
.
status
=
SequenceStatus
.
FINISHED_LENGTH_CAPPED
return
# Check if the sequence has generated the EOS token.
if
((
not
sampling_params
.
ignore_eos
)
and
seq
.
get_last_token_id
()
==
self
.
tokenizer
.
eos_token_id
):
seq
.
status
=
SequenceStatus
.
FINISHED_STOPPED
return
def
_run_workers
(
self
,
method
:
str
,
*
args
,
get_all_outputs
:
bool
=
False
,
**
kwargs
,
)
->
Any
:
"""Runs the given method on all workers."""
all_outputs
=
[]
for
worker
in
self
.
workers
:
if
self
.
parallel_config
.
worker_use_ray
:
executor
=
partial
(
worker
.
execute_method
.
remote
,
method
)
else
:
executor
=
getattr
(
worker
,
method
)
output
=
executor
(
*
args
,
**
kwargs
)
all_outputs
.
append
(
output
)
if
self
.
parallel_config
.
worker_use_ray
:
all_outputs
=
ray
.
get
(
all_outputs
)
if
get_all_outputs
:
return
all_outputs
# Make sure all workers have the same results.
output
=
all_outputs
[
0
]
for
other_output
in
all_outputs
[
1
:]:
assert
output
==
other_output
return
output
server/vllm/vllm/engine/ray_utils.py
deleted
100644 → 0
View file @
64def8e2
import
socket
from
typing
import
Optional
,
Tuple
,
TYPE_CHECKING
from
vllm.config
import
ParallelConfig
from
vllm.logger
import
init_logger
logger
=
init_logger
(
__name__
)
try
:
import
ray
from
ray.air.util.torch_dist
import
TorchDistributedWorker
class
RayWorker
(
TorchDistributedWorker
):
"""Ray wrapper for vllm.worker.Worker, allowing Worker to be
lazliy initialized after Ray sets CUDA_VISIBLE_DEVICES."""
def
__init__
(
self
,
init_cached_hf_modules
=
False
)
->
None
:
if
init_cached_hf_modules
:
# pylint: disable=import-outside-toplevel
from
transformers.dynamic_module_utils
import
init_hf_modules
init_hf_modules
()
self
.
worker
=
None
def
init_worker
(
self
,
worker_init_fn
):
self
.
worker
=
worker_init_fn
()
def
__getattr__
(
self
,
name
):
return
getattr
(
self
.
worker
,
name
)
def
execute_method
(
self
,
method
,
*
args
,
**
kwargs
):
executor
=
getattr
(
self
,
method
)
return
executor
(
*
args
,
**
kwargs
)
except
ImportError
as
e
:
logger
.
warning
(
f
"Failed to import Ray with
{
e
!
r
}
. "
"For distributed inference, please install Ray with "
"`pip install ray pandas pyarrow`."
)
ray
=
None
TorchDistributedWorker
=
None
RayWorker
=
None
# pylint: disable=invalid-name
if
TYPE_CHECKING
:
from
ray.util.placement_group
import
PlacementGroup
def
get_open_port
():
with
socket
.
socket
(
socket
.
AF_INET
,
socket
.
SOCK_STREAM
)
as
s
:
s
.
bind
((
""
,
0
))
return
s
.
getsockname
()[
1
]
def
initialize_cluster
(
parallel_config
:
ParallelConfig
,
engine_use_ray
:
bool
=
False
,
ray_address
:
Optional
[
str
]
=
None
,
)
->
Tuple
[
str
,
Optional
[
"PlacementGroup"
]]:
"""Initialize the distributed cluster probably with Ray.
Args:
parallel_config: The configurations for parallel execution.
engine_use_ray: Whether to use Ray for async engine.
ray_address: The address of the Ray cluster. If None, uses
the default Ray cluster address.
Returns:
A tuple of (`distributed_init_method`, `placement_group`). The
`distributed_init_method` is the address for initializing the
distributed backend. `placement_group` includes the specification
of the resources for each distributed worker.
"""
if
parallel_config
.
worker_use_ray
or
engine_use_ray
:
if
ray
is
None
:
raise
ImportError
(
"Ray is not installed. Please install Ray to use distributed "
"serving."
)
# Connect to a ray cluster.
ray
.
init
(
address
=
ray_address
,
ignore_reinit_error
=
True
)
if
not
parallel_config
.
worker_use_ray
:
# Initialize cluster locally.
port
=
get_open_port
()
# We need to setup the distributed init method to make sure
# the distributed megatron code (e.g., get world size) works correctly.
distributed_init_method
=
f
"tcp://localhost:
{
port
}
"
return
distributed_init_method
,
None
current_placement_group
=
ray
.
util
.
get_current_placement_group
()
if
current_placement_group
:
# We are in a placement group
bundles
=
current_placement_group
.
bundle_specs
# Verify that we can use the placement group.
gpu_bundles
=
0
for
bundle
in
bundles
:
bundle_gpus
=
bundle
.
get
(
"GPU"
,
0
)
if
bundle_gpus
>
1
:
raise
ValueError
(
"Placement group bundle cannot have more than 1 GPU."
)
if
bundle_gpus
:
gpu_bundles
+=
1
if
parallel_config
.
world_size
>
gpu_bundles
:
raise
ValueError
(
"The number of required GPUs exceeds the total number of "
"available GPUs in the placement group."
)
else
:
num_gpus_in_cluster
=
ray
.
cluster_resources
().
get
(
"GPU"
,
0
)
if
parallel_config
.
world_size
>
num_gpus_in_cluster
:
raise
ValueError
(
"The number of required GPUs exceeds the total number of "
"available GPUs in the cluster."
)
# Create a new placement group
current_placement_group
=
ray
.
util
.
placement_group
([{
"GPU"
:
1
}]
*
parallel_config
.
world_size
)
# Wait until PG is ready - this will block until all
# requested resources are available, and will timeout
# if they cannot be provisioned.
ray
.
get
(
current_placement_group
.
ready
(),
timeout
=
1800
)
return
None
,
current_placement_group
server/vllm/vllm/entrypoints/__init__.py
deleted
100644 → 0
View file @
64def8e2
server/vllm/vllm/entrypoints/api_server.py
deleted
100644 → 0
View file @
64def8e2
import
argparse
import
json
from
typing
import
AsyncGenerator
from
fastapi
import
FastAPI
,
Request
from
fastapi.responses
import
JSONResponse
,
Response
,
StreamingResponse
import
uvicorn
from
vllm.engine.arg_utils
import
AsyncEngineArgs
from
vllm.engine.async_llm_engine
import
AsyncLLMEngine
from
vllm.sampling_params
import
SamplingParams
from
vllm.utils
import
random_uuid
TIMEOUT_KEEP_ALIVE
=
5
# seconds.
TIMEOUT_TO_PREVENT_DEADLOCK
=
1
# seconds.
app
=
FastAPI
()
engine
=
None
@
app
.
post
(
"/generate"
)
async
def
generate
(
request
:
Request
)
->
Response
:
"""Generate completion for the request.
The request should be a JSON object with the following fields:
- prompt: the prompt to use for the generation.
- stream: whether to stream the results or not.
- other fields: the sampling parameters (See `SamplingParams` for details).
"""
request_dict
=
await
request
.
json
()
prompt
=
request_dict
.
pop
(
"prompt"
)
stream
=
request_dict
.
pop
(
"stream"
,
False
)
sampling_params
=
SamplingParams
(
**
request_dict
)
request_id
=
random_uuid
()
results_generator
=
engine
.
generate
(
prompt
,
sampling_params
,
request_id
)
# Streaming case
async
def
stream_results
()
->
AsyncGenerator
[
bytes
,
None
]:
async
for
request_output
in
results_generator
:
prompt
=
request_output
.
prompt
text_outputs
=
[
prompt
+
output
.
text
for
output
in
request_output
.
outputs
]
ret
=
{
"text"
:
text_outputs
}
yield
(
json
.
dumps
(
ret
)
+
"
\0
"
).
encode
(
"utf-8"
)
if
stream
:
return
StreamingResponse
(
stream_results
())
# Non-streaming case
final_output
=
None
async
for
request_output
in
results_generator
:
if
await
request
.
is_disconnected
():
# Abort the request if the client disconnects.
await
engine
.
abort
(
request_id
)
return
Response
(
status_code
=
499
)
final_output
=
request_output
assert
final_output
is
not
None
prompt
=
final_output
.
prompt
text_outputs
=
[
prompt
+
output
.
text
for
output
in
final_output
.
outputs
]
ret
=
{
"text"
:
text_outputs
}
return
JSONResponse
(
ret
)
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--host"
,
type
=
str
,
default
=
None
)
parser
.
add_argument
(
"--port"
,
type
=
int
,
default
=
8000
)
parser
=
AsyncEngineArgs
.
add_cli_args
(
parser
)
args
=
parser
.
parse_args
()
engine_args
=
AsyncEngineArgs
.
from_cli_args
(
args
)
engine
=
AsyncLLMEngine
.
from_engine_args
(
engine_args
)
uvicorn
.
run
(
app
,
host
=
args
.
host
,
port
=
args
.
port
,
log_level
=
"debug"
,
timeout_keep_alive
=
TIMEOUT_KEEP_ALIVE
)
server/vllm/vllm/entrypoints/llm.py
deleted
100644 → 0
View file @
64def8e2
from
typing
import
List
,
Optional
,
Union
from
tqdm
import
tqdm
from
transformers
import
PreTrainedTokenizer
,
PreTrainedTokenizerFast
from
vllm.engine.arg_utils
import
EngineArgs
from
vllm.engine.llm_engine
import
LLMEngine
from
vllm.outputs
import
RequestOutput
from
vllm.sampling_params
import
SamplingParams
from
vllm.utils
import
Counter
class
LLM
:
"""An LLM for generating texts from given prompts and sampling parameters.
This class includes a tokenizer, a language model (possibly distributed
across multiple GPUs), and GPU memory space allocated for intermediate
states (aka KV cache). Given a batch of prompts and sampling parameters,
this class generates texts from the model, using an intelligent batching
mechanism and efficient memory management.
NOTE: This class is intended to be used for offline inference. For online
serving, use the `AsyncLLMEngine` class instead.
NOTE: For the comprehensive list of arguments, see `EngineArgs`.
Args:
model: The name or path of a HuggingFace Transformers model.
tokenizer: The name or path of a HuggingFace Transformers tokenizer.
tokenizer_mode: The tokenizer mode. "auto" will use the fast tokenizer
if available, and "slow" will always use the slow tokenizer.
trust_remote_code: Trust remote code (e.g., from HuggingFace) when
downloading the model and tokenizer.
tensor_parallel_size: The number of GPUs to use for distributed
execution with tensor parallelism.
dtype: The data type for the model weights and activations. Currently,
we support `float32`, `float16`, and `bfloat16`. If `auto`, we use
the `torch_dtype` attribute specified in the model config file.
However, if the `torch_dtype` in the config is `float32`, we will
use `float16` instead.
quantization: The method used to quantize the model weights. Currently,
we support "awq". If None, we assume the model weights are not
quantized and use `dtype` to determine the data type of the weights.
revision: The specific model version to use. It can be a branch name,
a tag name, or a commit id.
tokenizer_revision: The specific tokenizer version to use. It can be a
branch name, a tag name, or a commit id.
seed: The seed to initialize the random number generator for sampling.
gpu_memory_utilization: The ratio (between 0 and 1) of GPU memory to
reserve for the model weights, activations, and KV cache. Higher
values will increase the KV cache size and thus improve the model's
throughput. However, if the value is too high, it may cause out-of-
memory (OOM) errors.
swap_space: The size (GiB) of CPU memory per GPU to use as swap space.
This can be used for temporarily storing the states of the requests
when their `best_of` sampling parameters are larger than 1. If all
requests will have `best_of=1`, you can safely set this to 0.
Otherwise, too small values may cause out-of-memory (OOM) errors.
"""
def
__init__
(
self
,
model
:
str
,
tokenizer
:
Optional
[
str
]
=
None
,
tokenizer_mode
:
str
=
"auto"
,
trust_remote_code
:
bool
=
False
,
tensor_parallel_size
:
int
=
1
,
dtype
:
str
=
"auto"
,
quantization
:
Optional
[
str
]
=
None
,
revision
:
Optional
[
str
]
=
None
,
tokenizer_revision
:
Optional
[
str
]
=
None
,
seed
:
int
=
0
,
gpu_memory_utilization
:
float
=
0.9
,
swap_space
:
int
=
4
,
**
kwargs
,
)
->
None
:
if
"disable_log_stats"
not
in
kwargs
:
kwargs
[
"disable_log_stats"
]
=
True
engine_args
=
EngineArgs
(
model
=
model
,
tokenizer
=
tokenizer
,
tokenizer_mode
=
tokenizer_mode
,
trust_remote_code
=
trust_remote_code
,
tensor_parallel_size
=
tensor_parallel_size
,
dtype
=
dtype
,
quantization
=
quantization
,
revision
=
revision
,
tokenizer_revision
=
tokenizer_revision
,
seed
=
seed
,
gpu_memory_utilization
=
gpu_memory_utilization
,
swap_space
=
swap_space
,
**
kwargs
,
)
self
.
llm_engine
=
LLMEngine
.
from_engine_args
(
engine_args
)
self
.
request_counter
=
Counter
()
def
get_tokenizer
(
self
)
->
Union
[
PreTrainedTokenizer
,
PreTrainedTokenizerFast
]:
return
self
.
llm_engine
.
tokenizer
def
set_tokenizer
(
self
,
tokenizer
:
Union
[
PreTrainedTokenizer
,
PreTrainedTokenizerFast
],
)
->
None
:
self
.
llm_engine
.
tokenizer
=
tokenizer
def
generate
(
self
,
prompts
:
Optional
[
Union
[
str
,
List
[
str
]]]
=
None
,
sampling_params
:
Optional
[
SamplingParams
]
=
None
,
prompt_token_ids
:
Optional
[
List
[
List
[
int
]]]
=
None
,
use_tqdm
:
bool
=
True
,
)
->
List
[
RequestOutput
]:
"""Generates the completions for the input prompts.
NOTE: This class automatically batches the given prompts, considering
the memory constraint. For the best performance, put all of your prompts
into a single list and pass it to this method.
Args:
prompts: A list of prompts to generate completions for.
sampling_params: The sampling parameters for text generation. If
None, we use the default sampling parameters.
prompt_token_ids: A list of token IDs for the prompts. If None, we
use the tokenizer to convert the prompts to token IDs.
use_tqdm: Whether to use tqdm to display the progress bar.
Returns:
A list of `RequestOutput` objects containing the generated
completions in the same order as the input prompts.
"""
if
prompts
is
None
and
prompt_token_ids
is
None
:
raise
ValueError
(
"Either prompts or prompt_token_ids must be "
"provided."
)
if
isinstance
(
prompts
,
str
):
# Convert a single prompt to a list.
prompts
=
[
prompts
]
if
prompts
is
not
None
and
prompt_token_ids
is
not
None
:
if
len
(
prompts
)
!=
len
(
prompt_token_ids
):
raise
ValueError
(
"The lengths of prompts and prompt_token_ids "
"must be the same."
)
if
sampling_params
is
None
:
# Use default sampling params.
sampling_params
=
SamplingParams
()
# Add requests to the engine.
if
prompts
is
not
None
:
num_requests
=
len
(
prompts
)
else
:
num_requests
=
len
(
prompt_token_ids
)
for
i
in
range
(
num_requests
):
prompt
=
prompts
[
i
]
if
prompts
is
not
None
else
None
if
prompt_token_ids
is
None
:
token_ids
=
None
else
:
token_ids
=
prompt_token_ids
[
i
]
self
.
_add_request
(
prompt
,
sampling_params
,
token_ids
)
return
self
.
_run_engine
(
use_tqdm
)
def
_add_request
(
self
,
prompt
:
Optional
[
str
],
sampling_params
:
SamplingParams
,
prompt_token_ids
:
Optional
[
List
[
int
]],
)
->
None
:
request_id
=
str
(
next
(
self
.
request_counter
))
self
.
llm_engine
.
add_request
(
request_id
,
prompt
,
sampling_params
,
prompt_token_ids
)
def
_run_engine
(
self
,
use_tqdm
:
bool
)
->
List
[
RequestOutput
]:
# Initialize tqdm.
if
use_tqdm
:
num_requests
=
self
.
llm_engine
.
get_num_unfinished_requests
()
pbar
=
tqdm
(
total
=
num_requests
,
desc
=
"Processed prompts"
)
# Run the engine.
outputs
:
List
[
RequestOutput
]
=
[]
while
self
.
llm_engine
.
has_unfinished_requests
():
step_outputs
=
self
.
llm_engine
.
step
()
for
output
in
step_outputs
:
if
output
.
finished
:
outputs
.
append
(
output
)
if
use_tqdm
:
pbar
.
update
(
1
)
if
use_tqdm
:
pbar
.
close
()
# Sort the outputs by request ID.
# This is necessary because some requests may be finished earlier than
# its previous requests.
outputs
=
sorted
(
outputs
,
key
=
lambda
x
:
int
(
x
.
request_id
))
return
outputs
server/vllm/vllm/entrypoints/openai/__init__.py
deleted
100644 → 0
View file @
64def8e2
server/vllm/vllm/entrypoints/openai/api_server.py
deleted
100644 → 0
View file @
64def8e2
# Adapted from
# https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/serve/openai_api_server.py
import
argparse
import
asyncio
import
json
import
time
from
http
import
HTTPStatus
from
typing
import
AsyncGenerator
,
Dict
,
List
,
Optional
,
Tuple
,
Union
import
fastapi
import
uvicorn
from
fastapi
import
Request
from
fastapi.exceptions
import
RequestValidationError
from
fastapi.middleware.cors
import
CORSMiddleware
from
fastapi.responses
import
JSONResponse
,
StreamingResponse
from
packaging
import
version
from
vllm.engine.arg_utils
import
AsyncEngineArgs
from
vllm.engine.async_llm_engine
import
AsyncLLMEngine
from
vllm.entrypoints.openai.protocol
import
(
CompletionRequest
,
CompletionResponse
,
CompletionResponseChoice
,
CompletionResponseStreamChoice
,
CompletionStreamResponse
,
ChatCompletionRequest
,
ChatCompletionResponse
,
ChatCompletionResponseChoice
,
ChatCompletionResponseStreamChoice
,
ChatCompletionStreamResponse
,
ChatMessage
,
DeltaMessage
,
ErrorResponse
,
LogProbs
,
ModelCard
,
ModelList
,
ModelPermission
,
UsageInfo
)
from
vllm.logger
import
init_logger
from
vllm.outputs
import
RequestOutput
from
vllm.sampling_params
import
SamplingParams
from
vllm.transformers_utils.tokenizer
import
get_tokenizer
from
vllm.utils
import
random_uuid
try
:
import
fastchat
from
fastchat.conversation
import
Conversation
,
SeparatorStyle
from
fastchat.model.model_adapter
import
get_conversation_template
_fastchat_available
=
True
except
ImportError
:
_fastchat_available
=
False
TIMEOUT_KEEP_ALIVE
=
5
# seconds
logger
=
init_logger
(
__name__
)
served_model
=
None
app
=
fastapi
.
FastAPI
()
engine
=
None
def
create_error_response
(
status_code
:
HTTPStatus
,
message
:
str
)
->
JSONResponse
:
return
JSONResponse
(
ErrorResponse
(
message
=
message
,
type
=
"invalid_request_error"
).
dict
(),
status_code
=
status_code
.
value
)
@
app
.
exception_handler
(
RequestValidationError
)
async
def
validation_exception_handler
(
request
,
exc
):
# pylint: disable=unused-argument
return
create_error_response
(
HTTPStatus
.
BAD_REQUEST
,
str
(
exc
))
async
def
check_model
(
request
)
->
Optional
[
JSONResponse
]:
if
request
.
model
==
served_model
:
return
ret
=
create_error_response
(
HTTPStatus
.
NOT_FOUND
,
f
"The model `
{
request
.
model
}
` does not exist."
,
)
return
ret
async
def
get_gen_prompt
(
request
)
->
str
:
if
not
_fastchat_available
:
raise
ModuleNotFoundError
(
"fastchat is not installed. Please install fastchat to use "
"the chat completion and conversation APIs: `$ pip install fschat`"
)
if
version
.
parse
(
fastchat
.
__version__
)
<
version
.
parse
(
"0.2.23"
):
raise
ImportError
(
f
"fastchat version is low. Current version:
{
fastchat
.
__version__
}
"
"Please upgrade fastchat to use: `$ pip install -U fschat`"
)
conv
=
get_conversation_template
(
request
.
model
)
conv
=
Conversation
(
name
=
conv
.
name
,
system_template
=
conv
.
system_template
,
system_message
=
conv
.
system_message
,
roles
=
conv
.
roles
,
messages
=
list
(
conv
.
messages
),
# prevent in-place modification
offset
=
conv
.
offset
,
sep_style
=
SeparatorStyle
(
conv
.
sep_style
),
sep
=
conv
.
sep
,
sep2
=
conv
.
sep2
,
stop_str
=
conv
.
stop_str
,
stop_token_ids
=
conv
.
stop_token_ids
,
)
if
isinstance
(
request
.
messages
,
str
):
prompt
=
request
.
messages
else
:
for
message
in
request
.
messages
:
msg_role
=
message
[
"role"
]
if
msg_role
==
"system"
:
conv
.
system_message
=
message
[
"content"
]
elif
msg_role
==
"user"
:
conv
.
append_message
(
conv
.
roles
[
0
],
message
[
"content"
])
elif
msg_role
==
"assistant"
:
conv
.
append_message
(
conv
.
roles
[
1
],
message
[
"content"
])
else
:
raise
ValueError
(
f
"Unknown role:
{
msg_role
}
"
)
# Add a blank message for the assistant.
conv
.
append_message
(
conv
.
roles
[
1
],
None
)
prompt
=
conv
.
get_prompt
()
return
prompt
async
def
check_length
(
request
:
Union
[
ChatCompletionRequest
,
CompletionRequest
],
prompt
:
Optional
[
str
]
=
None
,
prompt_ids
:
Optional
[
List
[
int
]]
=
None
)
->
Tuple
[
List
[
int
],
Optional
[
JSONResponse
]]:
assert
(
not
(
prompt
is
None
and
prompt_ids
is
None
)
and
not
(
prompt
is
not
None
and
prompt_ids
is
not
None
)
),
"Either prompt or prompt_ids should be provided."
if
prompt_ids
is
not
None
:
input_ids
=
prompt_ids
else
:
input_ids
=
tokenizer
(
prompt
).
input_ids
token_num
=
len
(
input_ids
)
if
request
.
max_tokens
is
None
:
request
.
max_tokens
=
max_model_len
-
token_num
if
token_num
+
request
.
max_tokens
>
max_model_len
:
return
input_ids
,
create_error_response
(
HTTPStatus
.
BAD_REQUEST
,
f
"This model's maximum context length is
{
max_model_len
}
tokens. "
f
"However, you requested
{
request
.
max_tokens
+
token_num
}
tokens "
f
"(
{
token_num
}
in the messages, "
f
"
{
request
.
max_tokens
}
in the completion). "
f
"Please reduce the length of the messages or completion."
,
)
else
:
return
input_ids
,
None
@
app
.
get
(
"/v1/models"
)
async
def
show_available_models
():
"""Show available models. Right now we only have one model."""
model_cards
=
[
ModelCard
(
id
=
served_model
,
root
=
served_model
,
permission
=
[
ModelPermission
()])
]
return
ModelList
(
data
=
model_cards
)
def
create_logprobs
(
token_ids
:
List
[
int
],
id_logprobs
:
List
[
Dict
[
int
,
float
]],
initial_text_offset
:
int
=
0
)
->
LogProbs
:
"""Create OpenAI-style logprobs."""
logprobs
=
LogProbs
()
last_token_len
=
0
for
token_id
,
id_logprob
in
zip
(
token_ids
,
id_logprobs
):
token
=
tokenizer
.
convert_ids_to_tokens
(
token_id
)
logprobs
.
tokens
.
append
(
token
)
logprobs
.
token_logprobs
.
append
(
id_logprob
[
token_id
])
if
len
(
logprobs
.
text_offset
)
==
0
:
logprobs
.
text_offset
.
append
(
initial_text_offset
)
else
:
logprobs
.
text_offset
.
append
(
logprobs
.
text_offset
[
-
1
]
+
last_token_len
)
last_token_len
=
len
(
token
)
logprobs
.
top_logprobs
.
append
({
tokenizer
.
convert_ids_to_tokens
(
i
):
p
for
i
,
p
in
id_logprob
.
items
()
})
return
logprobs
@
app
.
post
(
"/v1/chat/completions"
)
async
def
create_chat_completion
(
request
:
ChatCompletionRequest
,
raw_request
:
Request
):
"""Completion API similar to OpenAI's API.
See https://platform.openai.com/docs/api-reference/chat/create
for the API specification. This API mimics the OpenAI ChatCompletion API.
NOTE: Currently we do not support the following features:
- function_call (Users should implement this by themselves)
- logit_bias (to be supported by vLLM engine)
"""
logger
.
info
(
f
"Received chat completion request:
{
request
}
"
)
error_check_ret
=
await
check_model
(
request
)
if
error_check_ret
is
not
None
:
return
error_check_ret
if
request
.
logit_bias
is
not
None
and
len
(
request
.
logit_bias
)
>
0
:
# TODO: support logit_bias in vLLM engine.
return
create_error_response
(
HTTPStatus
.
BAD_REQUEST
,
"logit_bias is not currently supported"
)
prompt
=
await
get_gen_prompt
(
request
)
token_ids
,
error_check_ret
=
await
check_length
(
request
,
prompt
=
prompt
)
if
error_check_ret
is
not
None
:
return
error_check_ret
model_name
=
request
.
model
request_id
=
f
"cmpl-
{
random_uuid
()
}
"
created_time
=
int
(
time
.
monotonic
())
try
:
sampling_params
=
SamplingParams
(
n
=
request
.
n
,
presence_penalty
=
request
.
presence_penalty
,
frequency_penalty
=
request
.
frequency_penalty
,
temperature
=
request
.
temperature
,
top_p
=
request
.
top_p
,
stop
=
request
.
stop
,
stop_token_ids
=
request
.
stop_token_ids
,
max_tokens
=
request
.
max_tokens
,
best_of
=
request
.
best_of
,
top_k
=
request
.
top_k
,
ignore_eos
=
request
.
ignore_eos
,
use_beam_search
=
request
.
use_beam_search
,
skip_special_tokens
=
request
.
skip_special_tokens
,
)
except
ValueError
as
e
:
return
create_error_response
(
HTTPStatus
.
BAD_REQUEST
,
str
(
e
))
result_generator
=
engine
.
generate
(
prompt
,
sampling_params
,
request_id
,
token_ids
)
def
create_stream_response_json
(
index
:
int
,
text
:
str
,
finish_reason
:
Optional
[
str
]
=
None
,
)
->
str
:
choice_data
=
ChatCompletionResponseStreamChoice
(
index
=
index
,
delta
=
DeltaMessage
(
content
=
text
),
finish_reason
=
finish_reason
,
)
response
=
ChatCompletionStreamResponse
(
id
=
request_id
,
created
=
created_time
,
model
=
model_name
,
choices
=
[
choice_data
],
)
response_json
=
response
.
json
(
ensure_ascii
=
False
)
return
response_json
async
def
completion_stream_generator
()
->
AsyncGenerator
[
str
,
None
]:
# First chunk with role
for
i
in
range
(
request
.
n
):
choice_data
=
ChatCompletionResponseStreamChoice
(
index
=
i
,
delta
=
DeltaMessage
(
role
=
"assistant"
),
finish_reason
=
None
,
)
chunk
=
ChatCompletionStreamResponse
(
id
=
request_id
,
choices
=
[
choice_data
],
model
=
model_name
)
data
=
chunk
.
json
(
exclude_unset
=
True
,
ensure_ascii
=
False
)
yield
f
"data:
{
data
}
\n\n
"
previous_texts
=
[
""
]
*
request
.
n
previous_num_tokens
=
[
0
]
*
request
.
n
async
for
res
in
result_generator
:
res
:
RequestOutput
for
output
in
res
.
outputs
:
i
=
output
.
index
delta_text
=
output
.
text
[
len
(
previous_texts
[
i
]):]
previous_texts
[
i
]
=
output
.
text
previous_num_tokens
[
i
]
=
len
(
output
.
token_ids
)
response_json
=
create_stream_response_json
(
index
=
i
,
text
=
delta_text
,
)
yield
f
"data:
{
response_json
}
\n\n
"
if
output
.
finish_reason
is
not
None
:
response_json
=
create_stream_response_json
(
index
=
i
,
text
=
""
,
finish_reason
=
output
.
finish_reason
,
)
yield
f
"data:
{
response_json
}
\n\n
"
yield
"data: [DONE]
\n\n
"
# Streaming response
if
request
.
stream
:
return
StreamingResponse
(
completion_stream_generator
(),
media_type
=
"text/event-stream"
)
# Non-streaming response
final_res
:
RequestOutput
=
None
async
for
res
in
result_generator
:
if
await
raw_request
.
is_disconnected
():
# Abort the request if the client disconnects.
await
engine
.
abort
(
request_id
)
return
create_error_response
(
HTTPStatus
.
BAD_REQUEST
,
"Client disconnected"
)
final_res
=
res
assert
final_res
is
not
None
choices
=
[]
for
output
in
final_res
.
outputs
:
choice_data
=
ChatCompletionResponseChoice
(
index
=
output
.
index
,
message
=
ChatMessage
(
role
=
"assistant"
,
content
=
output
.
text
),
finish_reason
=
output
.
finish_reason
,
)
choices
.
append
(
choice_data
)
num_prompt_tokens
=
len
(
final_res
.
prompt_token_ids
)
num_generated_tokens
=
sum
(
len
(
output
.
token_ids
)
for
output
in
final_res
.
outputs
)
usage
=
UsageInfo
(
prompt_tokens
=
num_prompt_tokens
,
completion_tokens
=
num_generated_tokens
,
total_tokens
=
num_prompt_tokens
+
num_generated_tokens
,
)
response
=
ChatCompletionResponse
(
id
=
request_id
,
created
=
created_time
,
model
=
model_name
,
choices
=
choices
,
usage
=
usage
,
)
if
request
.
stream
:
# When user requests streaming but we don't stream, we still need to
# return a streaming response with a single event.
response_json
=
response
.
json
(
ensure_ascii
=
False
)
async
def
fake_stream_generator
()
->
AsyncGenerator
[
str
,
None
]:
yield
f
"data:
{
response_json
}
\n\n
"
yield
"data: [DONE]
\n\n
"
return
StreamingResponse
(
fake_stream_generator
(),
media_type
=
"text/event-stream"
)
return
response
@
app
.
post
(
"/v1/completions"
)
async
def
create_completion
(
request
:
CompletionRequest
,
raw_request
:
Request
):
"""Completion API similar to OpenAI's API.
See https://platform.openai.com/docs/api-reference/completions/create
for the API specification. This API mimics the OpenAI Completion API.
NOTE: Currently we do not support the following features:
- echo (since the vLLM engine does not currently support
getting the logprobs of prompt tokens)
- suffix (the language models we currently support do not support
suffix)
- logit_bias (to be supported by vLLM engine)
"""
logger
.
info
(
f
"Received completion request:
{
request
}
"
)
error_check_ret
=
await
check_model
(
request
)
if
error_check_ret
is
not
None
:
return
error_check_ret
if
request
.
echo
:
# We do not support echo since the vLLM engine does not
# currently support getting the logprobs of prompt tokens.
return
create_error_response
(
HTTPStatus
.
BAD_REQUEST
,
"echo is not currently supported"
)
if
request
.
suffix
is
not
None
:
# The language models we currently support do not support suffix.
return
create_error_response
(
HTTPStatus
.
BAD_REQUEST
,
"suffix is not currently supported"
)
if
request
.
logit_bias
is
not
None
and
len
(
request
.
logit_bias
)
>
0
:
# TODO: support logit_bias in vLLM engine.
return
create_error_response
(
HTTPStatus
.
BAD_REQUEST
,
"logit_bias is not currently supported"
)
model_name
=
request
.
model
request_id
=
f
"cmpl-
{
random_uuid
()
}
"
use_token_ids
=
False
if
isinstance
(
request
.
prompt
,
list
):
if
len
(
request
.
prompt
)
==
0
:
return
create_error_response
(
HTTPStatus
.
BAD_REQUEST
,
"please provide at least one prompt"
)
first_element
=
request
.
prompt
[
0
]
if
isinstance
(
first_element
,
int
):
use_token_ids
=
True
prompt
=
request
.
prompt
elif
isinstance
(
first_element
,
(
str
,
list
)):
# TODO: handles multiple prompt case in list[list[int]]
if
len
(
request
.
prompt
)
>
1
:
return
create_error_response
(
HTTPStatus
.
BAD_REQUEST
,
"multiple prompts in a batch is not currently supported"
)
use_token_ids
=
not
isinstance
(
first_element
,
str
)
prompt
=
request
.
prompt
[
0
]
else
:
prompt
=
request
.
prompt
if
use_token_ids
:
_
,
error_check_ret
=
await
check_length
(
request
,
prompt_ids
=
prompt
)
else
:
token_ids
,
error_check_ret
=
await
check_length
(
request
,
prompt
=
prompt
)
if
error_check_ret
is
not
None
:
return
error_check_ret
created_time
=
int
(
time
.
monotonic
())
try
:
sampling_params
=
SamplingParams
(
n
=
request
.
n
,
best_of
=
request
.
best_of
,
presence_penalty
=
request
.
presence_penalty
,
frequency_penalty
=
request
.
frequency_penalty
,
temperature
=
request
.
temperature
,
top_p
=
request
.
top_p
,
top_k
=
request
.
top_k
,
stop
=
request
.
stop
,
stop_token_ids
=
request
.
stop_token_ids
,
ignore_eos
=
request
.
ignore_eos
,
max_tokens
=
request
.
max_tokens
,
logprobs
=
request
.
logprobs
,
use_beam_search
=
request
.
use_beam_search
,
skip_special_tokens
=
request
.
skip_special_tokens
,
)
except
ValueError
as
e
:
return
create_error_response
(
HTTPStatus
.
BAD_REQUEST
,
str
(
e
))
if
use_token_ids
:
result_generator
=
engine
.
generate
(
None
,
sampling_params
,
request_id
,
prompt_token_ids
=
prompt
)
else
:
result_generator
=
engine
.
generate
(
prompt
,
sampling_params
,
request_id
,
token_ids
)
# Similar to the OpenAI API, when n != best_of, we do not stream the
# results. In addition, we do not stream the results when use beam search.
stream
=
(
request
.
stream
and
(
request
.
best_of
is
None
or
request
.
n
==
request
.
best_of
)
and
not
request
.
use_beam_search
)
def
create_stream_response_json
(
index
:
int
,
text
:
str
,
logprobs
:
Optional
[
LogProbs
]
=
None
,
finish_reason
:
Optional
[
str
]
=
None
,
)
->
str
:
choice_data
=
CompletionResponseStreamChoice
(
index
=
index
,
text
=
text
,
logprobs
=
logprobs
,
finish_reason
=
finish_reason
,
)
response
=
CompletionStreamResponse
(
id
=
request_id
,
created
=
created_time
,
model
=
model_name
,
choices
=
[
choice_data
],
)
response_json
=
response
.
json
(
ensure_ascii
=
False
)
return
response_json
async
def
completion_stream_generator
()
->
AsyncGenerator
[
str
,
None
]:
previous_texts
=
[
""
]
*
request
.
n
previous_num_tokens
=
[
0
]
*
request
.
n
async
for
res
in
result_generator
:
res
:
RequestOutput
for
output
in
res
.
outputs
:
i
=
output
.
index
delta_text
=
output
.
text
[
len
(
previous_texts
[
i
]):]
if
request
.
logprobs
is
not
None
:
logprobs
=
create_logprobs
(
output
.
token_ids
[
previous_num_tokens
[
i
]:],
output
.
logprobs
[
previous_num_tokens
[
i
]:],
len
(
previous_texts
[
i
]))
else
:
logprobs
=
None
previous_texts
[
i
]
=
output
.
text
previous_num_tokens
[
i
]
=
len
(
output
.
token_ids
)
response_json
=
create_stream_response_json
(
index
=
i
,
text
=
delta_text
,
logprobs
=
logprobs
,
)
yield
f
"data:
{
response_json
}
\n\n
"
if
output
.
finish_reason
is
not
None
:
logprobs
=
(
LogProbs
()
if
request
.
logprobs
is
not
None
else
None
)
response_json
=
create_stream_response_json
(
index
=
i
,
text
=
""
,
logprobs
=
logprobs
,
finish_reason
=
output
.
finish_reason
,
)
yield
f
"data:
{
response_json
}
\n\n
"
yield
"data: [DONE]
\n\n
"
# Streaming response
if
stream
:
return
StreamingResponse
(
completion_stream_generator
(),
media_type
=
"text/event-stream"
)
# Non-streaming response
final_res
:
RequestOutput
=
None
async
for
res
in
result_generator
:
if
await
raw_request
.
is_disconnected
():
# Abort the request if the client disconnects.
await
engine
.
abort
(
request_id
)
return
create_error_response
(
HTTPStatus
.
BAD_REQUEST
,
"Client disconnected"
)
final_res
=
res
assert
final_res
is
not
None
choices
=
[]
for
output
in
final_res
.
outputs
:
if
request
.
logprobs
is
not
None
:
logprobs
=
create_logprobs
(
output
.
token_ids
,
output
.
logprobs
)
else
:
logprobs
=
None
choice_data
=
CompletionResponseChoice
(
index
=
output
.
index
,
text
=
output
.
text
,
logprobs
=
logprobs
,
finish_reason
=
output
.
finish_reason
,
)
choices
.
append
(
choice_data
)
num_prompt_tokens
=
len
(
final_res
.
prompt_token_ids
)
num_generated_tokens
=
sum
(
len
(
output
.
token_ids
)
for
output
in
final_res
.
outputs
)
usage
=
UsageInfo
(
prompt_tokens
=
num_prompt_tokens
,
completion_tokens
=
num_generated_tokens
,
total_tokens
=
num_prompt_tokens
+
num_generated_tokens
,
)
response
=
CompletionResponse
(
id
=
request_id
,
created
=
created_time
,
model
=
model_name
,
choices
=
choices
,
usage
=
usage
,
)
if
request
.
stream
:
# When user requests streaming but we don't stream, we still need to
# return a streaming response with a single event.
response_json
=
response
.
json
(
ensure_ascii
=
False
)
async
def
fake_stream_generator
()
->
AsyncGenerator
[
str
,
None
]:
yield
f
"data:
{
response_json
}
\n\n
"
yield
"data: [DONE]
\n\n
"
return
StreamingResponse
(
fake_stream_generator
(),
media_type
=
"text/event-stream"
)
return
response
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
(
description
=
"vLLM OpenAI-Compatible RESTful API server."
)
parser
.
add_argument
(
"--host"
,
type
=
str
,
default
=
None
,
help
=
"host name"
)
parser
.
add_argument
(
"--port"
,
type
=
int
,
default
=
8000
,
help
=
"port number"
)
parser
.
add_argument
(
"--allow-credentials"
,
action
=
"store_true"
,
help
=
"allow credentials"
)
parser
.
add_argument
(
"--allowed-origins"
,
type
=
json
.
loads
,
default
=
[
"*"
],
help
=
"allowed origins"
)
parser
.
add_argument
(
"--allowed-methods"
,
type
=
json
.
loads
,
default
=
[
"*"
],
help
=
"allowed methods"
)
parser
.
add_argument
(
"--allowed-headers"
,
type
=
json
.
loads
,
default
=
[
"*"
],
help
=
"allowed headers"
)
parser
.
add_argument
(
"--served-model-name"
,
type
=
str
,
default
=
None
,
help
=
"The model name used in the API. If not "
"specified, the model name will be the same as "
"the huggingface name."
)
parser
=
AsyncEngineArgs
.
add_cli_args
(
parser
)
args
=
parser
.
parse_args
()
app
.
add_middleware
(
CORSMiddleware
,
allow_origins
=
args
.
allowed_origins
,
allow_credentials
=
args
.
allow_credentials
,
allow_methods
=
args
.
allowed_methods
,
allow_headers
=
args
.
allowed_headers
,
)
logger
.
info
(
f
"args:
{
args
}
"
)
if
args
.
served_model_name
is
not
None
:
served_model
=
args
.
served_model_name
else
:
served_model
=
args
.
model
engine_args
=
AsyncEngineArgs
.
from_cli_args
(
args
)
engine
=
AsyncLLMEngine
.
from_engine_args
(
engine_args
)
engine_model_config
=
asyncio
.
run
(
engine
.
get_model_config
())
max_model_len
=
engine_model_config
.
max_model_len
# A separate tokenizer to map token IDs to strings.
tokenizer
=
get_tokenizer
(
engine_args
.
tokenizer
,
tokenizer_mode
=
engine_args
.
tokenizer_mode
,
trust_remote_code
=
engine_args
.
trust_remote_code
)
uvicorn
.
run
(
app
,
host
=
args
.
host
,
port
=
args
.
port
,
log_level
=
"info"
,
timeout_keep_alive
=
TIMEOUT_KEEP_ALIVE
)
server/vllm/vllm/entrypoints/openai/protocol.py
deleted
100644 → 0
View file @
64def8e2
# Adapted from
# https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/protocol/openai_api_protocol.py
import
time
from
typing
import
Dict
,
List
,
Literal
,
Optional
,
Union
from
pydantic
import
BaseModel
,
Field
from
vllm.utils
import
random_uuid
class
ErrorResponse
(
BaseModel
):
object
:
str
=
"error"
message
:
str
type
:
str
param
:
Optional
[
str
]
=
None
code
:
Optional
[
str
]
=
None
class
ModelPermission
(
BaseModel
):
id
:
str
=
Field
(
default_factory
=
lambda
:
f
"modelperm-
{
random_uuid
()
}
"
)
object
:
str
=
"model_permission"
created
:
int
=
Field
(
default_factory
=
lambda
:
int
(
time
.
time
()))
allow_create_engine
:
bool
=
False
allow_sampling
:
bool
=
True
allow_logprobs
:
bool
=
True
allow_search_indices
:
bool
=
False
allow_view
:
bool
=
True
allow_fine_tuning
:
bool
=
False
organization
:
str
=
"*"
group
:
Optional
[
str
]
=
None
is_blocking
:
str
=
False
class
ModelCard
(
BaseModel
):
id
:
str
object
:
str
=
"model"
created
:
int
=
Field
(
default_factory
=
lambda
:
int
(
time
.
time
()))
owned_by
:
str
=
"vllm"
root
:
Optional
[
str
]
=
None
parent
:
Optional
[
str
]
=
None
permission
:
List
[
ModelPermission
]
=
Field
(
default_factory
=
list
)
class
ModelList
(
BaseModel
):
object
:
str
=
"list"
data
:
List
[
ModelCard
]
=
Field
(
default_factory
=
list
)
class
UsageInfo
(
BaseModel
):
prompt_tokens
:
int
=
0
total_tokens
:
int
=
0
completion_tokens
:
Optional
[
int
]
=
0
class
ChatCompletionRequest
(
BaseModel
):
model
:
str
messages
:
Union
[
str
,
List
[
Dict
[
str
,
str
]]]
temperature
:
Optional
[
float
]
=
0.7
top_p
:
Optional
[
float
]
=
1.0
n
:
Optional
[
int
]
=
1
max_tokens
:
Optional
[
int
]
=
None
stop
:
Optional
[
Union
[
str
,
List
[
str
]]]
=
Field
(
default_factory
=
list
)
stream
:
Optional
[
bool
]
=
False
presence_penalty
:
Optional
[
float
]
=
0.0
frequency_penalty
:
Optional
[
float
]
=
0.0
logit_bias
:
Optional
[
Dict
[
str
,
float
]]
=
None
user
:
Optional
[
str
]
=
None
# Additional parameters supported by vLLM
best_of
:
Optional
[
int
]
=
None
top_k
:
Optional
[
int
]
=
-
1
ignore_eos
:
Optional
[
bool
]
=
False
use_beam_search
:
Optional
[
bool
]
=
False
stop_token_ids
:
Optional
[
List
[
int
]]
=
Field
(
default_factory
=
list
)
skip_special_tokens
:
Optional
[
bool
]
=
True
class
CompletionRequest
(
BaseModel
):
model
:
str
# a string, array of strings, array of tokens, or array of token arrays
prompt
:
Union
[
List
[
int
],
List
[
List
[
int
]],
str
,
List
[
str
]]
suffix
:
Optional
[
str
]
=
None
max_tokens
:
Optional
[
int
]
=
16
temperature
:
Optional
[
float
]
=
1.0
top_p
:
Optional
[
float
]
=
1.0
n
:
Optional
[
int
]
=
1
stream
:
Optional
[
bool
]
=
False
logprobs
:
Optional
[
int
]
=
None
echo
:
Optional
[
bool
]
=
False
stop
:
Optional
[
Union
[
str
,
List
[
str
]]]
=
Field
(
default_factory
=
list
)
presence_penalty
:
Optional
[
float
]
=
0.0
frequency_penalty
:
Optional
[
float
]
=
0.0
best_of
:
Optional
[
int
]
=
None
logit_bias
:
Optional
[
Dict
[
str
,
float
]]
=
None
user
:
Optional
[
str
]
=
None
# Additional parameters supported by vLLM
top_k
:
Optional
[
int
]
=
-
1
ignore_eos
:
Optional
[
bool
]
=
False
use_beam_search
:
Optional
[
bool
]
=
False
stop_token_ids
:
Optional
[
List
[
int
]]
=
Field
(
default_factory
=
list
)
skip_special_tokens
:
Optional
[
bool
]
=
True
class
LogProbs
(
BaseModel
):
text_offset
:
List
[
int
]
=
Field
(
default_factory
=
list
)
token_logprobs
:
List
[
Optional
[
float
]]
=
Field
(
default_factory
=
list
)
tokens
:
List
[
str
]
=
Field
(
default_factory
=
list
)
top_logprobs
:
List
[
Optional
[
Dict
[
str
,
float
]]]
=
Field
(
default_factory
=
list
)
class
CompletionResponseChoice
(
BaseModel
):
index
:
int
text
:
str
logprobs
:
Optional
[
LogProbs
]
=
None
finish_reason
:
Optional
[
Literal
[
"stop"
,
"length"
]]
=
None
class
CompletionResponse
(
BaseModel
):
id
:
str
=
Field
(
default_factory
=
lambda
:
f
"cmpl-
{
random_uuid
()
}
"
)
object
:
str
=
"text_completion"
created
:
int
=
Field
(
default_factory
=
lambda
:
int
(
time
.
time
()))
model
:
str
choices
:
List
[
CompletionResponseChoice
]
usage
:
UsageInfo
class
CompletionResponseStreamChoice
(
BaseModel
):
index
:
int
text
:
str
logprobs
:
Optional
[
LogProbs
]
=
None
finish_reason
:
Optional
[
Literal
[
"stop"
,
"length"
]]
=
None
class
CompletionStreamResponse
(
BaseModel
):
id
:
str
=
Field
(
default_factory
=
lambda
:
f
"cmpl-
{
random_uuid
()
}
"
)
object
:
str
=
"text_completion"
created
:
int
=
Field
(
default_factory
=
lambda
:
int
(
time
.
time
()))
model
:
str
choices
:
List
[
CompletionResponseStreamChoice
]
class
ChatMessage
(
BaseModel
):
role
:
str
content
:
str
class
ChatCompletionResponseChoice
(
BaseModel
):
index
:
int
message
:
ChatMessage
finish_reason
:
Optional
[
Literal
[
"stop"
,
"length"
]]
=
None
class
ChatCompletionResponse
(
BaseModel
):
id
:
str
=
Field
(
default_factory
=
lambda
:
f
"chatcmpl-
{
random_uuid
()
}
"
)
object
:
str
=
"chat.completion"
created
:
int
=
Field
(
default_factory
=
lambda
:
int
(
time
.
time
()))
model
:
str
choices
:
List
[
ChatCompletionResponseChoice
]
usage
:
UsageInfo
class
DeltaMessage
(
BaseModel
):
role
:
Optional
[
str
]
=
None
content
:
Optional
[
str
]
=
None
class
ChatCompletionResponseStreamChoice
(
BaseModel
):
index
:
int
delta
:
DeltaMessage
finish_reason
:
Optional
[
Literal
[
"stop"
,
"length"
]]
=
None
class
ChatCompletionStreamResponse
(
BaseModel
):
id
:
str
=
Field
(
default_factory
=
lambda
:
f
"chatcmpl-
{
random_uuid
()
}
"
)
object
:
str
=
"chat.completion.chunk"
created
:
int
=
Field
(
default_factory
=
lambda
:
int
(
time
.
time
()))
model
:
str
choices
:
List
[
ChatCompletionResponseStreamChoice
]
server/vllm/vllm/logger.py
deleted
100644 → 0
View file @
64def8e2
# Adapted from
# https://github.com/skypilot-org/skypilot/blob/86dc0f6283a335e4aa37b3c10716f90999f48ab6/sky/sky_logging.py
"""Logging configuration for vLLM."""
import
logging
import
sys
_FORMAT
=
"%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s"
_DATE_FORMAT
=
"%m-%d %H:%M:%S"
class
NewLineFormatter
(
logging
.
Formatter
):
"""Adds logging prefix to newlines to align multi-line messages."""
def
__init__
(
self
,
fmt
,
datefmt
=
None
):
logging
.
Formatter
.
__init__
(
self
,
fmt
,
datefmt
)
def
format
(
self
,
record
):
msg
=
logging
.
Formatter
.
format
(
self
,
record
)
if
record
.
message
!=
""
:
parts
=
msg
.
split
(
record
.
message
)
msg
=
msg
.
replace
(
"
\n
"
,
"
\r\n
"
+
parts
[
0
])
return
msg
_root_logger
=
logging
.
getLogger
(
"vllm"
)
_default_handler
=
None
def
_setup_logger
():
_root_logger
.
setLevel
(
logging
.
DEBUG
)
global
_default_handler
if
_default_handler
is
None
:
_default_handler
=
logging
.
StreamHandler
(
sys
.
stdout
)
_default_handler
.
flush
=
sys
.
stdout
.
flush
# type: ignore
_default_handler
.
setLevel
(
logging
.
INFO
)
_root_logger
.
addHandler
(
_default_handler
)
fmt
=
NewLineFormatter
(
_FORMAT
,
datefmt
=
_DATE_FORMAT
)
_default_handler
.
setFormatter
(
fmt
)
# Setting this will avoid the message
# being propagated to the parent logger.
_root_logger
.
propagate
=
False
# The logger is initialized when the module is imported.
# This is thread-safe as the module is only imported once,
# guaranteed by the Python GIL.
_setup_logger
()
def
init_logger
(
name
:
str
):
return
logging
.
getLogger
(
name
)
server/vllm/vllm/model_executor/__init__.py
deleted
100644 → 0
View file @
64def8e2
from
vllm.model_executor.input_metadata
import
InputMetadata
from
vllm.model_executor.model_loader
import
get_model
from
vllm.model_executor.utils
import
set_random_seed
__all__
=
[
"InputMetadata"
,
"get_model"
,
"set_random_seed"
,
]
Prev
1
2
3
4
5
6
7
8
9
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment