Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
text-generation-inference
Commits
70056d1e
Commit
70056d1e
authored
May 29, 2024
by
huangwb
Browse files
add custom vllm source code
parent
12d93ad7
Changes
158
Hide whitespace changes
Inline
Side-by-side
Showing
18 changed files
with
2261 additions
and
0 deletions
+2261
-0
server/vllm/vllm/model_executor/utils.py
server/vllm/vllm/model_executor/utils.py
+13
-0
server/vllm/vllm/model_executor/weight_utils.py
server/vllm/vllm/model_executor/weight_utils.py
+321
-0
server/vllm/vllm/outputs.py
server/vllm/vllm/outputs.py
+119
-0
server/vllm/vllm/sampling_params.py
server/vllm/vllm/sampling_params.py
+215
-0
server/vllm/vllm/sequence.py
server/vllm/vllm/sequence.py
+407
-0
server/vllm/vllm/transformers_utils/__init__.py
server/vllm/vllm/transformers_utils/__init__.py
+0
-0
server/vllm/vllm/transformers_utils/config.py
server/vllm/vllm/transformers_utils/config.py
+37
-0
server/vllm/vllm/transformers_utils/configs/__init__.py
server/vllm/vllm/transformers_utils/configs/__init__.py
+16
-0
server/vllm/vllm/transformers_utils/configs/aquila.py
server/vllm/vllm/transformers_utils/configs/aquila.py
+69
-0
server/vllm/vllm/transformers_utils/configs/baichuan.py
server/vllm/vllm/transformers_utils/configs/baichuan.py
+62
-0
server/vllm/vllm/transformers_utils/configs/falcon.py
server/vllm/vllm/transformers_utils/configs/falcon.py
+87
-0
server/vllm/vllm/transformers_utils/configs/mpt.py
server/vllm/vllm/transformers_utils/configs/mpt.py
+74
-0
server/vllm/vllm/transformers_utils/configs/qwen.py
server/vllm/vllm/transformers_utils/configs/qwen.py
+60
-0
server/vllm/vllm/transformers_utils/tokenizer.py
server/vllm/vllm/transformers_utils/tokenizer.py
+156
-0
server/vllm/vllm/utils.py
server/vllm/vllm/utils.py
+55
-0
server/vllm/vllm/worker/__init__.py
server/vllm/vllm/worker/__init__.py
+0
-0
server/vllm/vllm/worker/cache_engine.py
server/vllm/vllm/worker/cache_engine.py
+160
-0
server/vllm/vllm/worker/worker.py
server/vllm/vllm/worker/worker.py
+410
-0
No files found.
server/vllm/vllm/model_executor/utils.py
0 → 100644
View file @
70056d1e
"""Utils for model executor."""
import
random
import
numpy
as
np
import
torch
def
set_random_seed
(
seed
:
int
)
->
None
:
random
.
seed
(
seed
)
np
.
random
.
seed
(
seed
)
torch
.
manual_seed
(
seed
)
if
torch
.
cuda
.
is_available
():
torch
.
cuda
.
manual_seed_all
(
seed
)
server/vllm/vllm/model_executor/weight_utils.py
0 → 100644
View file @
70056d1e
"""Utilities for downloading and initializing model weights."""
import
filelock
import
glob
import
json
import
os
from
collections
import
defaultdict
from
typing
import
Any
,
Iterator
,
List
,
Optional
,
Tuple
from
huggingface_hub
import
snapshot_download
from
safetensors.torch
import
load_file
,
save_file
,
safe_open
import
numpy
as
np
import
torch
from
tqdm.auto
import
tqdm
from
vllm.logger
import
init_logger
from
vllm.model_executor.quantization_utils
import
get_quant_class
from
vllm.model_executor.quantization_utils.base
import
QuantizationConfig
logger
=
init_logger
(
__name__
)
class
Disabledtqdm
(
tqdm
):
def
__init__
(
self
,
*
args
,
**
kwargs
):
super
().
__init__
(
*
args
,
**
kwargs
,
disable
=
True
)
def
get_lock
(
model_name_or_path
:
str
,
cache_dir
:
Optional
[
str
]
=
None
):
lock_dir
=
cache_dir
if
cache_dir
is
not
None
else
"/tmp"
lock_file_name
=
model_name_or_path
.
replace
(
"/"
,
"-"
)
+
".lock"
lock
=
filelock
.
FileLock
(
os
.
path
.
join
(
lock_dir
,
lock_file_name
))
return
lock
def
_shared_pointers
(
tensors
):
ptrs
=
defaultdict
(
list
)
for
k
,
v
in
tensors
.
items
():
ptrs
[
v
.
data_ptr
()].
append
(
k
)
failing
=
[]
for
_
,
names
in
ptrs
.
items
():
if
len
(
names
)
>
1
:
failing
.
append
(
names
)
return
failing
def
convert_bin_to_safetensor_file
(
pt_filename
:
str
,
sf_filename
:
str
,
)
->
None
:
loaded
=
torch
.
load
(
pt_filename
,
map_location
=
"cpu"
)
if
"state_dict"
in
loaded
:
loaded
=
loaded
[
"state_dict"
]
shared
=
_shared_pointers
(
loaded
)
for
shared_weights
in
shared
:
for
name
in
shared_weights
[
1
:]:
loaded
.
pop
(
name
)
# For tensors to be contiguous
loaded
=
{
k
:
v
.
contiguous
()
for
k
,
v
in
loaded
.
items
()}
dirname
=
os
.
path
.
dirname
(
sf_filename
)
os
.
makedirs
(
dirname
,
exist_ok
=
True
)
save_file
(
loaded
,
sf_filename
,
metadata
=
{
"format"
:
"pt"
})
# check file size
sf_size
=
os
.
stat
(
sf_filename
).
st_size
pt_size
=
os
.
stat
(
pt_filename
).
st_size
if
(
sf_size
-
pt_size
)
/
pt_size
>
0.01
:
raise
RuntimeError
(
f
"""The file size different is more than 1%:
-
{
sf_filename
}
:
{
sf_size
}
-
{
pt_filename
}
:
{
pt_size
}
"""
)
# check if the tensors are the same
reloaded
=
load_file
(
sf_filename
)
for
k
in
loaded
:
pt_tensor
=
loaded
[
k
]
sf_tensor
=
reloaded
[
k
]
if
not
torch
.
equal
(
pt_tensor
,
sf_tensor
):
raise
RuntimeError
(
f
"The output tensors do not match for key
{
k
}
"
)
# TODO(woosuk): Move this to other place.
def
get_quant_config
(
quantization
:
str
,
model_name_or_path
:
str
,
cache_dir
:
Optional
[
str
]
=
None
,
)
->
QuantizationConfig
:
is_local
=
os
.
path
.
isdir
(
model_name_or_path
)
if
not
is_local
:
# Download the config files.
with
get_lock
(
model_name_or_path
,
cache_dir
):
hf_folder
=
snapshot_download
(
model_name_or_path
,
allow_patterns
=
"*.json"
,
cache_dir
=
cache_dir
,
tqdm_class
=
Disabledtqdm
)
else
:
hf_folder
=
model_name_or_path
config_files
=
glob
.
glob
(
os
.
path
.
join
(
hf_folder
,
"*.json"
))
quant_cls
=
get_quant_class
(
quantization
)
quant_config_files
=
[
f
for
f
in
config_files
if
any
(
f
.
endswith
(
x
)
for
x
in
quant_cls
.
get_config_filenames
())
]
if
len
(
quant_config_files
)
==
0
:
raise
ValueError
(
f
"Cannot find the config file for
{
quantization
}
"
)
if
len
(
quant_config_files
)
>
1
:
raise
ValueError
(
f
"Found multiple config files for
{
quantization
}
: "
f
"
{
quant_config_files
}
"
)
quant_config_file
=
quant_config_files
[
0
]
with
open
(
quant_config_file
,
"r"
)
as
f
:
config
=
json
.
load
(
f
)
return
quant_cls
.
from_config
(
config
)
def
prepare_hf_model_weights
(
model_name_or_path
:
str
,
cache_dir
:
Optional
[
str
]
=
None
,
use_safetensors
:
bool
=
False
,
fall_back_to_pt
:
bool
=
True
,
revision
:
Optional
[
str
]
=
None
,
)
->
Tuple
[
str
,
List
[
str
],
bool
]:
# Download model weights from huggingface.
is_local
=
os
.
path
.
isdir
(
model_name_or_path
)
if
use_safetensors
:
allow_patterns
=
[
"*.safetensors"
]
else
:
# Some quantized models use .pt files for storing the weights.
allow_patterns
=
[
"*.bin"
,
"*.pt"
]
if
not
is_local
:
# Use file lock to prevent multiple processes from
# downloading the same model weights at the same time.
with
get_lock
(
model_name_or_path
,
cache_dir
):
hf_folder
=
snapshot_download
(
model_name_or_path
,
allow_patterns
=
allow_patterns
,
cache_dir
=
cache_dir
,
tqdm_class
=
Disabledtqdm
,
revision
=
revision
)
else
:
hf_folder
=
model_name_or_path
hf_weights_files
:
List
[
str
]
=
[]
for
pattern
in
allow_patterns
:
hf_weights_files
+=
glob
.
glob
(
os
.
path
.
join
(
hf_folder
,
pattern
))
if
not
use_safetensors
:
# Exclude files that are not needed for inference.
# https://github.com/huggingface/transformers/blob/v4.34.0/src/transformers/trainer.py#L227-L233
blacklist
=
[
"training_args.bin"
,
"optimizer.bin"
,
"optimizer.pt"
,
"scheduler.pt"
,
"scaler.pt"
,
]
hf_weights_files
=
[
f
for
f
in
hf_weights_files
if
not
any
(
f
.
endswith
(
x
)
for
x
in
blacklist
)
]
if
len
(
hf_weights_files
)
==
0
and
use_safetensors
and
fall_back_to_pt
:
return
prepare_hf_model_weights
(
model_name_or_path
,
cache_dir
=
cache_dir
,
use_safetensors
=
False
,
fall_back_to_pt
=
False
,
revision
=
revision
)
if
len
(
hf_weights_files
)
==
0
:
raise
RuntimeError
(
f
"Cannot find any model weights with `
{
model_name_or_path
}
`"
)
return
hf_folder
,
hf_weights_files
,
use_safetensors
def
hf_model_weights_iterator
(
model_name_or_path
:
str
,
cache_dir
:
Optional
[
str
]
=
None
,
load_format
:
str
=
"auto"
,
revision
:
Optional
[
str
]
=
None
,
)
->
Iterator
[
Tuple
[
str
,
torch
.
Tensor
]]:
use_safetensors
=
False
use_np_cache
=
False
fall_back_to_pt
=
False
if
load_format
==
"auto"
:
use_safetensors
=
True
fall_back_to_pt
=
True
elif
load_format
==
"safetensors"
:
use_safetensors
=
True
elif
load_format
==
"pt"
:
pass
elif
load_format
==
"npcache"
:
use_np_cache
=
True
else
:
raise
ValueError
(
f
"Unknown load_format:
{
load_format
}
"
)
hf_folder
,
hf_weights_files
,
use_safetensors
=
prepare_hf_model_weights
(
model_name_or_path
,
cache_dir
=
cache_dir
,
use_safetensors
=
use_safetensors
,
fall_back_to_pt
=
fall_back_to_pt
,
revision
=
revision
)
if
use_np_cache
:
# Currently np_cache only support *.bin checkpoints
assert
use_safetensors
is
False
# Convert the model weights from torch tensors to numpy arrays for
# faster loading.
np_folder
=
os
.
path
.
join
(
hf_folder
,
"np"
)
os
.
makedirs
(
np_folder
,
exist_ok
=
True
)
weight_names_file
=
os
.
path
.
join
(
np_folder
,
"weight_names.json"
)
# Use file lock to prevent multiple processes from
# dumping the same model weights to numpy at the same time.
with
get_lock
(
model_name_or_path
,
cache_dir
):
if
not
os
.
path
.
exists
(
weight_names_file
):
weight_names
=
[]
for
bin_file
in
hf_weights_files
:
state
=
torch
.
load
(
bin_file
,
map_location
=
"cpu"
)
for
name
,
param
in
state
.
items
():
param_path
=
os
.
path
.
join
(
np_folder
,
name
)
with
open
(
param_path
,
"wb"
)
as
f
:
np
.
save
(
f
,
param
.
cpu
().
detach
().
numpy
())
weight_names
.
append
(
name
)
with
open
(
weight_names_file
,
"w"
)
as
f
:
json
.
dump
(
weight_names
,
f
)
with
open
(
weight_names_file
,
"r"
)
as
f
:
weight_names
=
json
.
load
(
f
)
for
name
in
weight_names
:
param_path
=
os
.
path
.
join
(
np_folder
,
name
)
with
open
(
param_path
,
"rb"
)
as
f
:
param
=
np
.
load
(
f
)
yield
name
,
torch
.
from_numpy
(
param
)
elif
use_safetensors
:
for
st_file
in
hf_weights_files
:
with
safe_open
(
st_file
,
framework
=
"pt"
)
as
f
:
for
name
in
f
.
keys
():
param
=
f
.
get_slice
(
name
)
yield
name
,
param
else
:
for
bin_file
in
hf_weights_files
:
state
=
torch
.
load
(
bin_file
,
map_location
=
"cpu"
)
for
name
,
param
in
state
.
items
():
yield
name
,
param
del
state
torch
.
cuda
.
empty_cache
()
def
convert_pyslice_to_tensor
(
x
:
Any
)
->
torch
.
Tensor
:
"""convert PySafeSlice object from safetensors to torch.Tensor
PySafeSlice object supports indexing, which is done before loading the
actual tensor and can reduce the amount of memory being read into the
memory. However, it does not support more advanced functionalities
like `.view()` or `.t()`. Therefore, if we need to modify the loaded
tensor with these more complicated operators, we need to convert to
tensor first.
"""
if
not
isinstance
(
x
,
torch
.
Tensor
):
x
=
x
[:]
return
x
def
load_padded_tensor_parallel_vocab
(
param
:
torch
.
Tensor
,
loaded_weight
:
Any
,
# `torch.Tensor` or `PySafeSlice`
tensor_model_parallel_rank
:
int
,
)
->
None
:
shard_size
=
param
.
shape
[
0
]
start_idx
=
tensor_model_parallel_rank
*
shard_size
end_idx
=
(
tensor_model_parallel_rank
+
1
)
*
shard_size
loaded_weight
=
loaded_weight
[
start_idx
:
end_idx
]
loaded_weight
=
convert_pyslice_to_tensor
(
loaded_weight
)
param
[:
loaded_weight
.
shape
[
0
]].
copy_
(
loaded_weight
)
def
load_tensor_parallel_weights
(
param
:
torch
.
Tensor
,
loaded_weight
:
Any
,
# `torch.Tensor` or `PySafeSlice`
param_name
:
str
,
column_parallel_weight_names
:
List
[
str
],
row_parallel_weight_names
:
List
[
str
],
tensor_model_parallel_rank
:
int
,
)
->
None
:
for
p
in
column_parallel_weight_names
:
if
p
in
param_name
:
shard_size
=
param
.
shape
[
0
]
start_idx
=
tensor_model_parallel_rank
*
shard_size
end_idx
=
(
tensor_model_parallel_rank
+
1
)
*
shard_size
loaded_weight
=
loaded_weight
[
start_idx
:
end_idx
]
break
for
p
in
row_parallel_weight_names
:
if
p
in
param_name
:
shard_size
=
param
.
shape
[
1
]
start_idx
=
tensor_model_parallel_rank
*
shard_size
end_idx
=
(
tensor_model_parallel_rank
+
1
)
*
shard_size
loaded_weight
=
loaded_weight
[:,
start_idx
:
end_idx
]
break
loaded_weight
=
convert_pyslice_to_tensor
(
loaded_weight
)
assert
param
.
shape
==
loaded_weight
.
shape
,
(
f
"
{
param_name
}
shape mismatch between model and checkpoint: "
f
"
{
param
.
shape
}
!=
{
loaded_weight
.
shape
}
"
)
param
.
data
.
copy_
(
loaded_weight
)
def
initialize_dummy_weights
(
model
:
torch
.
nn
.
Module
,
low
:
float
=
-
1e-3
,
high
:
float
=
1e-3
,
)
->
None
:
"""Initialize model weights with random values.
The model weights must be randomly initialized for accurate performance
measurements. Additionally, the model weights should not cause NaNs in the
forward pass. We empirically found that initializing the weights with
values between -1e-3 and 1e-3 works well for most models.
"""
for
param
in
model
.
state_dict
().
values
():
param
.
data
.
uniform_
(
low
,
high
)
server/vllm/vllm/outputs.py
0 → 100644
View file @
70056d1e
from
typing
import
List
,
Optional
from
vllm.sequence
import
(
PromptLogprobs
,
SampleLogprobs
,
SequenceGroup
,
SequenceStatus
)
class
CompletionOutput
:
"""The output data of one completion output of a request.
Args:
index: The index of the output in the request.
text: The generated output text.
token_ids: The token IDs of the generated output text.
cumulative_logprob: The cumulative log probability of the generated
output text.
logprobs: The log probabilities of the top probability words at each
position if the logprobs are requested.
finish_reason: The reason why the sequence is finished.
"""
def
__init__
(
self
,
index
:
int
,
text
:
str
,
token_ids
:
List
[
int
],
cumulative_logprob
:
float
,
logprobs
:
Optional
[
SampleLogprobs
],
finish_reason
:
Optional
[
str
]
=
None
,
)
->
None
:
self
.
index
=
index
self
.
text
=
text
self
.
token_ids
=
token_ids
self
.
cumulative_logprob
=
cumulative_logprob
self
.
logprobs
=
logprobs
self
.
finish_reason
=
finish_reason
def
finished
(
self
)
->
bool
:
return
self
.
finish_reason
is
not
None
def
__repr__
(
self
)
->
str
:
return
(
f
"CompletionOutput(index=
{
self
.
index
}
, "
f
"text=
{
self
.
text
!
r
}
, "
f
"token_ids=
{
self
.
token_ids
}
, "
f
"cumulative_logprob=
{
self
.
cumulative_logprob
}
, "
f
"logprobs=
{
self
.
logprobs
}
, "
f
"finish_reason=
{
self
.
finish_reason
}
)"
)
class
RequestOutput
:
"""The output data of a request to the LLM.
Args:
request_id: The unique ID of the request.
prompt: The prompt string of the request.
prompt_token_ids: The token IDs of the prompt.
outputs: The output sequences of the request.
finished: Whether the whole request is finished.
"""
def
__init__
(
self
,
request_id
:
str
,
prompt
:
str
,
prompt_token_ids
:
List
[
int
],
prompt_logprobs
:
Optional
[
PromptLogprobs
],
outputs
:
List
[
CompletionOutput
],
finished
:
bool
,
)
->
None
:
self
.
request_id
=
request_id
self
.
prompt
=
prompt
self
.
prompt_token_ids
=
prompt_token_ids
self
.
prompt_logprobs
=
prompt_logprobs
self
.
outputs
=
outputs
self
.
finished
=
finished
@
classmethod
def
from_seq_group
(
cls
,
seq_group
:
SequenceGroup
)
->
"RequestOutput"
:
# Get the top-n sequences.
n
=
seq_group
.
sampling_params
.
n
seqs
=
seq_group
.
get_seqs
()
if
seq_group
.
sampling_params
.
use_beam_search
:
sorting_key
=
lambda
seq
:
seq
.
get_beam_search_score
(
seq_group
.
sampling_params
.
length_penalty
)
else
:
sorting_key
=
lambda
seq
:
seq
.
get_cumulative_logprob
()
sorted_seqs
=
sorted
(
seqs
,
key
=
sorting_key
,
reverse
=
True
)
top_n_seqs
=
sorted_seqs
[:
n
]
# Create the outputs.
outputs
:
List
[
CompletionOutput
]
=
[]
for
seq
in
top_n_seqs
:
logprobs
=
seq
.
output_logprobs
if
seq_group
.
sampling_params
.
logprobs
is
None
:
# NOTE: We need to take care of this case because the sequence
# always has the logprobs of the sampled tokens even if the
# logprobs are not requested.
logprobs
=
None
finshed_reason
=
SequenceStatus
.
get_finished_reason
(
seq
.
status
)
output
=
CompletionOutput
(
seqs
.
index
(
seq
),
seq
.
output_text
,
seq
.
get_output_token_ids
(),
seq
.
get_cumulative_logprob
(),
logprobs
,
finshed_reason
)
outputs
.
append
(
output
)
# Every sequence in the sequence group should have the same prompt.
prompt
=
seq_group
.
prompt
prompt_token_ids
=
seq_group
.
prompt_token_ids
prompt_logprobs
=
seq_group
.
prompt_logprobs
finished
=
seq_group
.
is_finished
()
return
cls
(
seq_group
.
request_id
,
prompt
,
prompt_token_ids
,
prompt_logprobs
,
outputs
,
finished
)
def
__repr__
(
self
)
->
str
:
return
(
f
"RequestOutput(request_id=
{
self
.
request_id
}
, "
f
"prompt=
{
self
.
prompt
!
r
}
, "
f
"prompt_token_ids=
{
self
.
prompt_token_ids
}
, "
f
"prompt_logprobs=
{
self
.
prompt_logprobs
}
, "
f
"outputs=
{
self
.
outputs
}
, "
f
"finished=
{
self
.
finished
}
)"
)
server/vllm/vllm/sampling_params.py
0 → 100644
View file @
70056d1e
"""Sampling parameters for text generation."""
from
enum
import
IntEnum
from
functools
import
cached_property
from
typing
import
List
,
Optional
,
Union
_SAMPLING_EPS
=
1e-5
class
SamplingType
(
IntEnum
):
GREEDY
=
0
RANDOM
=
1
BEAM
=
2
class
SamplingParams
:
"""Sampling parameters for text generation.
Overall, we follow the sampling parameters from the OpenAI text completion
API (https://platform.openai.com/docs/api-reference/completions/create).
In addition, we support beam search, which is not supported by OpenAI.
Args:
n: Number of output sequences to return for the given prompt.
best_of: Number of output sequences that are generated from the prompt.
From these `best_of` sequences, the top `n` sequences are returned.
`best_of` must be greater than or equal to `n`. This is treated as
the beam width when `use_beam_search` is True. By default, `best_of`
is set to `n`.
presence_penalty: Float that penalizes new tokens based on whether they
appear in the generated text so far. Values > 0 encourage the model
to use new tokens, while values < 0 encourage the model to repeat
tokens.
frequency_penalty: Float that penalizes new tokens based on their
frequency in the generated text so far. Values > 0 encourage the
model to use new tokens, while values < 0 encourage the model to
repeat tokens.
temperature: Float that controls the randomness of the sampling. Lower
values make the model more deterministic, while higher values make
the model more random. Zero means greedy sampling.
top_p: Float that controls the cumulative probability of the top tokens
to consider. Must be in (0, 1]. Set to 1 to consider all tokens.
top_k: Integer that controls the number of top tokens to consider. Set
to -1 to consider all tokens.
use_beam_search: Whether to use beam search instead of sampling.
length_penalty: Float that penalizes sequences based on their length.
Used in beam search.
early_stopping: Controls the stopping condition for beam search. It
accepts the following values: `True`, where the generation stops as
soon as there are `best_of` complete candidates; `False`, where an
heuristic is applied and the generation stops when is it very
unlikely to find better candidates; `"never"`, where the beam search
procedure only stops when there cannot be better candidates
(canonical beam search algorithm).
stop: List of strings that stop the generation when they are generated.
The returned output will not contain the stop strings.
stop_token_ids: List of tokens that stop the generation when they are
generated. The returned output will contain the stop tokens unless
the stop tokens are sepcial tokens.
ignore_eos: Whether to ignore the EOS token and continue generating
tokens after the EOS token is generated.
max_tokens: Maximum number of tokens to generate per output sequence.
logprobs: Number of log probabilities to return per output token.
Note that the implementation follows the OpenAI API: The return
result includes the log probabilities on the `logprobs` most likely
tokens, as well the chosen tokens. The API will always return the
log probability of the sampled token, so there may be up to
`logprobs+1` elements in the response.
prompt_logprobs: Number of log probabilities to return per prompt token.
skip_special_tokens: Whether to skip special tokens in the output.
"""
def
__init__
(
self
,
n
:
int
=
1
,
best_of
:
Optional
[
int
]
=
None
,
presence_penalty
:
float
=
0.0
,
frequency_penalty
:
float
=
0.0
,
temperature
:
float
=
1.0
,
top_p
:
float
=
1.0
,
top_k
:
int
=
-
1
,
use_beam_search
:
bool
=
False
,
length_penalty
:
float
=
1.0
,
early_stopping
:
Union
[
bool
,
str
]
=
False
,
stop
:
Optional
[
Union
[
str
,
List
[
str
]]]
=
None
,
stop_token_ids
:
Optional
[
List
[
int
]]
=
None
,
ignore_eos
:
bool
=
False
,
max_tokens
:
int
=
16
,
logprobs
:
Optional
[
int
]
=
None
,
prompt_logprobs
:
Optional
[
int
]
=
None
,
skip_special_tokens
:
bool
=
True
,
)
->
None
:
self
.
n
=
n
self
.
best_of
=
best_of
if
best_of
is
not
None
else
n
self
.
presence_penalty
=
presence_penalty
self
.
frequency_penalty
=
frequency_penalty
self
.
temperature
=
temperature
self
.
top_p
=
top_p
self
.
top_k
=
top_k
self
.
use_beam_search
=
use_beam_search
self
.
length_penalty
=
length_penalty
self
.
early_stopping
=
early_stopping
if
stop
is
None
:
self
.
stop
=
[]
elif
isinstance
(
stop
,
str
):
self
.
stop
=
[
stop
]
else
:
self
.
stop
=
list
(
stop
)
if
stop_token_ids
is
None
:
self
.
stop_token_ids
=
[]
else
:
self
.
stop_token_ids
=
list
(
stop_token_ids
)
self
.
ignore_eos
=
ignore_eos
self
.
max_tokens
=
max_tokens
self
.
logprobs
=
logprobs
self
.
prompt_logprobs
=
prompt_logprobs
self
.
skip_special_tokens
=
skip_special_tokens
self
.
_verify_args
()
if
self
.
use_beam_search
:
self
.
_verify_beam_search
()
else
:
self
.
_verify_non_beam_search
()
if
self
.
temperature
<
_SAMPLING_EPS
:
# Zero temperature means greedy sampling.
self
.
_verify_greedy_sampling
()
def
_verify_args
(
self
)
->
None
:
if
self
.
n
<
1
:
raise
ValueError
(
f
"n must be at least 1, got
{
self
.
n
}
."
)
if
self
.
best_of
<
self
.
n
:
raise
ValueError
(
f
"best_of must be greater than or equal to n, "
f
"got n=
{
self
.
n
}
and best_of=
{
self
.
best_of
}
."
)
if
not
-
2.0
<=
self
.
presence_penalty
<=
2.0
:
raise
ValueError
(
"presence_penalty must be in [-2, 2], got "
f
"
{
self
.
presence_penalty
}
."
)
if
not
-
2.0
<=
self
.
frequency_penalty
<=
2.0
:
raise
ValueError
(
"frequency_penalty must be in [-2, 2], got "
f
"
{
self
.
frequency_penalty
}
."
)
if
self
.
temperature
<
0.0
:
raise
ValueError
(
f
"temperature must be non-negative, got
{
self
.
temperature
}
."
)
if
not
0.0
<
self
.
top_p
<=
1.0
:
raise
ValueError
(
f
"top_p must be in (0, 1], got
{
self
.
top_p
}
."
)
if
self
.
top_k
<
-
1
or
self
.
top_k
==
0
:
raise
ValueError
(
f
"top_k must be -1 (disable), or at least 1, "
f
"got
{
self
.
top_k
}
."
)
if
self
.
max_tokens
<
1
:
raise
ValueError
(
f
"max_tokens must be at least 1, got
{
self
.
max_tokens
}
."
)
if
self
.
logprobs
is
not
None
and
self
.
logprobs
<
0
:
raise
ValueError
(
f
"logprobs must be non-negative, got
{
self
.
logprobs
}
."
)
if
self
.
prompt_logprobs
is
not
None
and
self
.
prompt_logprobs
<
0
:
raise
ValueError
(
f
"prompt_logprobs must be non-negative, got "
f
"
{
self
.
prompt_logprobs
}
."
)
def
_verify_beam_search
(
self
)
->
None
:
if
self
.
best_of
==
1
:
raise
ValueError
(
"best_of must be greater than 1 when using beam "
f
"search. Got
{
self
.
best_of
}
."
)
if
self
.
temperature
>
_SAMPLING_EPS
:
raise
ValueError
(
"temperature must be 0 when using beam search."
)
if
self
.
top_p
<
1.0
-
_SAMPLING_EPS
:
raise
ValueError
(
"top_p must be 1 when using beam search."
)
if
self
.
top_k
!=
-
1
:
raise
ValueError
(
"top_k must be -1 when using beam search."
)
if
self
.
early_stopping
not
in
[
True
,
False
,
"never"
]:
raise
ValueError
(
f
"early_stopping must be True, False, or 'never', "
f
"got
{
self
.
early_stopping
}
."
)
def
_verify_non_beam_search
(
self
)
->
None
:
if
self
.
early_stopping
is
not
False
:
raise
ValueError
(
"early_stopping is not effective and must be "
"False when not using beam search."
)
if
(
self
.
length_penalty
<
1.0
-
_SAMPLING_EPS
or
self
.
length_penalty
>
1.0
+
_SAMPLING_EPS
):
raise
ValueError
(
"length_penalty is not effective and must be the "
"default value of 1.0 when not using beam search."
)
def
_verify_greedy_sampling
(
self
)
->
None
:
if
self
.
best_of
>
1
:
raise
ValueError
(
"best_of must be 1 when using greedy sampling."
f
"Got
{
self
.
best_of
}
."
)
if
self
.
top_p
<
1.0
-
_SAMPLING_EPS
:
raise
ValueError
(
"top_p must be 1 when using greedy sampling."
)
if
self
.
top_k
!=
-
1
:
raise
ValueError
(
"top_k must be -1 when using greedy sampling."
)
@
cached_property
def
sampling_type
(
self
)
->
SamplingType
:
if
self
.
use_beam_search
:
return
SamplingType
.
BEAM
if
self
.
temperature
<
_SAMPLING_EPS
:
return
SamplingType
.
GREEDY
return
SamplingType
.
RANDOM
def
__repr__
(
self
)
->
str
:
return
(
f
"SamplingParams(n=
{
self
.
n
}
, "
f
"best_of=
{
self
.
best_of
}
, "
f
"presence_penalty=
{
self
.
presence_penalty
}
, "
f
"frequency_penalty=
{
self
.
frequency_penalty
}
, "
f
"temperature=
{
self
.
temperature
}
, "
f
"top_p=
{
self
.
top_p
}
, "
f
"top_k=
{
self
.
top_k
}
, "
f
"use_beam_search=
{
self
.
use_beam_search
}
, "
f
"length_penalty=
{
self
.
length_penalty
}
, "
f
"early_stopping=
{
self
.
early_stopping
}
, "
f
"stop=
{
self
.
stop
}
, "
f
"ignore_eos=
{
self
.
ignore_eos
}
, "
f
"max_tokens=
{
self
.
max_tokens
}
, "
f
"logprobs=
{
self
.
logprobs
}
, "
f
"prompt_logprobs=
{
self
.
prompt_logprobs
}
, "
f
"skip_special_tokens=
{
self
.
skip_special_tokens
}
)"
)
server/vllm/vllm/sequence.py
0 → 100644
View file @
70056d1e
"""Sequence and its related classes."""
import
copy
import
enum
from
typing
import
Dict
,
List
,
Optional
,
Union
from
vllm.block
import
LogicalTokenBlock
from
vllm.sampling_params
import
SamplingParams
PromptLogprobs
=
List
[
Optional
[
Dict
[
int
,
float
]]]
SampleLogprobs
=
List
[
Dict
[
int
,
float
]]
class
SequenceStatus
(
enum
.
Enum
):
"""Status of a sequence."""
WAITING
=
enum
.
auto
()
RUNNING
=
enum
.
auto
()
SWAPPED
=
enum
.
auto
()
FINISHED_STOPPED
=
enum
.
auto
()
FINISHED_LENGTH_CAPPED
=
enum
.
auto
()
FINISHED_ABORTED
=
enum
.
auto
()
FINISHED_IGNORED
=
enum
.
auto
()
@
staticmethod
def
is_finished
(
status
:
"SequenceStatus"
)
->
bool
:
return
status
in
[
SequenceStatus
.
FINISHED_STOPPED
,
SequenceStatus
.
FINISHED_LENGTH_CAPPED
,
SequenceStatus
.
FINISHED_ABORTED
,
SequenceStatus
.
FINISHED_IGNORED
,
]
@
staticmethod
def
get_finished_reason
(
status
:
"SequenceStatus"
)
->
Union
[
str
,
None
]:
if
status
==
SequenceStatus
.
FINISHED_STOPPED
:
finish_reason
=
"stop"
elif
status
==
SequenceStatus
.
FINISHED_LENGTH_CAPPED
:
finish_reason
=
"length"
elif
status
==
SequenceStatus
.
FINISHED_ABORTED
:
finish_reason
=
"abort"
elif
status
==
SequenceStatus
.
FINISHED_IGNORED
:
# The ignored sequences are the sequences whose prompt lengths
# are longer than the model's length cap. Therefore, the stop
# reason should also be "length" as in OpenAI API.
finish_reason
=
"length"
else
:
finish_reason
=
None
return
finish_reason
class
SequenceData
:
"""Data associated with a sequence.
Args:
prompt_token_ids: The token IDs of the prompt.
Attributes:
prompt_token_ids: The token IDs of the prompt.
output_token_ids: The token IDs of the output.
cumulative_logprob: The cumulative log probability of the output.
"""
def
__init__
(
self
,
prompt_token_ids
:
List
[
int
],
)
->
None
:
self
.
prompt_token_ids
=
prompt_token_ids
self
.
output_token_ids
:
List
[
int
]
=
[]
self
.
cumulative_logprob
=
0.0
def
append_token_id
(
self
,
token_id
:
int
,
logprob
:
float
)
->
None
:
self
.
output_token_ids
.
append
(
token_id
)
self
.
cumulative_logprob
+=
logprob
def
get_len
(
self
)
->
int
:
return
len
(
self
.
output_token_ids
)
+
len
(
self
.
prompt_token_ids
)
def
get_prompt_len
(
self
)
->
int
:
return
len
(
self
.
prompt_token_ids
)
def
get_output_len
(
self
)
->
int
:
return
len
(
self
.
output_token_ids
)
def
get_token_ids
(
self
)
->
List
[
int
]:
return
self
.
prompt_token_ids
+
self
.
output_token_ids
def
get_last_token_id
(
self
)
->
int
:
if
not
self
.
output_token_ids
:
return
self
.
prompt_token_ids
[
-
1
]
return
self
.
output_token_ids
[
-
1
]
def
__repr__
(
self
)
->
str
:
return
(
f
"SequenceData("
f
"prompt_token_ids=
{
self
.
prompt_token_ids
}
, "
f
"output_token_ids=
{
self
.
output_token_ids
}
, "
f
"cumulative_logprob=
{
self
.
cumulative_logprob
}
)"
)
class
Sequence
:
"""Stores the data, status, and block information of a sequence.
Args:
seq_id: The ID of the sequence.
prompt: The prompt of the sequence.
prompt_token_ids: The token IDs of the prompt.
block_size: The block size of the sequence. Should be the same as the
block size used by the block manager and cache engine.
"""
def
__init__
(
self
,
seq_id
:
int
,
prompt
:
str
,
prompt_token_ids
:
List
[
int
],
block_size
:
int
,
)
->
None
:
self
.
seq_id
=
seq_id
self
.
prompt
=
prompt
self
.
block_size
=
block_size
self
.
data
=
SequenceData
(
prompt_token_ids
)
self
.
output_logprobs
:
SampleLogprobs
=
[]
self
.
output_text
=
""
self
.
logical_token_blocks
:
List
[
LogicalTokenBlock
]
=
[]
# Initialize the logical token blocks with the prompt token ids.
self
.
_append_tokens_to_blocks
(
prompt_token_ids
)
self
.
status
=
SequenceStatus
.
WAITING
# Used for incremental detokenization
self
.
prefix_offset
=
0
self
.
read_offset
=
0
# Input + output tokens
self
.
tokens
:
Optional
[
List
[
str
]]
=
None
def
_append_logical_block
(
self
)
->
None
:
block
=
LogicalTokenBlock
(
block_number
=
len
(
self
.
logical_token_blocks
),
block_size
=
self
.
block_size
,
)
self
.
logical_token_blocks
.
append
(
block
)
def
_append_tokens_to_blocks
(
self
,
token_ids
:
List
[
int
])
->
None
:
cursor
=
0
while
cursor
<
len
(
token_ids
):
if
not
self
.
logical_token_blocks
:
self
.
_append_logical_block
()
last_block
=
self
.
logical_token_blocks
[
-
1
]
if
last_block
.
is_full
():
self
.
_append_logical_block
()
last_block
=
self
.
logical_token_blocks
[
-
1
]
num_empty_slots
=
last_block
.
get_num_empty_slots
()
last_block
.
append_tokens
(
token_ids
[
cursor
:
cursor
+
num_empty_slots
])
cursor
+=
num_empty_slots
def
append_token_id
(
self
,
token_id
:
int
,
logprobs
:
Dict
[
int
,
float
],
)
->
None
:
assert
token_id
in
logprobs
self
.
_append_tokens_to_blocks
([
token_id
])
self
.
output_logprobs
.
append
(
logprobs
)
self
.
data
.
append_token_id
(
token_id
,
logprobs
[
token_id
])
def
get_len
(
self
)
->
int
:
return
self
.
data
.
get_len
()
def
get_prompt_len
(
self
)
->
int
:
return
self
.
data
.
get_prompt_len
()
def
get_output_len
(
self
)
->
int
:
return
self
.
data
.
get_output_len
()
def
get_token_ids
(
self
)
->
List
[
int
]:
return
self
.
data
.
get_token_ids
()
def
get_last_token_id
(
self
)
->
int
:
return
self
.
data
.
get_last_token_id
()
def
get_output_token_ids
(
self
)
->
List
[
int
]:
return
self
.
data
.
output_token_ids
def
get_cumulative_logprob
(
self
)
->
float
:
return
self
.
data
.
cumulative_logprob
def
get_beam_search_score
(
self
,
length_penalty
:
float
=
0.0
,
seq_len
:
Optional
[
int
]
=
None
,
eos_token_id
:
Optional
[
int
]
=
None
)
->
float
:
"""Calculate the beam search score with length penalty.
Adapted from
https://github.com/huggingface/transformers/blob/ccb92be23def445f2afdea94c31286f84b89eb5b/src/transformers/generation/beam_search.py#L938
"""
if
seq_len
is
None
:
seq_len
=
self
.
get_len
()
# NOTE: HF implementation does not count the EOS token
# towards the length, we align with that here for testing.
if
(
eos_token_id
is
not
None
and
self
.
get_last_token_id
()
==
eos_token_id
):
seq_len
-=
1
return
self
.
get_cumulative_logprob
()
/
(
seq_len
**
length_penalty
)
def
is_finished
(
self
)
->
bool
:
return
SequenceStatus
.
is_finished
(
self
.
status
)
def
fork
(
self
,
new_seq_id
:
int
)
->
"Sequence"
:
new_seq
=
copy
.
deepcopy
(
self
)
new_seq
.
seq_id
=
new_seq_id
return
new_seq
def
__repr__
(
self
)
->
str
:
return
(
f
"Sequence(seq_id=
{
self
.
seq_id
}
, "
f
"status=
{
self
.
status
.
name
}
, "
f
"num_blocks=
{
len
(
self
.
logical_token_blocks
)
}
)"
)
class
SequenceGroup
:
"""A group of sequences that are generated from the same prompt.
Args:
request_id: The ID of the request.
seqs: The list of sequences.
sampling_params: The sampling parameters used to generate the outputs.
arrival_time: The arrival time of the request.
"""
def
__init__
(
self
,
request_id
:
str
,
seqs
:
List
[
Sequence
],
sampling_params
:
SamplingParams
,
arrival_time
:
float
,
)
->
None
:
self
.
request_id
=
request_id
self
.
seqs_dict
=
{
seq
.
seq_id
:
seq
for
seq
in
seqs
}
self
.
sampling_params
=
sampling_params
self
.
arrival_time
=
arrival_time
self
.
prompt_logprobs
:
Optional
[
PromptLogprobs
]
=
None
@
property
def
prompt
(
self
)
->
str
:
# All sequences in the group should have the same prompt.
# We use the prompt of an arbitrary sequence.
return
next
(
iter
(
self
.
seqs_dict
.
values
())).
prompt
@
property
def
prompt_token_ids
(
self
)
->
List
[
int
]:
# All sequences in the group should have the same prompt.
# We use the prompt of an arbitrary sequence.
return
next
(
iter
(
self
.
seqs_dict
.
values
())).
data
.
prompt_token_ids
def
get_max_num_running_seqs
(
self
)
->
int
:
"""The maximum number of sequences running in parallel in the remaining
lifetime of the request."""
if
self
.
sampling_params
.
use_beam_search
:
# For beam search, maximally there will always be `best_of` beam
# candidates running in the future.
return
self
.
sampling_params
.
best_of
else
:
if
self
.
sampling_params
.
best_of
>
self
.
num_seqs
():
# At prompt stage, the sequence group is not yet filled up
# and only have one sequence running. However, in the
# generation stage, we will have `best_of` sequences running.
return
self
.
sampling_params
.
best_of
# At sampling stages, return the number of actual sequences
# that are not finished yet.
return
self
.
num_unfinished_seqs
()
def
get_seqs
(
self
,
status
:
Optional
[
SequenceStatus
]
=
None
,
)
->
List
[
Sequence
]:
if
status
is
None
:
return
list
(
self
.
seqs_dict
.
values
())
else
:
return
[
seq
for
seq
in
self
.
seqs_dict
.
values
()
if
seq
.
status
==
status
]
def
get_unfinished_seqs
(
self
)
->
List
[
Sequence
]:
return
[
seq
for
seq
in
self
.
seqs_dict
.
values
()
if
not
seq
.
is_finished
()
]
def
get_finished_seqs
(
self
)
->
List
[
Sequence
]:
return
[
seq
for
seq
in
self
.
seqs_dict
.
values
()
if
seq
.
is_finished
()]
def
num_seqs
(
self
,
status
:
Optional
[
SequenceStatus
]
=
None
)
->
int
:
return
len
(
self
.
get_seqs
(
status
))
def
num_unfinished_seqs
(
self
)
->
int
:
return
len
(
self
.
get_unfinished_seqs
())
def
num_finished_seqs
(
self
)
->
int
:
return
len
(
self
.
get_finished_seqs
())
def
find
(
self
,
seq_id
:
int
)
->
Sequence
:
if
seq_id
not
in
self
.
seqs_dict
:
raise
ValueError
(
f
"Sequence
{
seq_id
}
not found."
)
return
self
.
seqs_dict
[
seq_id
]
def
add
(
self
,
seq
:
Sequence
)
->
None
:
if
seq
.
seq_id
in
self
.
seqs_dict
:
raise
ValueError
(
f
"Sequence
{
seq
.
seq_id
}
already exists."
)
self
.
seqs_dict
[
seq
.
seq_id
]
=
seq
def
remove
(
self
,
seq_id
:
int
)
->
None
:
if
seq_id
not
in
self
.
seqs_dict
:
raise
ValueError
(
f
"Sequence
{
seq_id
}
not found."
)
del
self
.
seqs_dict
[
seq_id
]
def
is_finished
(
self
)
->
bool
:
return
all
(
seq
.
is_finished
()
for
seq
in
self
.
get_seqs
())
def
__repr__
(
self
)
->
str
:
return
(
f
"SequenceGroup(request_id=
{
self
.
request_id
}
, "
f
"sampling_params=
{
self
.
sampling_params
}
, "
f
"num_seqs=
{
len
(
self
.
seqs_dict
)
}
)"
)
class
SequenceGroupMetadata
:
"""Metadata for a sequence group. Used to create `InputMetadata`.
Args:
request_id: The ID of the request.
is_prompt: Whether the request is at prompt stage.
seq_data: The sequence data. (Seq id -> sequence data)
sampling_params: The sampling parameters used to generate the outputs.
block_tables: The block tables. (Seq id -> list of physical block
numbers)
"""
def
__init__
(
self
,
request_id
:
str
,
is_prompt
:
bool
,
seq_data
:
Dict
[
int
,
SequenceData
],
sampling_params
:
SamplingParams
,
block_tables
:
Dict
[
int
,
List
[
int
]],
)
->
None
:
self
.
request_id
=
request_id
self
.
is_prompt
=
is_prompt
self
.
seq_data
=
seq_data
self
.
sampling_params
=
sampling_params
self
.
block_tables
=
block_tables
class
SequenceOutputs
:
"""The model output associated with a sequence.
Args:
parent_seq_id: The ID of the parent sequence (for forking in beam
search).
output_token: The output token ID.
logprobs: The logprobs of the output token.
(Token id -> logP(x_i+1 | x_0, ..., x_i))
"""
def
__init__
(
self
,
parent_seq_id
:
int
,
output_token
:
int
,
logprobs
:
Dict
[
int
,
float
],
)
->
None
:
self
.
parent_seq_id
=
parent_seq_id
self
.
output_token
=
output_token
self
.
logprobs
=
logprobs
def
__repr__
(
self
)
->
str
:
return
(
f
"SequenceOutputs(parent_seq_id=
{
self
.
parent_seq_id
}
, "
f
"output_token=
{
self
.
output_token
}
, "
f
"logprobs=
{
self
.
logprobs
}
)"
)
def
__eq__
(
self
,
other
:
object
)
->
bool
:
if
not
isinstance
(
other
,
SequenceOutputs
):
raise
NotImplementedError
()
return
(
self
.
parent_seq_id
==
other
.
parent_seq_id
and
self
.
output_token
==
other
.
output_token
and
self
.
logprobs
==
other
.
logprobs
)
class
SequenceGroupOutputs
:
"""The model outputs associated with a sequence group."""
def
__init__
(
self
,
samples
:
List
[
SequenceOutputs
],
prompt_logprobs
:
Optional
[
PromptLogprobs
],
)
->
None
:
self
.
samples
=
samples
self
.
prompt_logprobs
=
prompt_logprobs
def
__repr__
(
self
)
->
str
:
return
(
f
"SequenceGroupOutputs(samples=
{
self
.
samples
}
, "
f
"prompt_logprobs=
{
self
.
prompt_logprobs
}
)"
)
# For each sequence group, we generate a list of SequenceOutputs object,
# each of which contains one possible candidate for the next token.
SamplerOutput
=
List
[
SequenceGroupOutputs
]
server/vllm/vllm/transformers_utils/__init__.py
0 → 100644
View file @
70056d1e
server/vllm/vllm/transformers_utils/config.py
0 → 100644
View file @
70056d1e
from
typing
import
Optional
from
transformers
import
AutoConfig
,
PretrainedConfig
from
vllm.transformers_utils.configs
import
*
# pylint: disable=wildcard-import
_CONFIG_REGISTRY
=
{
"mpt"
:
MPTConfig
,
"baichuan"
:
BaiChuanConfig
,
"aquila"
:
AquilaConfig
,
"qwen"
:
QWenConfig
,
"RefinedWeb"
:
RWConfig
,
# For tiiuae/falcon-40b(-instruct)
"RefinedWebModel"
:
RWConfig
,
# For tiiuae/falcon-7b(-instruct)
}
def
get_config
(
model
:
str
,
trust_remote_code
:
bool
,
revision
:
Optional
[
str
]
=
None
)
->
PretrainedConfig
:
try
:
config
=
AutoConfig
.
from_pretrained
(
model
,
trust_remote_code
=
trust_remote_code
,
revision
=
revision
)
except
ValueError
as
e
:
if
(
not
trust_remote_code
and
"requires you to execute the configuration file"
in
str
(
e
)):
err_msg
=
(
"Failed to load the model config. If the model is a custom "
"model not yet available in the HuggingFace transformers "
"library, consider setting `trust_remote_code=True` in LLM "
"or using the `--trust-remote-code` flag in the CLI."
)
raise
RuntimeError
(
err_msg
)
from
e
else
:
raise
e
if
config
.
model_type
in
_CONFIG_REGISTRY
:
config_class
=
_CONFIG_REGISTRY
[
config
.
model_type
]
config
=
config_class
.
from_pretrained
(
model
,
revision
=
revision
)
return
config
server/vllm/vllm/transformers_utils/configs/__init__.py
0 → 100644
View file @
70056d1e
from
vllm.transformers_utils.configs.mpt
import
MPTConfig
from
vllm.transformers_utils.configs.baichuan
import
BaiChuanConfig
from
vllm.transformers_utils.configs.aquila
import
AquilaConfig
from
vllm.transformers_utils.configs.qwen
import
QWenConfig
# RWConfig is for the original tiiuae/falcon-40b(-instruct) and
# tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
# `FalconConfig` class from the official HuggingFace transformers library.
from
vllm.transformers_utils.configs.falcon
import
RWConfig
__all__
=
[
"MPTConfig"
,
"BaiChuanConfig"
,
"AquilaConfig"
,
"QWenConfig"
,
"RWConfig"
,
]
server/vllm/vllm/transformers_utils/configs/aquila.py
0 → 100644
View file @
70056d1e
# coding=utf-8
# Copyright 2023 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Aquila model configuration"""
from
transformers
import
PretrainedConfig
class
AquilaConfig
(
PretrainedConfig
):
model_type
=
"aquila"
keys_to_ignore_at_inference
=
[
"past_key_values"
]
def
__init__
(
self
,
vocab_size
=
100008
,
hidden_size
=
4096
,
intermediate_size
=
11008
,
num_hidden_layers
=
32
,
num_attention_heads
=
32
,
num_key_value_heads
=
None
,
hidden_act
=
"silu"
,
max_position_embeddings
=
2048
,
initializer_range
=
0.006
,
rms_norm_eps
=
1e-5
,
use_cache
=
True
,
pad_token_id
=
0
,
bos_token_id
=
1
,
eos_token_id
=
2
,
tie_word_embeddings
=
False
,
**
kwargs
,
):
self
.
vocab_size
=
vocab_size
self
.
max_position_embeddings
=
max_position_embeddings
self
.
hidden_size
=
hidden_size
self
.
intermediate_size
=
intermediate_size
self
.
num_hidden_layers
=
num_hidden_layers
# for backward compatibility
if
num_key_value_heads
is
None
:
num_key_value_heads
=
num_attention_heads
self
.
num_key_value_heads
=
num_key_value_heads
self
.
num_attention_heads
=
num_attention_heads
self
.
hidden_act
=
hidden_act
self
.
initializer_range
=
initializer_range
self
.
rms_norm_eps
=
rms_norm_eps
self
.
use_cache
=
use_cache
super
().
__init__
(
pad_token_id
=
pad_token_id
,
bos_token_id
=
bos_token_id
,
eos_token_id
=
eos_token_id
,
tie_word_embeddings
=
tie_word_embeddings
,
**
kwargs
,
)
server/vllm/vllm/transformers_utils/configs/baichuan.py
0 → 100644
View file @
70056d1e
# coding=utf-8
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
transformers.configuration_utils
import
PretrainedConfig
class
BaiChuanConfig
(
PretrainedConfig
):
model_type
=
"baichuan"
keys_to_ignore_at_inference
=
[
"past_key_values"
]
def
__init__
(
self
,
vocab_size
=
64000
,
hidden_size
=
4096
,
intermediate_size
=
11008
,
num_hidden_layers
=
32
,
num_attention_heads
=
32
,
hidden_act
=
"silu"
,
max_position_embeddings
=
4096
,
initializer_range
=
0.02
,
rms_norm_eps
=
1e-6
,
use_cache
=
True
,
pad_token_id
=
0
,
bos_token_id
=
1
,
eos_token_id
=
2
,
tie_word_embeddings
=
False
,
**
kwargs
,
):
self
.
vocab_size
=
vocab_size
self
.
max_position_embeddings
=
max_position_embeddings
self
.
hidden_size
=
hidden_size
self
.
intermediate_size
=
intermediate_size
self
.
num_hidden_layers
=
num_hidden_layers
self
.
num_attention_heads
=
num_attention_heads
self
.
hidden_act
=
hidden_act
self
.
initializer_range
=
initializer_range
self
.
rms_norm_eps
=
rms_norm_eps
self
.
use_cache
=
use_cache
super
().
__init__
(
pad_token_id
=
pad_token_id
,
bos_token_id
=
bos_token_id
,
eos_token_id
=
eos_token_id
,
tie_word_embeddings
=
tie_word_embeddings
,
**
kwargs
,
)
server/vllm/vllm/transformers_utils/configs/falcon.py
0 → 100644
View file @
70056d1e
# Adapted from
# https://huggingface.co/tiiuae/falcon-7b/blob/main/configuration_RW.py
# Copyright 2023 The vLLM team.
# Copyright 2022 the Big Science Workshop and HuggingFace Inc. team.
# All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Falcon configuration"""
from
transformers.configuration_utils
import
PretrainedConfig
class
RWConfig
(
PretrainedConfig
):
model_type
=
"falcon"
keys_to_ignore_at_inference
=
[
"past_key_values"
]
attribute_map
=
{
"num_hidden_layers"
:
"n_layer"
,
"num_attention_heads"
:
"n_head"
,
"num_kv_heads"
:
"n_head_kv"
,
}
def
__init__
(
self
,
vocab_size
=
250880
,
hidden_size
=
64
,
n_layer
=
2
,
n_head
=
8
,
layer_norm_epsilon
=
1e-5
,
initializer_range
=
0.02
,
use_cache
=
True
,
bos_token_id
=
1
,
eos_token_id
=
2
,
hidden_dropout
=
0.0
,
attention_dropout
=
0.0
,
multi_query
=
True
,
n_head_kv
=
None
,
alibi
=
False
,
bias
=
False
,
parallel_attn
=
False
,
new_decoder_architecture
=
False
,
**
kwargs
,
)
->
None
:
self
.
vocab_size
=
vocab_size
# Backward compatibility with n_embed kwarg
n_embed
=
kwargs
.
pop
(
"n_embed"
,
None
)
self
.
hidden_size
=
hidden_size
if
n_embed
is
None
else
n_embed
self
.
n_layer
=
n_layer
self
.
n_head
=
n_head
self
.
layer_norm_epsilon
=
layer_norm_epsilon
self
.
initializer_range
=
initializer_range
self
.
use_cache
=
use_cache
self
.
hidden_dropout
=
hidden_dropout
self
.
attention_dropout
=
attention_dropout
self
.
bos_token_id
=
bos_token_id
self
.
eos_token_id
=
eos_token_id
self
.
multi_query
=
multi_query
self
.
n_head_kv
=
1
if
n_head_kv
is
None
else
n_head_kv
self
.
alibi
=
alibi
self
.
bias
=
bias
self
.
parallel_attn
=
parallel_attn
self
.
new_decoder_architecture
=
new_decoder_architecture
if
self
.
hidden_size
==
8192
:
# Hack for falcon-40b
self
.
new_decoder_architecture
=
True
super
().
__init__
(
bos_token_id
=
bos_token_id
,
eos_token_id
=
eos_token_id
,
**
kwargs
)
@
property
def
head_dim
(
self
):
return
self
.
hidden_size
//
self
.
n_head
@
property
def
rotary
(
self
):
return
not
self
.
alibi
server/vllm/vllm/transformers_utils/configs/mpt.py
0 → 100644
View file @
70056d1e
# Adapted from
# https://huggingface.co/mosaicml/mpt-7b/blob/main/configuration_mpt.py
from
typing
import
Any
,
Dict
,
Optional
,
Union
from
transformers
import
PretrainedConfig
_ATTN_CONFIG_DEFAULTS
=
{
"attn_type"
:
"multihead_attention"
,
"attn_pdrop"
:
0.0
,
"attn_impl"
:
"triton"
,
"qk_ln"
:
False
,
"clip_qkv"
:
None
,
"softmax_scale"
:
None
,
"prefix_lm"
:
False
,
"attn_uses_sequence_id"
:
False
,
"alibi"
:
False
,
"alibi_bias_max"
:
8
,
}
class
MPTConfig
(
PretrainedConfig
):
model_type
=
"mpt"
attribute_map
=
{
"hidden_size"
:
"d_model"
,
"num_attention_heads"
:
"n_heads"
,
"num_hidden_layers"
:
"n_layers"
,
}
def
__init__
(
self
,
d_model
:
int
=
2048
,
n_heads
:
int
=
16
,
n_layers
:
int
=
24
,
expansion_ratio
:
int
=
4
,
max_seq_len
:
int
=
2048
,
vocab_size
:
int
=
50368
,
resid_pdrop
:
float
=
0.0
,
emb_pdrop
:
float
=
0.0
,
learned_pos_emb
:
bool
=
True
,
attn_config
:
Optional
[
Dict
[
str
,
Any
]]
=
None
,
init_device
:
str
=
"cpu"
,
logit_scale
:
Optional
[
Union
[
float
,
str
]]
=
None
,
no_bias
:
bool
=
False
,
verbose
:
int
=
0
,
embedding_fraction
:
float
=
1.0
,
norm_type
:
str
=
"low_precision_layernorm"
,
use_cache
:
bool
=
False
,
**
kwargs
,
)
->
None
:
self
.
d_model
=
d_model
self
.
n_heads
=
n_heads
self
.
n_layers
=
n_layers
self
.
expansion_ratio
=
expansion_ratio
self
.
max_seq_len
=
max_seq_len
self
.
vocab_size
=
vocab_size
self
.
resid_pdrop
=
resid_pdrop
self
.
emb_pdrop
=
emb_pdrop
self
.
learned_pos_emb
=
learned_pos_emb
if
attn_config
is
None
:
self
.
attn_config
=
_ATTN_CONFIG_DEFAULTS
else
:
self
.
attn_config
=
attn_config
self
.
init_device
=
init_device
self
.
logit_scale
=
logit_scale
self
.
no_bias
=
no_bias
self
.
verbose
=
verbose
self
.
embedding_fraction
=
embedding_fraction
self
.
norm_type
=
norm_type
self
.
use_cache
=
use_cache
if
"name"
in
kwargs
:
del
kwargs
[
"name"
]
if
"loss_fn"
in
kwargs
:
del
kwargs
[
"loss_fn"
]
super
().
__init__
(
**
kwargs
)
server/vllm/vllm/transformers_utils/configs/qwen.py
0 → 100644
View file @
70056d1e
# Copyright (c) Alibaba Cloud.
# LICENSE: https://huggingface.co/Qwen/Qwen-7B/blob/main/LICENSE
from
transformers
import
PretrainedConfig
class
QWenConfig
(
PretrainedConfig
):
model_type
=
"qwen"
keys_to_ignore_at_inference
=
[
"past_key_values"
]
def
__init__
(
self
,
vocab_size
=
151936
,
hidden_size
=
4096
,
num_hidden_layers
=
32
,
num_attention_heads
=
32
,
emb_dropout_prob
=
0.0
,
attn_dropout_prob
=
0.0
,
layer_norm_epsilon
=
1e-6
,
initializer_range
=
0.02
,
max_position_embeddings
=
8192
,
scale_attn_weights
=
True
,
use_cache
=
True
,
bf16
=
False
,
fp16
=
False
,
fp32
=
False
,
kv_channels
=
128
,
rotary_pct
=
1.0
,
rotary_emb_base
=
10000
,
use_dynamic_ntk
=
True
,
use_logn_attn
=
True
,
use_flash_attn
=
"auto"
,
intermediate_size
=
22016
,
no_bias
=
True
,
tie_word_embeddings
=
False
,
**
kwargs
,
):
self
.
vocab_size
=
vocab_size
self
.
hidden_size
=
hidden_size
self
.
intermediate_size
=
intermediate_size
self
.
num_hidden_layers
=
num_hidden_layers
self
.
num_attention_heads
=
num_attention_heads
self
.
emb_dropout_prob
=
emb_dropout_prob
self
.
attn_dropout_prob
=
attn_dropout_prob
self
.
layer_norm_epsilon
=
layer_norm_epsilon
self
.
initializer_range
=
initializer_range
self
.
scale_attn_weights
=
scale_attn_weights
self
.
use_cache
=
use_cache
self
.
max_position_embeddings
=
max_position_embeddings
self
.
bf16
=
bf16
self
.
fp16
=
fp16
self
.
fp32
=
fp32
self
.
kv_channels
=
kv_channels
self
.
rotary_pct
=
rotary_pct
self
.
rotary_emb_base
=
rotary_emb_base
self
.
use_dynamic_ntk
=
use_dynamic_ntk
self
.
use_logn_attn
=
use_logn_attn
self
.
use_flash_attn
=
use_flash_attn
self
.
no_bias
=
no_bias
super
().
__init__
(
tie_word_embeddings
=
tie_word_embeddings
,
**
kwargs
)
server/vllm/vllm/transformers_utils/tokenizer.py
0 → 100644
View file @
70056d1e
from
typing
import
List
,
Optional
,
Tuple
,
Union
from
transformers
import
(
AutoTokenizer
,
PreTrainedTokenizer
,
PreTrainedTokenizerFast
)
from
vllm.logger
import
init_logger
logger
=
init_logger
(
__name__
)
# A fast LLaMA tokenizer with the pre-processed `tokenizer.json` file.
_FAST_LLAMA_TOKENIZER
=
"hf-internal-testing/llama-tokenizer"
def
get_tokenizer
(
tokenizer_name
:
str
,
*
args
,
tokenizer_mode
:
str
=
"auto"
,
trust_remote_code
:
bool
=
False
,
tokenizer_revision
:
Optional
[
str
]
=
None
,
**
kwargs
,
)
->
Union
[
PreTrainedTokenizer
,
PreTrainedTokenizerFast
]:
"""Gets a tokenizer for the given model name via Huggingface."""
if
tokenizer_mode
==
"slow"
:
if
kwargs
.
get
(
"use_fast"
,
False
):
raise
ValueError
(
"Cannot use the fast tokenizer in slow tokenizer mode."
)
kwargs
[
"use_fast"
]
=
False
if
(
"llama"
in
tokenizer_name
.
lower
()
and
kwargs
.
get
(
"use_fast"
,
True
)
and
tokenizer_name
!=
_FAST_LLAMA_TOKENIZER
):
logger
.
info
(
"For some LLaMA V1 models, initializing the fast tokenizer may "
"take a long time. To reduce the initialization time, consider "
f
"using '
{
_FAST_LLAMA_TOKENIZER
}
' instead of the original "
"tokenizer."
)
try
:
tokenizer
=
AutoTokenizer
.
from_pretrained
(
tokenizer_name
,
*
args
,
trust_remote_code
=
trust_remote_code
,
tokenizer_revision
=
tokenizer_revision
,
**
kwargs
)
except
TypeError
as
e
:
# The LLaMA tokenizer causes a protobuf error in some environments.
err_msg
=
(
"Failed to load the tokenizer. If you are using a LLaMA V1 model "
f
"consider using '
{
_FAST_LLAMA_TOKENIZER
}
' instead of the "
"original tokenizer."
)
raise
RuntimeError
(
err_msg
)
from
e
except
ValueError
as
e
:
# If the error pertains to the tokenizer class not existing or not
# currently being imported, suggest using the --trust-remote-code flag.
if
(
not
trust_remote_code
and
(
"does not exist or is not currently imported."
in
str
(
e
)
or
"requires you to execute the tokenizer file"
in
str
(
e
))):
err_msg
=
(
"Failed to load the tokenizer. If the tokenizer is a custom "
"tokenizer not yet available in the HuggingFace transformers "
"library, consider setting `trust_remote_code=True` in LLM "
"or using the `--trust-remote-code` flag in the CLI."
)
raise
RuntimeError
(
err_msg
)
from
e
else
:
raise
e
if
not
isinstance
(
tokenizer
,
PreTrainedTokenizerFast
):
logger
.
warning
(
"Using a slow tokenizer. This might cause a significant "
"slowdown. Consider using a fast tokenizer instead."
)
return
tokenizer
def
_convert_tokens_to_string_with_added_encoders
(
tokenizer
:
Union
[
PreTrainedTokenizer
,
PreTrainedTokenizerFast
],
output_tokens
:
List
[
str
],
skip_special_tokens
:
bool
,
)
->
str
:
# Adapted from
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/tokenization_utils.py#L921
# NOTE(woosuk): The following code is slow because it runs a for loop over
# the output_tokens. In Python, running a for loop over a list can be slow
# even when the loop body is very simple.
sub_texts
=
[]
current_sub_text
=
[]
all_special_tokens
=
set
(
tokenizer
.
all_special_tokens
)
for
token
in
output_tokens
:
if
skip_special_tokens
and
token
in
all_special_tokens
:
continue
if
token
in
tokenizer
.
get_added_vocab
():
if
current_sub_text
:
sub_text
=
tokenizer
.
convert_tokens_to_string
(
current_sub_text
)
sub_texts
.
append
(
sub_text
)
current_sub_text
=
[]
sub_texts
.
append
(
token
)
else
:
current_sub_text
.
append
(
token
)
if
current_sub_text
:
sub_text
=
tokenizer
.
convert_tokens_to_string
(
current_sub_text
)
sub_texts
.
append
(
sub_text
)
return
" "
.
join
(
sub_texts
)
# Based on
# https://github.com/huggingface/text-generation-inference/blob/v0.9.4/server/text_generation_server/models/model.py#L62C9-L62C15
# under Apache 2.0 license
def
detokenize_incrementally
(
tokenizer
:
Union
[
PreTrainedTokenizer
,
PreTrainedTokenizerFast
],
all_input_ids
:
List
[
int
],
prev_tokens
:
Optional
[
List
[
str
]],
prefix_offset
:
int
=
0
,
read_offset
:
int
=
0
,
skip_special_tokens
:
bool
=
False
,
)
->
Tuple
[
List
[
str
],
str
,
int
,
int
]:
new_token_id
=
all_input_ids
[
-
1
]
# This is the first iteration for this sequence
if
prev_tokens
is
None
:
new_tokens
=
tokenizer
.
convert_ids_to_tokens
(
all_input_ids
,
skip_special_tokens
=
skip_special_tokens
)
output_tokens
=
new_tokens
# 5 is an arbitrary value that should work for all
# tokenizers (bigger = more conservative).
# Subtract 1 extra to account for the generated token.
prefix_offset
=
max
(
len
(
output_tokens
)
-
6
,
0
)
read_offset
=
max
(
len
(
output_tokens
)
-
1
,
0
)
else
:
# Put new_token_id in a list so skip_special_tokens is respected
new_tokens
=
tokenizer
.
convert_ids_to_tokens
(
[
new_token_id
],
skip_special_tokens
=
skip_special_tokens
)
output_tokens
=
prev_tokens
+
new_tokens
# The prefix text is necessary only to defeat cleanup algorithms in
# the decode which decide to add a space or not depending on the
# surrounding ids.
if
tokenizer
.
is_fast
or
not
tokenizer
.
get_added_vocab
():
prefix_text
=
tokenizer
.
convert_tokens_to_string
(
output_tokens
[
prefix_offset
:
read_offset
])
new_text
=
tokenizer
.
convert_tokens_to_string
(
output_tokens
[
prefix_offset
:])
else
:
prefix_text
=
_convert_tokens_to_string_with_added_encoders
(
tokenizer
,
output_tokens
[
prefix_offset
:
read_offset
],
skip_special_tokens
=
skip_special_tokens
)
new_text
=
_convert_tokens_to_string_with_added_encoders
(
tokenizer
,
output_tokens
[
prefix_offset
:],
skip_special_tokens
=
skip_special_tokens
)
if
len
(
new_text
)
>
len
(
prefix_text
)
and
not
new_text
.
endswith
(
"�"
):
# utf-8 char at the end means it's a potential unfinished byte sequence
# from byte fallback tokenization.
# If it's in the middle, it's probably a real invalid id generated
# by the model
new_text
=
new_text
[
len
(
prefix_text
):]
return
new_tokens
,
new_text
,
read_offset
,
len
(
output_tokens
)
else
:
return
new_tokens
,
""
,
prefix_offset
,
read_offset
server/vllm/vllm/utils.py
0 → 100644
View file @
70056d1e
import
enum
import
uuid
from
platform
import
uname
import
psutil
import
torch
from
vllm
import
cuda_utils
class
Device
(
enum
.
Enum
):
GPU
=
enum
.
auto
()
CPU
=
enum
.
auto
()
class
Counter
:
def
__init__
(
self
,
start
:
int
=
0
)
->
None
:
self
.
counter
=
start
def
__next__
(
self
)
->
int
:
i
=
self
.
counter
self
.
counter
+=
1
return
i
def
reset
(
self
)
->
None
:
self
.
counter
=
0
def
get_max_shared_memory_bytes
(
gpu
:
int
=
0
)
->
int
:
"""Returns the maximum shared memory per thread block in bytes."""
# https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html
cudaDevAttrMaxSharedMemoryPerBlockOptin
=
97
# pylint: disable=invalid-name
max_shared_mem
=
cuda_utils
.
get_device_attribute
(
cudaDevAttrMaxSharedMemoryPerBlockOptin
,
gpu
)
return
int
(
max_shared_mem
)
def
get_gpu_memory
(
gpu
:
int
=
0
)
->
int
:
"""Returns the total memory of the GPU in bytes."""
return
torch
.
cuda
.
get_device_properties
(
gpu
).
total_memory
def
get_cpu_memory
()
->
int
:
"""Returns the total CPU memory of the node in bytes."""
return
psutil
.
virtual_memory
().
total
def
random_uuid
()
->
str
:
return
str
(
uuid
.
uuid4
().
hex
)
def
in_wsl
()
->
bool
:
# Reference: https://github.com/microsoft/WSL/issues/4071
return
"microsoft"
in
" "
.
join
(
uname
()).
lower
()
server/vllm/vllm/worker/__init__.py
0 → 100644
View file @
70056d1e
server/vllm/vllm/worker/cache_engine.py
0 → 100644
View file @
70056d1e
"""CacheEngine class for managing the KV cache."""
from
typing
import
Dict
,
List
,
Tuple
import
torch
from
vllm
import
cache_ops
from
vllm.config
import
CacheConfig
,
ModelConfig
,
ParallelConfig
from
vllm.logger
import
init_logger
from
vllm.utils
import
in_wsl
logger
=
init_logger
(
__name__
)
KVCache
=
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]
class
CacheEngine
:
"""Manages the KV cache.
This class is responsible for initializing and managing the GPU and CPU KV
caches. It also provides methods for performing KV cache operations, such
as swapping and copying.
"""
def
__init__
(
self
,
cache_config
:
CacheConfig
,
model_config
:
ModelConfig
,
parallel_config
:
ParallelConfig
,
)
->
None
:
self
.
cache_config
=
cache_config
self
.
model_config
=
model_config
self
.
parallel_config
=
parallel_config
self
.
head_size
=
model_config
.
get_head_size
()
self
.
num_layers
=
model_config
.
get_num_layers
(
parallel_config
)
self
.
num_heads
=
model_config
.
get_num_kv_heads
(
parallel_config
)
self
.
dtype
=
model_config
.
dtype
self
.
block_size
=
cache_config
.
block_size
self
.
num_gpu_blocks
=
cache_config
.
num_gpu_blocks
self
.
num_cpu_blocks
=
cache_config
.
num_cpu_blocks
# Initialize the cache.
self
.
gpu_cache
=
self
.
allocate_gpu_cache
()
self
.
cpu_cache
=
self
.
allocate_cpu_cache
()
# Initialize the stream for caching operations.
self
.
cache_stream
=
torch
.
cuda
.
Stream
()
assert
self
.
cache_stream
!=
torch
.
cuda
.
current_stream
()
# Initialize the events for stream synchronization.
self
.
events
=
[
torch
.
cuda
.
Event
()
for
_
in
range
(
self
.
num_layers
)]
def
get_key_block_shape
(
self
)
->
Tuple
[
int
,
int
,
int
,
int
]:
element_size
=
torch
.
tensor
([],
dtype
=
self
.
dtype
).
element_size
()
x
=
16
//
element_size
return
(
self
.
num_heads
,
self
.
head_size
//
x
,
self
.
block_size
,
x
,
)
def
get_value_block_shape
(
self
)
->
Tuple
[
int
,
int
,
int
]:
return
(
self
.
num_heads
,
self
.
head_size
,
self
.
block_size
,
)
def
allocate_gpu_cache
(
self
)
->
List
[
KVCache
]:
gpu_cache
:
List
[
KVCache
]
=
[]
key_block_shape
=
self
.
get_key_block_shape
()
value_block_shape
=
self
.
get_value_block_shape
()
for
_
in
range
(
self
.
num_layers
):
key_blocks
=
torch
.
empty
(
size
=
(
self
.
num_gpu_blocks
,
*
key_block_shape
),
dtype
=
self
.
dtype
,
device
=
"cuda"
,
)
value_blocks
=
torch
.
empty
(
size
=
(
self
.
num_gpu_blocks
,
*
value_block_shape
),
dtype
=
self
.
dtype
,
device
=
"cuda"
,
)
gpu_cache
.
append
((
key_blocks
,
value_blocks
))
return
gpu_cache
def
allocate_cpu_cache
(
self
)
->
List
[
KVCache
]:
cpu_cache
:
List
[
KVCache
]
=
[]
key_block_shape
=
self
.
get_key_block_shape
()
value_block_shape
=
self
.
get_value_block_shape
()
pin_memory
=
not
in_wsl
()
if
not
pin_memory
:
# Pinning memory in WSL is not supported.
# https://docs.nvidia.com/cuda/wsl-user-guide/index.html#known-limitations-for-linux-cuda-applications
logger
.
warning
(
"Using 'pin_memory=False' as WSL is detected. "
"This may slow down the performance."
)
for
_
in
range
(
self
.
num_layers
):
key_blocks
=
torch
.
empty
(
size
=
(
self
.
num_cpu_blocks
,
*
key_block_shape
),
dtype
=
self
.
dtype
,
pin_memory
=
pin_memory
,
)
value_blocks
=
torch
.
empty
(
size
=
(
self
.
num_cpu_blocks
,
*
value_block_shape
),
dtype
=
self
.
dtype
,
pin_memory
=
pin_memory
,
)
cpu_cache
.
append
((
key_blocks
,
value_blocks
))
return
cpu_cache
def
_swap
(
self
,
src
:
List
[
KVCache
],
dst
:
List
[
KVCache
],
src_to_dst
:
Dict
[
int
,
int
],
)
->
None
:
with
torch
.
cuda
.
stream
(
self
.
cache_stream
):
for
i
in
range
(
self
.
num_layers
):
src_key_cache
,
src_value_cache
=
src
[
i
]
dst_key_cache
,
dst_value_cache
=
dst
[
i
]
# Copy the key blocks.
cache_ops
.
swap_blocks
(
src_key_cache
,
dst_key_cache
,
src_to_dst
)
# Copy the value blocks.
cache_ops
.
swap_blocks
(
src_value_cache
,
dst_value_cache
,
src_to_dst
)
event
=
self
.
events
[
i
]
event
.
record
(
stream
=
self
.
cache_stream
)
def
swap_in
(
self
,
src_to_dst
:
Dict
[
int
,
int
])
->
None
:
self
.
_swap
(
self
.
cpu_cache
,
self
.
gpu_cache
,
src_to_dst
)
def
swap_out
(
self
,
src_to_dst
:
Dict
[
int
,
int
])
->
None
:
self
.
_swap
(
self
.
gpu_cache
,
self
.
cpu_cache
,
src_to_dst
)
def
copy
(
self
,
src_to_dsts
:
Dict
[
int
,
List
[
int
]])
->
None
:
key_caches
=
[
key_cache
for
key_cache
,
_
in
self
.
gpu_cache
]
value_caches
=
[
value_cache
for
_
,
value_cache
in
self
.
gpu_cache
]
# NOTE(woosuk): This operation implicitly synchronizes the CPU and GPU.
cache_ops
.
copy_blocks
(
key_caches
,
value_caches
,
src_to_dsts
)
@
staticmethod
def
get_cache_block_size
(
block_size
:
int
,
model_config
:
ModelConfig
,
parallel_config
:
ParallelConfig
,
)
->
int
:
head_size
=
model_config
.
get_head_size
()
num_heads
=
model_config
.
get_num_kv_heads
(
parallel_config
)
num_layers
=
model_config
.
get_num_layers
(
parallel_config
)
key_cache_block
=
block_size
*
num_heads
*
head_size
value_cache_block
=
key_cache_block
total
=
num_layers
*
(
key_cache_block
+
value_cache_block
)
dtype_size
=
_get_dtype_size
(
model_config
.
dtype
)
return
dtype_size
*
total
def
_get_dtype_size
(
dtype
:
torch
.
dtype
)
->
int
:
return
torch
.
tensor
([],
dtype
=
dtype
).
element_size
()
server/vllm/vllm/worker/worker.py
0 → 100644
View file @
70056d1e
"""A GPU worker class."""
import
os
from
typing
import
Dict
,
List
,
Tuple
,
Optional
import
torch
import
torch.distributed
from
vllm.config
import
(
CacheConfig
,
ModelConfig
,
ParallelConfig
,
SchedulerConfig
)
from
vllm.model_executor
import
get_model
,
InputMetadata
,
set_random_seed
from
vllm.model_executor.parallel_utils.parallel_state
import
(
initialize_model_parallel
)
from
vllm.sampling_params
import
SamplingParams
from
vllm.sequence
import
SamplerOutput
,
SequenceData
,
SequenceGroupMetadata
from
vllm.worker.cache_engine
import
CacheEngine
from
vllm.utils
import
get_gpu_memory
,
get_max_shared_memory_bytes
class
Worker
:
"""A worker class that executes (a partition of) the model on a GPU.
Each worker is associated with a single GPU. The worker is responsible for
maintaining the KV cache and executing the model on the GPU. In case of
distributed inference, each worker is assigned a partition of the model.
"""
def
__init__
(
self
,
model_config
:
ModelConfig
,
parallel_config
:
ParallelConfig
,
scheduler_config
:
SchedulerConfig
,
rank
:
Optional
[
int
]
=
None
,
distributed_init_method
:
Optional
[
str
]
=
None
,
)
->
None
:
self
.
model_config
=
model_config
self
.
parallel_config
=
parallel_config
self
.
scheduler_config
=
scheduler_config
self
.
rank
=
rank
self
.
distributed_init_method
=
distributed_init_method
# Uninitialized cache engine. Will be initialized by
# self.init_cache_engine().
self
.
cache_config
=
None
self
.
block_size
=
None
self
.
sliding_window
=
None
self
.
cache_engine
=
None
self
.
cache_events
=
None
self
.
gpu_cache
=
None
def
init_model
(
self
):
# This env var set by Ray causes exceptions with graph building.
os
.
environ
.
pop
(
"NCCL_ASYNC_ERROR_HANDLING"
,
None
)
# Env vars will be set by Ray.
self
.
rank
=
self
.
rank
if
self
.
rank
is
not
None
else
int
(
os
.
getenv
(
"RANK"
,
"-1"
))
local_rank
=
int
(
os
.
getenv
(
"LOCAL_RANK"
,
"0"
))
self
.
device
=
torch
.
device
(
f
"cuda:
{
local_rank
}
"
)
if
self
.
rank
<
0
:
raise
ValueError
(
"Invalid or unspecified rank."
)
torch
.
cuda
.
set_device
(
self
.
device
)
_check_if_gpu_supports_dtype
(
self
.
model_config
.
dtype
)
# Initialize the distributed environment.
_init_distributed_environment
(
self
.
parallel_config
,
self
.
rank
,
self
.
distributed_init_method
)
# Initialize the model.
set_random_seed
(
self
.
model_config
.
seed
)
self
.
model
=
get_model
(
self
.
model_config
)
@
torch
.
inference_mode
()
def
profile_num_available_blocks
(
self
,
block_size
:
int
,
gpu_memory_utilization
:
float
,
cpu_swap_space
:
int
,
)
->
Tuple
[
int
,
int
]:
# Profile the memory usage of the model and get the maximum number of
# cache blocks that can be allocated with the remaining free memory.
torch
.
cuda
.
empty_cache
()
torch
.
cuda
.
reset_peak_memory_stats
()
# Profile memory usage with max_num_sequences sequences and the total
# number of tokens equal to max_num_batched_tokens.
# Enable top-k sampling to reflect the accurate memory usage.
vocab_size
=
self
.
model
.
config
.
vocab_size
sampling_params
=
SamplingParams
(
top_p
=
0.99
,
top_k
=
vocab_size
-
1
)
max_num_batched_tokens
=
self
.
scheduler_config
.
max_num_batched_tokens
max_num_seqs
=
self
.
scheduler_config
.
max_num_seqs
seqs
=
[]
for
group_id
in
range
(
max_num_seqs
):
seq_len
=
(
max_num_batched_tokens
//
max_num_seqs
+
(
group_id
<
max_num_batched_tokens
%
max_num_seqs
))
seq_data
=
SequenceData
([
0
]
*
seq_len
)
seq
=
SequenceGroupMetadata
(
request_id
=
str
(
group_id
),
is_prompt
=
True
,
seq_data
=
{
group_id
:
seq_data
},
sampling_params
=
sampling_params
,
block_tables
=
None
,
)
seqs
.
append
(
seq
)
input_tokens
,
input_positions
,
input_metadata
=
self
.
_prepare_inputs
(
seqs
)
# Execute the model.
num_layers
=
self
.
model_config
.
get_num_layers
(
self
.
parallel_config
)
self
.
model
(
input_ids
=
input_tokens
,
positions
=
input_positions
,
kv_caches
=
[(
None
,
None
)]
*
num_layers
,
input_metadata
=
input_metadata
,
cache_events
=
None
,
)
# Calculate the number of blocks that can be allocated with the
# profiled peak memory.
torch
.
cuda
.
synchronize
()
peak_memory
=
torch
.
cuda
.
max_memory_allocated
()
total_gpu_memory
=
get_gpu_memory
()
cache_block_size
=
CacheEngine
.
get_cache_block_size
(
block_size
,
self
.
model_config
,
self
.
parallel_config
)
num_gpu_blocks
=
int
(
(
total_gpu_memory
*
gpu_memory_utilization
-
peak_memory
)
//
cache_block_size
)
num_cpu_blocks
=
int
(
cpu_swap_space
//
cache_block_size
)
num_gpu_blocks
=
max
(
num_gpu_blocks
,
0
)
num_cpu_blocks
=
max
(
num_cpu_blocks
,
0
)
torch
.
cuda
.
empty_cache
()
# Reset the seed to ensure that the random state is not affected by
# the model initialization and profiling.
set_random_seed
(
self
.
model_config
.
seed
)
return
num_gpu_blocks
,
num_cpu_blocks
def
init_cache_engine
(
self
,
cache_config
:
CacheConfig
)
->
None
:
self
.
cache_config
=
cache_config
self
.
block_size
=
cache_config
.
block_size
self
.
sliding_window
=
cache_config
.
sliding_window
if
self
.
sliding_window
is
None
:
max_seq_len
=
self
.
scheduler_config
.
max_model_len
else
:
max_seq_len
=
min
(
self
.
scheduler_config
.
max_model_len
,
self
.
sliding_window
)
_check_if_can_support_max_seq_len
(
max_seq_len
,
self
.
block_size
)
self
.
cache_engine
=
CacheEngine
(
self
.
cache_config
,
self
.
model_config
,
self
.
parallel_config
)
self
.
cache_events
=
self
.
cache_engine
.
events
self
.
gpu_cache
=
self
.
cache_engine
.
gpu_cache
def
_prepare_inputs
(
self
,
seq_group_metadata_list
:
List
[
SequenceGroupMetadata
],
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
,
InputMetadata
]:
seq_groups
:
List
[
Tuple
[
List
[
int
],
SamplingParams
]]
=
[]
input_tokens
:
List
[
List
[
int
]]
=
[]
input_positions
:
List
[
List
[
int
]]
=
[]
slot_mapping
:
List
[
List
[
int
]]
=
[]
# Add prompt tokens.
prompt_lens
:
List
[
int
]
=
[]
for
seq_group_metadata
in
seq_group_metadata_list
:
if
not
seq_group_metadata
.
is_prompt
:
continue
seq_ids
=
list
(
seq_group_metadata
.
seq_data
.
keys
())
sampling_params
=
seq_group_metadata
.
sampling_params
seq_groups
.
append
((
seq_ids
,
sampling_params
))
# Use any sequence in the group.
seq_id
=
seq_ids
[
0
]
seq_data
=
seq_group_metadata
.
seq_data
[
seq_id
]
prompt_tokens
=
seq_data
.
get_token_ids
()
prompt_len
=
len
(
prompt_tokens
)
prompt_lens
.
append
(
prompt_len
)
input_tokens
.
append
(
prompt_tokens
)
# NOTE(woosuk): Here we assume that the first token in the prompt
# is always the first token in the sequence.
input_positions
.
append
(
list
(
range
(
prompt_len
)))
if
seq_group_metadata
.
block_tables
is
None
:
# During memory profiling, the block tables are not initialized
# yet. In this case, we just use a dummy slot mapping.
slot_mapping
.
append
([
0
]
*
prompt_len
)
continue
# Compute the slot mapping.
slot_mapping
.
append
([])
block_table
=
seq_group_metadata
.
block_tables
[
seq_id
]
for
i
in
range
(
prompt_len
):
block_number
=
block_table
[
i
//
self
.
block_size
]
block_offset
=
i
%
self
.
block_size
slot
=
block_number
*
self
.
block_size
+
block_offset
slot_mapping
[
-
1
].
append
(
slot
)
# Add generation tokens.
max_context_len
=
0
max_num_blocks_per_seq
=
0
context_lens
:
List
[
int
]
=
[]
generation_block_tables
:
List
[
List
[
int
]]
=
[]
for
seq_group_metadata
in
seq_group_metadata_list
:
if
seq_group_metadata
.
is_prompt
:
continue
seq_ids
=
list
(
seq_group_metadata
.
seq_data
.
keys
())
sampling_params
=
seq_group_metadata
.
sampling_params
seq_groups
.
append
((
seq_ids
,
sampling_params
))
for
seq_id
in
seq_ids
:
seq_data
=
seq_group_metadata
.
seq_data
[
seq_id
]
generation_token
=
seq_data
.
get_last_token_id
()
input_tokens
.
append
([
generation_token
])
context_len
=
seq_data
.
get_len
()
position
=
context_len
-
1
if
self
.
sliding_window
is
not
None
:
context_len
=
min
(
context_len
,
self
.
sliding_window
)
input_positions
.
append
([
position
])
block_table
=
seq_group_metadata
.
block_tables
[
seq_id
]
max_context_len
=
max
(
max_context_len
,
context_len
)
max_num_blocks_per_seq
=
max
(
max_num_blocks_per_seq
,
len
(
block_table
))
context_lens
.
append
(
context_len
)
block_number
=
block_table
[
position
//
self
.
block_size
]
block_offset
=
position
%
self
.
block_size
slot
=
block_number
*
self
.
block_size
+
block_offset
slot_mapping
.
append
([
slot
])
if
self
.
sliding_window
is
not
None
:
sliding_window_blocks
=
(
self
.
sliding_window
//
self
.
block_size
)
block_table
=
block_table
[
-
sliding_window_blocks
:]
generation_block_tables
.
append
(
block_table
)
max_seq_len
=
max
(
prompt_lens
)
if
prompt_lens
else
1
padded_input_tokens
=
[
_pad_to_max
(
tokens
,
max_seq_len
,
pad
=
0
)
for
tokens
in
input_tokens
]
padded_input_positions
=
[
_pad_to_max
(
positions
,
max_seq_len
,
pad
=
0
)
for
positions
in
input_positions
]
padded_slot_mapping
=
[
_pad_to_max
(
mapping
,
max_seq_len
,
pad
=-
1
)
for
mapping
in
slot_mapping
]
padded_block_tables
=
[
_pad_to_max
(
block_table
,
max_num_blocks_per_seq
,
pad
=
0
)
for
block_table
in
generation_block_tables
]
# Convert to tensors.
tokens_tensor
=
torch
.
tensor
(
padded_input_tokens
,
dtype
=
torch
.
long
,
device
=
"cuda"
)
positions_tensor
=
torch
.
tensor
(
padded_input_positions
,
dtype
=
torch
.
long
,
device
=
"cuda"
)
slot_mapping_tensor
=
torch
.
tensor
(
padded_slot_mapping
,
dtype
=
torch
.
int
,
device
=
"cuda"
)
context_lens_tensor
=
torch
.
tensor
(
context_lens
,
dtype
=
torch
.
int
,
device
=
"cuda"
)
block_tables_tensor
=
torch
.
tensor
(
padded_block_tables
,
dtype
=
torch
.
int
,
device
=
"cuda"
)
seq_data
:
Dict
[
int
,
SequenceData
]
=
{}
for
seq_group_metadata
in
seq_group_metadata_list
:
seq_data
.
update
(
seq_group_metadata
.
seq_data
)
input_metadata
=
InputMetadata
(
seq_groups
=
seq_groups
,
seq_data
=
seq_data
,
prompt_lens
=
prompt_lens
,
slot_mapping
=
slot_mapping_tensor
,
context_lens
=
context_lens_tensor
,
max_context_len
=
max_context_len
,
block_tables
=
block_tables_tensor
,
sliding_window
=
self
.
sliding_window
,
)
return
tokens_tensor
,
positions_tensor
,
input_metadata
@
torch
.
inference_mode
()
def
execute_model
(
self
,
seq_group_metadata_list
:
List
[
SequenceGroupMetadata
],
blocks_to_swap_in
:
Dict
[
int
,
int
],
blocks_to_swap_out
:
Dict
[
int
,
int
],
blocks_to_copy
:
Dict
[
int
,
List
[
int
]],
)
->
SamplerOutput
:
# Issue cache operations.
issued_cache_op
=
False
if
blocks_to_swap_in
:
self
.
cache_engine
.
swap_in
(
blocks_to_swap_in
)
issued_cache_op
=
True
if
blocks_to_swap_out
:
self
.
cache_engine
.
swap_out
(
blocks_to_swap_out
)
issued_cache_op
=
True
if
blocks_to_copy
:
self
.
cache_engine
.
copy
(
blocks_to_copy
)
issued_cache_op
=
True
if
issued_cache_op
:
cache_events
=
self
.
cache_events
else
:
cache_events
=
None
# If there is no input, we don't need to execute the model.
if
not
seq_group_metadata_list
:
if
cache_events
is
not
None
:
for
event
in
cache_events
:
event
.
wait
()
return
{}
# Prepare input tensors.
input_tokens
,
input_positions
,
input_metadata
=
self
.
_prepare_inputs
(
seq_group_metadata_list
)
# Execute the model.
output
=
self
.
model
(
input_ids
=
input_tokens
,
positions
=
input_positions
,
kv_caches
=
self
.
gpu_cache
,
input_metadata
=
input_metadata
,
cache_events
=
cache_events
,
)
return
output
def
_init_distributed_environment
(
parallel_config
:
ParallelConfig
,
rank
:
int
,
distributed_init_method
:
Optional
[
str
]
=
None
,
)
->
None
:
"""Initialize the distributed environment."""
if
torch
.
distributed
.
is_initialized
():
torch_world_size
=
torch
.
distributed
.
get_world_size
()
if
torch_world_size
!=
parallel_config
.
world_size
:
raise
RuntimeError
(
"torch.distributed is already initialized but the torch world "
"size does not match parallel_config.world_size "
f
"(
{
torch_world_size
}
vs.
{
parallel_config
.
world_size
}
)."
)
elif
not
distributed_init_method
:
raise
ValueError
(
"distributed_init_method must be set if torch.distributed "
"is not already initialized"
)
else
:
torch
.
distributed
.
init_process_group
(
backend
=
"nccl"
,
world_size
=
parallel_config
.
world_size
,
rank
=
rank
,
init_method
=
distributed_init_method
,
)
# A small all_reduce for warmup.
torch
.
distributed
.
all_reduce
(
torch
.
zeros
(
1
).
cuda
())
initialize_model_parallel
(
parallel_config
.
tensor_parallel_size
,
parallel_config
.
pipeline_parallel_size
)
def
_pad_to_alignment
(
x
:
List
[
int
],
multiple_of
:
int
,
pad
:
int
)
->
List
[
int
]:
return
x
+
[
pad
]
*
((
-
len
(
x
))
%
multiple_of
)
def
_pad_to_max
(
x
:
List
[
int
],
max_len
:
int
,
pad
:
int
)
->
List
[
int
]:
return
x
+
[
pad
]
*
(
max_len
-
len
(
x
))
def
_check_if_can_support_max_seq_len
(
max_seq_len
:
int
,
block_size
:
int
)
->
None
:
# Follows the logic in
# attention_kernels.cu::single_query_cached_kv_attention_launcher
max_shared_mem
=
get_max_shared_memory_bytes
()
float32_bytes
=
torch
.
finfo
(
torch
.
float
).
bits
//
8
padded_max_seq_len
=
(
(
max_seq_len
+
block_size
-
1
)
/
block_size
)
*
block_size
# padded_max_seq_len + extra buffer
required_shared_mem
=
(
padded_max_seq_len
+
512
)
*
float32_bytes
if
padded_max_seq_len
*
float32_bytes
>
max_shared_mem
:
raise
RuntimeError
(
f
"vLLM cannot currently support max_model_len=
{
max_seq_len
}
"
f
"with block_size=
{
block_size
}
on GPU with compute "
f
"capability
{
torch
.
cuda
.
get_device_capability
()
}
"
f
"(required shared memory
{
required_shared_mem
}
> "
f
"available shared memory
{
max_shared_mem
}
). "
"This will be fixed in a future release."
)
def
_check_if_gpu_supports_dtype
(
torch_dtype
:
torch
.
dtype
):
# Check if the GPU supports the dtype.
if
torch_dtype
==
torch
.
bfloat16
:
compute_capability
=
torch
.
cuda
.
get_device_capability
()
if
compute_capability
[
0
]
<
8
:
gpu_name
=
torch
.
cuda
.
get_device_name
()
raise
ValueError
(
"Bfloat16 is only supported on GPUs with compute capability "
f
"of at least 8.0. Your
{
gpu_name
}
GPU has compute capability "
f
"
{
compute_capability
[
0
]
}
.
{
compute_capability
[
1
]
}
."
)
Prev
1
…
4
5
6
7
8
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment