Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
verl_mixtral_8x7B
Commits
f87b35b2
Commit
f87b35b2
authored
Apr 17, 2025
by
jerrrrry
Browse files
Initial commit
parents
Pipeline
#2648
failed with stages
in 0 seconds
Changes
363
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
4996 additions
and
0 deletions
+4996
-0
verl/third_party/vllm/vllm_v_0_4_2/dtensor_weight_loaders.py
verl/third_party/vllm/vllm_v_0_4_2/dtensor_weight_loaders.py
+269
-0
verl/third_party/vllm/vllm_v_0_4_2/hf_weight_loader.py
verl/third_party/vllm/vllm_v_0_4_2/hf_weight_loader.py
+91
-0
verl/third_party/vllm/vllm_v_0_4_2/llm.py
verl/third_party/vllm/vllm_v_0_4_2/llm.py
+306
-0
verl/third_party/vllm/vllm_v_0_4_2/llm_engine_sp.py
verl/third_party/vllm/vllm_v_0_4_2/llm_engine_sp.py
+283
-0
verl/third_party/vllm/vllm_v_0_4_2/megatron_weight_loaders.py
.../third_party/vllm/vllm_v_0_4_2/megatron_weight_loaders.py
+348
-0
verl/third_party/vllm/vllm_v_0_4_2/model_loader.py
verl/third_party/vllm/vllm_v_0_4_2/model_loader.py
+265
-0
verl/third_party/vllm/vllm_v_0_4_2/model_runner.py
verl/third_party/vllm/vllm_v_0_4_2/model_runner.py
+281
-0
verl/third_party/vllm/vllm_v_0_4_2/parallel_state.py
verl/third_party/vllm/vllm_v_0_4_2/parallel_state.py
+294
-0
verl/third_party/vllm/vllm_v_0_4_2/spmd_gpu_executor.py
verl/third_party/vllm/vllm_v_0_4_2/spmd_gpu_executor.py
+218
-0
verl/third_party/vllm/vllm_v_0_4_2/tokenizer.py
verl/third_party/vllm/vllm_v_0_4_2/tokenizer.py
+77
-0
verl/third_party/vllm/vllm_v_0_4_2/worker.py
verl/third_party/vllm/vllm_v_0_4_2/worker.py
+292
-0
verl/third_party/vllm/vllm_v_0_5_4/__init__.py
verl/third_party/vllm/vllm_v_0_5_4/__init__.py
+13
-0
verl/third_party/vllm/vllm_v_0_5_4/arg_utils.py
verl/third_party/vllm/vllm_v_0_5_4/arg_utils.py
+453
-0
verl/third_party/vllm/vllm_v_0_5_4/config.py
verl/third_party/vllm/vllm_v_0_5_4/config.py
+246
-0
verl/third_party/vllm/vllm_v_0_5_4/dtensor_weight_loaders.py
verl/third_party/vllm/vllm_v_0_5_4/dtensor_weight_loaders.py
+340
-0
verl/third_party/vllm/vllm_v_0_5_4/hf_weight_loader.py
verl/third_party/vllm/vllm_v_0_5_4/hf_weight_loader.py
+44
-0
verl/third_party/vllm/vllm_v_0_5_4/llm.py
verl/third_party/vllm/vllm_v_0_5_4/llm.py
+239
-0
verl/third_party/vllm/vllm_v_0_5_4/llm_engine_sp.py
verl/third_party/vllm/vllm_v_0_5_4/llm_engine_sp.py
+328
-0
verl/third_party/vllm/vllm_v_0_5_4/megatron_weight_loaders.py
.../third_party/vllm/vllm_v_0_5_4/megatron_weight_loaders.py
+307
-0
verl/third_party/vllm/vllm_v_0_5_4/model_loader.py
verl/third_party/vllm/vllm_v_0_5_4/model_loader.py
+302
-0
No files found.
Too many changes to show.
To preserve performance only
363 of 363+
files are displayed.
Plain diff
Email patch
verl/third_party/vllm/vllm_v_0_4_2/dtensor_weight_loaders.py
0 → 100644
View file @
f87b35b2
# Copyright 2024 Bytedance Ltd. and/or its affiliates
# Copyright 2023 The vLLM team.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Adapted from https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/models
from
typing
import
Dict
,
Iterable
,
Tuple
import
torch
import
torch.nn
as
nn
from
torch.distributed._tensor
import
DTensor
,
Shard
,
Replicate
from
vllm.model_executor.layers.linear
import
*
from
vllm.model_executor.models
import
ModelRegistry
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
def
gemma_dtensor_weight_loader
(
actor_weights
:
Dict
,
vllm_model
:
nn
.
Module
)
->
nn
.
Module
:
stacked_params_mapping
=
[
# (param_name, shard_name, shard_id)
(
"qkv_proj"
,
"q_proj"
,
"q"
),
(
"qkv_proj"
,
"k_proj"
,
"k"
),
(
"qkv_proj"
,
"v_proj"
,
"v"
),
(
"gate_up_proj"
,
"gate_proj"
,
0
),
(
"gate_up_proj"
,
"up_proj"
,
1
),
]
params_dict
=
dict
(
vllm_model
.
named_parameters
())
for
name
,
loaded_weight
in
actor_weights
.
items
():
for
(
param_name
,
shard_name
,
shard_id
)
in
stacked_params_mapping
:
if
shard_name
not
in
name
:
continue
stacked_name
=
name
.
replace
(
shard_name
,
param_name
)
# Skip loading extra bias for GPTQ models.
if
stacked_name
.
endswith
(
".bias"
)
and
stacked_name
not
in
params_dict
:
continue
local_loaded_weight
=
redistribute_dtensor
(
param_name
=
name
,
loaded_weights
=
loaded_weight
)
param
=
params_dict
[
stacked_name
]
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
weight_loader
(
param
,
local_loaded_weight
.
to
(
dtype
=
param
.
dtype
),
shard_id
)
break
else
:
# lm_head is not used in vllm as it is tied with embed_token.
# To prevent errors, skip loading lm_head.weight.
if
"lm_head.weight"
in
name
:
continue
# Skip loading extra bias for GPTQ models.
if
name
.
endswith
(
".bias"
)
and
name
not
in
params_dict
:
continue
# GemmaRMSNorm is different from Llama's in that it multiplies
# (1 + weight) to the output, instead of just weight.
if
"norm.weight"
in
name
:
local_loaded_weight
=
redistribute_dtensor
(
param_name
=
name
,
loaded_weights
=
loaded_weight
)
norm_weight
=
local_loaded_weight
+
1.0
param
=
params_dict
[
name
]
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
weight_loader
(
param
,
norm_weight
.
to
(
dtype
=
param
.
dtype
))
else
:
local_loaded_weight
=
redistribute_dtensor
(
param_name
=
name
,
loaded_weights
=
loaded_weight
)
param
=
params_dict
[
name
]
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
weight_loader
(
param
,
local_loaded_weight
.
to
(
dtype
=
param
.
dtype
))
def
gptbigcode_dtensor_load_weights
(
actor_weights
:
Dict
,
vllm_model
:
nn
.
Module
):
params_dict
=
dict
(
vllm_model
.
named_parameters
(
remove_duplicate
=
False
))
for
name
,
loaded_weight
in
actor_weights
.
items
():
if
"lm_head.weight"
in
name
:
continue
if
".attn.bias"
in
name
:
# Skip attention mask.
# NOTE: "c_attn.bias" should not be skipped.
continue
local_loaded_weight
=
redistribute_dtensor
(
param_name
=
name
,
loaded_weights
=
loaded_weight
)
param
=
params_dict
[
name
]
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
weight_loader
(
param
,
local_loaded_weight
.
to
(
dtype
=
param
.
dtype
))
def
starcoder2_dtensor_load_weights
(
actor_weights
:
Dict
,
vllm_model
:
nn
.
Module
):
stacked_params_mapping
=
[
# (param_name, shard_name, shard_id)
(
"qkv_proj"
,
"q_proj"
,
"q"
),
(
"qkv_proj"
,
"k_proj"
,
"k"
),
(
"qkv_proj"
,
"v_proj"
,
"v"
),
]
params_dict
=
dict
(
vllm_model
.
named_parameters
(
remove_duplicate
=
False
))
for
name
,
loaded_weight
in
actor_weights
.
items
():
if
"rotary_emb.inv_freq"
in
name
:
continue
for
(
param_name
,
weight_name
,
shard_id
)
in
stacked_params_mapping
:
if
weight_name
not
in
name
:
continue
name
=
name
.
replace
(
weight_name
,
param_name
)
local_loaded_weight
=
redistribute_dtensor
(
param_name
=
name
,
loaded_weights
=
loaded_weight
)
param
=
params_dict
[
name
]
weight_loader
=
param
.
weight_loader
weight_loader
(
param
,
local_loaded_weight
.
to
(
dtype
=
param
.
dtype
),
shard_id
)
break
else
:
if
vllm_model
.
config
.
tie_word_embeddings
and
"lm_head.weight"
in
name
:
continue
param
=
params_dict
[
name
]
local_loaded_weight
=
redistribute_dtensor
(
param_name
=
name
,
loaded_weights
=
loaded_weight
)
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
weight_loader
(
param
,
local_loaded_weight
.
to
(
dtype
=
param
.
dtype
))
def
llama_dtensor_weight_loader
(
actor_weights
:
Dict
,
vllm_model
:
nn
.
Module
)
->
nn
.
Module
:
stacked_params_mapping
=
[
# (param_name, shard_name, shard_id)
(
".qkv_proj"
,
".q_proj"
,
"q"
),
(
".qkv_proj"
,
".k_proj"
,
"k"
),
(
".qkv_proj"
,
".v_proj"
,
"v"
),
(
".gate_up_proj"
,
".gate_proj"
,
0
),
(
".gate_up_proj"
,
".up_proj"
,
1
),
]
params_dict
=
dict
(
vllm_model
.
named_parameters
())
for
name
,
loaded_weight
in
actor_weights
.
items
():
if
"rotary_emb.inv_freq"
in
name
:
continue
if
(
"rotary_emb.cos_cached"
in
name
or
"rotary_emb.sin_cached"
in
name
):
# Models trained using ColossalAI may include these tensors in
# the checkpoint. Skip them.
continue
# With tie_word_embeddings, we can skip lm_head.weight
# The weight might appear unnecessarily in the files if the model is
# processed with quantization, LoRA, fine-tuning, etc.
if
vllm_model
.
config
.
tie_word_embeddings
and
"lm_head.weight"
in
name
:
continue
for
(
param_name
,
weight_name
,
shard_id
)
in
stacked_params_mapping
:
if
weight_name
not
in
name
:
continue
name
=
name
.
replace
(
weight_name
,
param_name
)
# Skip loading extra bias for GPTQ models.
if
name
.
endswith
(
".bias"
)
and
name
not
in
params_dict
:
continue
local_loaded_weight
=
redistribute_dtensor
(
param_name
=
name
,
loaded_weights
=
loaded_weight
)
param
=
params_dict
[
name
]
weight_loader
=
param
.
weight_loader
weight_loader
(
param
,
local_loaded_weight
.
to
(
dtype
=
param
.
dtype
),
shard_id
)
break
else
:
# Skip loading extra bias for GPTQ models.
if
name
.
endswith
(
".bias"
)
and
name
not
in
params_dict
:
continue
local_loaded_weight
=
redistribute_dtensor
(
param_name
=
name
,
loaded_weights
=
loaded_weight
)
param
=
params_dict
[
name
]
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
weight_loader
(
param
,
local_loaded_weight
)
def
qwen2_dtensor_weight_loader
(
actor_weights
:
Dict
,
vllm_model
:
nn
.
Module
)
->
nn
.
Module
:
stacked_params_mapping
=
[
# (param_name, shard_name, shard_id)
(
"qkv_proj"
,
"q_proj"
,
"q"
),
(
"qkv_proj"
,
"k_proj"
,
"k"
),
(
"qkv_proj"
,
"v_proj"
,
"v"
),
(
"gate_up_proj"
,
"gate_proj"
,
0
),
(
"gate_up_proj"
,
"up_proj"
,
1
),
]
params_dict
=
dict
(
vllm_model
.
named_parameters
(
remove_duplicate
=
False
))
for
name
,
loaded_weight
in
actor_weights
.
items
():
if
"rotary_emb.inv_freq"
in
name
:
continue
if
vllm_model
.
config
.
tie_word_embeddings
and
"lm_head.weight"
in
name
:
continue
for
(
param_name
,
weight_name
,
shard_id
)
in
stacked_params_mapping
:
if
weight_name
not
in
name
:
continue
name
=
name
.
replace
(
weight_name
,
param_name
)
# Skip loading extra bias for GPTQ models.
if
name
.
endswith
(
".bias"
)
and
name
not
in
params_dict
:
continue
local_loaded_weight
=
redistribute_dtensor
(
param_name
=
name
,
loaded_weights
=
loaded_weight
)
param
=
params_dict
[
name
]
weight_loader
=
param
.
weight_loader
weight_loader
(
param
,
local_loaded_weight
.
to
(
dtype
=
param
.
dtype
),
shard_id
)
break
else
:
# Skip loading extra bias for GPTQ models.
if
name
.
endswith
(
".bias"
)
and
name
not
in
params_dict
:
continue
param
=
params_dict
[
name
]
local_loaded_weight
=
redistribute_dtensor
(
param_name
=
name
,
loaded_weights
=
loaded_weight
)
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
weight_loader
(
param
,
local_loaded_weight
.
to
(
dtype
=
param
.
dtype
))
def
gpt2_dtensor_weight_loader
(
actor_weights
:
Dict
,
vllm_model
:
nn
.
Module
)
->
nn
.
Module
:
pass
def
redistribute_dtensor
(
param_name
:
str
,
loaded_weights
:
DTensor
,
parallelize_plan
:
Dict
=
None
):
param_name
=
_process_parameter_names
(
name
=
param_name
)
if
parallelize_plan
is
not
None
:
assert
param_name
in
parallelize_plan
.
keys
(),
\
f
"param name:
{
param_name
}
not in parallelize_plan :
{
parallelize_plan
.
keys
()
}
"
placement
=
parallelize_plan
[
param_name
]
local_loaded_weights
=
loaded_weights
.
redistribute
(
device_mesh
=
loaded_weights
.
device_mesh
,
placements
=
placement
).
to_local
()
else
:
local_loaded_weights
=
loaded_weights
.
full_tensor
()
return
local_loaded_weights
def
_process_parameter_names
(
name
):
# Remove '.weight' if it exists at the end of the string
if
name
.
endswith
(
".weight"
):
name
=
name
[:
-
7
]
# Remove 'model.layers.x.' or 'model.' prefix
if
"model.layers"
in
name
:
parts
=
name
.
split
(
'.'
)
# Reconstruct the string without 'model.layers.x.'
name
=
'.'
.
join
(
parts
[
3
:])
# parts[0] is 'model', parts[1] is 'layers', parts[2] is 'x'
elif
name
.
startswith
(
"model."
):
name
=
name
[
6
:]
# Remove 'model.'
return
name
__MODEL_DTENSOR_WEIGHT_LOADER_REGISTRY__
=
{
'GPT2LMHeadModel'
:
gpt2_dtensor_weight_loader
,
'LlamaForCausalLM'
:
llama_dtensor_weight_loader
,
'LLaMAForCausalLM'
:
llama_dtensor_weight_loader
,
'MistralForCausalLM'
:
llama_dtensor_weight_loader
,
# mistral is the same as llama in vLLM
'InternLMForCausalLM'
:
llama_dtensor_weight_loader
,
'AquilaModel'
:
llama_dtensor_weight_loader
,
'AquilaForCausalLM'
:
llama_dtensor_weight_loader
,
'Phi3ForCausalLM'
:
llama_dtensor_weight_loader
,
'GemmaForCausalLM'
:
gemma_dtensor_weight_loader
,
'GPTBigCodeForCausalLM'
:
gptbigcode_dtensor_load_weights
,
'Starcoder2ForCausalLM'
:
starcoder2_dtensor_load_weights
,
'Qwen2ForCausalLM'
:
qwen2_dtensor_weight_loader
}
# the actor model is .state_dict()
# Load dtensor weights
def
load_dtensor_weights
(
actor_weights
:
Dict
,
vllm_model
:
nn
.
Module
):
weight_loader
=
_get_model_weight_loader
(
vllm_model
.
__class__
.
__name__
)
weight_loader
(
actor_weights
,
vllm_model
)
# NOTE(sgm) to reduce peak memory usage, we offload vllm model to cpu
# after init, and we need this after sync model weights for in first iter.
vllm_model
=
vllm_model
.
cuda
()
def
_get_model_weight_loader
(
arch
:
str
):
if
arch
in
__MODEL_DTENSOR_WEIGHT_LOADER_REGISTRY__
:
return
__MODEL_DTENSOR_WEIGHT_LOADER_REGISTRY__
[
arch
]
raise
ValueError
(
f
"Model architectures
{
arch
}
are not supported for now. "
f
"Supported architectures:
{
__MODEL_DTENSOR_WEIGHT_LOADER_REGISTRY__
.
keys
()
}
"
)
# NOTE(sgm): we use per-parameter weight loader in each vllm sub
def
update_dtensor_weight_loader
():
pass
verl/third_party/vllm/vllm_v_0_4_2/hf_weight_loader.py
0 → 100644
View file @
f87b35b2
# Copyright 2024 Bytedance Ltd. and/or its affiliates
# Copyright 2023 The vLLM team.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Adapted from https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/models
from
typing
import
Dict
,
Union
,
Optional
,
Iterable
,
Tuple
import
torch
import
torch.nn
as
nn
from
vllm.model_executor.model_loader.utils
import
set_default_torch_dtype
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
def
update_hf_weight_loader
():
from
vllm.model_executor.models.gemma
import
GemmaForCausalLM
GemmaForCausalLM
.
load_weights
=
gemma_load_weights
def
gemma_load_weights
(
self
,
weights
:
Iterable
[
Tuple
[
str
,
torch
.
Tensor
]]):
stacked_params_mapping
=
[
# (param_name, shard_name, shard_id)
(
"qkv_proj"
,
"q_proj"
,
"q"
),
(
"qkv_proj"
,
"k_proj"
,
"k"
),
(
"qkv_proj"
,
"v_proj"
,
"v"
),
(
"gate_up_proj"
,
"gate_proj"
,
0
),
(
"gate_up_proj"
,
"up_proj"
,
1
),
]
params_dict
=
dict
(
self
.
named_parameters
())
loaded_params
=
set
()
for
name
,
loaded_weight
in
weights
:
for
(
param_name
,
shard_name
,
shard_id
)
in
stacked_params_mapping
:
if
shard_name
not
in
name
:
continue
name
=
name
.
replace
(
shard_name
,
param_name
)
# Skip loading extra bias for GPTQ models.
if
name
.
endswith
(
".bias"
)
and
name
not
in
params_dict
:
continue
param
=
params_dict
[
name
]
weight_loader
=
param
.
weight_loader
weight_loader
(
param
,
loaded_weight
,
shard_id
)
break
else
:
# lm_head is not used in vllm as it is tied with embed_token.
# To prevent errors, skip loading lm_head.weight.
if
"lm_head.weight"
in
name
:
continue
# Skip loading extra bias for GPTQ models.
if
name
.
endswith
(
".bias"
)
and
name
not
in
params_dict
:
continue
# GemmaRMSNorm is different from Llama's in that it multiplies
# (1 + weight) to the output, instead of just weight.
if
"norm.weight"
in
name
:
norm_weight
=
loaded_weight
+
1.0
# prevent inplace modify actor weights
param
=
params_dict
[
name
]
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
weight_loader
(
param
,
norm_weight
)
else
:
param
=
params_dict
[
name
]
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
weight_loader
(
param
,
loaded_weight
)
loaded_params
.
add
(
name
)
unloaded_params
=
params_dict
.
keys
()
-
loaded_params
if
unloaded_params
:
raise
RuntimeError
(
"Some weights are not initialized from checkpoints: "
f
"
{
unloaded_params
}
"
)
def
load_hf_weights
(
actor_weights
:
Dict
,
vllm_model
:
nn
.
Module
):
assert
isinstance
(
actor_weights
,
Dict
)
with
set_default_torch_dtype
(
next
(
vllm_model
.
parameters
()).
dtype
):
# TODO
vllm_model
.
load_weights
(
actor_weights
.
items
())
for
_
,
module
in
vllm_model
.
named_modules
():
quant_method
=
getattr
(
module
,
"quant_method"
,
None
)
if
quant_method
is
not
None
:
quant_method
.
process_weights_after_loading
(
module
)
# FIXME: Remove this after Mixtral is updated
# to use quant_method.
if
hasattr
(
module
,
"process_weights_after_loading"
):
module
.
process_weights_after_loading
()
vllm_model
=
vllm_model
.
cuda
()
verl/third_party/vllm/vllm_v_0_4_2/llm.py
0 → 100644
View file @
f87b35b2
# Copyright 2024 Bytedance Ltd. and/or its affiliates
# Copyright 2023 The vLLM team.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/llm.py
from
typing
import
Dict
,
List
,
Optional
,
Tuple
,
Union
from
tqdm
import
tqdm
from
transformers
import
PreTrainedTokenizer
,
PreTrainedTokenizerFast
from
transformers
import
PretrainedConfig
import
torch.nn
as
nn
from
.arg_utils
import
EngineArgs
from
.llm_engine_sp
import
LLMEngine
from
vllm.lora.request
import
LoRARequest
from
vllm.outputs
import
RequestOutput
from
vllm.sampling_params
import
SamplingParams
from
vllm.sequence
import
MultiModalData
from
vllm.usage.usage_lib
import
UsageContext
from
vllm.utils
import
Counter
import
torch
from
torch.nn.utils.rnn
import
pad_sequence
from
verl.workers.rollout.tokenizer
import
HybridEngineBaseTokenizer
class
LLM
:
"""An LLM for generating texts from given prompts and sampling parameters.
This class includes a tokenizer, a language model (possibly distributed
across multiple GPUs), and GPU memory space allocated for intermediate
states (aka KV cache). Given a batch of prompts and sampling parameters,
this class generates texts from the model, using an intelligent batching
mechanism and efficient memory management.
NOTE: This class is intended to be used for offline inference. For online
serving, use the `AsyncLLMEngine` class instead.
NOTE: For the comprehensive list of arguments, see `EngineArgs`.
Args:
model: A HuggingFace Transformers model instance.
tokenizer: A HuggingFace Transformers tokenizer instance.
tokenizer_mode: The tokenizer mode. "auto" will use the fast tokenizer
if available, and "slow" will always use the slow tokenizer.
trust_remote_code: Trust remote code (e.g., from HuggingFace) when
downloading the model and tokenizer.
tensor_parallel_size: The number of GPUs to use for distributed
execution with tensor parallelism.
dtype: The data type for the model weights and activations. Currently,
we support `float32`, `float16`, and `bfloat16`. If `auto`, we use
the `torch_dtype` attribute specified in the model config file.
However, if the `torch_dtype` in the config is `float32`, we will
use `float16` instead.
quantization: The method used to quantize the model weights. Currently,
we support "awq". If None, we assume the model weights are not
quantized and use `dtype` to determine the data type of the weights.
revision: The specific model version to use. It can be a branch name,
a tag name, or a commit id.
tokenizer_revision: The specific tokenizer version to use. It can be a
branch name, a tag name, or a commit id.
seed: The seed to initialize the random number generator for sampling.
gpu_memory_utilization: The ratio (between 0 and 1) of GPU memory to
reserve for the model weights, activations, and KV cache. Higher
values will increase the KV cache size and thus improve the model's
throughput. However, if the value is too high, it may cause out-of-
memory (OOM) errors.
swap_space: The size (GiB) of CPU memory per GPU to use as swap space.
This can be used for temporarily storing the states of the requests
when their `best_of` sampling parameters are larger than 1. If all
requests will have `best_of=1`, you can safely set this to 0.
Otherwise, too small values may cause out-of-memory (OOM) errors.
enforce_eager: Whether to enforce eager execution. If True, we will
disable CUDA graph and always execute the model in eager mode.
If False, we will use CUDA graph and eager execution in hybrid.
max_context_len_to_capture: Maximum context len covered by CUDA graphs.
When a sequence has context length larger than this, we fall back
to eager mode.
disable_custom_all_reduce: See ParallelConfig
"""
def
__init__
(
self
,
model
:
Union
[
nn
.
Module
,
Dict
],
# model itself or its parameter dict
tokenizer
:
Union
[
PreTrainedTokenizer
,
PreTrainedTokenizerFast
,
HybridEngineBaseTokenizer
],
model_hf_config
:
PretrainedConfig
,
tokenizer_mode
:
str
=
"auto"
,
trust_remote_code
:
bool
=
False
,
tensor_parallel_size
:
int
=
1
,
dtype
:
str
=
"auto"
,
quantization
:
Optional
[
str
]
=
None
,
revision
:
Optional
[
str
]
=
None
,
tokenizer_revision
:
Optional
[
str
]
=
None
,
seed
:
int
=
0
,
gpu_memory_utilization
:
float
=
0.9
,
swap_space
:
int
=
4
,
enforce_eager
:
bool
=
False
,
max_context_len_to_capture
:
int
=
None
,
disable_custom_all_reduce
:
bool
=
False
,
load_format
=
'auto'
,
**
kwargs
,
)
->
None
:
if
"disable_log_stats"
not
in
kwargs
:
kwargs
[
"disable_log_stats"
]
=
True
engine_args
=
EngineArgs
(
model_hf_config
=
model_hf_config
,
tensor_parallel_size
=
tensor_parallel_size
,
dtype
=
dtype
,
quantization
=
quantization
,
revision
=
revision
,
tokenizer_revision
=
tokenizer_revision
,
seed
=
seed
,
gpu_memory_utilization
=
gpu_memory_utilization
,
swap_space
=
swap_space
,
enforce_eager
=
enforce_eager
,
max_context_len_to_capture
=
max_context_len_to_capture
,
disable_custom_all_reduce
=
disable_custom_all_reduce
,
load_format
=
load_format
,
**
kwargs
,
)
tokenizer_cls
=
(
PreTrainedTokenizer
,
PreTrainedTokenizerFast
,
HybridEngineBaseTokenizer
)
if
not
isinstance
(
tokenizer
,
tokenizer_cls
):
raise
ValueError
(
f
"Unexpected tokenizer type:
{
type
(
tokenizer
)
}
. Must be"
"one of the following: PreTrainedTokenizer, PreTrainedTokenizerFast, verl.workers.rollout.HybridEngineBaseTokenizer"
)
self
.
llm_engine
=
LLMEngine
.
from_engine_args
(
model
,
tokenizer
,
engine_args
)
self
.
request_counter
=
Counter
()
def
init_cache_engine
(
self
):
self
.
llm_engine
.
init_cache_engine
()
def
free_cache_engine
(
self
):
self
.
llm_engine
.
free_cache_engine
()
def
get_tokenizer
(
self
)
->
Union
[
PreTrainedTokenizer
,
PreTrainedTokenizerFast
]:
return
self
.
llm_engine
.
tokenizer
def
set_tokenizer
(
self
,
tokenizer
:
Union
[
PreTrainedTokenizer
,
PreTrainedTokenizerFast
],
)
->
None
:
self
.
llm_engine
.
tokenizer
=
tokenizer
def
generate
(
self
,
prompts
:
Optional
[
Union
[
str
,
List
[
str
]]]
=
None
,
sampling_params
:
Optional
[
Union
[
SamplingParams
,
List
[
SamplingParams
]]]
=
None
,
prompt_token_ids
:
Optional
[
List
[
List
[
int
]]]
=
None
,
use_tqdm
:
bool
=
True
,
lora_request
:
Optional
[
LoRARequest
]
=
None
,
multi_modal_data
:
Optional
[
MultiModalData
]
=
None
,
)
->
List
[
RequestOutput
]:
"""Generates the completions for the input prompts.
NOTE: This class automatically batches the given prompts, considering
the memory constraint. For the best performance, put all of your prompts
into a single list and pass it to this method.
Args:
prompts: A list of prompts to generate completions for.
sampling_params: The sampling parameters for text generation. If
None, we use the default sampling parameters.
When it is a single value, it is applied to every prompt.
When it is a list, the list must have the same length as the
prompts and it is paired one by one with the prompt.
prompt_token_ids: A list of token IDs for the prompts. If None, we
use the tokenizer to convert the prompts to token IDs.
use_tqdm: Whether to use tqdm to display the progress bar.
lora_request: LoRA request to use for generation, if any.
multi_modal_data: Multi modal data.
Returns:
A list of `RequestOutput` objects containing the generated
completions in the same order as the input prompts.
"""
if
prompts
is
None
and
prompt_token_ids
is
None
:
raise
ValueError
(
"Either prompts or prompt_token_ids must be "
"provided."
)
if
self
.
llm_engine
.
model_config
.
skip_tokenizer_init
\
and
prompts
is
not
None
:
raise
ValueError
(
"prompts must be None if skip_tokenizer_init "
"is True"
)
if
isinstance
(
prompts
,
str
):
# Convert a single prompt to a list.
prompts
=
[
prompts
]
if
(
prompts
is
not
None
and
prompt_token_ids
is
not
None
and
len
(
prompts
)
!=
len
(
prompt_token_ids
)):
raise
ValueError
(
"The lengths of prompts and prompt_token_ids "
"must be the same."
)
if
prompts
is
not
None
:
num_requests
=
len
(
prompts
)
else
:
assert
prompt_token_ids
is
not
None
num_requests
=
len
(
prompt_token_ids
)
if
sampling_params
is
None
:
# Use default sampling params.
sampling_params
=
SamplingParams
()
elif
isinstance
(
sampling_params
,
list
)
and
len
(
sampling_params
)
!=
num_requests
:
raise
ValueError
(
"The lengths of prompts and sampling_params "
"must be the same."
)
if
multi_modal_data
:
multi_modal_data
.
data
=
multi_modal_data
.
data
.
to
(
torch
.
float16
)
# Add requests to the engine.
for
i
in
range
(
num_requests
):
prompt
=
prompts
[
i
]
if
prompts
is
not
None
else
None
token_ids
=
None
if
prompt_token_ids
is
None
else
prompt_token_ids
[
i
]
if
not
isinstance
(
token_ids
,
list
):
# NOTE(shengguangming): convert the rollout input into List[str]
token_ids
=
self
.
_pre_process_inputs
(
token_ids
)
self
.
_add_request
(
prompt
,
sampling_params
[
i
]
if
isinstance
(
sampling_params
,
list
)
else
sampling_params
,
token_ids
,
lora_request
=
lora_request
,
# Get ith image while maintaining the batch dim.
multi_modal_data
=
MultiModalData
(
type
=
multi_modal_data
.
type
,
data
=
multi_modal_data
.
data
[
i
].
unsqueeze
(
0
))
if
multi_modal_data
else
None
,
)
return
self
.
_run_engine
(
use_tqdm
)
def
_add_request
(
self
,
prompt
:
Optional
[
str
],
sampling_params
:
SamplingParams
,
prompt_token_ids
:
Optional
[
List
[
int
]],
lora_request
:
Optional
[
LoRARequest
]
=
None
,
multi_modal_data
:
Optional
[
MultiModalData
]
=
None
,
)
->
None
:
request_id
=
str
(
next
(
self
.
request_counter
))
self
.
llm_engine
.
add_request
(
request_id
,
prompt
,
sampling_params
,
prompt_token_ids
,
lora_request
=
lora_request
,
multi_modal_data
=
multi_modal_data
)
def
_run_engine
(
self
,
use_tqdm
:
bool
)
->
List
[
RequestOutput
]:
# Initialize tqdm.
if
use_tqdm
:
num_requests
=
self
.
llm_engine
.
get_num_unfinished_requests
()
pbar
=
tqdm
(
total
=
num_requests
,
desc
=
"Processed prompts"
,
dynamic_ncols
=
True
)
# Run the engine.
outputs
:
List
[
RequestOutput
]
=
[]
while
self
.
llm_engine
.
has_unfinished_requests
():
step_outputs
=
self
.
llm_engine
.
step
()
for
output
in
step_outputs
:
if
output
.
finished
:
outputs
.
append
(
output
)
if
use_tqdm
:
pbar
.
update
(
1
)
if
use_tqdm
:
pbar
.
close
()
# Sort the outputs by request ID.
# This is necessary because some requests may be finished earlier than
# its previous requests.
outputs
=
sorted
(
outputs
,
key
=
lambda
x
:
int
(
x
.
request_id
))
# TODO(shengguangming): maybe we can hack the autoregressive logics without only apply post process for better performance
return
self
.
_post_process_outputs
(
outputs
)
# NOTE(shengguangming): add for verl
# TODO(sgm): we can optimize it by making the dataloader yield List[int] without padding.
def
_pre_process_inputs
(
self
,
prompt_token_ids
:
torch
.
Tensor
)
->
List
[
int
]:
# remove the left padding in the prompt token_id
pad_token_id
=
self
.
llm_engine
.
tokenizer
.
pad_token_id
if
self
.
llm_engine
.
tokenizer
.
pad_token_id
is
not
None
else
self
.
llm_engine
.
tokenizer
.
eos_token_id
non_pad_index
=
torch
.
nonzero
(
prompt_token_ids
!=
pad_token_id
,
as_tuple
=
False
)[
0
][
0
]
token_ids
=
prompt_token_ids
[
non_pad_index
:].
tolist
()
return
token_ids
# NOTE(shengguangming): add for verl
def
_post_process_outputs
(
self
,
request_outputs
:
List
[
RequestOutput
])
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
output_token_ids
=
[]
logprobs
=
[]
for
request_output
in
request_outputs
:
# List[RequestOutput]
outputs
=
request_output
.
outputs
for
output
in
outputs
:
# List[CompletionOutput], usually len == 1
output_token_ids
.
append
(
torch
.
tensor
(
output
.
token_ids
))
# TODO(shengguangming): can be optimzied by rewrite the Sampler._get_logprobs() logits
logprobs_dicts
=
output
.
logprobs
if
logprobs_dicts
is
not
None
:
logprob
=
[]
for
logprobs_dict
,
id
in
zip
(
logprobs_dicts
,
output
.
token_ids
):
logprob
.
append
(
logprobs_dict
[
id
].
logprob
)
logprobs
.
append
(
torch
.
tensor
(
logprob
))
pad_token_id
=
self
.
llm_engine
.
tokenizer
.
pad_token_id
if
self
.
llm_engine
.
tokenizer
.
pad_token_id
is
not
None
else
self
.
llm_engine
.
tokenizer
.
eos_token_id
output_token_ids
=
pad_sequence
(
output_token_ids
,
batch_first
=
True
,
padding_value
=
pad_token_id
)
if
len
(
logprobs
)
>
0
:
logprobs
=
pad_sequence
(
logprobs
,
batch_first
=
True
,
padding_value
=
pad_token_id
)
return
output_token_ids
,
logprobs
def
sync_model_weights
(
self
,
actor_weights
:
Dict
[
str
,
torch
.
Tensor
],
load_format
:
str
)
->
None
:
self
.
llm_engine
.
sync_model_weights
(
actor_weights
=
actor_weights
,
load_format
=
load_format
)
def
offload_model_weights
(
self
)
->
None
:
self
.
llm_engine
.
offload_model_weights
()
verl/third_party/vllm/vllm_v_0_4_2/llm_engine_sp.py
0 → 100644
View file @
f87b35b2
# Copyright 2024 Bytedance Ltd. and/or its affiliates
# Copyright 2023 The vLLM team.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/engine/llm_engine.py
import
torch
from
typing
import
Dict
,
Optional
,
Union
,
Type
import
vllm
from
vllm.config
import
(
CacheConfig
,
DecodingConfig
,
DeviceConfig
,
LoRAConfig
,
ParallelConfig
,
SchedulerConfig
,
SpeculativeConfig
,
VisionLanguageConfig
)
from
vllm.core.scheduler
import
Scheduler
from
vllm.engine.output_processor.interfaces
import
(
SequenceGroupOutputProcessor
)
from
vllm.engine.output_processor.stop_checker
import
StopChecker
from
vllm.executor.executor_base
import
ExecutorBase
from
vllm.logger
import
init_logger
from
vllm.transformers_utils.detokenizer
import
Detokenizer
from
vllm.engine.metrics
import
StatLogger
from
vllm.usage.usage_lib
import
(
UsageContext
,
is_usage_stats_enabled
,
usage_message
)
from
vllm.utils
import
Counter
from
vllm.engine.llm_engine
import
_load_generation_config_dict
from
vllm.engine.llm_engine
import
LLMEngine
import
torch.nn
as
nn
from
.arg_utils
import
EngineArgs
from
.tokenizer
import
TokenizerGroup
from
.config
import
ModelConfig
,
LoadConfig
logger
=
init_logger
(
__name__
)
_LOCAL_LOGGING_INTERVAL_SEC
=
5
class
LLMEngine
(
LLMEngine
):
"""An LLM engine that receives requests and generates texts.
This is the main class for the vLLM engine. It receives requests
from clients and generates texts from the LLM. It includes a tokenizer, a
language model (possibly distributed across multiple GPUs), and GPU memory
space allocated for intermediate states (aka KV cache). This class utilizes
iteration-level scheduling and efficient memory management to maximize the
serving throughput.
The `LLM` class wraps this class for offline batched inference and the
`AsyncLLMEngine` class wraps this class for online serving.
NOTE: The config arguments are derived from the `EngineArgs` class. For the
comprehensive list of arguments, see `EngineArgs`.
Args:
model: the actor model initialize outside vllm (add for verl)
tokenizer: the initialized tokenizer (add for verl)
model_config: The configuration related to the LLM model.
cache_config: The configuration related to the KV cache memory
management.
parallel_config: The configuration related to distributed execution.
scheduler_config: The configuration related to the request scheduler.
distributed_init_method: The initialization method for distributed
execution. See `torch.distributed.init_process_group` for details.
placement_group: Ray placement group for distributed execution.
Required for distributed execution.
log_stats: Whether to log statistics.
"""
def
__init__
(
self
,
# NOTE(sgm): first two arguments are added for verl
model
:
Union
[
nn
.
Module
,
Dict
],
# model itself or its parameter dict
tokenizer
:
nn
.
Module
,
# NOTE(sgm): vllm original arguments
model_config
:
ModelConfig
,
cache_config
:
CacheConfig
,
parallel_config
:
ParallelConfig
,
scheduler_config
:
SchedulerConfig
,
device_config
:
DeviceConfig
,
load_config
:
LoadConfig
,
lora_config
:
Optional
[
LoRAConfig
],
vision_language_config
:
Optional
[
VisionLanguageConfig
],
speculative_config
:
Optional
[
SpeculativeConfig
],
decoding_config
:
Optional
[
DecodingConfig
],
executor_class
:
Type
[
ExecutorBase
],
log_stats
:
bool
,
usage_context
:
UsageContext
=
UsageContext
.
ENGINE_CONTEXT
,
)
->
None
:
logger
.
info
(
"Initializing an LLM engine (v%s) with config: "
"model=%r, speculative_config=%r, tokenizer=%r, "
"skip_tokenizer_init=%s, tokenizer_mode=%s, revision=%s, "
"tokenizer_revision=%s, trust_remote_code=%s, dtype=%s, "
"max_seq_len=%d, download_dir=%r, load_format=%s, "
"tensor_parallel_size=%d, disable_custom_all_reduce=%s, "
"quantization=%s, enforce_eager=%s, kv_cache_dtype=%s, "
"quantization_param_path=%s, device_config=%s, "
"decoding_config=%r, seed=%d, served_model_name=%s)"
,
vllm
.
__version__
,
model_config
.
model
,
speculative_config
,
model_config
.
tokenizer
,
model_config
.
skip_tokenizer_init
,
# model_config.tokenizer_mode,
model_config
.
revision
,
model_config
.
tokenizer_revision
,
# model_config.trust_remote_code,
model_config
.
dtype
,
model_config
.
max_model_len
,
load_config
.
download_dir
,
load_config
.
load_format
,
parallel_config
.
tensor_parallel_size
,
parallel_config
.
disable_custom_all_reduce
,
model_config
.
quantization
,
model_config
.
enforce_eager
,
cache_config
.
cache_dtype
,
model_config
.
quantization_param_path
,
device_config
.
device
,
decoding_config
,
model_config
.
seed
,
# model_config.served_model_name,
)
# TODO(woosuk): Print more configs in debug mode.
self
.
model_config
=
model_config
# TODO: currently is hfconfig
self
.
cache_config
=
cache_config
self
.
lora_config
=
lora_config
self
.
vision_language_config
=
vision_language_config
self
.
parallel_config
=
parallel_config
self
.
scheduler_config
=
scheduler_config
self
.
device_config
=
device_config
self
.
speculative_config
=
speculative_config
self
.
load_config
=
load_config
self
.
decoding_config
=
decoding_config
or
DecodingConfig
()
self
.
log_stats
=
log_stats
# self.model = model # should not store the model, it should be deleted
# TODO(shengguangming): maybe we can choose init here or from arguments
if
not
self
.
model_config
.
skip_tokenizer_init
:
# TODO: check tokenizer class
self
.
_init_tokenizer
(
tokenizer
)
self
.
detokenizer
=
Detokenizer
(
self
.
tokenizer
)
else
:
self
.
detokenizer
=
None
self
.
tokenizer
=
None
self
.
seq_counter
=
Counter
()
# TODO: don't know what's the usage
self
.
generation_config_fields
=
_load_generation_config_dict
(
model_config
)
self
.
model_executor
=
executor_class
(
model
=
model
,
# add for spmd_gpu_executor
model_config
=
model_config
,
cache_config
=
cache_config
,
parallel_config
=
parallel_config
,
scheduler_config
=
scheduler_config
,
device_config
=
device_config
,
lora_config
=
lora_config
,
vision_language_config
=
vision_language_config
,
speculative_config
=
speculative_config
,
load_config
=
load_config
,
)
# Profile the memory usage and initialize the cache.
self
.
_initialize_kv_caches
()
# If usage stat is enabled, collect relevant info.
if
is_usage_stats_enabled
():
from
vllm.model_executor.model_loader
import
(
get_architecture_class_name
)
usage_message
.
report_usage
(
get_architecture_class_name
(
model_config
),
usage_context
,
extra_kvs
=
{
# Common configuration
"dtype"
:
str
(
model_config
.
dtype
),
"tensor_parallel_size"
:
parallel_config
.
tensor_parallel_size
,
"block_size"
:
cache_config
.
block_size
,
"gpu_memory_utilization"
:
cache_config
.
gpu_memory_utilization
,
# Quantization
"quantization"
:
model_config
.
quantization
,
"kv_cache_dtype"
:
cache_config
.
cache_dtype
,
# Feature flags
"enable_lora"
:
bool
(
lora_config
),
"enable_prefix_caching"
:
cache_config
.
enable_prefix_caching
,
"enforce_eager"
:
model_config
.
enforce_eager
,
"disable_custom_all_reduce"
:
parallel_config
.
disable_custom_all_reduce
,
})
if
self
.
tokenizer
:
# Ping the tokenizer to ensure liveness if it runs in a
# different process.
self
.
tokenizer
.
ping
()
# Create the scheduler.
# NOTE: the cache_config here have been updated with the numbers of
# GPU and CPU blocks, which are profiled in the distributed executor.
# NOTE(shengguangming): each process will have independent scheduler
self
.
scheduler
=
Scheduler
(
scheduler_config
,
cache_config
,
lora_config
)
# Metric Logging.
if
self
.
log_stats
:
self
.
stat_logger
=
StatLogger
(
local_interval
=
_LOCAL_LOGGING_INTERVAL_SEC
,
labels
=
dict
(
model_name
=
model_config
.
served_model_name
),
max_model_len
=
self
.
model_config
.
max_model_len
)
self
.
stat_logger
.
info
(
"cache_config"
,
self
.
cache_config
)
# Create sequence output processor, e.g. for beam search or
# speculative decoding.
self
.
output_processor
=
(
SequenceGroupOutputProcessor
.
create_output_processor
(
self
.
scheduler_config
,
self
.
detokenizer
,
self
.
scheduler
,
self
.
seq_counter
,
self
.
get_tokenizer_for_seq
,
stop_checker
=
StopChecker
(
self
.
scheduler_config
.
max_model_len
,
self
.
get_tokenizer_for_seq
,
),
))
# TODO(sgm): add for verl but we may not tokenizer in Rollout
def
_init_tokenizer
(
self
,
tokenizer
,
**
tokenizer_init_kwargs
):
init_kwargs
=
dict
(
enable_lora
=
bool
(
self
.
lora_config
),
max_num_seqs
=
self
.
scheduler_config
.
max_num_seqs
,
max_input_length
=
None
)
init_kwargs
.
update
(
tokenizer_init_kwargs
)
self
.
tokenizer
:
TokenizerGroup
=
TokenizerGroup
(
tokenizer
,
**
init_kwargs
)
def
init_cache_engine
(
self
):
# TODO: check whether we should rebuild the CUDAGraph every iter when offload/load KVCache
# Re-capture CUDAGraph would be time-consuming
self
.
model_executor
.
init_cache_engine
()
def
free_cache_engine
(
self
):
self
.
model_executor
.
free_cache_engine
()
# NOTE(sgm): currently, we only support GPU executor
# The GPUExecutor remove the Ray dependency
@
classmethod
def
from_engine_args
(
cls
,
model
,
tokenizer
,
engine_args
:
EngineArgs
,
usage_context
:
UsageContext
=
UsageContext
.
ENGINE_CONTEXT
,
)
->
"LLMEngine"
:
"""Creates an LLM engine from the engine arguments."""
# Create the engine configs.
engine_config
=
engine_args
.
create_engine_config
()
# Initialize the cluster and specify the executor class.
assert
engine_config
.
device_config
.
device_type
==
"cuda"
,
\
"Currently, the vllm in verl only support running on GPU"
if
engine_config
.
parallel_config
.
world_size
==
1
:
engine_config
.
load_config
.
load_format
=
"dummy_hf"
from
.spmd_gpu_executor
import
SPMDGPUExecutor
executor_class
=
SPMDGPUExecutor
# Create the LLM engine.
engine
=
cls
(
model
,
tokenizer
,
**
engine_config
.
to_dict
(),
executor_class
=
executor_class
,
log_stats
=
not
engine_args
.
disable_log_stats
,
usage_context
=
usage_context
,
)
return
engine
def
sync_model_weights
(
self
,
actor_weights
:
Dict
[
str
,
torch
.
Tensor
],
load_format
:
str
)
->
None
:
self
.
model_executor
.
sync_model_weights
(
actor_weights
=
actor_weights
,
load_format
=
load_format
)
def
offload_model_weights
(
self
)
->
None
:
self
.
model_executor
.
offload_model_weights
()
verl/third_party/vllm/vllm_v_0_4_2/megatron_weight_loaders.py
0 → 100644
View file @
f87b35b2
# Copyright 2024 Bytedance Ltd. and/or its affiliates
# Copyright 2023 The vLLM team.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Adapted from https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/models
from
typing
import
Dict
import
torch
import
torch.nn
as
nn
from
vllm.model_executor.layers.linear
import
*
from
vllm.model_executor.layers.vocab_parallel_embedding
import
VocabParallelEmbedding
,
ParallelLMHead
from
vllm.model_executor.layers.activation
import
ScaledActivation
from
vllm.model_executor.models
import
ModelRegistry
# NOTE(shengguangming): replace the origin weight loader function in the class
def
parallel_weight_loader
(
self
,
param
:
torch
.
Tensor
,
loaded_weight
:
torch
.
Tensor
)
->
None
:
"""Parallel Linear weight loader."""
assert
param
.
size
()
==
loaded_weight
.
size
(
),
'the parameter size is not align with the loaded weight size, param size: {}, loaded_weight size: {}'
.
format
(
param
.
size
(),
loaded_weight
.
size
())
assert
param
.
data
.
dtype
==
loaded_weight
.
data
.
dtype
,
"if we want to shared weights, the data type should also be the same"
param
.
data
=
loaded_weight
.
data
def
default_weight_loader
(
param
:
torch
.
Tensor
,
loaded_weight
:
torch
.
Tensor
)
->
None
:
"""Default weight loader."""
assert
param
.
size
()
==
loaded_weight
.
size
()
assert
param
.
data
.
dtype
==
loaded_weight
.
data
.
dtype
,
"if we want to shared weights, the data type should also be the same"
param
.
data
=
loaded_weight
.
data
def
gpt2_weight_loader
(
actor_weights
:
Dict
,
vllm_model
:
nn
.
Module
)
->
nn
.
Module
:
params_dict
=
dict
(
vllm_model
.
named_parameters
(
remove_duplicate
=
False
))
for
name
,
loaded_weight
in
actor_weights
.
items
():
if
"lm_head.weight"
in
name
:
# GPT-2 ties the weights of the embedding layer and the final
# linear layer.
continue
if
".attn.bias"
in
name
or
".attn.masked_bias"
in
name
:
# Skip attention mask.
# NOTE: "c_attn.bias" should not be skipped.
continue
if
not
name
.
startswith
(
"transformer."
):
name
=
"transformer."
+
name
param
=
params_dict
[
name
]
# The HF's GPT-2 implementation uses Conv1D instead of Linear.
# Because of this, we need to transpose the weights.
# Note(zhuohan): the logic below might break quantized models.
for
conv1d_weight_name
in
[
"c_attn"
,
"c_proj"
,
"c_fc"
]:
if
conv1d_weight_name
not
in
name
:
continue
if
not
name
.
endswith
(
".weight"
):
continue
# TODO: check megatron
loaded_weight
=
loaded_weight
.
t
()
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
weight_loader
(
param
,
loaded_weight
)
def
llama_megatron_weight_loader
(
actor_weights
:
Dict
,
vllm_model
:
nn
.
Module
)
->
nn
.
Module
:
# NOTE(shengguangming): the megatron llama may have this prefix
params_dict
=
dict
(
vllm_model
.
named_parameters
())
for
name
,
loaded_weight
in
actor_weights
.
items
():
if
"rotary_emb.inv_freq"
in
name
:
continue
else
:
param
=
params_dict
[
name
]
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
weight_loader
(
param
,
loaded_weight
)
def
llama_megatron_core_te_weight_loader
(
actor_weights
:
Dict
,
vllm_model
:
nn
.
Module
)
->
nn
.
Module
:
params_mapping
=
[
# (megatron core gpt model name, vllm model name)
(
"embedding.word_embeddings"
,
"model.embed_tokens"
),
(
"self_attention.linear_qkv.layer_norm_weight"
,
"input_layernorm.weight"
),
(
"self_attention.linear_qkv.layer_norm_bias"
,
"input_layernorm.bias"
),
(
"self_attention.linear_qkv"
,
"self_attn.qkv_proj"
),
(
"self_attention.linear_qkv"
,
"self_attn.qkv_proj"
),
(
"self_attention.linear_proj"
,
'self_attn.o_proj'
),
(
'pre_mlp_layernorm'
,
'post_attention_layernorm'
),
(
'mlp.linear_fc1.layer_norm_weight'
,
'post_attention_layernorm.weight'
),
(
'mlp.linear_fc1.layer_norm_bias'
,
'post_attention_layernorm.bias'
),
(
'mlp.linear_fc1'
,
'mlp.gate_up_proj'
),
(
'mlp.linear_fc2'
,
'mlp.down_proj'
),
(
'decoder.final_layernorm'
,
'model.norm'
),
(
'output_layer'
,
'lm_head'
),
]
# NOTE(shengguangming): the megatron llama may have this prefix
params_dict
=
dict
(
vllm_model
.
named_parameters
())
for
name
,
loaded_weight
in
actor_weights
.
items
():
name
=
_replace_name
(
name
,
params_mapping
)
if
name
.
endswith
(
'.bias'
)
and
name
not
in
params_dict
:
continue
if
"rotary_emb.inv_freq"
in
name
:
continue
else
:
param
=
params_dict
[
name
]
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
weight_loader
(
param
,
loaded_weight
)
def
llama_megatron_core_weight_loader
(
actor_weights
:
Dict
,
vllm_model
:
nn
.
Module
)
->
nn
.
Module
:
params_mapping
=
[
# (megatron core gpt model name, vllm model name)
(
"embedding.word_embeddings"
,
"model.embed_tokens"
),
(
"self_attention.linear_qkv"
,
"self_attn.qkv_proj"
),
(
"self_attention.linear_proj"
,
'self_attn.o_proj'
),
(
'input_layernorm'
,
'input_layernorm'
,
),
(
'pre_mlp_layernorm'
,
'post_attention_layernorm'
),
(
'mlp.linear_fc1'
,
'mlp.gate_up_proj'
),
(
'mlp.linear_fc2'
,
'mlp.down_proj'
),
(
'decoder.final_layernorm'
,
'model.norm'
),
(
'output_layer'
,
'lm_head'
),
]
# NOTE(shengguangming): the megatron llama may have this prefix
params_dict
=
dict
(
vllm_model
.
named_parameters
())
for
name
,
loaded_weight
in
actor_weights
.
items
():
name
=
_replace_name
(
name
,
params_mapping
)
if
name
.
endswith
(
'.bias'
)
and
name
not
in
params_dict
:
continue
if
"rotary_emb.inv_freq"
in
name
:
continue
else
:
param
=
params_dict
[
name
]
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
weight_loader
(
param
,
loaded_weight
)
def
_replace_name
(
megatron_name
,
name_mapping
):
for
m_name
,
v_name
in
name_mapping
:
if
m_name
not
in
megatron_name
:
continue
if
'layers'
in
megatron_name
:
# deal with decoder layers
megatron_name
=
megatron_name
.
replace
(
'decoder'
,
'model'
)
megatron_name_list
=
megatron_name
.
split
(
'.'
)
if
'layer_norm_weight'
in
megatron_name_list
or
'layer_norm_bias'
in
megatron_name_list
:
param_name_list
=
megatron_name_list
[:
3
]
param_name_list
.
append
(
v_name
)
param_name
=
'.'
.
join
(
param_name_list
)
else
:
param_name_list
=
megatron_name_list
[:
3
]
weight_or_bias
=
megatron_name_list
[
-
1
]
param_name_list
.
append
(
v_name
)
param_name_list
.
append
(
weight_or_bias
)
param_name
=
'.'
.
join
(
param_name_list
)
return
param_name
else
:
param_name
=
megatron_name
.
replace
(
m_name
,
v_name
)
return
param_name
def
llama_megatron_core_te_weight_loader
(
actor_weights
:
Dict
,
vllm_model
:
nn
.
Module
)
->
nn
.
Module
:
params_mapping
=
[
# (megatron core gpt model name, vllm model name)
(
"embedding.word_embeddings"
,
"model.embed_tokens"
),
(
"self_attention.linear_qkv.layer_norm_weight"
,
"input_layernorm.weight"
),
(
"self_attention.linear_qkv.layer_norm_bias"
,
"input_layernorm.bias"
),
(
"self_attention.linear_qkv"
,
"self_attn.qkv_proj"
),
(
"self_attention.linear_qkv"
,
"self_attn.qkv_proj"
),
(
"self_attention.linear_proj"
,
'self_attn.o_proj'
),
(
'pre_mlp_layernorm'
,
'post_attention_layernorm'
),
(
'mlp.linear_fc1.layer_norm_weight'
,
'post_attention_layernorm.weight'
),
(
'mlp.linear_fc1.layer_norm_bias'
,
'post_attention_layernorm.bias'
),
(
'mlp.linear_fc1'
,
'mlp.gate_up_proj'
),
(
'mlp.linear_fc2'
,
'mlp.down_proj'
),
(
'decoder.final_layernorm'
,
'model.norm'
),
(
'output_layer'
,
'lm_head'
),
]
# NOTE(shengguangming): the megatron llama may have this prefix
params_dict
=
dict
(
vllm_model
.
named_parameters
())
for
name
,
loaded_weight
in
actor_weights
.
items
():
name
=
_replace_name
(
name
,
params_mapping
)
if
name
.
endswith
(
'.bias'
)
and
name
not
in
params_dict
:
continue
if
"rotary_emb.inv_freq"
in
name
:
continue
else
:
param
=
params_dict
[
name
]
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
weight_loader
(
param
,
loaded_weight
)
def
llama_megatron_core_weight_loader
(
actor_weights
:
Dict
,
vllm_model
:
nn
.
Module
)
->
nn
.
Module
:
params_mapping
=
[
# (megatron core gpt model name, vllm model name)
(
"embedding.word_embeddings"
,
"model.embed_tokens"
),
(
"self_attention.linear_qkv"
,
"self_attn.qkv_proj"
),
(
"self_attention.linear_proj"
,
'self_attn.o_proj'
),
(
'input_layernorm'
,
'input_layernorm'
,
),
(
'pre_mlp_layernorm'
,
'post_attention_layernorm'
),
(
'mlp.linear_fc1'
,
'mlp.gate_up_proj'
),
(
'mlp.linear_fc2'
,
'mlp.down_proj'
),
(
'decoder.final_layernorm'
,
'model.norm'
),
(
'output_layer'
,
'lm_head'
),
]
# NOTE(shengguangming): the megatron llama may have this prefix
params_dict
=
dict
(
vllm_model
.
named_parameters
())
for
name
,
loaded_weight
in
actor_weights
.
items
():
name
=
_replace_name
(
name
,
params_mapping
)
if
name
.
endswith
(
'.bias'
)
and
name
not
in
params_dict
:
continue
if
"rotary_emb.inv_freq"
in
name
:
continue
else
:
param
=
params_dict
[
name
]
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
weight_loader
(
param
,
loaded_weight
)
def
_replace_name
(
megatron_name
,
name_mapping
):
for
m_name
,
v_name
in
name_mapping
:
if
m_name
not
in
megatron_name
:
continue
if
'layers'
in
megatron_name
:
# deal with decoder layers
megatron_name
=
megatron_name
.
replace
(
'decoder'
,
'model'
)
megatron_name_list
=
megatron_name
.
split
(
'.'
)
if
'layer_norm_weight'
in
megatron_name_list
or
'layer_norm_bias'
in
megatron_name_list
:
param_name_list
=
megatron_name_list
[:
3
]
param_name_list
.
append
(
v_name
)
param_name
=
'.'
.
join
(
param_name_list
)
else
:
param_name_list
=
megatron_name_list
[:
3
]
weight_or_bias
=
megatron_name_list
[
-
1
]
param_name_list
.
append
(
v_name
)
param_name_list
.
append
(
weight_or_bias
)
param_name
=
'.'
.
join
(
param_name_list
)
return
param_name
else
:
param_name
=
megatron_name
.
replace
(
m_name
,
v_name
)
return
param_name
def
mistral_megatron_weight_loader
(
actor_weights
:
Dict
,
vllm_model
:
nn
.
Module
)
->
nn
.
Module
:
# TODO: need to implement a general way to deal with prefix
params_dict
=
dict
(
vllm_model
.
named_parameters
())
for
name
,
loaded_weight
in
actor_weights
.
items
():
if
"rotary_emb.inv_freq"
in
name
:
continue
else
:
param
=
params_dict
[
name
]
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
weight_loader
(
param
,
loaded_weight
)
__LAYER_WEIGHT_MEGATRON_LOADER_REGISTRY__
=
{
ColumnParallelLinear
:
parallel_weight_loader
,
MergedColumnParallelLinear
:
parallel_weight_loader
,
QKVParallelLinear
:
parallel_weight_loader
,
RowParallelLinear
:
parallel_weight_loader
,
VocabParallelEmbedding
:
parallel_weight_loader
,
ParallelLMHead
:
parallel_weight_loader
# "ScaledActivation.weight_loader": ScaledActivation, # TODO(shengguangming): latest commit in vllm fix awq for this function and add load_weights
# "default_weight_loader": default_weight_loader
}
# for layer_class, weight_loader in __LAYER_WEIGHT_MEGATRON_LOADER_REGISTRY__.items():
# # setattr(layer_class, 'megatron_weight_loader', weight_loader)
# layer_class.weight_loader = weight_loader
__MODEL_MEGATRON_WEIGHT_LOADER_REGISTRY__
=
{
'GPT2LMHeadModel'
:
gpt2_weight_loader
,
'LlamaForCausalLM'
:
llama_megatron_core_te_weight_loader
,
# use te backend for open-source megatron
'LLaMAForCausalLM'
:
llama_megatron_core_te_weight_loader
,
'MistralForCausalLM'
:
mistral_megatron_weight_loader
,
}
# the actor model is .state_dict()
# Load megatron weights
def
load_megatron_weights
(
actor_weights
:
Dict
,
vllm_model
:
nn
.
Module
):
weight_loader
=
_get_model_weight_loader
(
vllm_model
.
__class__
.
__name__
)
weight_loader
(
actor_weights
,
vllm_model
)
# NOTE(sgm) to reduce peak memory usage, we offload vllm model to cpu
# after init, and we need this after sync model weights for in first iter.
vllm_model
=
vllm_model
.
cuda
()
def
_get_model_weight_loader
(
arch
:
str
):
if
arch
in
__MODEL_MEGATRON_WEIGHT_LOADER_REGISTRY__
:
return
__MODEL_MEGATRON_WEIGHT_LOADER_REGISTRY__
[
arch
]
raise
ValueError
(
f
"Model architectures
{
arch
}
are not supported for now. "
f
"Supported architectures:
{
ModelRegistry
.
get_supported_archs
()
}
"
)
def
update_megatron_weight_loader
():
for
layer_class
,
weight_loader
in
__LAYER_WEIGHT_MEGATRON_LOADER_REGISTRY__
.
items
():
layer_class
.
weight_loader
=
weight_loader
VocabParallelEmbedding
.
__init__
=
vocab_init
# FIXME(shengguangming): the vLLM vocab will pad to 64, which may incur out of bounds
# so we need to rewrite the init function of vocab
DEFAULT_VOCAB_PADDING_SIZE
=
64
def
vocab_init
(
self
,
num_embeddings
:
int
,
embedding_dim
:
int
,
params_dtype
:
Optional
[
torch
.
dtype
]
=
None
,
org_num_embeddings
:
Optional
[
int
]
=
None
,
padding_size
:
int
=
DEFAULT_VOCAB_PADDING_SIZE
):
super
(
VocabParallelEmbedding
,
self
).
__init__
()
# Keep the input dimensions.
# TODO (pad to be divided by 4)
self
.
num_embeddings
=
num_embeddings
self
.
org_vocab_size
=
org_num_embeddings
or
num_embeddings
# self.num_embeddings_padded = pad_vocab_size(num_embeddings,
# padding_size)
self
.
embedding_dim
=
embedding_dim
if
params_dtype
is
None
:
params_dtype
=
torch
.
get_default_dtype
()
self
.
tp_size
=
get_tensor_model_parallel_world_size
()
# Divide the weight matrix along the vocaburaly dimension.
# TODO: remove dependencies from megatron
from
megatron.core.tensor_parallel.utils
import
VocabUtility
self
.
vocab_start_index
,
self
.
vocab_end_index
=
(
VocabUtility
.
vocab_range_from_global_vocab_size
(
self
.
num_embeddings
,
get_tensor_model_parallel_rank
(),
self
.
tp_size
))
self
.
num_embeddings_per_partition
=
(
self
.
vocab_end_index
-
self
.
vocab_start_index
)
self
.
weight
=
Parameter
(
torch
.
empty
(
self
.
num_embeddings_per_partition
,
self
.
embedding_dim
,
# device=torch.cuda.current_device(),
dtype
=
params_dtype
))
set_weight_attrs
(
self
.
weight
,
{
"parallel_dim"
:
0
,
"weight_loader"
:
self
.
weight_loader
})
verl/third_party/vllm/vllm_v_0_4_2/model_loader.py
0 → 100644
View file @
f87b35b2
# Copyright 2024 Bytedance Ltd. and/or its affiliates
# Copyright 2023 The vLLM team.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Adapted from https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/model_loader
"""Utilities for selecting and loading models."""
from
typing
import
Dict
,
Union
,
Optional
,
Iterable
,
Tuple
import
torch
import
torch.nn
as
nn
from
transformers
import
PreTrainedModel
from
vllm.config
import
(
DeviceConfig
,
LoRAConfig
,
ParallelConfig
,
SchedulerConfig
,
VisionLanguageConfig
)
from
vllm.model_executor.model_loader
import
BaseModelLoader
from
vllm.model_executor.model_loader.loader
import
_initialize_model
from
vllm.model_executor.model_loader.utils
import
set_default_torch_dtype
from
vllm.distributed.communication_op
import
tensor_model_parallel_all_gather
from
.config
import
ModelConfig
,
LoadFormat
,
LoadConfig
from
.megatron_weight_loaders
import
load_megatron_weights
,
update_megatron_weight_loader
from
.dtensor_weight_loaders
import
load_dtensor_weights
,
update_dtensor_weight_loader
from
.hf_weight_loader
import
update_hf_weight_loader
def
get_model
(
actor_model
:
Union
[
PreTrainedModel
,
Dict
],
model_config
:
ModelConfig
,
load_config
:
LoadConfig
,
device_config
:
DeviceConfig
,
parallel_config
:
ParallelConfig
,
scheduler_config
:
SchedulerConfig
,
lora_config
:
Optional
[
LoRAConfig
],
vision_language_config
:
Optional
[
VisionLanguageConfig
])
->
nn
.
Module
:
loader
=
get_model_loader
(
load_config
)
if
load_config
.
load_format
.
startswith
(
'dummy'
):
return
loader
.
load_model
(
model_config
=
model_config
,
device_config
=
device_config
,
lora_config
=
lora_config
,
vision_language_config
=
vision_language_config
,
parallel_config
=
parallel_config
,
scheduler_config
=
scheduler_config
)
else
:
return
loader
.
load_model
(
actor_model
=
actor_model
,
model_config
=
model_config
,
device_config
=
device_config
,
lora_config
=
lora_config
,
vision_language_config
=
vision_language_config
,
parallel_config
=
parallel_config
,
scheduler_config
=
scheduler_config
)
def
get_model_loader
(
load_config
:
LoadConfig
)
->
BaseModelLoader
:
"""Get a model loader based on the load format."""
if
isinstance
(
load_config
.
load_format
,
type
):
return
load_config
.
load_format
(
load_config
)
if
load_config
.
load_format
==
LoadFormat
.
AUTO
:
update_megatron_weight_loader
()
return
MegatronLoader
(
load_config
)
# NOTE(sgm): change the weight_loader function in runtime
if
load_config
.
load_format
==
LoadFormat
.
MEGATRON
:
update_megatron_weight_loader
()
return
MegatronLoader
(
load_config
)
if
load_config
.
load_format
==
LoadFormat
.
HF
:
update_hf_weight_loader
()
return
HFLoader
(
load_config
)
if
load_config
.
load_format
==
LoadFormat
.
DTENSOR
:
update_dtensor_weight_loader
()
return
DTensorLoader
(
load_config
)
if
load_config
.
load_format
==
LoadFormat
.
DUMMY_HF
:
update_hf_weight_loader
()
return
DummyModelLoader
(
load_config
)
if
load_config
.
load_format
==
LoadFormat
.
DUMMY_MEGATRON
:
update_megatron_weight_loader
()
return
DummyModelLoader
(
load_config
)
if
load_config
.
load_format
==
LoadFormat
.
DUMMY_DTENSOR
:
update_dtensor_weight_loader
()
return
DummyModelLoader
(
load_config
)
raise
ValueError
(
'load format not supported in verl: {}, only support {} and {}'
.
format
(
load_config
.
load_format
,
LoadFormat
.
MEGATRON
,
LoadFormat
.
HF
))
class
DummyModelLoader
(
BaseModelLoader
):
"""Model loader that will set model weights to random values."""
def
__init__
(
self
,
load_config
:
LoadConfig
):
super
().
__init__
(
load_config
)
if
load_config
.
model_loader_extra_config
:
raise
ValueError
(
f
"Model loader extra config is not supported for "
f
"load format
{
load_config
.
load_format
}
"
)
def
load_model
(
self
,
*
,
model_config
:
ModelConfig
,
device_config
:
DeviceConfig
,
lora_config
:
Optional
[
LoRAConfig
],
vision_language_config
:
Optional
[
VisionLanguageConfig
],
parallel_config
:
ParallelConfig
,
scheduler_config
:
SchedulerConfig
)
->
nn
.
Module
:
with
set_default_torch_dtype
(
model_config
.
dtype
):
with
torch
.
device
(
device_config
.
device
):
model
=
_initialize_model
(
model_config
,
self
.
load_config
,
lora_config
,
vision_language_config
)
# NOTE(woosuk): For accurate performance evaluation, we assign
# random values to the weights.
# initialize_dummy_weights(model)
return
model
.
eval
()
class
MegatronLoader
(
BaseModelLoader
):
"""Model loader that can load the model weights from partitioned megatron model."""
def
__init__
(
self
,
load_config
:
LoadConfig
):
super
().
__init__
(
load_config
)
if
load_config
.
model_loader_extra_config
:
raise
ValueError
(
f
"Model loader extra config is not supported for "
f
"load format
{
load_config
.
load_format
}
"
)
def
_get_weights_iterator
(
actor_model
:
Union
[
PreTrainedModel
,
Dict
]):
# NOTE(shengguangming) Load the weights from the actor model
pass
# if isinstance(actor_model, nn.Module):
# load_weights(actor_weights=dict(actor_model.named_parameters(remove_duplicate=False)), vllm_model=model)
# else:
# load_weights(actor_weights=actor_model, vllm_model=model)
# return actor_model
def
load_model
(
self
,
actor_model
:
Union
[
PreTrainedModel
,
Dict
],
model_config
:
ModelConfig
,
device_config
:
DeviceConfig
,
lora_config
:
Optional
[
LoRAConfig
],
vision_language_config
:
Optional
[
VisionLanguageConfig
],
parallel_config
:
ParallelConfig
,
scheduler_config
:
SchedulerConfig
)
->
nn
.
Module
:
with
set_default_torch_dtype
(
model_config
.
dtype
):
with
torch
.
device
(
device_config
.
device
):
model
=
_initialize_model
(
model_config
,
self
.
load_config
,
lora_config
,
vision_language_config
)
# TODO(sgm): This is a hack, we need to register the load_weight() func for each model in vllm
if
isinstance
(
actor_model
,
nn
.
Module
):
load_megatron_weights
(
actor_weights
=
dict
(
actor_model
.
named_parameters
(
remove_duplicate
=
False
)),
vllm_model
=
model
)
else
:
load_megatron_weights
(
actor_weights
=
actor_model
,
vllm_model
=
model
)
for
_
,
module
in
model
.
named_modules
():
quant_method
=
getattr
(
module
,
"quant_method"
,
None
)
if
quant_method
is
not
None
:
quant_method
.
process_weights_after_loading
(
module
)
# FIXME: Remove this after Mixtral is updated
# to use quant_method.
if
hasattr
(
module
,
"process_weights_after_loading"
):
module
.
process_weights_after_loading
()
# NOTE(sgm) Some weights are point to gpu, but still need this.
model
=
model
.
cuda
()
# NOTE (zhangchi.usc1992) We need this for vllm to profile memory usage
return
model
.
eval
()
class
HFLoader
(
BaseModelLoader
):
"""Model loader that can load the model weights from model's full params."""
def
__init__
(
self
,
load_config
:
LoadConfig
):
super
().
__init__
(
load_config
)
if
load_config
.
model_loader_extra_config
:
raise
ValueError
(
f
"Model loader extra config is not supported for "
f
"load format
{
load_config
.
load_format
}
"
)
def
_get_weights_iterator
(
self
,
actor_model
:
Union
[
PreTrainedModel
,
Dict
]):
if
isinstance
(
actor_model
,
Dict
):
return
actor_model
.
items
()
elif
isinstance
(
actor_model
,
nn
.
Module
):
return
dict
(
actor_model
.
named_parameters
()).
items
()
else
:
raise
ValueError
(
f
'actor model should be Dict or nn.Module, but get
{
type
(
actor_model
)
}
'
)
def
load_model
(
self
,
actor_model
:
Union
[
PreTrainedModel
,
Dict
],
model_config
:
ModelConfig
,
device_config
:
DeviceConfig
,
lora_config
:
Optional
[
LoRAConfig
],
vision_language_config
:
Optional
[
VisionLanguageConfig
],
parallel_config
:
ParallelConfig
,
scheduler_config
:
SchedulerConfig
)
->
nn
.
Module
:
with
set_default_torch_dtype
(
model_config
.
dtype
):
# with torch.device(device_config.device):
# NOTE(sgm): init the model in cpu
model
=
_initialize_model
(
model_config
,
self
.
load_config
,
lora_config
,
vision_language_config
)
model
.
load_weights
(
self
.
_get_weights_iterator
(
actor_model
))
for
_
,
module
in
model
.
named_modules
():
quant_method
=
getattr
(
module
,
"quant_method"
,
None
)
if
quant_method
is
not
None
:
quant_method
.
process_weights_after_loading
(
module
)
# FIXME: Remove this after Mixtral is updated
# to use quant_method.
if
hasattr
(
module
,
"process_weights_after_loading"
):
module
.
process_weights_after_loading
()
# NOTE(sgm) Some weights are point to gpu, but still need this.
model
=
model
.
cuda
()
# NOTE (zhangchi.usc1992) We need this for vllm to profile memory usage
return
model
.
eval
()
class
DTensorLoader
(
BaseModelLoader
):
"""Model loader that can load the model weights from partitioned megatron model."""
def
__init__
(
self
,
load_config
:
LoadConfig
):
super
().
__init__
(
load_config
)
if
load_config
.
model_loader_extra_config
:
raise
ValueError
(
f
"Model loader extra config is not supported for "
f
"load format
{
load_config
.
load_format
}
"
)
def
_get_weights_iterator
(
actor_model
:
Union
[
PreTrainedModel
,
Dict
]):
# NOTE(shengguangming) Load the weights from the actor model
pass
# if isinstance(actor_model, nn.Module):
# load_weights(actor_weights=dict(actor_model.named_parameters(remove_duplicate=False)), vllm_model=model)
# else:
# load_weights(actor_weights=actor_model, vllm_model=model)
# return actor_model
def
load_model
(
self
,
actor_model
:
Union
[
PreTrainedModel
,
Dict
],
model_config
:
ModelConfig
,
device_config
:
DeviceConfig
,
lora_config
:
Optional
[
LoRAConfig
],
vision_language_config
:
Optional
[
VisionLanguageConfig
],
parallel_config
:
ParallelConfig
,
scheduler_config
:
SchedulerConfig
)
->
nn
.
Module
:
with
set_default_torch_dtype
(
model_config
.
dtype
):
with
torch
.
device
(
device_config
.
device
):
model
=
_initialize_model
(
model_config
,
self
.
load_config
,
lora_config
,
vision_language_config
)
# TODO(sgm): This is a hack, we need to register the load_weight() func for each model in vllm
if
isinstance
(
actor_model
,
nn
.
Module
):
load_dtensor_weights
(
actor_weights
=
dict
(
actor_model
.
named_parameters
(
remove_duplicate
=
False
)),
vllm_model
=
model
)
else
:
load_dtensor_weights
(
actor_weights
=
actor_model
,
vllm_model
=
model
)
for
_
,
module
in
model
.
named_modules
():
quant_method
=
getattr
(
module
,
"quant_method"
,
None
)
if
quant_method
is
not
None
:
quant_method
.
process_weights_after_loading
(
module
)
# FIXME: Remove this after Mixtral is updated
# to use quant_method.
if
hasattr
(
module
,
"process_weights_after_loading"
):
module
.
process_weights_after_loading
()
# NOTE(sgm) Some weights are point to gpu, but still need this.
model
=
model
.
cuda
()
# NOTE (zhangchi.usc1992) We need this for vllm to profile memory usage
return
model
.
eval
()
# FIXME(sgm): hack the _get_logits function in vllm v0.4.2
# as they use ray, the _get_logits result will only need to return to the driver node,
# therefore gather is enough. However, we use SPMD instead of a central scheduler,
# all_gather is required (aligned with v0.2.6)
def
_get_logits
(
self
,
hidden_states
:
torch
.
Tensor
,
embedding
:
torch
.
Tensor
,
embedding_bias
:
Optional
[
torch
.
Tensor
])
->
torch
.
Tensor
:
# Get the logits for the next tokens.
logits
=
torch
.
matmul
(
hidden_states
,
embedding
.
t
())
if
embedding_bias
is
not
None
:
logits
+=
embedding_bias
logits
=
tensor_model_parallel_all_gather
(
logits
)
# Remove paddings in vocab (if any).
if
logits
is
not
None
:
logits
=
logits
[:,
:
self
.
org_vocab_size
]
return
logits
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
LogitsProcessor
.
_get_logits
=
_get_logits
verl/third_party/vllm/vllm_v_0_4_2/model_runner.py
0 → 100644
View file @
f87b35b2
# Copyright 2024 Bytedance Ltd. and/or its affiliates
# Copyright 2023 The vLLM team.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/worker/model_runner.py
import
torch
import
torch.nn
as
nn
from
enum
import
IntEnum
from
typing
import
Dict
,
List
,
Optional
,
Set
,
Tuple
,
Union
from
vllm.attention
import
(
AttentionMetadata
,
get_attn_backend
)
from
vllm.config
import
(
DeviceConfig
,
LoRAConfig
,
ParallelConfig
,
SchedulerConfig
,
VisionLanguageConfig
)
from
vllm.logger
import
init_logger
from
vllm.lora.layers
import
LoRAMapping
from
vllm.lora.request
import
LoRARequest
from
vllm.lora.worker_manager
import
LRUCacheWorkerLoRAManager
from
vllm.model_executor
import
SamplingMetadata
from
vllm.sequence
import
(
MultiModalData
,
SamplerOutput
,
SequenceData
,
SequenceGroupMetadata
)
from
vllm.utils
import
(
CudaMemoryProfiler
,
is_hip
,
is_pin_memory_available
)
from
vllm.worker.model_runner
import
ModelRunner
,
CUDAGraphRunner
from
.model_loader
import
get_model
from
.config
import
ModelConfig
,
LoadConfig
logger
=
init_logger
(
__name__
)
# How batches are constructed.
class
BatchType
(
IntEnum
):
# Every batch is prefill.
PREFILL
=
0
# Every batch is decode.
DECODE
=
1
# Batch is a mixture of prefill and decode.
MIXED
=
2
class
ModelRunner
(
ModelRunner
):
def
__init__
(
self
,
model
:
Union
[
nn
.
Module
,
Dict
],
# model itself or its parameter dict
model_config
:
ModelConfig
,
parallel_config
:
ParallelConfig
,
scheduler_config
:
SchedulerConfig
,
device_config
:
DeviceConfig
,
load_config
:
LoadConfig
,
lora_config
:
Optional
[
LoRAConfig
],
kv_cache_dtype
:
Optional
[
str
]
=
"auto"
,
vision_language_config
:
Optional
[
VisionLanguageConfig
]
=
None
,
):
self
.
model_config
=
model_config
self
.
parallel_config
=
parallel_config
self
.
scheduler_config
=
scheduler_config
self
.
lora_config
=
lora_config
self
.
load_config
=
load_config
# model_config can be None in tests/samplers/test_sampler.py.
# FIXME(woosuk): This is a hack to make the tests work. Refactor this.
self
.
sliding_window
=
(
model_config
.
get_sliding_window
()
if
model_config
is
not
None
else
None
)
self
.
device_config
=
(
device_config
if
device_config
is
not
None
else
DeviceConfig
())
self
.
device
=
self
.
device_config
.
device
# NOTE(sgm): add for verl
self
.
model
=
model
# this will be replaced by get_model()
# Set after load_model.
self
.
lora_manager
:
LRUCacheWorkerLoRAManager
=
None
self
.
graph_runners
:
Dict
[
int
,
CUDAGraphRunner
]
=
{}
self
.
graph_memory_pool
:
Optional
[
Tuple
[
int
,
int
]]
=
None
# Set during graph capture.
self
.
max_seq_len_to_capture
=
(
self
.
model_config
.
max_seq_len_to_capture
if
self
.
model_config
is
not
None
else
0
)
self
.
pin_memory
=
is_pin_memory_available
()
self
.
kv_cache_dtype
=
kv_cache_dtype
self
.
vision_language_config
=
vision_language_config
self
.
attn_backend
=
get_attn_backend
(
self
.
model_config
.
dtype
if
model_config
is
not
None
else
None
)
# Lazy initialization
self
.
block_size
:
int
# Set after initial profiling.
# When using CUDA graph, the input block tables must be padded to
# max_seq_len_to_capture. However, creating the block table in
# Python can be expensive. To optimize this, we cache the block table
# in numpy and only copy the actual input content at every iteration.
# The shape of the cached block table will be
# (max batch size to capture, max context len to capture / block size).
self
.
graph_block_tables
:
torch
.
Tensor
# Set after initial profiling.
# Set if the backend is flashinfer.
self
.
flashinfer_workspace_buffer
:
torch
.
Tensor
# NOTE(sgm): initialize model using the actor model
def
load_model
(
self
)
->
None
:
with
CudaMemoryProfiler
()
as
m
:
self
.
model
=
get_model
(
actor_model
=
self
.
model
,
model_config
=
self
.
model_config
,
device_config
=
self
.
device_config
,
lora_config
=
self
.
lora_config
,
load_config
=
self
.
load_config
,
parallel_config
=
self
.
parallel_config
,
scheduler_config
=
self
.
scheduler_config
,
vision_language_config
=
self
.
vision_language_config
)
self
.
model_memory_usage
=
m
.
consumed_memory
logger
.
info
(
"Loading model weights took %.4f GB"
,
self
.
model_memory_usage
/
float
(
2
**
30
))
if
self
.
lora_config
:
assert
hasattr
(
self
.
model
,
"supported_lora_modules"
)
and
self
.
model
.
supported_lora_modules
,
(
"Model does not support LoRA"
)
assert
hasattr
(
self
.
model
,
"embedding_modules"
),
"Model does not have embedding_modules"
assert
hasattr
(
self
.
model
,
"embedding_padding_modules"
),
"Model does not have embedding_padding_modules"
self
.
lora_manager
=
LRUCacheWorkerLoRAManager
(
self
.
scheduler_config
.
max_num_seqs
,
self
.
scheduler_config
.
max_num_batched_tokens
,
self
.
vocab_size
,
self
.
lora_config
,
self
.
device
,
self
.
model
.
embedding_modules
,
self
.
model
.
embedding_padding_modules
)
self
.
model
=
self
.
lora_manager
.
create_lora_manager
(
self
.
model
)
if
self
.
kv_cache_dtype
==
"fp8"
and
is_hip
():
# Currently scaled KV cache is only enabled on ROCm
if
self
.
model_config
.
quantization_param_path
is
not
None
:
if
callable
(
getattr
(
self
.
model
,
"load_kv_cache_scales"
,
None
)):
self
.
model
.
load_kv_cache_scales
(
self
.
model_config
.
quantization_param_path
)
else
:
raise
RuntimeError
(
"Using FP8 KV cache and scaling factors provided but "
"model %s does not support loading scaling factors."
,
self
.
model
.
__class__
)
else
:
logger
.
warning
(
"Using FP8 KV cache but no scaling factors "
"provided. Defaulting to scaling factors of 1.0. "
"This may lead to less accurate results!"
)
elif
self
.
model_config
.
quantization_param_path
is
not
None
:
logger
.
warning
(
"KV cache scaling factors provided, "
"but the KV cache data type is not FP8. "
"KV cache scaling factors will not be used."
)
def
prepare_input_tensors
(
self
,
seq_group_metadata_list
:
List
[
SequenceGroupMetadata
],
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
,
AttentionMetadata
,
SamplingMetadata
,
Set
[
LoRARequest
],
LoRAMapping
,
torch
.
Tensor
]:
# NOTE(sgm): all workers prepare the input in the same way
prefill_reqs
=
[]
decode_reqs
=
[]
for
seq_group_meta
in
seq_group_metadata_list
:
if
seq_group_meta
.
is_prompt
:
prefill_reqs
.
append
(
seq_group_meta
)
else
:
decode_reqs
.
append
(
seq_group_meta
)
# Prepare input tensors.
(
input_tokens
,
input_positions
,
prefill_attn_metadata
,
seq_lens
,
query_lens
,
lora_index_mapping
,
lora_prompt_mapping
,
lora_requests
,
multi_modal_input
,
slot_mapping
,
)
=
self
.
_prepare_prompt
(
prefill_reqs
)
(
decode_input_tokens
,
decode_input_positions
,
decode_attn_metadata
,
decode_lora_index_mapping
,
decode_lora_prompt_mapping
,
decode_lora_requests
,
decode_slot_mapping
,
)
=
self
.
_prepare_decode
(
decode_reqs
)
sampling_metadata
=
SamplingMetadata
.
prepare
(
seq_group_metadata_list
,
seq_lens
,
query_lens
,
self
.
device
,
self
.
pin_memory
)
if
not
self
.
scheduler_config
.
chunked_prefill_enabled
:
assert
(
len
(
prefill_reqs
)
and
len
(
decode_reqs
))
==
0
num_prefills
=
len
(
seq_lens
)
num_prefill_tokens
=
len
(
input_tokens
)
num_decode_tokens
=
len
(
decode_input_tokens
)
# Coalesce tensors. Note that attn_metadata is currently not
# coalesced for simplicity.
input_tokens
.
extend
(
decode_input_tokens
)
input_positions
.
extend
(
decode_input_positions
)
slot_mapping
.
extend
(
decode_slot_mapping
)
lora_index_mapping
.
extend
(
decode_lora_index_mapping
)
lora_prompt_mapping
.
extend
(
decode_lora_prompt_mapping
)
lora_requests
.
update
(
decode_lora_requests
)
input_tokens
=
torch
.
tensor
(
input_tokens
,
dtype
=
torch
.
long
,
device
=
self
.
device
)
input_positions
=
torch
.
tensor
(
input_positions
,
dtype
=
torch
.
long
,
device
=
self
.
device
)
slot_mapping
=
torch
.
tensor
(
slot_mapping
,
dtype
=
torch
.
long
,
device
=
self
.
device
)
if
self
.
lora_config
:
lora_mapping
=
LoRAMapping
(
lora_index_mapping
,
lora_prompt_mapping
,
)
else
:
lora_mapping
=
None
# Broadcast the metadata.
# If batch contains both prefill and decode, it sends 2 broadcasts.
# If it only contains 1 type, it triggers a single broadcast.
if
(
prefill_attn_metadata
is
not
None
and
decode_attn_metadata
is
not
None
):
batch_type
=
BatchType
.
MIXED
elif
prefill_attn_metadata
is
not
None
:
batch_type
=
BatchType
.
PREFILL
else
:
batch_type
=
BatchType
.
DECODE
attn_metadata
=
AttentionMetadata
(
num_prefills
=
num_prefills
,
slot_mapping
=
slot_mapping
,
num_prefill_tokens
=
num_prefill_tokens
,
num_decode_tokens
=
num_decode_tokens
,
prefill_metadata
=
prefill_attn_metadata
,
decode_metadata
=
decode_attn_metadata
,
kv_cache_dtype
=
self
.
kv_cache_dtype
,
)
return
(
input_tokens
,
input_positions
,
attn_metadata
,
sampling_metadata
,
lora_requests
,
lora_mapping
,
multi_modal_input
)
@
torch
.
inference_mode
()
def
execute_model
(
self
,
seq_group_metadata_list
:
List
[
SequenceGroupMetadata
],
kv_caches
:
List
[
torch
.
Tensor
],
)
->
Optional
[
SamplerOutput
]:
(
input_tokens
,
input_positions
,
attn_metadata
,
sampling_metadata
,
lora_requests
,
lora_mapping
,
multi_modal_input
)
=
self
.
prepare_input_tensors
(
seq_group_metadata_list
)
if
self
.
lora_config
:
self
.
set_active_loras
(
lora_requests
,
lora_mapping
)
# Currently cuda graph is only supported by the decode phase.
prefill_meta
=
attn_metadata
.
prefill_metadata
decode_meta
=
attn_metadata
.
decode_metadata
if
prefill_meta
is
None
and
decode_meta
.
use_cuda_graph
:
graph_batch_size
=
input_tokens
.
shape
[
0
]
model_executable
=
self
.
graph_runners
[
graph_batch_size
]
else
:
model_executable
=
self
.
model
execute_model_kwargs
=
{
"input_ids"
:
input_tokens
,
"positions"
:
input_positions
,
"kv_caches"
:
kv_caches
,
"attn_metadata"
:
attn_metadata
,
}
if
self
.
vision_language_config
:
execute_model_kwargs
.
update
({
"image_input"
:
multi_modal_input
})
hidden_states
=
model_executable
(
**
execute_model_kwargs
)
# Compute the logits.
logits
=
self
.
model
.
compute_logits
(
hidden_states
,
sampling_metadata
)
# Only perform sampling in the driver worker.
# if not self.is_driver_worker:
# return None
# TODO(sgm): perform sampling on rank 0
# Sample the next token.
output
=
self
.
model
.
sample
(
logits
=
logits
,
sampling_metadata
=
sampling_metadata
,
)
return
output
verl/third_party/vllm/vllm_v_0_4_2/parallel_state.py
0 → 100644
View file @
f87b35b2
# Copyright 2024 Bytedance Ltd. and/or its affiliates
# Copyright 2023 The vLLM team.
# Adapted from
# https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/parallel_state.py
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
"""Model and data parallel groups."""
import
os
import
torch
import
torch.distributed
from
typing
import
Optional
import
vllm.distributed.parallel_state
as
ps
import
vllm.envs
as
envs
from
vllm.logger
import
init_logger
from
torch.distributed.device_mesh
import
init_device_mesh
logger
=
init_logger
(
__name__
)
"""
This version is strongly tied with Megatron to implement HybridEngine and weight sharing between vllm and Megatron.
- We assume the Megatron tp+dp+pp world is already established before calling this function.
"""
# Device mesh for using DTensor
_DEVICE_MESH
=
None
# Tensor model parallel group that the current rank belongs to.
_TP_DEVICE_GROUP
=
None
_TP_CPU_GROUP
=
None
# This method is for initializing the ParallelGroup when using HybridEngine
def
initialize_parallel_state
(
distributed_init_method
:
str
=
"env://"
,
backend
:
str
=
"nccl"
,
tensor_model_parallel_size
:
int
=
1
,
num_tp_per_train_tp
:
int
=
1
,
pipeline_model_parallel_size
:
int
=
1
,
):
# torch.distributed.all_reduce does not free the input tensor until
# the synchronization point. This causes the memory usage to grow
# as the number of all_reduce calls increases. This env var disables
# this behavior.
# Related issue:
# https://discuss.pytorch.org/t/cuda-allocation-lifetime-for-inputs-to-distributed-all-reduce/191573
os
.
environ
[
"TORCH_NCCL_AVOID_RECORD_STREAMS"
]
=
"1"
# NOTE(sgm): Modify for verl, Env vars will be set by TORCHRUN.
rank
=
int
(
os
.
getenv
(
"RANK"
,
"-1"
))
local_rank
=
int
(
os
.
getenv
(
"LOCAL_RANK"
,
"0"
))
# Use the world_size set by TORCHRUN
world_size
=
int
(
os
.
getenv
(
"WORLD_SIZE"
,
"-1"
))
assert
world_size
!=
-
1
,
"The world_size is set to -1, not initialized by TORCHRUN"
ps
.
init_distributed_environment
(
world_size
,
rank
,
distributed_init_method
,
local_rank
,
backend
)
if
torch
.
distributed
.
get_world_size
()
>
1
:
# NOTE: build a sepearate inference group with infer tp & micro dp
initialize_model_parallel_for_vllm
(
tensor_model_parallel_size
=
tensor_model_parallel_size
,
num_tensor_model_parallel_groups_per_train_tp
=
num_tp_per_train_tp
)
else
:
initialize_model_parallel
(
tensor_model_parallel_size
,
pipeline_model_parallel_size
,
backend
)
def
ensure_model_parallel_initialized
(
tensor_model_parallel_size
:
int
,
pipeline_model_parallel_size
:
int
=
1
,
backend
:
Optional
[
str
]
=
None
,
)
->
None
:
"""Helper to initialize model parallel groups if they are not initialized,
or ensure tensor-parallel and pipeline-parallel sizes are equal to expected
values if the model parallel groups are initialized.
"""
# get the backend of _DEVICE_WORLD_GROUP
backend
=
backend
or
torch
.
distributed
.
get_backend
()
if
not
model_parallel_is_initialized
():
initialize_model_parallel
(
tensor_model_parallel_size
,
pipeline_model_parallel_size
,
backend
)
return
assert
(
get_tensor_model_parallel_world_size
()
==
tensor_model_parallel_size
),
(
"tensor parallel group already initialized, but of unexpected size: "
f
"
{
get_tensor_model_parallel_world_size
()
=
}
vs. "
f
"
{
tensor_model_parallel_size
=
}
"
)
# assert (get_pipeline_model_parallel_world_size(
# ) == pipeline_model_parallel_size), (
# "pipeline parallel group already initialized, but of unexpected size: "
# f"{get_pipeline_model_parallel_world_size()=} vs. "
# f"{pipeline_model_parallel_size=}")
def
model_parallel_is_initialized
():
"""Check if tensor and pipeline parallel groups are initialized."""
return
(
ps
.
_TP_DEVICE_GROUP
is
not
None
)
# and _PIPELINE_MODEL_PARALLEL_GROUP is not None)
def
initialize_model_parallel_for_vllm
(
tensor_model_parallel_size
:
int
,
num_tensor_model_parallel_groups_per_train_tp
:
int
=
1
)
->
None
:
from
torch.distributed
import
new_group
# Get world size and rank. Ensure some consistencies.
assert
torch
.
distributed
.
is_initialized
()
assert
isinstance
(
tensor_model_parallel_size
,
int
)
# assert num_tensor_model_parallel_groups_per_train_tp == 1 and not different_tp_group
# assert num_tensor_model_parallel_groups_per_train_tp > 1 and different_tp_group
# Build the tensor model-parallel groups.
assert
ps
.
_TP_DEVICE_GROUP
is
None
,
(
"tensor model parallel group is already initialized"
)
global
_TP_DEVICE_GROUP
global
_TP_CPU_GROUP
global
_DEVICE_MESH
world_size
:
int
=
torch
.
distributed
.
get_world_size
()
rank
=
torch
.
distributed
.
get_rank
()
backend
=
torch
.
distributed
.
get_backend
()
num_tensor_model_parallel_groups
=
world_size
//
tensor_model_parallel_size
if
num_tensor_model_parallel_groups_per_train_tp
==
1
:
# if tensor_model_parallel_size == train_tensor_parallel_size:
# using the same tp group as Megatron/vllm
for
i
in
range
(
num_tensor_model_parallel_groups
):
ranks
=
range
(
i
*
tensor_model_parallel_size
,
(
i
+
1
)
*
tensor_model_parallel_size
)
group
=
torch
.
distributed
.
new_group
(
ranks
,
backend
=
backend
)
cpu_group
=
torch
.
distributed
.
new_group
(
ranks
,
backend
=
"gloo"
)
if
rank
in
ranks
:
_TP_DEVICE_GROUP
=
group
_TP_CPU_GROUP
=
cpu_group
ps
.
_TP_DEVICE_GROUP
=
group
ps
.
_TP_CPU_GROUP
=
cpu_group
# no _MICRO_DATA_PARALLEL_GROUP
else
:
# initialize a micro_dp group and a tp group
# assume training tp=4, infer tp=2, then, weight is partitioned as
# [1], [2], [3], [4] for training and [1,2], [1,2], [3,4], [3,4] for inference
# Build the inference tp groups
# train_tp = train_tensor_parallel_size
train_tp
=
num_tensor_model_parallel_groups_per_train_tp
*
tensor_model_parallel_size
# num_tensor_model_parallel_groups_per_train_tp = train_tp // tensor_model_parallel_size
assert
_TP_DEVICE_GROUP
is
None
,
(
"tensor model parallel group is already initialized"
)
for
i
in
range
(
num_tensor_model_parallel_groups
//
num_tensor_model_parallel_groups_per_train_tp
):
start
=
train_tp
*
i
end
=
train_tp
*
(
i
+
1
)
for
j
in
range
(
num_tensor_model_parallel_groups_per_train_tp
):
ranks
=
list
(
range
(
start
,
end
,
num_tensor_model_parallel_groups_per_train_tp
))
for
i
in
range
(
len
(
ranks
)):
ranks
[
i
]
+=
j
group
=
torch
.
distributed
.
new_group
(
ranks
)
cpu_group
=
torch
.
distributed
.
new_group
(
ranks
,
backend
=
'gloo'
)
if
rank
in
ranks
:
_TP_DEVICE_GROUP
=
group
_TP_CPU_GROUP
=
cpu_group
ps
.
_TP_DEVICE_GROUP
=
_TP_DEVICE_GROUP
ps
.
_TP_CPU_GROUP
=
cpu_group
# Build the pipeline model-parallel groups.
# global _PIPELINE_MODEL_PARALLEL_GROUP
# global _PIPELINE_GLOBAL_RANKS
# assert ps._PIPELINE_MODEL_PARALLEL_GROUP is None, ("pipeline model parallel group is already initialized")
# ps._PIPELINE_MODEL_PARALLEL_GROUP = mpu.get_pipeline_model_parallel_group()
# ps._PIPELINE_GLOBAL_RANKS = mpu.get_pipeline_model_parallel_ranks()
def
initialize_model_parallel
(
tensor_model_parallel_size
:
int
=
1
,
pipeline_model_parallel_size
:
int
=
1
,
backend
:
Optional
[
str
]
=
None
,
)
->
None
:
"""
NOTE: This method is a hack from the open-sourced version without
asertion of world_size = tp * pp
Initialize model parallel groups.
Arguments:
tensor_model_parallel_size: number of GPUs used for tensor model
parallelism.
pipeline_model_parallel_size: number of GPUs used for pipeline model
parallelism.
Let's say we have a total of 8 GPUs denoted by g0 ... g7 and we
use 2 GPUs to parallelize the model tensor, and 4 GPUs to parallelize
the model pipeline. The present function will
create 4 tensor model-parallel groups and 2 pipeline model-parallel groups:
4 tensor model-parallel groups:
[g0, g1], [g2, g3], [g4, g5], [g6, g7]
2 pipeline model-parallel groups:
[g0, g2, g4, g6], [g1, g3, g5, g7]
Note that for efficiency, the caller should make sure adjacent ranks
are on the same DGX box. For example if we are using 2 DGX-1 boxes
with a total of 16 GPUs, rank 0 to 7 belong to the first box and
ranks 8 to 15 belong to the second box.
"""
# Get world size and rank. Ensure some consistencies.
assert
torch
.
distributed
.
is_initialized
()
world_size
:
int
=
torch
.
distributed
.
get_world_size
()
# get the backend of _DEVICE_WORLD_GROUP
backend
=
backend
or
torch
.
distributed
.
get_backend
()
# NOTE(sgm) we don't assert world_size == tp * pp
# DP is not managed by vllm but by the verl WorkerGroup
num_tensor_model_parallel_groups
:
int
=
(
world_size
//
tensor_model_parallel_size
)
num_pipeline_model_parallel_groups
:
int
=
(
world_size
//
pipeline_model_parallel_size
)
rank
=
torch
.
distributed
.
get_rank
()
# Build device mesh for TP
if
num_tensor_model_parallel_groups
>
1
:
device_mesh
=
init_device_mesh
(
"cuda"
,
(
num_tensor_model_parallel_groups
,
tensor_model_parallel_size
),
mesh_dim_names
=
(
"replicate"
,
"tp_shard"
))
else
:
device_mesh
=
init_device_mesh
(
"cuda"
,
(
tensor_model_parallel_size
,),
mesh_dim_names
=
[
"tp_shard"
])
shard_group
=
device_mesh
.
get_group
(
mesh_dim
=
"tp_shard"
)
# Build the tensor model-parallel groups.
global
_TP_DEVICE_GROUP
,
_TP_CPU_GROUP
global
_DEVICE_MESH
assert
_TP_DEVICE_GROUP
is
None
,
(
"tensor model parallel group is already initialized"
)
assert
_DEVICE_MESH
is
None
,
(
"device mesh in vllm is already initialized"
)
_DEVICE_MESH
=
device_mesh
# for i in range(num_tensor_model_parallel_groups):
# ranks = range(i * tensor_model_parallel_size, (i + 1) * tensor_model_parallel_size)
# group = torch.distributed.new_group(ranks, backend=backend)
# cpu_group = torch.distributed.new_group(ranks, backend="gloo")
# assert torch.distributed.get_process_group_ranks(shard_group) == torch.distributed.get_process_group_ranks(cpu_group)
# ranks = torch.distributed.get_process_group_ranks(shard_group)
# cpu_group = torch.distributed.new_group(ranks, backend="gloo") # TODO: this will hang
# cpu_group = torch.distributed.new_group(, backend="gloo")
# if rank == 0:
# print(f'rank: {rank}')
# print(f'ranks: {ranks}')
# print(f'torch.distributed.get_process_group_ranks(shard_group): {torch.distributed.get_process_group_ranks(shard_group)}')
# if rank in ranks:
_TP_DEVICE_GROUP
=
shard_group
ps
.
_TP_DEVICE_GROUP
=
_TP_DEVICE_GROUP
# ps._TP_CPU_GROUP = cpu_group # TODO: will hang when used with device mesh
# TODO: init using device mesh
# Build the pipeline model-parallel groups.
assert
ps
.
_PIPELINE_MODEL_PARALLEL_GROUP
is
None
,
(
"pipeline model parallel group is already initialized"
)
for
i
in
range
(
num_pipeline_model_parallel_groups
):
ranks
=
range
(
i
,
world_size
,
num_pipeline_model_parallel_groups
)
group
=
torch
.
distributed
.
new_group
(
ranks
,
backend
=
backend
)
if
rank
in
ranks
:
ps
.
_PIPELINE_MODEL_PARALLEL_GROUP
=
group
ps
.
_PIPELINE_GLOBAL_RANKS
=
ranks
"""
Device mesh utilities
"""
def
get_device_mesh
():
assert
_DEVICE_MESH
is
not
None
,
(
"device mesh is not initialized"
)
return
_DEVICE_MESH
"""
Tensor model parallel utilities
"""
def
get_tensor_model_parallel_group
():
"""Get the tensor model parallel group the caller rank belongs to."""
assert
_TP_DEVICE_GROUP
is
not
None
,
(
"tensor model parallel group is not initialized"
)
return
_TP_DEVICE_GROUP
def
get_tensor_model_parallel_world_size
():
"""Return world size for the tensor model parallel group."""
return
torch
.
distributed
.
get_world_size
(
group
=
get_tensor_model_parallel_group
())
def
get_tensor_model_parallel_rank
():
"""Return my rank for the tensor model parallel group."""
return
torch
.
distributed
.
get_rank
(
group
=
get_tensor_model_parallel_group
())
def
get_tensor_model_parallel_src_rank
():
"""Calculate the global rank corresponding to the first local rank
in the tensor model parallel group."""
global_rank
=
torch
.
distributed
.
get_rank
()
local_world_size
=
get_tensor_model_parallel_world_size
()
return
(
global_rank
//
local_world_size
)
*
local_world_size
verl/third_party/vllm/vllm_v_0_4_2/spmd_gpu_executor.py
0 → 100644
View file @
f87b35b2
# Copyright 2024 Bytedance Ltd. and/or its affiliates
# Copyright 2023 The vLLM team.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/executor/gpu_executor.py
import
os
import
socket
from
typing
import
Any
,
Dict
,
List
,
Optional
,
Set
,
Tuple
import
torch
import
vllm.envs
as
envs
from
vllm.executor.executor_base
import
ExecutorBase
,
ExecutorAsyncBase
from
vllm.logger
import
init_logger
from
vllm.lora.request
import
LoRARequest
from
vllm.sequence
import
SamplerOutput
,
ExecuteModelRequest
from
vllm.config
import
(
CacheConfig
,
DeviceConfig
,
LoRAConfig
,
ParallelConfig
,
SchedulerConfig
,
SpeculativeConfig
,
VisionLanguageConfig
)
from
.config
import
ModelConfig
,
LoadConfig
logger
=
init_logger
(
__name__
)
class
SPMDGPUExecutor
(
ExecutorBase
):
"""SPMD-based multi-GPU executor implementations."""
def
__init__
(
self
,
model
,
# pytorch model itself or its parameter dict
model_config
:
ModelConfig
,
cache_config
:
CacheConfig
,
parallel_config
:
ParallelConfig
,
scheduler_config
:
SchedulerConfig
,
device_config
:
DeviceConfig
,
load_config
:
LoadConfig
,
lora_config
:
Optional
[
LoRAConfig
],
vision_language_config
:
Optional
[
VisionLanguageConfig
],
speculative_config
:
Optional
[
SpeculativeConfig
],
)
->
None
:
self
.
model_config
=
model_config
self
.
cache_config
=
cache_config
self
.
lora_config
=
lora_config
self
.
load_config
=
load_config
self
.
parallel_config
=
parallel_config
self
.
scheduler_config
=
scheduler_config
self
.
device_config
=
device_config
self
.
vision_language_config
=
vision_language_config
self
.
speculative_config
=
speculative_config
distributed_init_method
=
initialize_cluster
(
parallel_config
)
self
.
_init_executor
(
model
,
distributed_init_method
)
# TODO(sgm): verl not support speculative decode now
def
_init_executor
(
self
,
model
,
distributed_init_method
)
->
None
:
assert
(
not
self
.
speculative_config
),
"Speculative decoding not yet supported for multi-GPU backend."
# Create the parallel worker for each GPU.
self
.
_init_workers_sp
(
model
,
distributed_init_method
)
def
_init_workers_sp
(
self
,
model
,
distributed_init_method
:
str
):
# Lazy import the Worker to avoid importing torch.cuda/xformers
# before CUDA_VISIBLE_DEVICES is set in the Worker
from
.worker
import
Worker
# pylint: disable=import-outside-toplevel
rank
=
int
(
os
.
getenv
(
"RANK"
))
local_rank
=
int
(
os
.
getenv
(
"LOCAL_RANK"
))
print
(
f
'local rank
{
local_rank
}
'
)
self
.
worker
=
Worker
(
model
,
self
.
model_config
,
self
.
parallel_config
,
self
.
scheduler_config
,
self
.
device_config
,
self
.
cache_config
,
self
.
load_config
,
local_rank
,
rank
,
distributed_init_method
,
lora_config
=
self
.
lora_config
,
vision_language_config
=
self
.
vision_language_config
,
)
# NOTE(shengguangming): torch.distributed.init_process_group will be called inside the init_model()
self
.
worker
.
init_device
()
self
.
worker
.
load_model
()
def
determine_num_available_blocks
(
self
)
->
Tuple
[
int
,
int
]:
"""Determine the number of available KV blocks.
This invokes `determine_num_available_blocks` on each worker and takes
the min of the results, guaranteeing that the selected cache sizes are
compatible with all workers.
Returns:
- tuple[num_gpu_blocks, num_cpu_blocks]
"""
# Get the maximum number of blocks that can be allocated on GPU and CPU.
num_blocks
=
self
.
worker
.
determine_num_available_blocks
()
# NOTE(shengguangming): Now we don't use a shared centralized controler but each process will
# have its own scheduler
num_gpu_blocks
=
num_blocks
[
0
]
num_cpu_blocks
=
num_blocks
[
1
]
return
num_gpu_blocks
,
num_cpu_blocks
def
initialize_cache
(
self
,
num_gpu_blocks
:
int
,
num_cpu_blocks
:
int
)
->
None
:
"""Initialize the KV cache in all workers.
"""
# NOTE: We log here to avoid multiple logs when number of workers is
# greater than one. We could log in the engine, but not all executors
# have GPUs.
logger
.
info
(
"# GPU blocks: %d, # CPU blocks: %d"
,
num_gpu_blocks
,
num_cpu_blocks
)
self
.
cache_config
.
num_gpu_blocks
=
num_gpu_blocks
self
.
cache_config
.
num_cpu_blocks
=
num_cpu_blocks
if
torch
.
distributed
.
get_rank
()
==
0
:
print
(
f
'before init cache memory allocated:
{
torch
.
cuda
.
memory_allocated
()
/
1e9
}
GB, reserved:
{
torch
.
cuda
.
memory_reserved
()
/
1e9
}
GB'
)
self
.
worker
.
initialize_cache
(
num_gpu_blocks
=
num_gpu_blocks
,
num_cpu_blocks
=
num_cpu_blocks
)
if
torch
.
distributed
.
get_rank
()
==
0
:
print
(
f
'after init cache memory allocated:
{
torch
.
cuda
.
memory_allocated
()
/
1e9
}
GB, reserved:
{
torch
.
cuda
.
memory_reserved
()
/
1e9
}
GB'
)
# NOTE(sgm): This will not profile & capture the model(CUDAGraph) when rebuilding KVCache
def
init_cache_engine
(
self
)
->
None
:
self
.
worker
.
_init_cache_engine
()
def
free_cache_engine
(
self
)
->
None
:
self
.
worker
.
free_cache_engine
()
def
execute_model
(
self
,
execute_model_req
)
->
List
[
SamplerOutput
]:
all_outputs
=
self
.
worker
.
execute_model
(
execute_model_req
=
execute_model_req
)
# NOTE(sgm):
# Each GPU in vllm under verl has its own spmd_gpu_executor, therefore all GPUs should return the outputs
# In vllm with ray, only the driver worker returns the sampling results.
return
all_outputs
def
add_lora
(
self
,
lora_request
:
LoRARequest
)
->
bool
:
assert
lora_request
.
lora_int_id
>
0
,
"lora_id must be greater than 0."
return
self
.
worker
.
add_lora
(
lora_request
=
lora_request
)
def
remove_lora
(
self
,
lora_id
:
int
)
->
bool
:
assert
lora_id
>
0
,
"lora_id must be greater than 0."
return
self
.
worker
.
remove_lora
(
lora_id
=
lora_id
)
def
list_loras
(
self
)
->
Set
[
int
]:
return
self
.
worker
.
list_loras
()
def
check_health
(
self
)
->
None
:
# SPMDExecutor will always be healthy as long as
# it's running.
return
# NOTE(sgm): add for verl
def
offload_model_weights
(
self
)
->
None
:
self
.
worker
.
offload_model_weights
()
def
sync_model_weights
(
self
,
actor_weights
:
Dict
[
str
,
torch
.
Tensor
],
load_format
:
str
)
->
None
:
self
.
worker
.
sync_model_weights
(
actor_weights
=
actor_weights
,
load_format
=
load_format
)
def
initialize_cluster
(
parallel_config
:
ParallelConfig
,
engine_use_ray
:
bool
=
False
,
ray_address
:
Optional
[
str
]
=
None
,
)
->
Tuple
[
str
,
Optional
[
None
]]:
"""Initialize the distributed cluster probably with Ray.
Args:
parallel_config: The configurations for parallel execution.
Returns:
The `distributed_init_method` is the address for initializing the
distributed backend.
"""
# Initialize cluster locally.
port
=
get_open_port
()
# We need to setup the distributed init method to make sure
# the distributed megatron code (e.g., get world size) works correctly.
# distributed_init_method = f"tcp://localhost:{port}"
distributed_init_method
=
'env://'
return
distributed_init_method
def
get_open_port
():
with
socket
.
socket
(
socket
.
AF_INET
,
socket
.
SOCK_STREAM
)
as
s
:
s
.
bind
((
""
,
0
))
return
s
.
getsockname
()[
1
]
# TODO(sgm): not implemented async executor yet
class
SPMDGPUExecutorAsync
(
SPMDGPUExecutor
,
ExecutorAsyncBase
):
async
def
execute_model_async
(
self
,
execute_model_req
:
ExecuteModelRequest
)
->
List
[
SamplerOutput
]:
"""Executes one model step on the given sequences."""
raise
NotImplementedError
async
def
check_health_async
(
self
)
->
None
:
"""Checks if the executor is healthy. If not, it should raise an
exception."""
self
.
check_health
()
verl/third_party/vllm/vllm_v_0_4_2/tokenizer.py
0 → 100644
View file @
f87b35b2
# Copyright 2024 Bytedance Ltd. and/or its affiliates
# Copyright 2023 The vLLM team.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/transformers_utils/tokenizer_group/tokenizer_group.py
from
typing
import
List
,
Optional
,
Tuple
,
Union
from
transformers
import
(
AutoTokenizer
,
PreTrainedTokenizer
,
PreTrainedTokenizerFast
)
from
vllm.lora.request
import
LoRARequest
from
vllm.utils
import
make_async
,
LRUCache
from
vllm.transformers_utils.tokenizers
import
*
class
TokenizerGroup
:
"""A group of tokenizers that can be used for LoRA adapters."""
def
__init__
(
self
,
tokenizer
:
PreTrainedTokenizer
,
enable_lora
:
bool
,
max_num_seqs
:
int
,
max_input_length
:
Optional
[
int
]):
self
.
enable_lora
=
enable_lora
self
.
max_input_length
=
max_input_length
self
.
tokenizer
=
tokenizer
self
.
lora_tokenizers
=
LRUCache
[
PreTrainedTokenizer
](
capacity
=
max_num_seqs
)
if
enable_lora
else
None
def
ping
(
self
)
->
bool
:
"""Check if the tokenizer group is alive."""
return
True
def
get_max_input_len
(
self
,
lora_request
:
Optional
[
LoRARequest
]
=
None
)
->
Optional
[
int
]:
"""Get the maximum input length for the LoRA request."""
return
self
.
max_input_length
def
encode
(
self
,
prompt
:
str
,
request_id
:
Optional
[
str
]
=
None
,
lora_request
:
Optional
[
LoRARequest
]
=
None
)
->
List
[
int
]:
tokenizer
=
self
.
get_lora_tokenizer
(
lora_request
)
return
tokenizer
.
encode
(
prompt
)
async
def
encode_async
(
self
,
prompt
:
str
,
request_id
:
Optional
[
str
]
=
None
,
lora_request
:
Optional
[
LoRARequest
]
=
None
)
->
List
[
int
]:
tokenizer
=
await
self
.
get_lora_tokenizer_async
(
lora_request
)
return
tokenizer
.
encode
(
prompt
)
def
get_lora_tokenizer
(
self
,
lora_request
:
Optional
[
LoRARequest
])
->
"PreTrainedTokenizer"
:
if
not
lora_request
or
not
self
.
enable_lora
:
return
self
.
tokenizer
if
lora_request
.
lora_int_id
not
in
self
.
lora_tokenizers
:
# TODO(sgm): the lora tokenizer is also passed, but may be different
tokenizer
=
self
.
tokenizer
# tokenizer = (get_lora_tokenizer(
# lora_request, **self.tokenizer_config) or self.tokenizer)
self
.
lora_tokenizers
.
put
(
lora_request
.
lora_int_id
,
tokenizer
)
return
tokenizer
else
:
return
self
.
lora_tokenizers
.
get
(
lora_request
.
lora_int_id
)
# FIXME(sgm): for simplicity, we assign the special token here
@
property
def
pad_token_id
(
self
):
return
self
.
tokenizer
.
pad_token_id
@
property
def
eos_token_id
(
self
):
return
self
.
tokenizer
.
eos_token_id
verl/third_party/vllm/vllm_v_0_4_2/worker.py
0 → 100644
View file @
f87b35b2
# Copyright 2024 Bytedance Ltd. and/or its affiliates
# Copyright 2023 The vLLM team.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/worker/worker.py
"""A GPU worker class."""
import
os
import
gc
from
typing
import
Dict
,
List
,
Tuple
,
Optional
,
Union
import
torch
import
torch.distributed
import
torch.nn
as
nn
from
vllm.config
import
(
CacheConfig
,
DeviceConfig
,
LoRAConfig
,
ParallelConfig
,
SchedulerConfig
,
VisionLanguageConfig
)
from
vllm.model_executor
import
set_random_seed
from
vllm.sequence
import
SamplerOutput
,
ExecuteModelRequest
from
vllm.worker.cache_engine
import
CacheEngine
from
vllm.distributed.device_communicators
import
pynccl_utils
from
vllm.distributed.device_communicators.custom_all_reduce
import
(
init_custom_ar
)
# TODO(sgm): check why vllm has similar file in vllm.model_executor.parallel_utils.parallel_state
from
vllm.distributed
import
get_tensor_model_parallel_cpu_group
,
init_distributed_environment
,
get_tensor_model_parallel_group
from
vllm.worker.worker
import
Worker
,
_check_if_gpu_supports_dtype
from
.model_runner
import
ModelRunner
from
.megatron_weight_loaders
import
load_megatron_weights
from
.hf_weight_loader
import
load_hf_weights
from
.dtensor_weight_loaders
import
load_dtensor_weights
from
.parallel_state
import
(
ensure_model_parallel_initialized
)
from
.config
import
ModelConfig
,
LoadConfig
,
LoadFormat
class
Worker
(
Worker
):
"""A worker class that executes (a partition of) the model on a GPU.
Each worker is associated with a single GPU. The worker is responsible for
maintaining the KV cache and executing the model on the GPU. In case of
distributed inference, each worker is assigned a partition of the model.
"""
def
__init__
(
self
,
model
:
Union
[
nn
.
Module
,
Dict
],
# model itself or its parameter dict
model_config
:
ModelConfig
,
parallel_config
:
ParallelConfig
,
scheduler_config
:
SchedulerConfig
,
device_config
:
DeviceConfig
,
cache_config
:
CacheConfig
,
load_config
:
LoadConfig
,
local_rank
:
int
,
rank
:
int
,
distributed_init_method
:
str
,
lora_config
:
Optional
[
LoRAConfig
]
=
None
,
vision_language_config
:
Optional
[
VisionLanguageConfig
]
=
None
,
is_driver_worker
:
bool
=
False
,
)
->
None
:
# self.model = model # will be replaced in the init_model
self
.
model_config
=
model_config
self
.
parallel_config
=
parallel_config
self
.
scheduler_config
=
scheduler_config
self
.
device_config
=
device_config
self
.
cache_config
=
cache_config
self
.
local_rank
=
local_rank
self
.
rank
=
rank
self
.
distributed_init_method
=
distributed_init_method
self
.
lora_config
=
lora_config
self
.
load_config
=
load_config
self
.
is_driver_worker
=
is_driver_worker
if
self
.
is_driver_worker
:
assert
self
.
rank
==
0
,
"The driver worker must have rank 0."
self
.
vision_language_config
=
vision_language_config
if
self
.
vision_language_config
:
assert
not
self
.
lora_config
,
(
"To be tested: vision language model with LoRA settings."
)
self
.
model_runner
=
ModelRunner
(
model
,
model_config
,
parallel_config
,
scheduler_config
,
device_config
,
load_config
=
load_config
,
lora_config
=
self
.
lora_config
,
kv_cache_dtype
=
self
.
cache_config
.
cache_dtype
,
vision_language_config
=
vision_language_config
,
)
# Uninitialized cache engine. Will be initialized by
# init_cache_engine.
self
.
cache_engine
:
CacheEngine
=
None
self
.
gpu_cache
:
List
[
torch
.
Tensor
]
=
None
# NOTE(sgm): For offloading inference engine params
self
.
cpu_model
=
None
def
init_device
(
self
)
->
None
:
if
self
.
device_config
.
device
.
type
==
"cuda"
:
# torch.distributed.all_reduce does not free the input tensor until
# the synchronization point. This causes the memory usage to grow
# as the number of all_reduce calls increases. This env var disables
# this behavior.
# Related issue:
# https://discuss.pytorch.org/t/cuda-allocation-lifetime-for-inputs-to-distributed-all-reduce/191573
os
.
environ
[
"TORCH_NCCL_AVOID_RECORD_STREAMS"
]
=
"1"
# NOTE(sgm): Modify for verl, Env vars will be set by TORCHRUN.
self
.
rank
=
self
.
rank
if
self
.
rank
is
not
None
else
int
(
os
.
getenv
(
"RANK"
,
"-1"
))
local_rank
=
int
(
os
.
getenv
(
"LOCAL_RANK"
,
"0"
))
self
.
device
=
torch
.
device
(
f
"cuda:
{
local_rank
}
"
)
if
self
.
rank
<
0
:
raise
ValueError
(
"Invalid or unspecified rank."
)
torch
.
cuda
.
set_device
(
self
.
device
)
# Use the world_size set by TORCHRUN
world_size
=
int
(
os
.
getenv
(
"WORLD_SIZE"
,
"-1"
))
assert
world_size
!=
-
1
,
"The world_size is set to -1, not initialized by TORCHRUN"
self
.
parallel_config
.
world_size
=
world_size
_check_if_gpu_supports_dtype
(
self
.
model_config
.
dtype
)
torch
.
cuda
.
empty_cache
()
self
.
init_gpu_memory
=
torch
.
cuda
.
mem_get_info
()[
0
]
else
:
raise
RuntimeError
(
f
"Not support device type:
{
self
.
device_config
.
device
}
"
)
# Initialize the distributed environment.
init_worker_distributed_environment
(
self
.
parallel_config
,
self
.
rank
,
self
.
distributed_init_method
,
self
.
local_rank
)
# Set random seed.
set_random_seed
(
self
.
model_config
.
seed
)
# self.model = get_model(actor_model=self.model, model_config=self.model_config)
@
torch
.
inference_mode
()
def
determine_num_available_blocks
(
self
)
->
Tuple
[
int
,
int
]:
"""Profiles the peak memory usage of the model to determine how many
KV blocks may be allocated without OOMs.
The engine will first conduct a profiling of the existing memory usage.
Then, it calculate the maximum possible number of GPU and CPU blocks
that can be allocated with the remaining free memory.
.. tip::
You may limit the usage of GPU memory
by adjusting the `gpu_memory_utilization` parameter.
"""
# Profile the memory usage of the model and get the maximum number of
# cache blocks that can be allocated with the remaining free memory.
torch
.
cuda
.
empty_cache
()
# torch.cuda.reset_peak_memory_stats()
# Execute a forward pass with dummy inputs to profile the memory usage
# of the model.
self
.
model_runner
.
profile_run
()
# Calculate the number of blocks that can be allocated with the
# profiled peak memory.
torch
.
cuda
.
synchronize
()
free_gpu_memory
,
total_gpu_memory
=
torch
.
cuda
.
mem_get_info
()
peak_memory
=
total_gpu_memory
-
free_gpu_memory
assert
peak_memory
>
0
,
(
"Error in memory profiling. This happens when the GPU memory was "
"not properly cleaned up before initializing the vLLM instance."
)
cache_block_size
=
self
.
get_cache_block_size_bytes
()
# NOTE(sgm) use the remaining memory
num_gpu_blocks
=
int
((
free_gpu_memory
*
self
.
cache_config
.
gpu_memory_utilization
)
//
cache_block_size
)
# num_gpu_blocks = int((total_gpu_memory * self.cache_config.gpu_memory_utilization - peak_memory) // cache_block_size)
num_cpu_blocks
=
int
(
self
.
cache_config
.
swap_space_bytes
//
cache_block_size
)
num_gpu_blocks
=
max
(
num_gpu_blocks
,
0
)
num_cpu_blocks
=
max
(
num_cpu_blocks
,
0
)
if
self
.
model_runner
.
lora_manager
:
self
.
model_runner
.
remove_all_loras
()
# NOTE(sgm): Add for verl, synchronize number of blocks with all the rank
num_gpu_blocks
=
torch
.
tensor
([
num_gpu_blocks
],
device
=
'cuda'
)
num_cpu_blocks
=
torch
.
tensor
([
num_cpu_blocks
],
device
=
'cuda'
)
torch
.
distributed
.
all_reduce
(
num_gpu_blocks
,
op
=
torch
.
distributed
.
ReduceOp
.
MIN
,
group
=
get_tensor_model_parallel_group
())
torch
.
distributed
.
all_reduce
(
num_cpu_blocks
,
op
=
torch
.
distributed
.
ReduceOp
.
MIN
,
group
=
get_tensor_model_parallel_group
())
num_gpu_blocks
=
num_gpu_blocks
.
item
()
num_cpu_blocks
=
num_cpu_blocks
.
item
()
gc
.
collect
()
torch
.
cuda
.
empty_cache
()
return
num_gpu_blocks
,
num_cpu_blocks
def
_init_cache_engine
(
self
):
if
self
.
cache_engine
is
None
and
self
.
gpu_cache
is
None
:
super
().
_init_cache_engine
()
def
free_cache_engine
(
self
):
# ensure `enforce_eager=True`
self
.
cache_engine
=
None
self
.
gpu_cache
=
None
@
torch
.
inference_mode
()
def
execute_model
(
self
,
execute_model_req
:
Optional
[
ExecuteModelRequest
]
=
None
)
->
List
[
SamplerOutput
]:
if
execute_model_req
is
None
:
seq_group_metadata_list
=
None
else
:
seq_group_metadata_list
=
execute_model_req
.
seq_group_metadata_list
# NOTE(sgm): each SPMD rank will have identical input
assert
seq_group_metadata_list
is
not
None
assert
execute_model_req
is
not
None
num_seq_groups
=
len
(
seq_group_metadata_list
)
blocks_to_swap_in
=
execute_model_req
.
blocks_to_swap_in
blocks_to_swap_out
=
execute_model_req
.
blocks_to_swap_out
blocks_to_copy
=
execute_model_req
.
blocks_to_copy
self
.
cache_swap
(
blocks_to_swap_in
,
blocks_to_swap_out
,
blocks_to_copy
)
# If there is no input, we don't need to execute the model.
if
num_seq_groups
==
0
:
return
[]
output
=
self
.
model_runner
.
execute_model
(
seq_group_metadata_list
,
self
.
gpu_cache
)
# Worker only supports single-step execution. Wrap the output in a list
# to conform to interface.
return
[
output
]
# assume the input is .state_dict()
def
sync_model_weights
(
self
,
actor_weights
:
Dict
,
load_format
:
str
):
if
load_format
in
[
LoadFormat
.
MEGATRON
,
LoadFormat
.
AUTO
]:
load_megatron_weights
(
actor_weights
,
self
.
model_runner
.
model
)
elif
load_format
==
LoadFormat
.
HF
:
# full model state dict without no sharding
load_hf_weights
(
actor_weights
,
self
.
model_runner
.
model
)
elif
load_format
==
LoadFormat
.
DTENSOR
:
load_dtensor_weights
(
actor_weights
,
self
.
model_runner
.
model
)
def
offload_model_weights
(
self
)
->
None
:
if
self
.
cpu_model
==
None
:
self
.
cpu_model
=
{}
for
name
,
params
in
self
.
model_runner
.
model
.
named_parameters
():
self
.
cpu_model
[
name
]
=
torch
.
empty_like
(
params
,
device
=
'cpu'
)
params
.
data
=
self
.
cpu_model
[
name
]
else
:
for
name
,
params
in
self
.
model_runner
.
model
.
named_parameters
():
params
.
data
=
self
.
cpu_model
[
name
]
def
init_worker_distributed_environment
(
parallel_config
:
ParallelConfig
,
rank
:
int
,
distributed_init_method
:
Optional
[
str
]
=
"env://"
,
local_rank
:
int
=
-
1
,
)
->
None
:
"""Initialize the distributed environment."""
# NOTE(sgm) use tcp://localhost:xxxx will hang in HF setting without megatron
init_distributed_environment
(
parallel_config
.
world_size
,
rank
,
distributed_init_method
,
local_rank
)
ensure_model_parallel_initialized
(
tensor_model_parallel_size
=
parallel_config
.
tensor_parallel_size
,
pipeline_model_parallel_size
=
parallel_config
.
pipeline_parallel_size
)
# TODO(sgm): check whether need this
# if pynccl_utils.is_initialized():
# pynccl_world_size = pynccl_utils.get_world_size()
# if pynccl_world_size != parallel_config.world_size:
# raise RuntimeError(
# "pynccl is already initialized but the pynccl world "
# "size does not match parallel_config.world_size "
# f"({pynccl_world_size} vs. {parallel_config.world_size}).")
# elif parallel_config.world_size > 1:
# # NOTE(woosuk): We don't initialize pynccl process group when world size
# # is 1.
# # NOTE(kaichao): By default, pynccl is initialized for tp group.
# pynccl_utils.init_process_group(
# group=get_tensor_model_parallel_cpu_group())
# # Initialize a custom fast all-reduce implementation.
# if not parallel_config.disable_custom_all_reduce:
# init_custom_ar()
# A small all_reduce for warmup.
torch
.
distributed
.
all_reduce
(
torch
.
zeros
(
1
).
cuda
())
# if pynccl_utils.is_initialized():
# pynccl_utils.all_reduce(torch.zeros(1).cuda())
verl/third_party/vllm/vllm_v_0_5_4/__init__.py
0 → 100644
View file @
f87b35b2
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
verl/third_party/vllm/vllm_v_0_5_4/arg_utils.py
0 → 100644
View file @
f87b35b2
# Copyright 2024 Bytedance Ltd. and/or its affiliates
# Copyright 2023 The vLLM team.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/engine/arg_utils.py
import
os
import
argparse
import
dataclasses
import
json
from
dataclasses
import
dataclass
from
typing
import
TYPE_CHECKING
,
List
,
Optional
,
Tuple
,
Type
,
Union
import
torch.nn
as
nn
from
transformers
import
PretrainedConfig
from
.config
import
ModelConfig
,
LoadConfig
from
vllm.config
import
(
CacheConfig
,
DecodingConfig
,
DeviceConfig
,
EngineConfig
,
LoRAConfig
,
MultiModalConfig
,
ObservabilityConfig
,
ParallelConfig
,
PromptAdapterConfig
,
SchedulerConfig
,
SpeculativeConfig
,
TokenizerPoolConfig
)
from
vllm.executor.executor_base
import
ExecutorBase
from
vllm.logger
import
init_logger
from
vllm.utils
import
FlexibleArgumentParser
from
vllm.model_executor.layers.quantization
import
QUANTIZATION_METHODS
from
vllm.utils
import
str_to_int_tuple
if
TYPE_CHECKING
:
from
vllm.transformers_utils.tokenizer_group.base_tokenizer_group
import
(
BaseTokenizerGroup
)
logger
=
init_logger
(
__name__
)
def
nullable_str
(
val
:
str
):
if
not
val
or
val
==
"None"
:
return
None
return
val
@
dataclass
class
EngineArgs
:
"""Arguments for vLLM engine."""
model_hf_config
:
PretrainedConfig
=
None
# for verl
served_model_name
=
None
# TODO(sgm): check this
# tokenizer: Optional[str] = None # TODO(sgm): check this
skip_tokenizer_init
:
bool
=
False
tokenizer_mode
:
str
=
'auto'
trust_remote_code
:
bool
=
False
download_dir
:
Optional
[
str
]
=
None
load_format
:
str
=
'auto'
dtype
:
str
=
'auto'
kv_cache_dtype
:
str
=
'auto'
quantization_param_path
:
Optional
[
str
]
=
None
seed
:
int
=
0
max_model_len
:
Optional
[
int
]
=
None
worker_use_ray
:
bool
=
False
# Note: Specifying a custom executor backend by passing a class
# is intended for expert use only. The API may change without
# notice.
distributed_executor_backend
:
Optional
[
Union
[
str
,
Type
[
ExecutorBase
]]]
=
None
pipeline_parallel_size
:
int
=
1
tensor_parallel_size
:
int
=
1
max_parallel_loading_workers
:
Optional
[
int
]
=
None
block_size
:
int
=
16
enable_prefix_caching
:
bool
=
False
disable_sliding_window
:
bool
=
False
use_v2_block_manager
:
bool
=
False
swap_space
:
int
=
4
# GiB
cpu_offload_gb
:
int
=
0
# GiB
gpu_memory_utilization
:
float
=
0.90
max_num_batched_tokens
:
Optional
[
int
]
=
None
max_num_seqs
:
int
=
256
max_logprobs
:
int
=
20
# Default value for OpenAI Chat Completions API
disable_log_stats
:
bool
=
False
revision
:
Optional
[
str
]
=
None
code_revision
:
Optional
[
str
]
=
None
rope_scaling
:
Optional
[
dict
]
=
None
rope_theta
:
Optional
[
float
]
=
None
tokenizer_revision
:
Optional
[
str
]
=
None
quantization
:
Optional
[
str
]
=
None
enforce_eager
:
bool
=
False
max_context_len_to_capture
:
Optional
[
int
]
=
None
max_seq_len_to_capture
:
int
=
8192
disable_custom_all_reduce
:
bool
=
False
tokenizer_pool_size
:
int
=
0
# Note: Specifying a tokenizer pool by passing a class
# is intended for expert use only. The API may change without
# notice.
tokenizer_pool_type
:
Union
[
str
,
Type
[
"BaseTokenizerGroup"
]]
=
"ray"
tokenizer_pool_extra_config
:
Optional
[
dict
]
=
None
enable_lora
:
bool
=
False
max_loras
:
int
=
1
max_lora_rank
:
int
=
16
enable_prompt_adapter
:
bool
=
False
max_prompt_adapters
:
int
=
1
max_prompt_adapter_token
:
int
=
0
fully_sharded_loras
:
bool
=
False
lora_extra_vocab_size
:
int
=
256
long_lora_scaling_factors
:
Optional
[
Tuple
[
float
]]
=
None
lora_dtype
:
str
=
'auto'
max_cpu_loras
:
Optional
[
int
]
=
None
device
:
str
=
'auto'
ray_workers_use_nsight
:
bool
=
False
num_gpu_blocks_override
:
Optional
[
int
]
=
None
num_lookahead_slots
:
int
=
0
model_loader_extra_config
:
Optional
[
dict
]
=
None
ignore_patterns
:
Optional
[
Union
[
str
,
List
[
str
]]]
=
None
preemption_mode
:
Optional
[
str
]
=
None
scheduler_delay_factor
:
float
=
0.0
enable_chunked_prefill
:
Optional
[
bool
]
=
None
guided_decoding_backend
:
str
=
'outlines'
# Speculative decoding configuration.
speculative_model
:
Optional
[
str
]
=
None
speculative_draft_tensor_parallel_size
:
Optional
[
int
]
=
None
num_speculative_tokens
:
Optional
[
int
]
=
None
speculative_max_model_len
:
Optional
[
int
]
=
None
speculative_disable_by_batch_size
:
Optional
[
int
]
=
None
ngram_prompt_lookup_max
:
Optional
[
int
]
=
None
ngram_prompt_lookup_min
:
Optional
[
int
]
=
None
spec_decoding_acceptance_method
:
str
=
'rejection_sampler'
typical_acceptance_sampler_posterior_threshold
:
Optional
[
float
]
=
None
typical_acceptance_sampler_posterior_alpha
:
Optional
[
float
]
=
None
qlora_adapter_name_or_path
:
Optional
[
str
]
=
None
disable_logprobs_during_spec_decoding
:
Optional
[
bool
]
=
None
otlp_traces_endpoint
:
Optional
[
str
]
=
None
@
staticmethod
def
add_cli_args
(
parser
:
argparse
.
ArgumentParser
)
->
argparse
.
ArgumentParser
:
"""Shared CLI arguments for vLLM engine."""
# Model arguments
# TODO(shengguangming): delete the unused args
parser
.
add_argument
(
'--model'
,
type
=
str
,
default
=
'facebook/opt-125m'
,
help
=
'name or path of the huggingface model to use'
)
parser
.
add_argument
(
'--tokenizer'
,
type
=
str
,
default
=
EngineArgs
.
tokenizer
,
help
=
'name or path of the huggingface tokenizer to use'
)
parser
.
add_argument
(
'--revision'
,
type
=
str
,
default
=
None
,
help
=
'the specific model version to use. It can be a branch '
'name, a tag name, or a commit id. If unspecified, will use '
'the default version.'
)
parser
.
add_argument
(
'--tokenizer-revision'
,
type
=
str
,
default
=
None
,
help
=
'the specific tokenizer version to use. It can be a branch '
'name, a tag name, or a commit id. If unspecified, will use '
'the default version.'
)
parser
.
add_argument
(
'--tokenizer-mode'
,
type
=
str
,
default
=
EngineArgs
.
tokenizer_mode
,
choices
=
[
'auto'
,
'slow'
],
help
=
'tokenizer mode. "auto" will use the fast '
'tokenizer if available, and "slow" will '
'always use the slow tokenizer.'
)
parser
.
add_argument
(
'--trust-remote-code'
,
action
=
'store_true'
,
help
=
'trust remote code from huggingface'
)
parser
.
add_argument
(
'--download-dir'
,
type
=
str
,
default
=
EngineArgs
.
download_dir
,
help
=
'directory to download and load the weights, '
'default to the default cache dir of '
'huggingface'
)
parser
.
add_argument
(
'--load-format'
,
type
=
str
,
default
=
EngineArgs
.
load_format
,
choices
=
[
'auto'
,
'pt'
,
'safetensors'
,
'npcache'
,
'dummy'
],
help
=
'The format of the model weights to load. '
'"auto" will try to load the weights in the safetensors format '
'and fall back to the pytorch bin format if safetensors format '
'is not available. '
'"pt" will load the weights in the pytorch bin format. '
'"safetensors" will load the weights in the safetensors format. '
'"npcache" will load the weights in pytorch format and store '
'a numpy cache to speed up the loading. '
'"dummy" will initialize the weights with random values, '
'which is mainly for profiling.'
)
parser
.
add_argument
(
'--dtype'
,
type
=
str
,
default
=
EngineArgs
.
dtype
,
choices
=
[
'auto'
,
'half'
,
'float16'
,
'bfloat16'
,
'float'
,
'float32'
],
help
=
'data type for model weights and activations. '
'The "auto" option will use FP16 precision '
'for FP32 and FP16 models, and BF16 precision '
'for BF16 models.'
)
parser
.
add_argument
(
'--max-model-len'
,
type
=
int
,
default
=
None
,
help
=
'model context length. If unspecified, '
'will be automatically derived from the model.'
)
# Parallel arguments
parser
.
add_argument
(
'--worker-use-ray'
,
action
=
'store_true'
,
help
=
'use Ray for distributed serving, will be '
'automatically set when using more than 1 GPU'
)
parser
.
add_argument
(
'--pipeline-parallel-size'
,
'-pp'
,
type
=
int
,
default
=
EngineArgs
.
pipeline_parallel_size
,
help
=
'number of pipeline stages'
)
parser
.
add_argument
(
'--tensor-parallel-size'
,
'-tp'
,
type
=
int
,
default
=
EngineArgs
.
tensor_parallel_size
,
help
=
'number of tensor parallel replicas'
)
# KV cache arguments
parser
.
add_argument
(
'--block-size'
,
type
=
int
,
default
=
EngineArgs
.
block_size
,
choices
=
[
8
,
16
,
32
],
help
=
'token block size'
)
# TODO(woosuk): Support fine-grained seeds (e.g., seed per request).
parser
.
add_argument
(
'--seed'
,
type
=
int
,
default
=
EngineArgs
.
seed
,
help
=
'random seed'
)
parser
.
add_argument
(
'--swap-space'
,
type
=
int
,
default
=
EngineArgs
.
swap_space
,
help
=
'CPU swap space size (GiB) per GPU'
)
parser
.
add_argument
(
'--gpu-memory-utilization'
,
type
=
float
,
default
=
EngineArgs
.
gpu_memory_utilization
,
help
=
'the percentage of GPU memory to be used for'
'the model executor'
)
parser
.
add_argument
(
'--max-num-batched-tokens'
,
type
=
int
,
default
=
EngineArgs
.
max_num_batched_tokens
,
help
=
'maximum number of batched tokens per '
'iteration'
)
parser
.
add_argument
(
'--max-num-seqs'
,
type
=
int
,
default
=
EngineArgs
.
max_num_seqs
,
help
=
'maximum number of sequences per iteration'
)
parser
.
add_argument
(
'--disable-log-stats'
,
action
=
'store_true'
,
help
=
'disable logging statistics'
)
# Quantization settings.
parser
.
add_argument
(
'--quantization'
,
'-q'
,
type
=
str
,
choices
=
[
'awq'
,
None
],
default
=
None
,
help
=
'Method used to quantize the weights'
)
return
parser
@
classmethod
def
from_cli_args
(
cls
,
args
:
argparse
.
Namespace
)
->
'EngineArgs'
:
# Get the list of attributes of this dataclass.
attrs
=
[
attr
.
name
for
attr
in
dataclasses
.
fields
(
cls
)]
# Set the attributes from the parsed arguments.
engine_args
=
cls
(
**
{
attr
:
getattr
(
args
,
attr
)
for
attr
in
attrs
})
return
engine_args
def
create_engine_config
(
self
,
)
->
EngineConfig
:
# bitsandbytes quantization needs a specific model loader
# so we make sure the quant method and the load format are consistent
if
(
self
.
quantization
==
"bitsandbytes"
or
self
.
qlora_adapter_name_or_path
is
not
None
)
and
\
self
.
load_format
!=
"bitsandbytes"
:
raise
ValueError
(
"BitsAndBytes quantization and QLoRA adapter only support "
f
"'bitsandbytes' load format, but got
{
self
.
load_format
}
"
)
if
(
self
.
load_format
==
"bitsandbytes"
or
self
.
qlora_adapter_name_or_path
is
not
None
)
and
\
self
.
quantization
!=
"bitsandbytes"
:
raise
ValueError
(
"BitsAndBytes load format and QLoRA adapter only support "
f
"'bitsandbytes' quantization, but got
{
self
.
quantization
}
"
)
assert
self
.
cpu_offload_gb
>=
0
,
(
"CPU offload space must be non-negative"
f
", but got
{
self
.
cpu_offload_gb
}
"
)
multimodal_config
=
MultiModalConfig
()
device_config
=
DeviceConfig
(
self
.
device
)
# NOTE(sgm): we only modify ModelConfig, other configs are import from vllm
model_config
=
ModelConfig
(
hf_config
=
self
.
model_hf_config
,
tokenizer_mode
=
self
.
tokenizer_mode
,
trust_remote_code
=
self
.
trust_remote_code
,
dtype
=
self
.
dtype
,
seed
=
self
.
seed
,
revision
=
self
.
revision
,
code_revision
=
self
.
code_revision
,
rope_scaling
=
self
.
rope_scaling
,
rope_theta
=
self
.
rope_theta
,
tokenizer_revision
=
self
.
tokenizer_revision
,
max_model_len
=
self
.
max_model_len
,
quantization
=
self
.
quantization
,
quantization_param_path
=
self
.
quantization_param_path
,
enforce_eager
=
self
.
enforce_eager
,
max_context_len_to_capture
=
self
.
max_context_len_to_capture
,
max_seq_len_to_capture
=
self
.
max_seq_len_to_capture
,
max_logprobs
=
self
.
max_logprobs
,
disable_sliding_window
=
self
.
disable_sliding_window
,
skip_tokenizer_init
=
self
.
skip_tokenizer_init
,
served_model_name
=
self
.
served_model_name
,
multimodal_config
=
multimodal_config
)
cache_config
=
CacheConfig
(
block_size
=
self
.
block_size
,
gpu_memory_utilization
=
self
.
gpu_memory_utilization
,
swap_space
=
self
.
swap_space
,
cache_dtype
=
self
.
kv_cache_dtype
,
num_gpu_blocks_override
=
self
.
num_gpu_blocks_override
,
sliding_window
=
model_config
.
get_sliding_window
(),
enable_prefix_caching
=
self
.
enable_prefix_caching
,
cpu_offload_gb
=
self
.
cpu_offload_gb
,
)
parallel_config
=
ParallelConfig
(
pipeline_parallel_size
=
self
.
pipeline_parallel_size
,
tensor_parallel_size
=
self
.
tensor_parallel_size
,
worker_use_ray
=
self
.
worker_use_ray
,
max_parallel_loading_workers
=
self
.
max_parallel_loading_workers
,
disable_custom_all_reduce
=
self
.
disable_custom_all_reduce
,
tokenizer_pool_config
=
TokenizerPoolConfig
.
create_config
(
self
.
tokenizer_pool_size
,
self
.
tokenizer_pool_type
,
self
.
tokenizer_pool_extra_config
,
),
ray_workers_use_nsight
=
self
.
ray_workers_use_nsight
,
distributed_executor_backend
=
self
.
distributed_executor_backend
)
# NOTE[VERL]: Use the world_size set by TORCHRUN
world_size
=
int
(
os
.
getenv
(
"WORLD_SIZE"
,
"-1"
))
assert
world_size
!=
-
1
,
"The world_size is set to -1, not initialized by TORCHRUN"
parallel_config
.
world_size
=
world_size
max_model_len
=
model_config
.
max_model_len
use_long_context
=
max_model_len
>
32768
if
self
.
enable_chunked_prefill
is
None
:
# If not explicitly set, enable chunked prefill by default for
# long context (> 32K) models. This is to avoid OOM errors in the
# initial memory profiling phase.
if
use_long_context
:
is_gpu
=
device_config
.
device_type
==
"cuda"
use_sliding_window
=
(
model_config
.
get_sliding_window
()
is
not
None
)
use_spec_decode
=
self
.
speculative_model
is
not
None
has_seqlen_agnostic_layers
=
(
model_config
.
contains_seqlen_agnostic_layers
(
parallel_config
))
if
(
is_gpu
and
not
use_sliding_window
and
not
use_spec_decode
and
not
self
.
enable_lora
and
not
self
.
enable_prompt_adapter
and
not
self
.
enable_prefix_caching
and
not
has_seqlen_agnostic_layers
):
self
.
enable_chunked_prefill
=
True
logger
.
warning
(
"Chunked prefill is enabled by default for models with "
"max_model_len > 32K. Currently, chunked prefill might "
"not work with some features or models. If you "
"encounter any issues, please disable chunked prefill "
"by setting --enable-chunked-prefill=False."
)
if
self
.
enable_chunked_prefill
is
None
:
self
.
enable_chunked_prefill
=
False
if
not
self
.
enable_chunked_prefill
and
use_long_context
:
logger
.
warning
(
"The model has a long context length (%s). This may cause OOM "
"errors during the initial memory profiling phase, or result "
"in low performance due to small KV cache space. Consider "
"setting --max-model-len to a smaller value."
,
max_model_len
)
# TODO: spec config
speculative_config
=
SpeculativeConfig
.
maybe_create_spec_config
(
target_model_config
=
model_config
,
target_parallel_config
=
parallel_config
,
target_dtype
=
self
.
dtype
,
speculative_model
=
self
.
speculative_model
,
speculative_draft_tensor_parallel_size
=
\
self
.
speculative_draft_tensor_parallel_size
,
num_speculative_tokens
=
self
.
num_speculative_tokens
,
speculative_disable_by_batch_size
=
self
.
speculative_disable_by_batch_size
,
speculative_max_model_len
=
self
.
speculative_max_model_len
,
enable_chunked_prefill
=
self
.
enable_chunked_prefill
,
use_v2_block_manager
=
self
.
use_v2_block_manager
,
disable_log_stats
=
self
.
disable_log_stats
,
ngram_prompt_lookup_max
=
self
.
ngram_prompt_lookup_max
,
ngram_prompt_lookup_min
=
self
.
ngram_prompt_lookup_min
,
draft_token_acceptance_method
=
\
self
.
spec_decoding_acceptance_method
,
typical_acceptance_sampler_posterior_threshold
=
self
.
typical_acceptance_sampler_posterior_threshold
,
typical_acceptance_sampler_posterior_alpha
=
self
.
typical_acceptance_sampler_posterior_alpha
,
disable_logprobs
=
self
.
disable_logprobs_during_spec_decoding
,
)
scheduler_config
=
SchedulerConfig
(
max_num_batched_tokens
=
self
.
max_num_batched_tokens
,
max_num_seqs
=
self
.
max_num_seqs
,
max_model_len
=
model_config
.
max_model_len
,
use_v2_block_manager
=
self
.
use_v2_block_manager
,
num_lookahead_slots
=
(
self
.
num_lookahead_slots
if
speculative_config
is
None
else
speculative_config
.
num_lookahead_slots
),
delay_factor
=
self
.
scheduler_delay_factor
,
enable_chunked_prefill
=
self
.
enable_chunked_prefill
,
embedding_mode
=
model_config
.
embedding_mode
,
preemption_mode
=
self
.
preemption_mode
,
)
lora_config
=
LoRAConfig
(
max_lora_rank
=
self
.
max_lora_rank
,
max_loras
=
self
.
max_loras
,
fully_sharded_loras
=
self
.
fully_sharded_loras
,
lora_extra_vocab_size
=
self
.
lora_extra_vocab_size
,
long_lora_scaling_factors
=
self
.
long_lora_scaling_factors
,
lora_dtype
=
self
.
lora_dtype
,
max_cpu_loras
=
self
.
max_cpu_loras
if
self
.
max_cpu_loras
and
self
.
max_cpu_loras
>
0
else
None
)
if
self
.
enable_lora
else
None
if
self
.
qlora_adapter_name_or_path
is
not
None
and
\
self
.
qlora_adapter_name_or_path
!=
""
:
if
self
.
model_loader_extra_config
is
None
:
self
.
model_loader_extra_config
=
{}
self
.
model_loader_extra_config
[
"qlora_adapter_name_or_path"
]
=
self
.
qlora_adapter_name_or_path
load_config
=
LoadConfig
(
load_format
=
self
.
load_format
,
download_dir
=
self
.
download_dir
,
model_loader_extra_config
=
self
.
model_loader_extra_config
,
ignore_patterns
=
self
.
ignore_patterns
,
)
prompt_adapter_config
=
PromptAdapterConfig
(
max_prompt_adapters
=
self
.
max_prompt_adapters
,
max_prompt_adapter_token
=
self
.
max_prompt_adapter_token
)
\
if
self
.
enable_prompt_adapter
else
None
decoding_config
=
DecodingConfig
(
guided_decoding_backend
=
self
.
guided_decoding_backend
)
observability_config
=
ObservabilityConfig
(
otlp_traces_endpoint
=
self
.
otlp_traces_endpoint
)
if
(
model_config
.
get_sliding_window
()
is
not
None
and
scheduler_config
.
chunked_prefill_enabled
and
not
scheduler_config
.
use_v2_block_manager
):
raise
ValueError
(
"Chunked prefill is not supported with sliding window. "
"Set --disable-sliding-window to disable sliding window."
)
return
EngineConfig
(
model_config
=
model_config
,
cache_config
=
cache_config
,
parallel_config
=
parallel_config
,
scheduler_config
=
scheduler_config
,
device_config
=
device_config
,
lora_config
=
lora_config
,
multimodal_config
=
multimodal_config
,
speculative_config
=
speculative_config
,
load_config
=
load_config
,
decoding_config
=
decoding_config
,
observability_config
=
observability_config
,
prompt_adapter_config
=
prompt_adapter_config
,
)
verl/third_party/vllm/vllm_v_0_5_4/config.py
0 → 100644
View file @
f87b35b2
# Copyright 2024 Bytedance Ltd. and/or its affiliates
# Copyright 2023 The vLLM team.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/config.py
import
enum
import
json
from
typing
import
List
,
Optional
,
Union
from
dataclasses
import
dataclass
,
field
,
fields
import
torch
from
transformers
import
PretrainedConfig
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.quantization
import
get_quantization_config
from
vllm.transformers_utils.config
import
get_hf_text_config
from
vllm.utils
import
is_hip
,
print_warning_once
# Add for verl
from
vllm.config
import
ModelConfig
,
_get_and_verify_dtype
,
_get_and_verify_max_len
,
get_served_model_name
GPTQMarlinConfig
=
get_quantization_config
(
"gptq_marlin"
)
logger
=
init_logger
(
__name__
)
_GB
=
1
<<
30
class
ModelConfig
(
ModelConfig
):
"""Configuration for the model.
Args:
model: Name or path of the huggingface model to use.
tokenizer: Name or path of the huggingface tokenizer to use.
tokenizer_mode: Tokenizer mode. "auto" will use the fast tokenizer if
available, and "slow" will always use the slow tokenizer.
trust_remote_code: Trust remote code (e.g., from HuggingFace) when
downloading the model and tokenizer.
download_dir: Directory to download and load the weights, default to the
default cache directory of huggingface.
load_format: The format of the model weights to load:
"auto" will try to load the weights in the safetensors format and
fall back to the pytorch bin format if safetensors format is
not available.
"pt" will load the weights in the pytorch bin format.
"safetensors" will load the weights in the safetensors format.
"npcache" will load the weights in pytorch format and store
a numpy cache to speed up the loading.
"dummy" will initialize the weights with random values, which is
mainly for profiling.
dtype: Data type for model weights and activations. The "auto" option
will use FP16 precision for FP32 and FP16 models, and BF16 precision
for BF16 models.
seed: Random seed for reproducibility.
revision: The specific model version to use. It can be a branch name,
a tag name, or a commit id. If unspecified, will use the default
version.
code_revision: The specific revision to use for the model code on
Hugging Face Hub. It can be a branch name, a tag name, or a
commit id. If unspecified, will use the default version.
tokenizer_revision: The specific tokenizer version to use. It can be a
branch name, a tag name, or a commit id. If unspecified, will use
the default version.
max_model_len: Maximum length of a sequence (including prompt and
output). If None, will be derived from the model.
quantization: Quantization method that was used to quantize the model
weights. If None, we assume the model weights are not quantized.
quantization_param_path: Path to JSON file containing scaling factors.
Used to load KV cache scaling factors into the model when KV cache
type is FP8_E4M3 on ROCm (AMD GPU). In the future these will also
be used to load activation and weight scaling factors when the
model dtype is FP8_E4M3 on ROCm.
enforce_eager: Whether to enforce eager execution. If True, we will
disable CUDA graph and always execute the model in eager mode.
If False, we will use CUDA graph and eager execution in hybrid.
max_context_len_to_capture: Maximum context len covered by CUDA graphs.
When a sequence has context length larger than this, we fall back
to eager mode (DEPRECATED. Use max_seq_len_to_capture instead).
max_seq_len_to_capture: Maximum sequence len covered by CUDA graphs.
When a sequence has context length larger than this, we fall back
to eager mode
skip_tokenizer_init: If true, skip initialization of tokenizer and
detokenizer.
served_model_name: The model name used in metrics tag `model_name`,
matches the model name exposed via the APIs. If multiple model
names provided, the first name will be used. If not specified,
the model name will be the same as `model`.
"""
def
__init__
(
self
,
hf_config
:
PretrainedConfig
,
tokenizer_mode
:
str
,
trust_remote_code
:
bool
,
dtype
:
Union
[
str
,
torch
.
dtype
],
seed
:
int
,
revision
:
Optional
[
str
]
=
None
,
code_revision
:
Optional
[
str
]
=
None
,
rope_scaling
:
Optional
[
dict
]
=
None
,
rope_theta
:
Optional
[
float
]
=
None
,
tokenizer_revision
:
Optional
[
str
]
=
None
,
max_model_len
:
Optional
[
int
]
=
None
,
quantization
:
Optional
[
str
]
=
None
,
quantization_param_path
:
Optional
[
str
]
=
None
,
enforce_eager
:
bool
=
False
,
max_context_len_to_capture
:
Optional
[
int
]
=
None
,
max_seq_len_to_capture
:
Optional
[
int
]
=
None
,
max_logprobs
:
int
=
20
,
disable_sliding_window
:
bool
=
False
,
skip_tokenizer_init
:
bool
=
False
,
served_model_name
:
Optional
[
Union
[
str
,
List
[
str
]]]
=
None
,
multimodal_config
:
Optional
[
"MultiModalConfig"
]
=
None
,
)
->
None
:
self
.
model
=
hf_config
.
_name_or_path
self
.
tokenizer
=
hf_config
.
_name_or_path
# NOTE(sgm): same as open-sourced
self
.
tokenizer_mode
=
tokenizer_mode
self
.
trust_remote_code
=
trust_remote_code
self
.
seed
=
seed
self
.
revision
=
revision
self
.
code_revision
=
code_revision
self
.
rope_scaling
=
rope_scaling
self
.
rope_theta
=
rope_theta
# The tokenizer version is consistent with the model version by default.
if
tokenizer_revision
is
None
:
self
.
tokenizer_revision
=
revision
else
:
self
.
tokenizer_revision
=
tokenizer_revision
self
.
quantization
=
quantization
self
.
quantization_param_path
=
quantization_param_path
self
.
enforce_eager
=
enforce_eager
if
max_context_len_to_capture
is
not
None
:
raise
ValueError
(
"`max_context_len_to_capture` is deprecated. "
"Use `max_seq_len_to_capture` instead."
)
self
.
max_seq_len_to_capture
=
max_seq_len_to_capture
self
.
max_logprobs
=
max_logprobs
self
.
disable_sliding_window
=
disable_sliding_window
self
.
skip_tokenizer_init
=
skip_tokenizer_init
# self.hf_config = get_config(model, trust_remote_code, revision)
self
.
hf_config
=
hf_config
self
.
hf_text_config
=
get_hf_text_config
(
hf_config
)
self
.
dtype
=
_get_and_verify_dtype
(
self
.
hf_text_config
,
dtype
)
# self.served_model_name = get_served_model_name(model,
# served_model_name)
# self._verify_load_format()
# self._verify_tokenizer_mode()
if
(
not
self
.
disable_sliding_window
and
self
.
hf_text_config
.
model_type
==
"gemma2"
and
self
.
hf_text_config
.
sliding_window
is
not
None
):
print_warning_once
(
"Gemma 2 uses sliding window attention for every odd layer, "
"which is currently not supported by vLLM. Disabling sliding "
"window and capping the max length to the sliding window size "
f
"(
{
self
.
hf_text_config
.
sliding_window
}
)."
)
self
.
disable_sliding_window
=
True
self
.
max_model_len
=
_get_and_verify_max_len
(
hf_config
=
self
.
hf_text_config
,
max_model_len
=
max_model_len
,
disable_sliding_window
=
self
.
disable_sliding_window
,
sliding_window_len
=
self
.
get_hf_config_sliding_window
())
self
.
served_model_name
=
get_served_model_name
(
self
.
model
,
# str
served_model_name
)
self
.
multimodal_config
=
multimodal_config
if
not
self
.
skip_tokenizer_init
:
self
.
_verify_tokenizer_mode
()
self
.
_verify_embedding_mode
()
self
.
_verify_quantization
()
self
.
_verify_cuda_graph
()
class
LoadFormat
(
str
,
enum
.
Enum
):
AUTO
=
'auto'
MEGATRON
=
"megatron"
HF
=
"hf"
DTENSOR
=
'dtensor'
DUMMY_HF
=
'dummy_hf'
DUMMY_MEGATRON
=
'dummy_megatron'
DUMMY_DTENSOR
=
'dummy_dtensor'
# TODO: check whether this is necessary
@
dataclass
class
LoadConfig
:
"""
download_dir: Directory to download and load the weights, default to the
default cache directory of huggingface.
load_format: The format of the model weights to load:
"auto" will try to load the weights in the safetensors format and
fall back to the pytorch bin format if safetensors format is
not available.
"pt" will load the weights in the pytorch bin format.
"safetensors" will load the weights in the safetensors format.
"npcache" will load the weights in pytorch format and store
a numpy cache to speed up the loading.
"dummy" will initialize the weights with random values, which is
mainly for profiling.
"tensorizer" will use CoreWeave's tensorizer library for
fast weight loading.
"bitsandbytes" will load nf4 type weights.
ignore_patterns: The list of patterns to ignore when loading the model.
Default to "original/**/*" to avoid repeated loading of llama's
checkpoints.
"""
load_format
:
Union
[
str
,
LoadFormat
,
"BaseModelLoader"
]
=
LoadFormat
.
AUTO
download_dir
:
Optional
[
str
]
=
None
model_loader_extra_config
:
Optional
[
Union
[
str
,
dict
]]
=
field
(
default_factory
=
dict
)
ignore_patterns
:
Optional
[
Union
[
List
[
str
],
str
]]
=
None
def
__post_init__
(
self
):
model_loader_extra_config
=
self
.
model_loader_extra_config
or
{}
if
isinstance
(
model_loader_extra_config
,
str
):
self
.
model_loader_extra_config
=
json
.
loads
(
model_loader_extra_config
)
self
.
_verify_load_format
()
if
self
.
ignore_patterns
is
not
None
and
len
(
self
.
ignore_patterns
)
>
0
:
logger
.
info
(
"Ignoring the following patterns when downloading weights: %s"
,
self
.
ignore_patterns
)
else
:
self
.
ignore_patterns
=
[
"original/**/*"
]
def
_verify_load_format
(
self
)
->
None
:
if
not
isinstance
(
self
.
load_format
,
str
):
return
load_format
=
self
.
load_format
.
lower
()
self
.
load_format
=
LoadFormat
(
load_format
)
rocm_not_supported_load_format
:
List
[
str
]
=
[]
if
is_hip
()
and
load_format
in
rocm_not_supported_load_format
:
rocm_supported_load_format
=
[
f
for
f
in
LoadFormat
.
__members__
if
(
f
not
in
rocm_not_supported_load_format
)
]
raise
ValueError
(
f
"load format '
{
load_format
}
' is not supported in ROCm. "
f
"Supported load formats are "
f
"
{
rocm_supported_load_format
}
"
)
verl/third_party/vllm/vllm_v_0_5_4/dtensor_weight_loaders.py
0 → 100644
View file @
f87b35b2
# Copyright 2024 Bytedance Ltd. and/or its affiliates
# Copyright 2023 The vLLM team.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Adapted from https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/models
from
typing
import
Dict
,
Iterable
,
Tuple
import
torch
import
torch.nn
as
nn
from
torch.distributed._tensor
import
DTensor
,
Shard
,
Replicate
from
vllm.model_executor.layers.linear
import
*
from
vllm.model_executor.models
import
ModelRegistry
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.models.utils
import
is_pp_missing_parameter
def
gemma_dtensor_weight_loader
(
actor_weights
:
Dict
,
vllm_model
:
nn
.
Module
)
->
nn
.
Module
:
stacked_params_mapping
=
[
# (param_name, shard_name, shard_id)
(
"qkv_proj"
,
"q_proj"
,
"q"
),
(
"qkv_proj"
,
"k_proj"
,
"k"
),
(
"qkv_proj"
,
"v_proj"
,
"v"
),
(
"gate_up_proj"
,
"gate_proj"
,
0
),
(
"gate_up_proj"
,
"up_proj"
,
1
),
]
params_dict
=
dict
(
vllm_model
.
named_parameters
())
for
name
,
loaded_weight
in
actor_weights
.
items
():
for
(
param_name
,
shard_name
,
shard_id
)
in
stacked_params_mapping
:
if
shard_name
not
in
name
:
continue
stacked_name
=
name
.
replace
(
shard_name
,
param_name
)
# Skip loading extra bias for GPTQ models.
if
stacked_name
.
endswith
(
".bias"
)
and
stacked_name
not
in
params_dict
:
continue
local_loaded_weight
=
redistribute_dtensor
(
param_name
=
name
,
loaded_weights
=
loaded_weight
)
param
=
params_dict
[
stacked_name
]
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
weight_loader
(
param
,
local_loaded_weight
.
to
(
dtype
=
param
.
dtype
),
shard_id
)
break
else
:
# lm_head is not used in vllm as it is tied with embed_token.
# To prevent errors, skip loading lm_head.weight.
if
"lm_head.weight"
in
name
:
continue
# Skip loading extra bias for GPTQ models.
if
name
.
endswith
(
".bias"
)
and
name
not
in
params_dict
:
continue
local_loaded_weight
=
redistribute_dtensor
(
param_name
=
name
,
loaded_weights
=
loaded_weight
)
param
=
params_dict
[
name
]
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
weight_loader
(
param
,
local_loaded_weight
.
to
(
dtype
=
param
.
dtype
))
def
gptbigcode_dtensor_load_weights
(
actor_weights
:
Dict
,
vllm_model
:
nn
.
Module
):
params_dict
=
dict
(
vllm_model
.
named_parameters
(
remove_duplicate
=
False
))
for
name
,
loaded_weight
in
actor_weights
.
items
():
if
"lm_head.weight"
in
name
:
continue
if
".attn.bias"
in
name
:
# Skip attention mask.
# NOTE: "c_attn.bias" should not be skipped.
continue
local_loaded_weight
=
redistribute_dtensor
(
param_name
=
name
,
loaded_weights
=
loaded_weight
)
param
=
params_dict
[
name
]
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
weight_loader
(
param
,
local_loaded_weight
.
to
(
dtype
=
param
.
dtype
))
def
starcoder2_dtensor_load_weights
(
actor_weights
:
Dict
,
vllm_model
:
nn
.
Module
):
stacked_params_mapping
=
[
# (param_name, shard_name, shard_id)
(
"qkv_proj"
,
"q_proj"
,
"q"
),
(
"qkv_proj"
,
"k_proj"
,
"k"
),
(
"qkv_proj"
,
"v_proj"
,
"v"
),
]
params_dict
=
dict
(
vllm_model
.
named_parameters
(
remove_duplicate
=
False
))
for
name
,
loaded_weight
in
actor_weights
.
items
():
if
"rotary_emb.inv_freq"
in
name
:
continue
for
(
param_name
,
weight_name
,
shard_id
)
in
stacked_params_mapping
:
if
weight_name
not
in
name
:
continue
name
=
name
.
replace
(
weight_name
,
param_name
)
local_loaded_weight
=
redistribute_dtensor
(
param_name
=
name
,
loaded_weights
=
loaded_weight
)
param
=
params_dict
[
name
]
weight_loader
=
param
.
weight_loader
weight_loader
(
param
,
local_loaded_weight
.
to
(
dtype
=
param
.
dtype
),
shard_id
)
break
else
:
if
vllm_model
.
config
.
tie_word_embeddings
and
"lm_head.weight"
in
name
:
continue
param
=
params_dict
[
name
]
local_loaded_weight
=
redistribute_dtensor
(
param_name
=
name
,
loaded_weights
=
loaded_weight
)
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
weight_loader
(
param
,
local_loaded_weight
.
to
(
dtype
=
param
.
dtype
))
def
llama_dtensor_weight_loader
(
actor_weights
:
Dict
,
vllm_model
:
nn
.
Module
)
->
nn
.
Module
:
stacked_params_mapping
=
[
# (param_name, shard_name, shard_id)
(
".qkv_proj"
,
".q_proj"
,
"q"
),
(
".qkv_proj"
,
".k_proj"
,
"k"
),
(
".qkv_proj"
,
".v_proj"
,
"v"
),
(
".gate_up_proj"
,
".gate_proj"
,
0
),
(
".gate_up_proj"
,
".up_proj"
,
1
),
]
params_dict
=
dict
(
vllm_model
.
named_parameters
())
for
name
,
loaded_weight
in
actor_weights
.
items
():
if
"rotary_emb.inv_freq"
in
name
:
continue
if
(
"rotary_emb.cos_cached"
in
name
or
"rotary_emb.sin_cached"
in
name
):
# Models trained using ColossalAI may include these tensors in
# the checkpoint. Skip them.
continue
# With tie_word_embeddings, we can skip lm_head.weight
# The weight might appear unnecessarily in the files if the model is
# processed with quantization, LoRA, fine-tuning, etc.
if
vllm_model
.
config
.
tie_word_embeddings
and
"lm_head.weight"
in
name
:
continue
for
(
param_name
,
weight_name
,
shard_id
)
in
stacked_params_mapping
:
if
weight_name
not
in
name
:
continue
name
=
name
.
replace
(
weight_name
,
param_name
)
# Skip loading extra bias for GPTQ models.
if
name
.
endswith
(
".bias"
)
and
name
not
in
params_dict
:
continue
local_loaded_weight
=
redistribute_dtensor
(
param_name
=
name
,
loaded_weights
=
loaded_weight
)
param
=
params_dict
[
name
]
weight_loader
=
param
.
weight_loader
weight_loader
(
param
,
local_loaded_weight
.
to
(
dtype
=
param
.
dtype
),
shard_id
)
break
else
:
# Skip loading extra bias for GPTQ models.
if
name
.
endswith
(
".bias"
)
and
name
not
in
params_dict
:
continue
local_loaded_weight
=
redistribute_dtensor
(
param_name
=
name
,
loaded_weights
=
loaded_weight
)
param
=
params_dict
[
name
]
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
weight_loader
(
param
,
local_loaded_weight
)
def
qwen2_dtensor_weight_loader
(
actor_weights
:
Dict
,
vllm_model
:
nn
.
Module
)
->
nn
.
Module
:
stacked_params_mapping
=
[
# (param_name, shard_name, shard_id)
(
"qkv_proj"
,
"q_proj"
,
"q"
),
(
"qkv_proj"
,
"k_proj"
,
"k"
),
(
"qkv_proj"
,
"v_proj"
,
"v"
),
(
"gate_up_proj"
,
"gate_proj"
,
0
),
(
"gate_up_proj"
,
"up_proj"
,
1
),
]
params_dict
=
dict
(
vllm_model
.
named_parameters
(
remove_duplicate
=
False
))
for
name
,
loaded_weight
in
actor_weights
.
items
():
if
"rotary_emb.inv_freq"
in
name
:
continue
if
vllm_model
.
config
.
tie_word_embeddings
and
"lm_head.weight"
in
name
:
continue
for
(
param_name
,
weight_name
,
shard_id
)
in
stacked_params_mapping
:
if
weight_name
not
in
name
:
continue
name
=
name
.
replace
(
weight_name
,
param_name
)
# Skip loading extra bias for GPTQ models.
if
name
.
endswith
(
".bias"
)
and
name
not
in
params_dict
:
continue
local_loaded_weight
=
redistribute_dtensor
(
param_name
=
name
,
loaded_weights
=
loaded_weight
)
param
=
params_dict
[
name
]
weight_loader
=
param
.
weight_loader
weight_loader
(
param
,
local_loaded_weight
.
to
(
dtype
=
param
.
dtype
),
shard_id
)
break
else
:
# Skip loading extra bias for GPTQ models.
if
name
.
endswith
(
".bias"
)
and
name
not
in
params_dict
:
continue
param
=
params_dict
[
name
]
local_loaded_weight
=
redistribute_dtensor
(
param_name
=
name
,
loaded_weights
=
loaded_weight
)
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
weight_loader
(
param
,
local_loaded_weight
.
to
(
dtype
=
param
.
dtype
))
from
vllm.model_executor.layers.fused_moe
import
FusedMoE
def
deepseekv2_dtensor_weight_loader
(
actor_weights
:
Dict
,
vllm_model
:
nn
.
Module
)
->
nn
.
Module
:
stacked_params_mapping
=
[
# (param_name, shard_name, shard_id)
(
"gate_up_proj"
,
"gate_proj"
,
0
),
(
"gate_up_proj"
,
"up_proj"
,
1
),
]
# Params for weights, fp8 weight scales, fp8 activation scales
# (param_name, weight_name, expert_id, shard_id)
expert_params_mapping
=
FusedMoE
.
make_expert_params_mapping
(
ckpt_gate_proj_name
=
"gate_proj"
,
ckpt_down_proj_name
=
"down_proj"
,
ckpt_up_proj_name
=
"up_proj"
,
num_experts
=
vllm_model
.
config
.
n_routed_experts
)
params_dict
=
dict
(
vllm_model
.
named_parameters
(
remove_duplicate
=
False
))
for
name
,
loaded_weight
in
actor_weights
.
items
():
if
"rotary_emb.inv_freq"
in
name
:
continue
for
(
param_name
,
weight_name
,
shard_id
)
in
stacked_params_mapping
:
# Skip non-stacked layers and experts (experts handled below).
if
weight_name
not
in
name
:
continue
# We have mlp.experts[0].gate_proj in the checkpoint.
# Since we handle the experts below in expert_params_mapping,
# we need to skip here BEFORE we update the name, otherwise
# name will be updated to mlp.experts[0].gate_up_proj, which
# will then be updated below in expert_params_mapping
# for mlp.experts[0].gate_gate_up_proj, which breaks load.
if
((
"mlp.experts."
in
name
)
and
name
not
in
params_dict
):
continue
name
=
name
.
replace
(
weight_name
,
param_name
)
# Skip loading extra bias for GPTQ models.
if
name
.
endswith
(
".bias"
)
and
name
not
in
params_dict
:
continue
if
is_pp_missing_parameter
(
name
,
vllm_model
):
continue
param
=
params_dict
[
name
]
local_loaded_weight
=
redistribute_dtensor
(
param_name
=
name
,
loaded_weights
=
loaded_weight
)
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
weight_loader
(
param
,
local_loaded_weight
.
to
(
dtype
=
param
.
dtype
),
shard_id
)
break
else
:
for
mapping
in
expert_params_mapping
:
param_name
,
weight_name
,
expert_id
,
shard_id
=
mapping
if
weight_name
not
in
name
:
continue
name
=
name
.
replace
(
weight_name
,
param_name
)
if
is_pp_missing_parameter
(
name
,
vllm_model
):
continue
param
=
params_dict
[
name
]
local_loaded_weight
=
redistribute_dtensor
(
param_name
=
name
,
loaded_weights
=
loaded_weight
)
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
weight_loader
(
param
,
local_loaded_weight
.
to
(
dtype
=
param
.
dtype
),
weight_name
,
shard_id
=
shard_id
,
expert_id
=
expert_id
)
break
else
:
# Skip loading extra bias for GPTQ models.
if
name
.
endswith
(
".bias"
)
and
name
not
in
params_dict
:
continue
if
is_pp_missing_parameter
(
name
,
vllm_model
):
continue
param
=
params_dict
[
name
]
local_loaded_weight
=
redistribute_dtensor
(
param_name
=
name
,
loaded_weights
=
loaded_weight
)
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
weight_loader
(
param
,
local_loaded_weight
.
to
(
dtype
=
param
.
dtype
))
def
gpt2_dtensor_weight_loader
(
actor_weights
:
Dict
,
vllm_model
:
nn
.
Module
)
->
nn
.
Module
:
pass
def
redistribute_dtensor
(
param_name
:
str
,
loaded_weights
:
DTensor
,
parallelize_plan
:
Dict
=
None
):
param_name
=
_process_parameter_names
(
name
=
param_name
)
if
parallelize_plan
is
not
None
:
assert
param_name
in
parallelize_plan
.
keys
(),
\
f
"param name:
{
param_name
}
not in parallelize_plan :
{
parallelize_plan
.
keys
()
}
"
placement
=
parallelize_plan
[
param_name
]
local_loaded_weights
=
loaded_weights
.
redistribute
(
device_mesh
=
loaded_weights
.
device_mesh
,
placements
=
placement
).
to_local
()
else
:
local_loaded_weights
=
loaded_weights
.
full_tensor
()
return
local_loaded_weights
def
_process_parameter_names
(
name
):
# Remove '.weight' if it exists at the end of the string
if
name
.
endswith
(
".weight"
):
name
=
name
[:
-
7
]
# Remove 'model.layers.x.' or 'model.' prefix
if
"model.layers"
in
name
:
parts
=
name
.
split
(
'.'
)
# Reconstruct the string without 'model.layers.x.'
name
=
'.'
.
join
(
parts
[
3
:])
# parts[0] is 'model', parts[1] is 'layers', parts[2] is 'x'
elif
name
.
startswith
(
"model."
):
name
=
name
[
6
:]
# Remove 'model.'
return
name
__MODEL_DTENSOR_WEIGHT_LOADER_REGISTRY__
=
{
'GPT2LMHeadModel'
:
gpt2_dtensor_weight_loader
,
'LlamaForCausalLM'
:
llama_dtensor_weight_loader
,
'LLaMAForCausalLM'
:
llama_dtensor_weight_loader
,
'MistralForCausalLM'
:
llama_dtensor_weight_loader
,
# mistral is the same as llama in vLLM
'InternLMForCausalLM'
:
llama_dtensor_weight_loader
,
'AquilaModel'
:
llama_dtensor_weight_loader
,
'AquilaForCausalLM'
:
llama_dtensor_weight_loader
,
'Phi3ForCausalLM'
:
llama_dtensor_weight_loader
,
'GemmaForCausalLM'
:
gemma_dtensor_weight_loader
,
'Gemma2ForCausalLM'
:
gemma_dtensor_weight_loader
,
'GPTBigCodeForCausalLM'
:
gptbigcode_dtensor_load_weights
,
'Starcoder2ForCausalLM'
:
starcoder2_dtensor_load_weights
,
'Qwen2ForCausalLM'
:
qwen2_dtensor_weight_loader
,
'DeepseekV2ForCausalLM'
:
deepseekv2_dtensor_weight_loader
}
# the actor model is .state_dict()
# Load dtensor weights
def
load_dtensor_weights
(
actor_weights
:
Dict
,
vllm_model
:
nn
.
Module
):
weight_loader
=
_get_model_weight_loader
(
vllm_model
.
__class__
.
__name__
)
weight_loader
(
actor_weights
,
vllm_model
)
# NOTE(sgm) to reduce peak memory usage, we offload vllm model to cpu
# after init, and we need this after sync model weights for in first iter.
vllm_model
=
vllm_model
.
cuda
()
def
_get_model_weight_loader
(
arch
:
str
):
if
arch
in
__MODEL_DTENSOR_WEIGHT_LOADER_REGISTRY__
:
return
__MODEL_DTENSOR_WEIGHT_LOADER_REGISTRY__
[
arch
]
raise
ValueError
(
f
"Model architectures
{
arch
}
are not supported for now. "
f
"Supported architectures:
{
__MODEL_DTENSOR_WEIGHT_LOADER_REGISTRY__
.
keys
()
}
"
)
# NOTE(sgm): we use per-parameter weight loader in each vllm sub
def
update_dtensor_weight_loader
():
pass
verl/third_party/vllm/vllm_v_0_5_4/hf_weight_loader.py
0 → 100644
View file @
f87b35b2
# Copyright 2024 Bytedance Ltd. and/or its affiliates
# Copyright 2023 The vLLM team.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Adapted from https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/models
from
typing
import
Dict
,
Union
,
Optional
,
Iterable
,
Tuple
import
torch
import
torch.nn
as
nn
from
vllm.model_executor.model_loader.utils
import
set_default_torch_dtype
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
def
update_hf_weight_loader
():
print
(
'no hf weight loader need to be updated'
)
return
def
load_hf_weights
(
actor_weights
:
Dict
,
vllm_model
:
nn
.
Module
):
assert
isinstance
(
actor_weights
,
Dict
)
with
set_default_torch_dtype
(
next
(
vllm_model
.
parameters
()).
dtype
):
# TODO
if
vllm_model
.
config
.
tie_word_embeddings
and
"lm_head.weight"
in
actor_weights
.
keys
():
del
actor_weights
[
"lm_head.weight"
]
vllm_model
.
load_weights
(
actor_weights
.
items
())
for
_
,
module
in
vllm_model
.
named_modules
():
quant_method
=
getattr
(
module
,
"quant_method"
,
None
)
if
quant_method
is
not
None
:
quant_method
.
process_weights_after_loading
(
module
)
# FIXME: Remove this after Mixtral is updated
# to use quant_method.
if
hasattr
(
module
,
"process_weights_after_loading"
):
module
.
process_weights_after_loading
()
vllm_model
=
vllm_model
.
cuda
()
verl/third_party/vllm/vllm_v_0_5_4/llm.py
0 → 100644
View file @
f87b35b2
# Copyright 2024 Bytedance Ltd. and/or its affiliates
# Copyright 2023 The vLLM team.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/llm.py
from
contextlib
import
contextmanager
from
typing
import
ClassVar
,
List
,
Optional
,
Sequence
,
Union
,
cast
,
overload
,
Dict
,
Tuple
from
tqdm
import
tqdm
from
transformers
import
PreTrainedTokenizer
,
PreTrainedTokenizerFast
from
transformers
import
PretrainedConfig
import
torch.nn
as
nn
from
.arg_utils
import
EngineArgs
from
.llm_engine_sp
import
LLMEngine
from
vllm
import
LLM
from
vllm.inputs
import
(
PromptInputs
,
TextPrompt
,
TokensPrompt
,
parse_and_batch_prompt
)
from
vllm.logger
import
init_logger
from
vllm.lora.request
import
LoRARequest
from
vllm.model_executor.guided_decoding
import
(
GuidedDecodingRequest
,
get_local_guided_decoding_logits_processor
)
from
vllm.model_executor.guided_decoding.guided_fields
import
LLMGuidedOptions
from
vllm.outputs
import
EmbeddingRequestOutput
,
RequestOutput
from
vllm.pooling_params
import
PoolingParams
from
vllm.prompt_adapter.request
import
PromptAdapterRequest
from
vllm.sampling_params
import
SamplingParams
from
vllm.transformers_utils.tokenizer
import
get_cached_tokenizer
from
vllm.usage.usage_lib
import
UsageContext
from
vllm.utils
import
Counter
,
deprecate_kwargs
import
torch
from
torch.nn.utils.rnn
import
pad_sequence
from
verl.workers.rollout.tokenizer
import
HybridEngineBaseTokenizer
class
LLM
(
LLM
):
"""An LLM for generating texts from given prompts and sampling parameters.
This class includes a tokenizer, a language model (possibly distributed
across multiple GPUs), and GPU memory space allocated for intermediate
states (aka KV cache). Given a batch of prompts and sampling parameters,
this class generates texts from the model, using an intelligent batching
mechanism and efficient memory management.
NOTE: This class is intended to be used for offline inference. For online
serving, use the `AsyncLLMEngine` class instead.
NOTE: For the comprehensive list of arguments, see `EngineArgs`.
Args:
model: A HuggingFace Transformers model instance.
tokenizer: A HuggingFace Transformers tokenizer instance.
tokenizer_mode: The tokenizer mode. "auto" will use the fast tokenizer
if available, and "slow" will always use the slow tokenizer.
trust_remote_code: Trust remote code (e.g., from HuggingFace) when
downloading the model and tokenizer.
tensor_parallel_size: The number of GPUs to use for distributed
execution with tensor parallelism.
dtype: The data type for the model weights and activations. Currently,
we support `float32`, `float16`, and `bfloat16`. If `auto`, we use
the `torch_dtype` attribute specified in the model config file.
However, if the `torch_dtype` in the config is `float32`, we will
use `float16` instead.
quantization: The method used to quantize the model weights. Currently,
we support "awq". If None, we assume the model weights are not
quantized and use `dtype` to determine the data type of the weights.
revision: The specific model version to use. It can be a branch name,
a tag name, or a commit id.
tokenizer_revision: The specific tokenizer version to use. It can be a
branch name, a tag name, or a commit id.
seed: The seed to initialize the random number generator for sampling.
gpu_memory_utilization: The ratio (between 0 and 1) of GPU memory to
reserve for the model weights, activations, and KV cache. Higher
values will increase the KV cache size and thus improve the model's
throughput. However, if the value is too high, it may cause out-of-
memory (OOM) errors.
swap_space: The size (GiB) of CPU memory per GPU to use as swap space.
This can be used for temporarily storing the states of the requests
when their `best_of` sampling parameters are larger than 1. If all
requests will have `best_of=1`, you can safely set this to 0.
Otherwise, too small values may cause out-of-memory (OOM) errors.
enforce_eager: Whether to enforce eager execution. If True, we will
disable CUDA graph and always execute the model in eager mode.
If False, we will use CUDA graph and eager execution in hybrid.
max_context_len_to_capture: Maximum context len covered by CUDA graphs.
When a sequence has context length larger than this, we fall back
to eager mode.
disable_custom_all_reduce: See ParallelConfig
"""
def
__init__
(
self
,
model
:
Union
[
nn
.
Module
,
Dict
],
# model itself or its parameter dict
tokenizer
:
Union
[
PreTrainedTokenizer
,
PreTrainedTokenizerFast
,
HybridEngineBaseTokenizer
],
model_hf_config
:
PretrainedConfig
,
tokenizer_mode
:
str
=
"auto"
,
trust_remote_code
:
bool
=
False
,
skip_tokenizer_init
:
bool
=
False
,
tensor_parallel_size
:
int
=
1
,
dtype
:
str
=
"auto"
,
quantization
:
Optional
[
str
]
=
None
,
revision
:
Optional
[
str
]
=
None
,
tokenizer_revision
:
Optional
[
str
]
=
None
,
seed
:
int
=
0
,
gpu_memory_utilization
:
float
=
0.9
,
swap_space
:
int
=
4
,
cpu_offload_gb
:
float
=
0
,
enforce_eager
:
bool
=
False
,
max_context_len_to_capture
:
Optional
[
int
]
=
None
,
max_seq_len_to_capture
:
int
=
8192
,
disable_custom_all_reduce
:
bool
=
False
,
load_format
=
'auto'
,
**
kwargs
,
)
->
None
:
if
"disable_log_stats"
not
in
kwargs
:
kwargs
[
"disable_log_stats"
]
=
True
engine_args
=
EngineArgs
(
model_hf_config
=
model_hf_config
,
tensor_parallel_size
=
tensor_parallel_size
,
dtype
=
dtype
,
quantization
=
quantization
,
revision
=
revision
,
tokenizer_revision
=
tokenizer_revision
,
seed
=
seed
,
gpu_memory_utilization
=
gpu_memory_utilization
,
swap_space
=
swap_space
,
cpu_offload_gb
=
cpu_offload_gb
,
enforce_eager
=
enforce_eager
,
max_context_len_to_capture
=
max_context_len_to_capture
,
max_seq_len_to_capture
=
max_seq_len_to_capture
,
disable_custom_all_reduce
=
disable_custom_all_reduce
,
load_format
=
load_format
,
skip_tokenizer_init
=
skip_tokenizer_init
,
**
kwargs
,
)
tokenizer_cls
=
(
PreTrainedTokenizer
,
PreTrainedTokenizerFast
,
HybridEngineBaseTokenizer
)
if
not
isinstance
(
tokenizer
,
tokenizer_cls
):
raise
ValueError
(
f
"Unexpected tokenizer type:
{
type
(
tokenizer
)
}
. Must be"
"one of the following: PreTrainedTokenizer, PreTrainedTokenizerFast, verl.workers.rollout.HybridEngineBaseTokenizer"
)
self
.
llm_engine
=
LLMEngine
.
from_engine_args
(
model
,
tokenizer
,
engine_args
)
# TODO: check usagecontext
self
.
request_counter
=
Counter
()
def
init_cache_engine
(
self
):
self
.
llm_engine
.
init_cache_engine
()
def
free_cache_engine
(
self
):
self
.
llm_engine
.
free_cache_engine
()
def
get_tokenizer
(
self
)
->
Union
[
PreTrainedTokenizer
,
PreTrainedTokenizerFast
]:
return
self
.
llm_engine
.
tokenizer
def
set_tokenizer
(
self
,
tokenizer
:
Union
[
PreTrainedTokenizer
,
PreTrainedTokenizerFast
],
)
->
None
:
self
.
llm_engine
.
tokenizer
=
tokenizer
def
_run_engine
(
self
,
*
,
use_tqdm
:
bool
)
->
List
[
Union
[
RequestOutput
,
EmbeddingRequestOutput
]]:
# Initialize tqdm.
if
use_tqdm
:
num_requests
=
self
.
llm_engine
.
get_num_unfinished_requests
()
pbar
=
tqdm
(
total
=
num_requests
,
desc
=
"Processed prompts"
,
dynamic_ncols
=
True
,
postfix
=
(
f
"est. speed input:
{
0
:.
2
f
}
toks/s, "
f
"output:
{
0
:.
2
f
}
toks/s"
),
)
# Run the engine.
outputs
:
List
[
Union
[
RequestOutput
,
EmbeddingRequestOutput
]]
=
[]
total_in_toks
=
0
total_out_toks
=
0
while
self
.
llm_engine
.
has_unfinished_requests
():
step_outputs
=
self
.
llm_engine
.
step
()
for
output
in
step_outputs
:
if
output
.
finished
:
outputs
.
append
(
output
)
if
use_tqdm
:
if
isinstance
(
output
,
RequestOutput
):
# Calculate tokens only for RequestOutput
total_in_toks
+=
len
(
output
.
prompt_token_ids
)
in_spd
=
total_in_toks
/
pbar
.
format_dict
[
"elapsed"
]
total_out_toks
+=
sum
(
len
(
stp
.
token_ids
)
for
stp
in
output
.
outputs
)
out_spd
=
total_out_toks
/
pbar
.
format_dict
[
"elapsed"
]
pbar
.
postfix
=
(
f
"est. speed input:
{
in_spd
:.
2
f
}
toks/s, "
f
"output:
{
out_spd
:.
2
f
}
toks/s"
)
pbar
.
update
(
1
)
if
use_tqdm
:
pbar
.
close
()
# Sort the outputs by request ID.
# This is necessary because some requests may be finished earlier than
# its previous requests.
outputs
=
sorted
(
outputs
,
key
=
lambda
x
:
int
(
x
.
request_id
))
return
self
.
_post_process_outputs
(
outputs
)
# # NOTE(shengguangming): add for verl
# # TODO(sgm): we can optimize it by making the dataloader yield List[int] without padding.
# def _pre_process_inputs(self, prompt_token_ids: torch.Tensor) -> List[int]:
# # remove the left padding in the prompt token_id
# pad_token_id = self.llm_engine.tokenizer.pad_token_id if self.llm_engine.tokenizer.pad_token_id is not None else self.llm_engine.tokenizer.eos_token_id
# non_pad_index = torch.nonzero(prompt_token_ids != pad_token_id, as_tuple=False)[0][0]
# token_ids = prompt_token_ids[non_pad_index:].tolist()
# return token_ids
# NOTE(shengguangming): add for verl
def
_post_process_outputs
(
self
,
request_outputs
:
List
[
RequestOutput
])
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
output_token_ids
=
[]
logprobs
=
[]
for
request_output
in
request_outputs
:
# List[RequestOutput]
outputs
=
request_output
.
outputs
for
output
in
outputs
:
# List[CompletionOutput], usually len == 1
output_token_ids
.
append
(
torch
.
tensor
(
output
.
token_ids
))
# TODO(shengguangming): can be optimzied by rewrite the Sampler._get_logprobs() logits
logprobs_dicts
=
output
.
logprobs
if
logprobs_dicts
is
not
None
:
logprob
=
[]
for
logprobs_dict
,
id
in
zip
(
logprobs_dicts
,
output
.
token_ids
):
logprob
.
append
(
logprobs_dict
[
id
].
logprob
)
logprobs
.
append
(
torch
.
tensor
(
logprob
))
pad_token_id
=
self
.
llm_engine
.
tokenizer
.
pad_token_id
if
self
.
llm_engine
.
tokenizer
.
pad_token_id
is
not
None
else
self
.
llm_engine
.
tokenizer
.
eos_token_id
output_token_ids
=
pad_sequence
(
output_token_ids
,
batch_first
=
True
,
padding_value
=
pad_token_id
)
if
len
(
logprobs
)
>
0
:
logprobs
=
pad_sequence
(
logprobs
,
batch_first
=
True
,
padding_value
=
pad_token_id
)
return
output_token_ids
,
logprobs
def
sync_model_weights
(
self
,
actor_weights
:
Dict
[
str
,
torch
.
Tensor
],
load_format
:
str
)
->
None
:
self
.
llm_engine
.
sync_model_weights
(
actor_weights
=
actor_weights
,
load_format
=
load_format
)
def
offload_model_weights
(
self
)
->
None
:
self
.
llm_engine
.
offload_model_weights
()
verl/third_party/vllm/vllm_v_0_5_4/llm_engine_sp.py
0 → 100644
View file @
f87b35b2
# Copyright 2024 Bytedance Ltd. and/or its affiliates
# Copyright 2023 The vLLM team.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/engine/llm_engine.py
import
torch
from
typing
import
Dict
,
Optional
,
Union
,
Type
import
vllm.envs
as
envs
from
vllm.config
import
(
CacheConfig
,
DecodingConfig
,
DeviceConfig
,
EngineConfig
,
LoRAConfig
,
MultiModalConfig
,
ObservabilityConfig
,
ParallelConfig
,
PromptAdapterConfig
,
SchedulerConfig
,
SpeculativeConfig
)
from
vllm.core.scheduler
import
Scheduler
from
vllm.engine.output_processor.interfaces
import
(
SequenceGroupOutputProcessor
)
from
vllm.engine.output_processor.stop_checker
import
StopChecker
from
vllm.executor.executor_base
import
ExecutorBase
from
vllm.inputs
import
INPUT_REGISTRY
,
LLMInputs
,
PromptInputs
from
vllm.logger
import
init_logger
from
vllm.transformers_utils.detokenizer
import
Detokenizer
from
vllm.engine.metrics
import
(
LoggingStatLogger
,
PrometheusStatLogger
,
StatLoggerBase
,
Stats
)
from
vllm.tracing
import
(
SpanAttributes
,
SpanKind
,
extract_trace_context
,
init_tracer
)
from
vllm.usage.usage_lib
import
(
UsageContext
,
is_usage_stats_enabled
,
usage_message
)
from
vllm.utils
import
Counter
from
vllm.engine.llm_engine
import
_load_generation_config_dict
from
vllm.engine.llm_engine
import
LLMEngine
from
vllm.version
import
__version__
as
VLLM_VERSION
import
torch.nn
as
nn
from
.arg_utils
import
EngineArgs
from
.tokenizer
import
TokenizerGroup
from
.config
import
ModelConfig
,
LoadConfig
logger
=
init_logger
(
__name__
)
_LOCAL_LOGGING_INTERVAL_SEC
=
5
class
LLMEngine
(
LLMEngine
):
"""An LLM engine that receives requests and generates texts.
This is the main class for the vLLM engine. It receives requests
from clients and generates texts from the LLM. It includes a tokenizer, a
language model (possibly distributed across multiple GPUs), and GPU memory
space allocated for intermediate states (aka KV cache). This class utilizes
iteration-level scheduling and efficient memory management to maximize the
serving throughput.
The `LLM` class wraps this class for offline batched inference and the
`AsyncLLMEngine` class wraps this class for online serving.
NOTE: The config arguments are derived from the `EngineArgs` class. For the
comprehensive list of arguments, see `EngineArgs`.
Args:
model: the actor model initialize outside vllm (add for verl)
tokenizer: the initialized tokenizer (add for verl)
model_config: The configuration related to the LLM model.
cache_config: The configuration related to the KV cache memory
management.
parallel_config: The configuration related to distributed execution.
scheduler_config: The configuration related to the request scheduler.
distributed_init_method: The initialization method for distributed
execution. See `torch.distributed.init_process_group` for details.
placement_group: Ray placement group for distributed execution.
Required for distributed execution.
log_stats: Whether to log statistics.
"""
def
__init__
(
self
,
# NOTE(sgm): first two arguments are added for verl
model
:
Union
[
nn
.
Module
,
Dict
],
# model itself or its parameter dict
tokenizer
:
nn
.
Module
,
# NOTE(sgm): vllm original arguments
model_config
:
ModelConfig
,
cache_config
:
CacheConfig
,
parallel_config
:
ParallelConfig
,
scheduler_config
:
SchedulerConfig
,
device_config
:
DeviceConfig
,
load_config
:
LoadConfig
,
lora_config
:
Optional
[
LoRAConfig
],
multimodal_config
:
Optional
[
MultiModalConfig
],
speculative_config
:
Optional
[
SpeculativeConfig
],
decoding_config
:
Optional
[
DecodingConfig
],
observability_config
:
Optional
[
ObservabilityConfig
],
prompt_adapter_config
:
Optional
[
PromptAdapterConfig
],
executor_class
:
Type
[
ExecutorBase
],
log_stats
:
bool
,
usage_context
:
UsageContext
=
UsageContext
.
ENGINE_CONTEXT
,
stat_loggers
:
Optional
[
Dict
[
str
,
StatLoggerBase
]]
=
None
,
)
->
None
:
logger
.
info
(
"Initializing an LLM engine (v%s) with config: "
"model=%r, speculative_config=%r, tokenizer=%r, "
"skip_tokenizer_init=%s, revision=%s, "
"rope_scaling=%r, rope_theta=%r, tokenizer_revision=%s, "
"trust_remote_code=%s, dtype=%s, max_seq_len=%d, "
"download_dir=%r, load_format=%s, tensor_parallel_size=%d, "
"pipeline_parallel_size=%d, "
"disable_custom_all_reduce=%s, quantization=%s, "
"enforce_eager=%s, kv_cache_dtype=%s, "
"quantization_param_path=%s, device_config=%s, "
"decoding_config=%r, observability_config=%r, "
"seed=%d, served_model_name=%s, use_v2_block_manager=%s, "
"enable_prefix_caching=%s)"
,
VLLM_VERSION
,
model_config
.
model
,
speculative_config
,
model_config
.
tokenizer
,
model_config
.
skip_tokenizer_init
,
model_config
.
revision
,
model_config
.
rope_scaling
,
model_config
.
rope_theta
,
model_config
.
tokenizer_revision
,
model_config
.
trust_remote_code
,
model_config
.
dtype
,
model_config
.
max_model_len
,
load_config
.
download_dir
,
load_config
.
load_format
,
parallel_config
.
tensor_parallel_size
,
parallel_config
.
pipeline_parallel_size
,
parallel_config
.
disable_custom_all_reduce
,
model_config
.
quantization
,
model_config
.
enforce_eager
,
cache_config
.
cache_dtype
,
model_config
.
quantization_param_path
,
device_config
.
device
,
decoding_config
,
observability_config
,
model_config
.
seed
,
model_config
.
served_model_name
,
scheduler_config
.
use_v2_block_manager
,
cache_config
.
enable_prefix_caching
,
)
# TODO(woosuk): Print more configs in debug mode.
self
.
model_config
=
model_config
self
.
cache_config
=
cache_config
self
.
lora_config
=
lora_config
self
.
multimodal_config
=
multimodal_config
self
.
parallel_config
=
parallel_config
self
.
scheduler_config
=
scheduler_config
self
.
device_config
=
device_config
self
.
speculative_config
=
speculative_config
self
.
load_config
=
load_config
self
.
decoding_config
=
decoding_config
or
DecodingConfig
()
self
.
prompt_adapter_config
=
prompt_adapter_config
self
.
observability_config
=
observability_config
or
ObservabilityConfig
()
self
.
log_stats
=
log_stats
# self.model = model # should not store the model, it should be deleted
# TODO(shengguangming): maybe we can choose init here or from arguments
if
not
self
.
model_config
.
skip_tokenizer_init
:
self
.
tokenizer
=
self
.
_init_tokenizer
(
tokenizer
)
self
.
detokenizer
=
Detokenizer
(
self
.
tokenizer
)
else
:
self
.
tokenizer
=
None
self
.
detokenizer
=
None
self
.
seq_counter
=
Counter
()
self
.
generation_config_fields
=
_load_generation_config_dict
(
model_config
)
self
.
input_processor
=
INPUT_REGISTRY
.
create_input_processor
(
self
.
model_config
)
self
.
model_executor
=
executor_class
(
model
=
model
,
# add for spmd_gpu_executor
model_config
=
model_config
,
cache_config
=
cache_config
,
parallel_config
=
parallel_config
,
scheduler_config
=
scheduler_config
,
device_config
=
device_config
,
lora_config
=
lora_config
,
multimodal_config
=
multimodal_config
,
speculative_config
=
speculative_config
,
load_config
=
load_config
,
prompt_adapter_config
=
prompt_adapter_config
,
)
# Profile the memory usage and initialize the cache.
if
not
self
.
model_config
.
embedding_mode
:
self
.
_initialize_kv_caches
()
# If usage stat is enabled, collect relevant info.
if
is_usage_stats_enabled
():
from
vllm.model_executor.model_loader
import
(
get_architecture_class_name
)
usage_message
.
report_usage
(
get_architecture_class_name
(
model_config
),
usage_context
,
extra_kvs
=
{
# Common configuration
"dtype"
:
str
(
model_config
.
dtype
),
"tensor_parallel_size"
:
parallel_config
.
tensor_parallel_size
,
"block_size"
:
cache_config
.
block_size
,
"gpu_memory_utilization"
:
cache_config
.
gpu_memory_utilization
,
# Quantization
"quantization"
:
model_config
.
quantization
,
"kv_cache_dtype"
:
str
(
cache_config
.
cache_dtype
),
# Feature flags
"enable_lora"
:
bool
(
lora_config
),
"enable_prompt_adapter"
:
bool
(
prompt_adapter_config
),
"enable_prefix_caching"
:
cache_config
.
enable_prefix_caching
,
"enforce_eager"
:
model_config
.
enforce_eager
,
"disable_custom_all_reduce"
:
parallel_config
.
disable_custom_all_reduce
,
})
if
self
.
tokenizer
:
# Ping the tokenizer to ensure liveness if it runs in a
# different process.
self
.
tokenizer
.
ping
()
# Create the scheduler.
# NOTE: the cache_config here have been updated with the numbers of
# GPU and CPU blocks, which are profiled in the distributed executor.
self
.
scheduler
=
[
Scheduler
(
scheduler_config
,
cache_config
,
lora_config
,
parallel_config
.
pipeline_parallel_size
)
for
_
in
range
(
parallel_config
.
pipeline_parallel_size
)
]
# Metric Logging.
if
self
.
log_stats
:
if
stat_loggers
is
not
None
:
self
.
stat_loggers
=
stat_loggers
else
:
self
.
stat_loggers
=
{
"logging"
:
LoggingStatLogger
(
local_interval
=
_LOCAL_LOGGING_INTERVAL_SEC
),
"prometheus"
:
PrometheusStatLogger
(
local_interval
=
_LOCAL_LOGGING_INTERVAL_SEC
,
labels
=
dict
(
model_name
=
model_config
.
served_model_name
),
max_model_len
=
self
.
model_config
.
max_model_len
),
}
self
.
stat_loggers
[
"prometheus"
].
info
(
"cache_config"
,
self
.
cache_config
)
self
.
tracer
=
None
if
self
.
observability_config
.
otlp_traces_endpoint
:
self
.
tracer
=
init_tracer
(
"vllm.llm_engine"
,
self
.
observability_config
.
otlp_traces_endpoint
)
# Create sequence output processor, e.g. for beam search or
# speculative decoding.
self
.
output_processor
=
(
SequenceGroupOutputProcessor
.
create_output_processor
(
self
.
scheduler_config
,
self
.
detokenizer
,
self
.
scheduler
,
self
.
seq_counter
,
self
.
get_tokenizer_for_seq
,
stop_checker
=
StopChecker
(
self
.
scheduler_config
.
max_model_len
,
self
.
get_tokenizer_for_seq
,
),
))
# TODO(sgm): add for verl but we may not tokenizer in Rollout
def
_init_tokenizer
(
self
,
tokenizer
,
**
tokenizer_init_kwargs
):
init_kwargs
=
dict
(
enable_lora
=
bool
(
self
.
lora_config
),
max_num_seqs
=
self
.
scheduler_config
.
max_num_seqs
,
max_input_length
=
None
)
init_kwargs
.
update
(
tokenizer_init_kwargs
)
return
TokenizerGroup
(
tokenizer
,
**
init_kwargs
)
def
init_cache_engine
(
self
):
# TODO: check whether we should rebuild the CUDAGraph every iter when offload/load KVCache
# Re-capture CUDAGraph would be time-consuming
self
.
model_executor
.
init_cache_engine
()
def
free_cache_engine
(
self
):
self
.
model_executor
.
free_cache_engine
()
# NOTE(sgm): currently, we only support GPU executor
# The GPUExecutor remove the Ray dependency
@
classmethod
def
_get_executor_cls
(
cls
,
engine_config
:
EngineConfig
)
->
Type
[
ExecutorBase
]:
assert
engine_config
.
device_config
.
device_type
==
"cuda"
,
\
"Currently, the vllm in verl only support running on GPU"
if
engine_config
.
parallel_config
.
world_size
==
1
:
engine_config
.
load_config
.
load_format
=
"dummy_hf"
from
.spmd_gpu_executor
import
SPMDGPUExecutor
executor_class
=
SPMDGPUExecutor
return
executor_class
@
classmethod
def
from_engine_args
(
cls
,
model
,
tokenizer
,
engine_args
:
EngineArgs
,
usage_context
:
UsageContext
=
UsageContext
.
ENGINE_CONTEXT
,
stat_loggers
:
Optional
[
Dict
[
str
,
StatLoggerBase
]]
=
None
,
)
->
"LLMEngine"
:
"""Creates an LLM engine from the engine arguments."""
# Create the engine configs.
engine_config
=
engine_args
.
create_engine_config
()
executor_class
=
cls
.
_get_executor_cls
(
engine_config
)
# Initialize the cluster and specify the executor class.
assert
engine_config
.
device_config
.
device_type
==
"cuda"
,
\
"Currently, the vllm in verl only support running on GPU"
from
.spmd_gpu_executor
import
SPMDGPUExecutor
executor_class
=
SPMDGPUExecutor
# Create the LLM engine.
engine
=
cls
(
model
,
tokenizer
,
**
engine_config
.
to_dict
(),
executor_class
=
executor_class
,
log_stats
=
not
engine_args
.
disable_log_stats
,
usage_context
=
usage_context
,
stat_loggers
=
stat_loggers
,
)
return
engine
def
sync_model_weights
(
self
,
actor_weights
:
Dict
[
str
,
torch
.
Tensor
],
load_format
:
str
)
->
None
:
self
.
model_executor
.
sync_model_weights
(
actor_weights
=
actor_weights
,
load_format
=
load_format
)
def
offload_model_weights
(
self
)
->
None
:
self
.
model_executor
.
offload_model_weights
()
verl/third_party/vllm/vllm_v_0_5_4/megatron_weight_loaders.py
0 → 100644
View file @
f87b35b2
# Copyright 2024 Bytedance Ltd. and/or its affiliates
# Copyright 2023 The vLLM team.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Adapted from https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/models
from
typing
import
Dict
import
torch
import
torch.nn
as
nn
from
vllm.model_executor.layers.linear
import
*
from
vllm.model_executor.layers.vocab_parallel_embedding
import
VocabParallelEmbedding
,
ParallelLMHead
from
vllm.model_executor.layers.activation
import
ScaledActivation
from
vllm.model_executor.models
import
ModelRegistry
# NOTE(shengguangming): replace the origin weight loader function in the class
def
parallel_weight_loader
(
self
,
param
:
torch
.
Tensor
,
loaded_weight
:
torch
.
Tensor
)
->
None
:
"""Parallel Linear weight loader."""
assert
param
.
size
()
==
loaded_weight
.
size
(
),
'the parameter size is not align with the loaded weight size, param size: {}, loaded_weight size: {}'
.
format
(
param
.
size
(),
loaded_weight
.
size
())
assert
param
.
data
.
dtype
==
loaded_weight
.
data
.
dtype
,
"if we want to shared weights, the data type should also be the same"
param
.
data
=
loaded_weight
.
data
def
default_weight_loader
(
param
:
torch
.
Tensor
,
loaded_weight
:
torch
.
Tensor
)
->
None
:
"""Default weight loader."""
assert
param
.
size
()
==
loaded_weight
.
size
()
assert
param
.
data
.
dtype
==
loaded_weight
.
data
.
dtype
,
"if we want to shared weights, the data type should also be the same"
param
.
data
=
loaded_weight
.
data
def
gpt2_weight_loader
(
actor_weights
:
Dict
,
vllm_model
:
nn
.
Module
)
->
nn
.
Module
:
params_dict
=
dict
(
vllm_model
.
named_parameters
(
remove_duplicate
=
False
))
for
name
,
loaded_weight
in
actor_weights
.
items
():
if
"lm_head.weight"
in
name
:
# GPT-2 ties the weights of the embedding layer and the final
# linear layer.
continue
if
".attn.bias"
in
name
or
".attn.masked_bias"
in
name
:
# Skip attention mask.
# NOTE: "c_attn.bias" should not be skipped.
continue
if
not
name
.
startswith
(
"transformer."
):
name
=
"transformer."
+
name
param
=
params_dict
[
name
]
# The HF's GPT-2 implementation uses Conv1D instead of Linear.
# Because of this, we need to transpose the weights.
# Note(zhuohan): the logic below might break quantized models.
for
conv1d_weight_name
in
[
"c_attn"
,
"c_proj"
,
"c_fc"
]:
if
conv1d_weight_name
not
in
name
:
continue
if
not
name
.
endswith
(
".weight"
):
continue
# TODO: check megatron
loaded_weight
=
loaded_weight
.
t
()
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
weight_loader
(
param
,
loaded_weight
)
def
llama_megatron_weight_loader
(
actor_weights
:
Dict
,
vllm_model
:
nn
.
Module
)
->
nn
.
Module
:
# NOTE(shengguangming): the megatron llama may have this prefix
params_dict
=
dict
(
vllm_model
.
named_parameters
())
for
name
,
loaded_weight
in
actor_weights
.
items
():
if
"rotary_emb.inv_freq"
in
name
:
continue
else
:
param
=
params_dict
[
name
]
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
weight_loader
(
param
,
loaded_weight
)
def
llama_megatron_core_te_weight_loader
(
actor_weights
:
Dict
,
vllm_model
:
nn
.
Module
)
->
nn
.
Module
:
params_mapping
=
[
# (megatron core gpt model name, vllm model name)
(
"embedding.word_embeddings"
,
"model.embed_tokens"
),
(
"self_attention.linear_qkv.layer_norm_weight"
,
"input_layernorm.weight"
),
(
"self_attention.linear_qkv.layer_norm_bias"
,
"input_layernorm.bias"
),
(
"self_attention.linear_qkv"
,
"self_attn.qkv_proj"
),
(
"self_attention.linear_qkv"
,
"self_attn.qkv_proj"
),
(
"self_attention.linear_proj"
,
'self_attn.o_proj'
),
(
'pre_mlp_layernorm'
,
'post_attention_layernorm'
),
(
'mlp.linear_fc1.layer_norm_weight'
,
'post_attention_layernorm.weight'
),
(
'mlp.linear_fc1.layer_norm_bias'
,
'post_attention_layernorm.bias'
),
(
'mlp.linear_fc1'
,
'mlp.gate_up_proj'
),
(
'mlp.linear_fc2'
,
'mlp.down_proj'
),
(
'decoder.final_layernorm'
,
'model.norm'
),
(
'output_layer'
,
'lm_head'
),
]
# NOTE(shengguangming): the megatron llama may have this prefix
params_dict
=
dict
(
vllm_model
.
named_parameters
())
for
name
,
loaded_weight
in
actor_weights
.
items
():
name
=
_replace_name
(
name
,
params_mapping
)
if
name
.
endswith
(
'.bias'
)
and
name
not
in
params_dict
:
continue
if
"rotary_emb.inv_freq"
in
name
:
continue
else
:
param
=
params_dict
[
name
]
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
weight_loader
(
param
,
loaded_weight
)
def
llama_megatron_core_weight_loader
(
actor_weights
:
Dict
,
vllm_model
:
nn
.
Module
)
->
nn
.
Module
:
params_mapping
=
[
# (megatron core gpt model name, vllm model name)
(
"embedding.word_embeddings"
,
"model.embed_tokens"
),
(
"self_attention.linear_qkv"
,
"self_attn.qkv_proj"
),
(
"self_attention.linear_proj"
,
'self_attn.o_proj'
),
(
'input_layernorm'
,
'input_layernorm'
,
),
(
'pre_mlp_layernorm'
,
'post_attention_layernorm'
),
(
'mlp.linear_fc1'
,
'mlp.gate_up_proj'
),
(
'mlp.linear_fc2'
,
'mlp.down_proj'
),
(
'decoder.final_layernorm'
,
'model.norm'
),
(
'output_layer'
,
'lm_head'
),
]
# NOTE(shengguangming): the megatron llama may have this prefix
params_dict
=
dict
(
vllm_model
.
named_parameters
())
for
name
,
loaded_weight
in
actor_weights
.
items
():
name
=
_replace_name
(
name
,
params_mapping
)
if
name
.
endswith
(
'.bias'
)
and
name
not
in
params_dict
:
continue
if
"rotary_emb.inv_freq"
in
name
:
continue
else
:
param
=
params_dict
[
name
]
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
weight_loader
(
param
,
loaded_weight
)
def
_replace_name
(
megatron_name
,
name_mapping
):
for
m_name
,
v_name
in
name_mapping
:
if
m_name
not
in
megatron_name
:
continue
if
'layers'
in
megatron_name
:
# deal with decoder layers
megatron_name
=
megatron_name
.
replace
(
'decoder'
,
'model'
)
megatron_name_list
=
megatron_name
.
split
(
'.'
)
if
'layer_norm_weight'
in
megatron_name_list
or
'layer_norm_bias'
in
megatron_name_list
:
param_name_list
=
megatron_name_list
[:
3
]
param_name_list
.
append
(
v_name
)
param_name
=
'.'
.
join
(
param_name_list
)
else
:
param_name_list
=
megatron_name_list
[:
3
]
weight_or_bias
=
megatron_name_list
[
-
1
]
param_name_list
.
append
(
v_name
)
param_name_list
.
append
(
weight_or_bias
)
param_name
=
'.'
.
join
(
param_name_list
)
return
param_name
else
:
param_name
=
megatron_name
.
replace
(
m_name
,
v_name
)
return
param_name
def
llama_megatron_core_te_weight_loader
(
actor_weights
:
Dict
,
vllm_model
:
nn
.
Module
)
->
nn
.
Module
:
params_mapping
=
[
# (megatron core gpt model name, vllm model name)
(
"embedding.word_embeddings"
,
"model.embed_tokens"
),
(
"self_attention.linear_qkv.layer_norm_weight"
,
"input_layernorm.weight"
),
(
"self_attention.linear_qkv.layer_norm_bias"
,
"input_layernorm.bias"
),
(
"self_attention.linear_qkv"
,
"self_attn.qkv_proj"
),
(
"self_attention.linear_qkv"
,
"self_attn.qkv_proj"
),
(
"self_attention.linear_proj"
,
'self_attn.o_proj'
),
(
'pre_mlp_layernorm'
,
'post_attention_layernorm'
),
(
'mlp.linear_fc1.layer_norm_weight'
,
'post_attention_layernorm.weight'
),
(
'mlp.linear_fc1.layer_norm_bias'
,
'post_attention_layernorm.bias'
),
(
'mlp.linear_fc1'
,
'mlp.gate_up_proj'
),
(
'mlp.linear_fc2'
,
'mlp.down_proj'
),
(
'decoder.final_layernorm'
,
'model.norm'
),
(
'output_layer'
,
'lm_head'
),
]
# NOTE(shengguangming): the megatron llama may have this prefix
params_dict
=
dict
(
vllm_model
.
named_parameters
())
for
name
,
loaded_weight
in
actor_weights
.
items
():
name
=
_replace_name
(
name
,
params_mapping
)
if
name
.
endswith
(
'.bias'
)
and
name
not
in
params_dict
:
continue
if
"rotary_emb.inv_freq"
in
name
:
continue
else
:
param
=
params_dict
[
name
]
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
weight_loader
(
param
,
loaded_weight
)
def
llama_megatron_core_weight_loader
(
actor_weights
:
Dict
,
vllm_model
:
nn
.
Module
)
->
nn
.
Module
:
params_mapping
=
[
# (megatron core gpt model name, vllm model name)
(
"embedding.word_embeddings"
,
"model.embed_tokens"
),
(
"self_attention.linear_qkv"
,
"self_attn.qkv_proj"
),
(
"self_attention.linear_proj"
,
'self_attn.o_proj'
),
(
'input_layernorm'
,
'input_layernorm'
,
),
(
'pre_mlp_layernorm'
,
'post_attention_layernorm'
),
(
'mlp.linear_fc1'
,
'mlp.gate_up_proj'
),
(
'mlp.linear_fc2'
,
'mlp.down_proj'
),
(
'decoder.final_layernorm'
,
'model.norm'
),
(
'output_layer'
,
'lm_head'
),
]
# NOTE(shengguangming): the megatron llama may have this prefix
params_dict
=
dict
(
vllm_model
.
named_parameters
())
for
name
,
loaded_weight
in
actor_weights
.
items
():
name
=
_replace_name
(
name
,
params_mapping
)
if
name
.
endswith
(
'.bias'
)
and
name
not
in
params_dict
:
continue
if
"rotary_emb.inv_freq"
in
name
:
continue
else
:
param
=
params_dict
[
name
]
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
weight_loader
(
param
,
loaded_weight
)
def
_replace_name
(
megatron_name
,
name_mapping
):
for
m_name
,
v_name
in
name_mapping
:
if
m_name
not
in
megatron_name
:
continue
if
'layers'
in
megatron_name
:
# deal with decoder layers
megatron_name
=
megatron_name
.
replace
(
'decoder'
,
'model'
)
megatron_name_list
=
megatron_name
.
split
(
'.'
)
if
'layer_norm_weight'
in
megatron_name_list
or
'layer_norm_bias'
in
megatron_name_list
:
param_name_list
=
megatron_name_list
[:
3
]
param_name_list
.
append
(
v_name
)
param_name
=
'.'
.
join
(
param_name_list
)
else
:
param_name_list
=
megatron_name_list
[:
3
]
weight_or_bias
=
megatron_name_list
[
-
1
]
param_name_list
.
append
(
v_name
)
param_name_list
.
append
(
weight_or_bias
)
param_name
=
'.'
.
join
(
param_name_list
)
return
param_name
else
:
param_name
=
megatron_name
.
replace
(
m_name
,
v_name
)
return
param_name
def
mistral_megatron_weight_loader
(
actor_weights
:
Dict
,
vllm_model
:
nn
.
Module
)
->
nn
.
Module
:
# TODO: need to implement a general way to deal with prefix
params_dict
=
dict
(
vllm_model
.
named_parameters
())
for
name
,
loaded_weight
in
actor_weights
.
items
():
if
"rotary_emb.inv_freq"
in
name
:
continue
else
:
param
=
params_dict
[
name
]
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
weight_loader
(
param
,
loaded_weight
)
__LAYER_WEIGHT_MEGATRON_LOADER_REGISTRY__
=
{
ColumnParallelLinear
:
parallel_weight_loader
,
MergedColumnParallelLinear
:
parallel_weight_loader
,
QKVParallelLinear
:
parallel_weight_loader
,
RowParallelLinear
:
parallel_weight_loader
,
VocabParallelEmbedding
:
parallel_weight_loader
,
ParallelLMHead
:
parallel_weight_loader
# "ScaledActivation.weight_loader": ScaledActivation, # TODO(shengguangming): latest commit in vllm fix awq for this function and add load_weights
# "default_weight_loader": default_weight_loader
}
# for layer_class, weight_loader in __LAYER_WEIGHT_MEGATRON_LOADER_REGISTRY__.items():
# # setattr(layer_class, 'megatron_weight_loader', weight_loader)
# layer_class.weight_loader = weight_loader
__MODEL_MEGATRON_WEIGHT_LOADER_REGISTRY__
=
{
'GPT2LMHeadModel'
:
gpt2_weight_loader
,
'LlamaForCausalLM'
:
llama_megatron_weight_loader
,
# use te backend for open-source megatron
'LLaMAForCausalLM'
:
llama_megatron_weight_loader
,
'MistralForCausalLM'
:
mistral_megatron_weight_loader
,
}
# the actor model is .state_dict()
# Load megatron weights
def
load_megatron_weights
(
actor_weights
:
Dict
,
vllm_model
:
nn
.
Module
):
weight_loader
=
_get_model_weight_loader
(
vllm_model
.
__class__
.
__name__
)
weight_loader
(
actor_weights
,
vllm_model
)
# NOTE(sgm) to reduce peak memory usage, we offload vllm model to cpu
# after init, and we need this after sync model weights for in first iter.
vllm_model
=
vllm_model
.
cuda
()
def
_get_model_weight_loader
(
arch
:
str
):
if
arch
in
__MODEL_MEGATRON_WEIGHT_LOADER_REGISTRY__
:
return
__MODEL_MEGATRON_WEIGHT_LOADER_REGISTRY__
[
arch
]
raise
ValueError
(
f
"Model architectures
{
arch
}
are not supported for now. "
f
"Supported architectures:
{
ModelRegistry
.
get_supported_archs
()
}
"
)
def
update_megatron_weight_loader
():
for
layer_class
,
weight_loader
in
__LAYER_WEIGHT_MEGATRON_LOADER_REGISTRY__
.
items
():
layer_class
.
weight_loader
=
weight_loader
verl/third_party/vllm/vllm_v_0_5_4/model_loader.py
0 → 100644
View file @
f87b35b2
# Copyright 2024 Bytedance Ltd. and/or its affiliates
# Copyright 2023 The vLLM team.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Adapted from https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/model_loader
from
typing
import
Dict
,
Union
,
Optional
,
Iterable
,
Tuple
import
torch
import
torch.nn
as
nn
from
transformers
import
PreTrainedModel
from
vllm.config
import
(
CacheConfig
,
DeviceConfig
,
LoadConfig
,
LoRAConfig
,
ModelConfig
,
MultiModalConfig
,
ParallelConfig
,
SchedulerConfig
)
from
vllm.model_executor.model_loader
import
BaseModelLoader
from
vllm.model_executor.model_loader.loader
import
_initialize_model
from
vllm.model_executor.model_loader.utils
import
set_default_torch_dtype
from
vllm.distributed.communication_op
import
tensor_model_parallel_all_gather
from
.config
import
ModelConfig
,
LoadFormat
,
LoadConfig
from
.megatron_weight_loaders
import
load_megatron_weights
,
update_megatron_weight_loader
from
.dtensor_weight_loaders
import
load_dtensor_weights
,
update_dtensor_weight_loader
from
.hf_weight_loader
import
update_hf_weight_loader
def
get_model
(
actor_model
:
Union
[
PreTrainedModel
,
Dict
],
model_config
:
ModelConfig
,
load_config
:
LoadConfig
,
device_config
:
DeviceConfig
,
parallel_config
:
ParallelConfig
,
scheduler_config
:
SchedulerConfig
,
lora_config
:
Optional
[
LoRAConfig
],
multimodal_config
:
Optional
[
MultiModalConfig
],
cache_config
:
CacheConfig
=
None
)
->
nn
.
Module
:
loader
=
get_model_loader
(
load_config
)
if
load_config
.
load_format
.
startswith
(
'dummy'
):
return
loader
.
load_model
(
model_config
=
model_config
,
device_config
=
device_config
,
lora_config
=
lora_config
,
multimodal_config
=
multimodal_config
,
parallel_config
=
parallel_config
,
scheduler_config
=
scheduler_config
,
cache_config
=
cache_config
)
else
:
return
loader
.
load_model
(
actor_model
=
actor_model
,
model_config
=
model_config
,
device_config
=
device_config
,
lora_config
=
lora_config
,
multimodal_config
=
multimodal_config
,
parallel_config
=
parallel_config
,
scheduler_config
=
scheduler_config
,
cache_config
=
cache_config
)
def
get_model_loader
(
load_config
:
LoadConfig
)
->
BaseModelLoader
:
"""Get a model loader based on the load format."""
if
isinstance
(
load_config
.
load_format
,
type
):
return
load_config
.
load_format
(
load_config
)
if
load_config
.
load_format
==
LoadFormat
.
AUTO
:
update_megatron_weight_loader
()
return
MegatronLoader
(
load_config
)
# NOTE(sgm): change the weight_loader function in runtime
if
load_config
.
load_format
==
LoadFormat
.
MEGATRON
:
update_megatron_weight_loader
()
return
MegatronLoader
(
load_config
)
if
load_config
.
load_format
==
LoadFormat
.
HF
:
update_hf_weight_loader
()
return
HFLoader
(
load_config
)
if
load_config
.
load_format
==
LoadFormat
.
DTENSOR
:
update_dtensor_weight_loader
()
return
DTensorLoader
(
load_config
)
if
load_config
.
load_format
==
LoadFormat
.
DUMMY_HF
:
update_hf_weight_loader
()
return
DummyModelLoader
(
load_config
)
if
load_config
.
load_format
==
LoadFormat
.
DUMMY_MEGATRON
:
update_megatron_weight_loader
()
return
DummyModelLoader
(
load_config
)
if
load_config
.
load_format
==
LoadFormat
.
DUMMY_DTENSOR
:
update_dtensor_weight_loader
()
return
DummyModelLoader
(
load_config
)
raise
ValueError
(
'load format not supported in verl: {}, only support {} and {}'
.
format
(
load_config
.
load_format
,
LoadFormat
.
MEGATRON
,
LoadFormat
.
HF
))
class
DummyModelLoader
(
BaseModelLoader
):
"""Model loader that will set model weights to random values."""
def
__init__
(
self
,
load_config
:
LoadConfig
):
super
().
__init__
(
load_config
)
if
load_config
.
model_loader_extra_config
:
raise
ValueError
(
f
"Model loader extra config is not supported for "
f
"load format
{
load_config
.
load_format
}
"
)
def
load_model
(
self
,
*
,
model_config
:
ModelConfig
,
device_config
:
DeviceConfig
,
lora_config
:
Optional
[
LoRAConfig
],
multimodal_config
:
Optional
[
MultiModalConfig
],
parallel_config
:
ParallelConfig
,
scheduler_config
:
SchedulerConfig
,
cache_config
:
CacheConfig
)
->
nn
.
Module
:
with
set_default_torch_dtype
(
model_config
.
dtype
):
with
torch
.
device
(
device_config
.
device
):
model
=
_initialize_model
(
model_config
,
self
.
load_config
,
lora_config
,
multimodal_config
,
cache_config
,
scheduler_config
)
# NOTE(woosuk): For accurate performance evaluation, we assign
# random values to the weights.
# initialize_dummy_weights(model)
return
model
.
eval
()
class
MegatronLoader
(
BaseModelLoader
):
"""Model loader that can load the model weights from partitioned megatron model."""
def
__init__
(
self
,
load_config
:
LoadConfig
):
super
().
__init__
(
load_config
)
if
load_config
.
model_loader_extra_config
:
raise
ValueError
(
f
"Model loader extra config is not supported for "
f
"load format
{
load_config
.
load_format
}
"
)
def
_get_weights_iterator
(
actor_model
:
Union
[
PreTrainedModel
,
Dict
]):
# NOTE(shengguangming) Load the weights from the actor model
pass
# if isinstance(actor_model, nn.Module):
# load_weights(actor_weights=dict(actor_model.named_parameters(remove_duplicate=False)), vllm_model=model)
# else:
# load_weights(actor_weights=actor_model, vllm_model=model)
# return actor_model
def
load_model
(
self
,
actor_model
:
Union
[
PreTrainedModel
,
Dict
],
model_config
:
ModelConfig
,
device_config
:
DeviceConfig
,
lora_config
:
Optional
[
LoRAConfig
],
multimodal_config
:
Optional
[
MultiModalConfig
],
parallel_config
:
ParallelConfig
,
scheduler_config
:
SchedulerConfig
,
cache_config
:
CacheConfig
)
->
nn
.
Module
:
with
set_default_torch_dtype
(
model_config
.
dtype
):
with
torch
.
device
(
device_config
.
device
):
model
=
_initialize_model
(
model_config
,
self
.
load_config
,
lora_config
,
multimodal_config
,
cache_config
,
scheduler_config
)
# TODO(sgm): This is a hack, we need to register the load_weight() func for each model in vllm
if
isinstance
(
actor_model
,
nn
.
Module
):
load_megatron_weights
(
actor_weights
=
dict
(
actor_model
.
named_parameters
(
remove_duplicate
=
False
)),
vllm_model
=
model
)
else
:
load_megatron_weights
(
actor_weights
=
actor_model
,
vllm_model
=
model
)
for
_
,
module
in
model
.
named_modules
():
quant_method
=
getattr
(
module
,
"quant_method"
,
None
)
if
quant_method
is
not
None
:
quant_method
.
process_weights_after_loading
(
module
)
# FIXME: Remove this after Mixtral is updated
# to use quant_method.
if
hasattr
(
module
,
"process_weights_after_loading"
):
module
.
process_weights_after_loading
()
# NOTE(sgm) Some weights are point to gpu, but still need this.
model
=
model
.
cuda
()
# NOTE (zhangchi.usc1992) We need this for vllm to profile memory usage
return
model
.
eval
()
class
HFLoader
(
BaseModelLoader
):
"""Model loader that can load the model weights from model's full params."""
def
__init__
(
self
,
load_config
:
LoadConfig
):
super
().
__init__
(
load_config
)
if
load_config
.
model_loader_extra_config
:
raise
ValueError
(
f
"Model loader extra config is not supported for "
f
"load format
{
load_config
.
load_format
}
"
)
def
_get_weights_iterator
(
self
,
actor_model
:
Union
[
PreTrainedModel
,
Dict
]):
if
isinstance
(
actor_model
,
Dict
):
return
actor_model
.
items
()
elif
isinstance
(
actor_model
,
nn
.
Module
):
return
dict
(
actor_model
.
named_parameters
()).
items
()
else
:
raise
ValueError
(
f
'actor model should be Dict or nn.Module, but get
{
type
(
actor_model
)
}
'
)
def
load_model
(
self
,
actor_model
:
Union
[
PreTrainedModel
,
Dict
],
model_config
:
ModelConfig
,
device_config
:
DeviceConfig
,
lora_config
:
Optional
[
LoRAConfig
],
multimodal_config
:
Optional
[
MultiModalConfig
],
parallel_config
:
ParallelConfig
,
scheduler_config
:
SchedulerConfig
,
cache_config
:
CacheConfig
)
->
nn
.
Module
:
with
set_default_torch_dtype
(
model_config
.
dtype
):
# with torch.device(device_config.device):
# NOTE(sgm): init the model in cpu
model
=
_initialize_model
(
model_config
,
self
.
load_config
,
lora_config
,
multimodal_config
,
cache_config
,
scheduler_config
)
model
.
load_weights
(
self
.
_get_weights_iterator
(
actor_model
))
for
_
,
module
in
model
.
named_modules
():
quant_method
=
getattr
(
module
,
"quant_method"
,
None
)
if
quant_method
is
not
None
:
quant_method
.
process_weights_after_loading
(
module
)
# FIXME: Remove this after Mixtral is updated
# to use quant_method.
if
hasattr
(
module
,
"process_weights_after_loading"
):
module
.
process_weights_after_loading
()
# NOTE(sgm) Some weights are point to gpu, but still need this.
model
=
model
.
cuda
()
# NOTE (zhangchi.usc1992) We need this for vllm to profile memory usage
return
model
.
eval
()
class
DTensorLoader
(
BaseModelLoader
):
"""Model loader that can load the model weights from partitioned megatron model."""
def
__init__
(
self
,
load_config
:
LoadConfig
):
super
().
__init__
(
load_config
)
if
load_config
.
model_loader_extra_config
:
raise
ValueError
(
f
"Model loader extra config is not supported for "
f
"load format
{
load_config
.
load_format
}
"
)
def
_get_weights_iterator
(
actor_model
:
Union
[
PreTrainedModel
,
Dict
]):
# NOTE(shengguangming) Load the weights from the actor model
pass
# if isinstance(actor_model, nn.Module):
# load_weights(actor_weights=dict(actor_model.named_parameters(remove_duplicate=False)), vllm_model=model)
# else:
# load_weights(actor_weights=actor_model, vllm_model=model)
# return actor_model
def
load_model
(
self
,
actor_model
:
Union
[
PreTrainedModel
,
Dict
],
model_config
:
ModelConfig
,
device_config
:
DeviceConfig
,
lora_config
:
Optional
[
LoRAConfig
],
multimodal_config
:
Optional
[
MultiModalConfig
],
parallel_config
:
ParallelConfig
,
scheduler_config
:
SchedulerConfig
,
cache_config
:
CacheConfig
)
->
nn
.
Module
:
with
set_default_torch_dtype
(
model_config
.
dtype
):
with
torch
.
device
(
device_config
.
device
):
model
=
_initialize_model
(
model_config
,
self
.
load_config
,
lora_config
,
multimodal_config
,
cache_config
,
scheduler_config
)
# TODO(sgm): This is a hack, we need to register the load_weight() func for each model in vllm
if
isinstance
(
actor_model
,
nn
.
Module
):
load_dtensor_weights
(
actor_weights
=
dict
(
actor_model
.
named_parameters
(
remove_duplicate
=
False
)),
vllm_model
=
model
)
else
:
load_dtensor_weights
(
actor_weights
=
actor_model
,
vllm_model
=
model
)
for
_
,
module
in
model
.
named_modules
():
quant_method
=
getattr
(
module
,
"quant_method"
,
None
)
if
quant_method
is
not
None
:
quant_method
.
process_weights_after_loading
(
module
)
# FIXME: Remove this after Mixtral is updated
# to use quant_method.
if
hasattr
(
module
,
"process_weights_after_loading"
):
module
.
process_weights_after_loading
()
# NOTE(sgm) Some weights are point to gpu, but still need this.
model
=
model
.
cuda
()
# NOTE (zhangchi.usc1992) We need this for vllm to profile memory usage
return
model
.
eval
()
# FIXME(sgm): hack the _get_logits function in vllm v0.4.2
# as they use ray, the _get_logits result will only need to return to the driver node,
# therefore gather is enough. However, we use SPMD instead of a central scheduler,
# all_gather is required (aligned with v0.2.6)
def
_get_logits
(
self
,
hidden_states
:
torch
.
Tensor
,
embedding
:
torch
.
Tensor
,
embedding_bias
:
Optional
[
torch
.
Tensor
])
->
torch
.
Tensor
:
# Get the logits for the next tokens.
logits
=
torch
.
matmul
(
hidden_states
,
embedding
.
t
())
if
embedding_bias
is
not
None
:
logits
+=
embedding_bias
logits
=
tensor_model_parallel_all_gather
(
logits
)
# Remove paddings in vocab (if any).
if
logits
is
not
None
:
logits
=
logits
[:,
:
self
.
org_vocab_size
]
return
logits
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
def
logitsprocessor_init
(
self
,
vocab_size
:
int
,
org_vocab_size
:
Optional
[
int
]
=
None
,
scale
:
float
=
1.0
,
logits_as_input
:
bool
=
False
,
soft_cap
:
Optional
[
float
]
=
None
)
->
None
:
"""
Args:
scale: A scaling factor to apply to the logits.
"""
super
(
LogitsProcessor
,
self
).
__init__
()
self
.
scale
=
scale
self
.
vocab_size
=
vocab_size
# Whether the input is logits (default is hidden states).
self
.
logits_as_input
=
logits_as_input
# original vocabulary size (without LoRA).
self
.
org_vocab_size
=
org_vocab_size
or
vocab_size
# Soft cap the logits. Used in Gemma 2.
self
.
soft_cap
=
soft_cap
# Whether to use gather or all-gather to gather the logits.
self
.
use_gather
=
False
LogitsProcessor
.
__init__
=
logitsprocessor_init
# use all_gather
Prev
1
…
12
13
14
15
16
17
18
19
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment